mcp-scraper 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/api-server.cjs +388 -75
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +243 -11
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.cjs +14 -4
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +4 -3
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-3OIRNUF5.js → chunk-RE6HCRYC.js} +244 -12
- package/dist/chunk-RE6HCRYC.js.map +1 -0
- package/dist/{chunk-LUBDFS67.js → chunk-TM22BLWP.js} +15 -3
- package/dist/chunk-TM22BLWP.js.map +1 -0
- package/dist/index.cjs +12 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +1 -1
- package/dist/{server-YNJHP5PU.js → server-QXVVTKJP.js} +80 -30
- package/dist/server-QXVVTKJP.js.map +1 -0
- package/dist/{worker-PBG6LGET.js → worker-AUCXFHEL.js} +4 -3
- package/dist/worker-AUCXFHEL.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-3OIRNUF5.js.map +0 -1
- package/dist/chunk-LUBDFS67.js.map +0 -1
- package/dist/server-YNJHP5PU.js.map +0 -1
- package/dist/worker-PBG6LGET.js.map +0 -1
package/dist/bin/paa-harvest.js
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
|
+
browserServiceApiKey,
|
|
3
4
|
harvest
|
|
4
|
-
} from "../chunk-
|
|
5
|
+
} from "../chunk-TM22BLWP.js";
|
|
5
6
|
import "../chunk-ZMOWIBMK.js";
|
|
6
7
|
|
|
7
8
|
// src/cli.ts
|
|
8
9
|
import { Command } from "commander";
|
|
9
10
|
var program = new Command();
|
|
10
|
-
program.name("paa-harvest").description("Recursively extract Google People Also Ask questions").requiredOption("-q, --query <query>", "Seed query").option("-l, --location <location>", 'Location name (e.g. "austin" or "Austin,Texas,United States")').option("--gl <gl>", "Google country code", "us").option("--hl <hl>", "Google language code", "en").option("-d, --depth <depth>", "BFS depth (1-30)", "3").option("-m, --max-questions <n>", "Max questions to harvest", "100").option("-o, --output <dir>", "Output directory", "./paa-output").option("-f, --format <format>", "Output format: json, csv, or both", "both").option("--headless", "Run browser in headless mode", false).option("--profile <dir>", "Persistent browser profile directory").option("--proxy <url>", "Proxy server URL").option("--kernel-api-key <key>", "
|
|
11
|
+
program.name("paa-harvest").description("Recursively extract Google People Also Ask questions").requiredOption("-q, --query <query>", "Seed query").option("-l, --location <location>", 'Location name (e.g. "austin" or "Austin,Texas,United States")').option("--gl <gl>", "Google country code", "us").option("--hl <hl>", "Google language code", "en").option("-d, --depth <depth>", "BFS depth (1-30)", "3").option("-m, --max-questions <n>", "Max questions to harvest", "100").option("-o, --output <dir>", "Output directory", "./paa-output").option("-f, --format <format>", "Output format: json, csv, or both", "both").option("--headless", "Run browser in headless mode", false).option("--profile <dir>", "Persistent browser profile directory").option("--proxy <url>", "Proxy server URL").option("--kernel-api-key <key>", "Browser service API key (or set BROWSER_SERVICE_API_KEY env var)").action(async (opts) => {
|
|
11
12
|
try {
|
|
12
13
|
const result = await harvest({
|
|
13
14
|
query: opts.query,
|
|
@@ -21,7 +22,7 @@ program.name("paa-harvest").description("Recursively extract Google People Also
|
|
|
21
22
|
headless: opts.headless,
|
|
22
23
|
profileDir: opts.profile,
|
|
23
24
|
proxy: opts.proxy,
|
|
24
|
-
kernelApiKey: opts.kernelApiKey ??
|
|
25
|
+
kernelApiKey: opts.kernelApiKey ?? browserServiceApiKey()
|
|
25
26
|
});
|
|
26
27
|
console.log(JSON.stringify({ totalQuestions: result.totalQuestions, outputDir: result.stats.seed }));
|
|
27
28
|
} catch (err) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/cli.ts","../../bin/paa-harvest.ts"],"sourcesContent":["import { Command } from 'commander'\nimport { harvest } from './harvest.js'\n\nconst program = new Command()\n\nprogram\n .name('paa-harvest')\n .description('Recursively extract Google People Also Ask questions')\n .requiredOption('-q, --query <query>', 'Seed query')\n .option('-l, --location <location>', 'Location name (e.g. \"austin\" or \"Austin,Texas,United States\")')\n .option('--gl <gl>', 'Google country code', 'us')\n .option('--hl <hl>', 'Google language code', 'en')\n .option('-d, --depth <depth>', 'BFS depth (1-30)', '3')\n .option('-m, --max-questions <n>', 'Max questions to harvest', '100')\n .option('-o, --output <dir>', 'Output directory', './paa-output')\n .option('-f, --format <format>', 'Output format: json, csv, or both', 'both')\n .option('--headless', 'Run browser in headless mode', false)\n .option('--profile <dir>', 'Persistent browser profile directory')\n .option('--proxy <url>', 'Proxy server URL')\n .option('--kernel-api-key <key>', '
|
|
1
|
+
{"version":3,"sources":["../../src/cli.ts","../../bin/paa-harvest.ts"],"sourcesContent":["import { Command } from 'commander'\nimport { browserServiceApiKey } from './lib/browser-service-env.js'\nimport { harvest } from './harvest.js'\n\nconst program = new Command()\n\nprogram\n .name('paa-harvest')\n .description('Recursively extract Google People Also Ask questions')\n .requiredOption('-q, --query <query>', 'Seed query')\n .option('-l, --location <location>', 'Location name (e.g. \"austin\" or \"Austin,Texas,United States\")')\n .option('--gl <gl>', 'Google country code', 'us')\n .option('--hl <hl>', 'Google language code', 'en')\n .option('-d, --depth <depth>', 'BFS depth (1-30)', '3')\n .option('-m, --max-questions <n>', 'Max questions to harvest', '100')\n .option('-o, --output <dir>', 'Output directory', './paa-output')\n .option('-f, --format <format>', 'Output format: json, csv, or both', 'both')\n .option('--headless', 'Run browser in headless mode', false)\n .option('--profile <dir>', 'Persistent browser profile directory')\n .option('--proxy <url>', 'Proxy server URL')\n .option('--kernel-api-key <key>', 'Browser service API key (or set BROWSER_SERVICE_API_KEY env var)')\n .action(async (opts) => {\n try {\n const result = await harvest({\n query: opts.query,\n location: opts.location,\n gl: opts.gl,\n hl: opts.hl,\n depth: parseInt(opts.depth, 10),\n maxQuestions: parseInt(opts.maxQuestions, 10),\n outputDir: opts.output,\n format: opts.format,\n headless: opts.headless,\n profileDir: opts.profile,\n proxy: opts.proxy,\n kernelApiKey: opts.kernelApiKey ?? browserServiceApiKey(),\n })\n console.log(JSON.stringify({ totalQuestions: result.totalQuestions, outputDir: result.stats.seed }))\n } catch (err) {\n console.error(err instanceof Error ? err.message : String(err))\n process.exit(1)\n }\n })\n\nexport async function runCli(): Promise<void> {\n await program.parseAsync()\n}\n","#!/usr/bin/env node\nimport { runCli } from '../src/cli.js'\nrunCli()\n"],"mappings":";;;;;;;;AAAA,SAAS,eAAe;AAIxB,IAAM,UAAU,IAAI,QAAQ;AAE5B,QACG,KAAK,aAAa,EAClB,YAAY,sDAAsD,EAClE,eAAe,uBAAuB,YAAY,EAClD,OAAO,6BAA6B,+DAA+D,EACnG,OAAO,aAAa,uBAAuB,IAAI,EAC/C,OAAO,aAAa,wBAAwB,IAAI,EAChD,OAAO,uBAAuB,oBAAoB,GAAG,EACrD,OAAO,2BAA2B,4BAA4B,KAAK,EACnE,OAAO,sBAAsB,oBAAoB,cAAc,EAC/D,OAAO,yBAAyB,qCAAqC,MAAM,EAC3E,OAAO,cAAc,gCAAgC,KAAK,EAC1D,OAAO,mBAAmB,sCAAsC,EAChE,OAAO,iBAAiB,kBAAkB,EAC1C,OAAO,0BAA0B,kEAAkE,EACnG,OAAO,OAAO,SAAS;AACtB,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ;AAAA,MAC3B,OAAO,KAAK;AAAA,MACZ,UAAU,KAAK;AAAA,MACf,IAAI,KAAK;AAAA,MACT,IAAI,KAAK;AAAA,MACT,OAAO,SAAS,KAAK,OAAO,EAAE;AAAA,MAC9B,cAAc,SAAS,KAAK,cAAc,EAAE;AAAA,MAC5C,WAAW,KAAK;AAAA,MAChB,QAAQ,KAAK;AAAA,MACb,UAAU,KAAK;AAAA,MACf,YAAY,KAAK;AAAA,MACjB,OAAO,KAAK;AAAA,MACZ,cAAc,KAAK,gBAAgB,qBAAqB;AAAA,IAC1D,CAAC;AACD,YAAQ,IAAI,KAAK,UAAU,EAAE,gBAAgB,OAAO,gBAAgB,WAAW,OAAO,MAAM,KAAK,CAAC,CAAC;AAAA,EACrG,SAAS,KAAK;AACZ,YAAQ,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAC9D,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF,CAAC;AAEH,eAAsB,SAAwB;AAC5C,QAAM,QAAQ,WAAW;AAC3B;;;AC5CA,OAAO;","names":[]}
|
|
@@ -20,7 +20,7 @@ function harvestTimeoutBudget(maxQuestions, serpOnly = false) {
|
|
|
20
20
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
21
21
|
|
|
22
22
|
// src/version.ts
|
|
23
|
-
var PACKAGE_VERSION = "0.1.
|
|
23
|
+
var PACKAGE_VERSION = "0.1.8";
|
|
24
24
|
|
|
25
25
|
// src/mcp/mcp-tool-schemas.ts
|
|
26
26
|
import { z } from "zod";
|
|
@@ -116,6 +116,120 @@ var MapsSearchOutputSchema = {
|
|
|
116
116
|
})),
|
|
117
117
|
durationMs: z.number().int().min(0)
|
|
118
118
|
};
|
|
119
|
+
var OrganicResultOutput = z.object({
|
|
120
|
+
position: z.number().int(),
|
|
121
|
+
title: z.string(),
|
|
122
|
+
url: z.string(),
|
|
123
|
+
domain: z.string(),
|
|
124
|
+
snippet: NullableString
|
|
125
|
+
});
|
|
126
|
+
var AiOverviewOutput = z.object({
|
|
127
|
+
detected: z.boolean(),
|
|
128
|
+
text: NullableString
|
|
129
|
+
}).nullable();
|
|
130
|
+
var EntityIdsOutput = z.object({
|
|
131
|
+
kgIds: z.array(z.string()),
|
|
132
|
+
cids: z.array(z.string()),
|
|
133
|
+
gcids: z.array(z.string())
|
|
134
|
+
}).nullable();
|
|
135
|
+
var HarvestPaaOutputSchema = {
|
|
136
|
+
query: z.string(),
|
|
137
|
+
location: NullableString,
|
|
138
|
+
questionCount: z.number().int().min(0),
|
|
139
|
+
completionStatus: NullableString,
|
|
140
|
+
questions: z.array(z.object({
|
|
141
|
+
question: z.string(),
|
|
142
|
+
answer: NullableString,
|
|
143
|
+
sourceTitle: NullableString,
|
|
144
|
+
sourceSite: NullableString
|
|
145
|
+
})),
|
|
146
|
+
organicResults: z.array(OrganicResultOutput),
|
|
147
|
+
aiOverview: AiOverviewOutput,
|
|
148
|
+
entityIds: EntityIdsOutput,
|
|
149
|
+
durationMs: z.number().min(0).nullable()
|
|
150
|
+
};
|
|
151
|
+
var SearchSerpOutputSchema = {
|
|
152
|
+
query: z.string(),
|
|
153
|
+
location: NullableString,
|
|
154
|
+
organicResults: z.array(OrganicResultOutput),
|
|
155
|
+
localPack: z.array(z.object({
|
|
156
|
+
position: z.number().int(),
|
|
157
|
+
name: z.string(),
|
|
158
|
+
rating: NullableString,
|
|
159
|
+
reviewCount: NullableString,
|
|
160
|
+
websiteUrl: NullableString
|
|
161
|
+
})),
|
|
162
|
+
aiOverview: AiOverviewOutput,
|
|
163
|
+
entityIds: EntityIdsOutput
|
|
164
|
+
};
|
|
165
|
+
var ExtractUrlOutputSchema = {
|
|
166
|
+
url: z.string(),
|
|
167
|
+
title: NullableString,
|
|
168
|
+
headings: z.array(z.object({
|
|
169
|
+
level: z.number().int(),
|
|
170
|
+
text: z.string()
|
|
171
|
+
})),
|
|
172
|
+
schemaBlockCount: z.number().int().min(0),
|
|
173
|
+
entityName: NullableString,
|
|
174
|
+
entityTypes: z.array(z.string()),
|
|
175
|
+
napScore: z.number().nullable(),
|
|
176
|
+
missingSchemaFields: z.array(z.string()),
|
|
177
|
+
screenshotSaved: NullableString
|
|
178
|
+
};
|
|
179
|
+
var ExtractSiteOutputSchema = {
|
|
180
|
+
url: z.string(),
|
|
181
|
+
pageCount: z.number().int().min(0),
|
|
182
|
+
pages: z.array(z.object({
|
|
183
|
+
url: z.string(),
|
|
184
|
+
title: NullableString,
|
|
185
|
+
schemaTypes: z.array(z.string())
|
|
186
|
+
})),
|
|
187
|
+
durationMs: z.number().min(0)
|
|
188
|
+
};
|
|
189
|
+
var MapsPlaceIntelOutputSchema = {
|
|
190
|
+
name: z.string(),
|
|
191
|
+
rating: NullableString,
|
|
192
|
+
reviewCount: NullableString,
|
|
193
|
+
category: NullableString,
|
|
194
|
+
address: NullableString,
|
|
195
|
+
phone: NullableString,
|
|
196
|
+
website: NullableString,
|
|
197
|
+
hoursSummary: NullableString,
|
|
198
|
+
bookingUrl: NullableString,
|
|
199
|
+
kgmid: NullableString,
|
|
200
|
+
cidDecimal: NullableString,
|
|
201
|
+
cidUrl: NullableString,
|
|
202
|
+
lat: z.number().nullable(),
|
|
203
|
+
lng: z.number().nullable(),
|
|
204
|
+
reviewsStatus: z.string(),
|
|
205
|
+
reviewsCollected: z.number().int().min(0),
|
|
206
|
+
reviewTopics: z.array(z.object({
|
|
207
|
+
label: z.string(),
|
|
208
|
+
count: z.string()
|
|
209
|
+
}))
|
|
210
|
+
};
|
|
211
|
+
var CreditsInfoOutputSchema = {
|
|
212
|
+
balanceCredits: z.number().nullable(),
|
|
213
|
+
matchedCost: z.object({
|
|
214
|
+
label: z.string(),
|
|
215
|
+
credits: z.number(),
|
|
216
|
+
unit: z.string(),
|
|
217
|
+
notes: NullableString
|
|
218
|
+
}).nullable(),
|
|
219
|
+
costs: z.array(z.object({
|
|
220
|
+
key: z.string(),
|
|
221
|
+
label: z.string(),
|
|
222
|
+
credits: z.number(),
|
|
223
|
+
unit: z.string(),
|
|
224
|
+
notes: NullableString
|
|
225
|
+
})),
|
|
226
|
+
ledger: z.array(z.object({
|
|
227
|
+
createdAt: z.string(),
|
|
228
|
+
operation: z.string(),
|
|
229
|
+
credits: z.number(),
|
|
230
|
+
description: NullableString
|
|
231
|
+
}))
|
|
232
|
+
};
|
|
119
233
|
var MapSiteUrlsOutputSchema = {
|
|
120
234
|
startUrl: z.string(),
|
|
121
235
|
totalFound: z.number().int().min(0),
|
|
@@ -322,7 +436,7 @@ function debugSection(debug) {
|
|
|
322
436
|
if (!debug || typeof debug !== "object") return "";
|
|
323
437
|
const request = debug.request ?? {};
|
|
324
438
|
const browser = debug.browser ?? {};
|
|
325
|
-
const kernel = browser.kernel ?? {};
|
|
439
|
+
const kernel = browser.browserRuntime ?? browser.kernel ?? {};
|
|
326
440
|
const network = browser.networkLocation ?? {};
|
|
327
441
|
const nav = browser.serpNavigation ?? {};
|
|
328
442
|
const proxyResolution = kernel.proxyResolution ?? {};
|
|
@@ -348,12 +462,14 @@ function errorAttemptsSection(body) {
|
|
|
348
462
|
const lines = attempts.slice(0, 5).map((attempt) => {
|
|
349
463
|
const debug = attempt.debug ?? {};
|
|
350
464
|
const browser = debug.browser ?? {};
|
|
351
|
-
const kernel = browser.kernel ?? {};
|
|
465
|
+
const kernel = browser.browserRuntime ?? browser.kernel ?? {};
|
|
352
466
|
const proxyResolution = kernel.proxyResolution ?? {};
|
|
353
467
|
const network = browser.networkLocation ?? {};
|
|
354
468
|
const nav = browser.serpNavigation ?? {};
|
|
355
469
|
const geo = [network.ip, network.city, network.region].filter(Boolean).join(" / ") || "geo unknown";
|
|
356
|
-
|
|
470
|
+
const sessionId = attempt.browser_session_id ?? attempt.kernel_session_id ?? kernel.sessionId ?? "unknown";
|
|
471
|
+
const cleanupSucceeded = attempt.session_cleanup_succeeded ?? attempt.kernel_delete_succeeded;
|
|
472
|
+
return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${sessionId} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 cleanup ${cleanupSucceeded === true ? "yes" : cleanupSucceeded === false ? "no" : "unknown"}`;
|
|
357
473
|
});
|
|
358
474
|
return `
|
|
359
475
|
|
|
@@ -400,7 +516,31 @@ ${serpRows}` : "";
|
|
|
400
516
|
const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
401
517
|
|
|
402
518
|
${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
|
|
403
|
-
return
|
|
519
|
+
return {
|
|
520
|
+
...oneBlock(full),
|
|
521
|
+
structuredContent: {
|
|
522
|
+
query: input.query,
|
|
523
|
+
location: input.location ?? null,
|
|
524
|
+
questionCount: flat.length,
|
|
525
|
+
completionStatus: diagnostics?.completionStatus ?? null,
|
|
526
|
+
questions: flat.map((r) => ({
|
|
527
|
+
question: String(r.question ?? ""),
|
|
528
|
+
answer: r.answer ?? null,
|
|
529
|
+
sourceTitle: r.source_title ?? null,
|
|
530
|
+
sourceSite: r.source_site ?? null
|
|
531
|
+
})),
|
|
532
|
+
organicResults: organic.map((r) => ({
|
|
533
|
+
position: Number(r.position) || 0,
|
|
534
|
+
title: String(r.title ?? ""),
|
|
535
|
+
url: String(r.url ?? ""),
|
|
536
|
+
domain: String(r.domain ?? ""),
|
|
537
|
+
snippet: r.snippet ?? null
|
|
538
|
+
})),
|
|
539
|
+
aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
|
|
540
|
+
entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null,
|
|
541
|
+
durationMs: durationMs ?? null
|
|
542
|
+
}
|
|
543
|
+
};
|
|
404
544
|
}
|
|
405
545
|
function formatSearchSerp(raw, input) {
|
|
406
546
|
const parsed = parseData(raw);
|
|
@@ -438,7 +578,29 @@ ${localRows}` : "";
|
|
|
438
578
|
const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
439
579
|
|
|
440
580
|
${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
|
|
441
|
-
return
|
|
581
|
+
return {
|
|
582
|
+
...oneBlock(full),
|
|
583
|
+
structuredContent: {
|
|
584
|
+
query: input.query,
|
|
585
|
+
location: input.location ?? null,
|
|
586
|
+
organicResults: organic.map((r) => ({
|
|
587
|
+
position: Number(r.position) || 0,
|
|
588
|
+
title: String(r.title ?? ""),
|
|
589
|
+
url: String(r.url ?? ""),
|
|
590
|
+
domain: String(r.domain ?? ""),
|
|
591
|
+
snippet: r.snippet ?? null
|
|
592
|
+
})),
|
|
593
|
+
localPack: localPack.map((b) => ({
|
|
594
|
+
position: Number(b.position) || 0,
|
|
595
|
+
name: String(b.name ?? ""),
|
|
596
|
+
rating: b.rating ?? null,
|
|
597
|
+
reviewCount: b.reviewCount ?? null,
|
|
598
|
+
websiteUrl: b.websiteUrl ?? null
|
|
599
|
+
})),
|
|
600
|
+
aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
|
|
601
|
+
entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null
|
|
602
|
+
}
|
|
603
|
+
};
|
|
442
604
|
}
|
|
443
605
|
function formatExtractUrl(raw, input) {
|
|
444
606
|
const parsed = parseData(raw);
|
|
@@ -507,15 +669,27 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
|
507
669
|
**${title}**
|
|
508
670
|
${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
|
|
509
671
|
const textResult = oneBlock(full);
|
|
672
|
+
const structuredContent = {
|
|
673
|
+
url,
|
|
674
|
+
title: d.title ?? null,
|
|
675
|
+
headings: headings.map((h) => ({ level: Number(h.level) || 0, text: String(h.text ?? "") })),
|
|
676
|
+
schemaBlockCount: schemaCount,
|
|
677
|
+
entityName: kpo?.entityName ?? null,
|
|
678
|
+
entityTypes: kpo?.type ?? [],
|
|
679
|
+
napScore: kpo?.napScore ?? null,
|
|
680
|
+
missingSchemaFields: kpo?.missingFields ?? [],
|
|
681
|
+
screenshotSaved: screenshotPath ?? null
|
|
682
|
+
};
|
|
510
683
|
if (screenshotMeta?.base64) {
|
|
511
684
|
return {
|
|
512
685
|
content: [
|
|
513
686
|
...textResult.content,
|
|
514
687
|
{ type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
|
|
515
|
-
]
|
|
688
|
+
],
|
|
689
|
+
structuredContent
|
|
516
690
|
};
|
|
517
691
|
}
|
|
518
|
-
return textResult;
|
|
692
|
+
return { ...textResult, structuredContent };
|
|
519
693
|
}
|
|
520
694
|
function formatMapSiteUrls(raw, input) {
|
|
521
695
|
const parsed = parseData(raw);
|
|
@@ -585,7 +759,19 @@ ${pageRows}`,
|
|
|
585
759
|
- Map URLs first: use \`map_site_urls\`
|
|
586
760
|
- Inspect a single page: use \`extract_url\``
|
|
587
761
|
].join("\n");
|
|
588
|
-
return
|
|
762
|
+
return {
|
|
763
|
+
...oneBlock(full),
|
|
764
|
+
structuredContent: {
|
|
765
|
+
url: input.url,
|
|
766
|
+
pageCount: pages.length,
|
|
767
|
+
pages: pages.map((p) => ({
|
|
768
|
+
url: String(p.url ?? ""),
|
|
769
|
+
title: p.title ?? null,
|
|
770
|
+
schemaTypes: p.kpo?.type ?? []
|
|
771
|
+
})),
|
|
772
|
+
durationMs: d.durationMs ?? 0
|
|
773
|
+
}
|
|
774
|
+
};
|
|
589
775
|
}
|
|
590
776
|
function formatYoutubeHarvest(raw, input) {
|
|
591
777
|
const parsed = parseData(raw);
|
|
@@ -782,7 +968,26 @@ ${costRows}` : "",
|
|
|
782
968
|
|------|-----------|---------|-------------|
|
|
783
969
|
${ledgerRows}` : ""
|
|
784
970
|
].filter(Boolean).join("\n");
|
|
785
|
-
return
|
|
971
|
+
return {
|
|
972
|
+
...oneBlock(full),
|
|
973
|
+
structuredContent: {
|
|
974
|
+
balanceCredits: typeof balance === "number" ? balance : null,
|
|
975
|
+
matchedCost: matched ? { label: matched.label, credits: matched.credits, unit: matched.unit, notes: matched.notes ?? null } : null,
|
|
976
|
+
costs: costs.map((c) => ({
|
|
977
|
+
key: c.key,
|
|
978
|
+
label: c.label,
|
|
979
|
+
credits: c.credits,
|
|
980
|
+
unit: c.unit,
|
|
981
|
+
notes: c.notes ?? null
|
|
982
|
+
})),
|
|
983
|
+
ledger: ledger.map((row) => ({
|
|
984
|
+
createdAt: String(row.created_at ?? ""),
|
|
985
|
+
operation: String(row.operation ?? ""),
|
|
986
|
+
credits: row.amount_mc / 1e3,
|
|
987
|
+
description: row.description ?? null
|
|
988
|
+
}))
|
|
989
|
+
}
|
|
990
|
+
};
|
|
786
991
|
}
|
|
787
992
|
function formatMapsSearch(raw, input) {
|
|
788
993
|
const parsed = parseData(raw);
|
|
@@ -931,7 +1136,28 @@ ${entitySection}` : null,
|
|
|
931
1136
|
---
|
|
932
1137
|
*Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
|
|
933
1138
|
].filter(Boolean).join("\n");
|
|
934
|
-
return
|
|
1139
|
+
return {
|
|
1140
|
+
...oneBlock(full),
|
|
1141
|
+
structuredContent: {
|
|
1142
|
+
name,
|
|
1143
|
+
rating: rating ?? null,
|
|
1144
|
+
reviewCount: reviewCount ?? null,
|
|
1145
|
+
category: category ?? null,
|
|
1146
|
+
address: address ?? null,
|
|
1147
|
+
phone: phone ?? null,
|
|
1148
|
+
website: website ?? null,
|
|
1149
|
+
hoursSummary: hoursSummary ?? null,
|
|
1150
|
+
bookingUrl: bookingUrl ?? null,
|
|
1151
|
+
kgmid: kgmid ?? null,
|
|
1152
|
+
cidDecimal: cidDecimal ?? null,
|
|
1153
|
+
cidUrl: cidUrl ?? null,
|
|
1154
|
+
lat: lat ?? null,
|
|
1155
|
+
lng: lng ?? null,
|
|
1156
|
+
reviewsStatus,
|
|
1157
|
+
reviewsCollected: reviews.length,
|
|
1158
|
+
reviewTopics: topics.map((t) => ({ label: String(t.label ?? ""), count: String(t.count ?? "") }))
|
|
1159
|
+
}
|
|
1160
|
+
};
|
|
935
1161
|
}
|
|
936
1162
|
function formatFacebookAdTranscribe(raw, input) {
|
|
937
1163
|
const parsed = parseData(raw);
|
|
@@ -983,18 +1209,21 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
|
|
|
983
1209
|
title: "Google PAA + SERP Harvest",
|
|
984
1210
|
description: withReportNote('Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded.'),
|
|
985
1211
|
inputSchema: HarvestPaaInputSchema,
|
|
1212
|
+
outputSchema: HarvestPaaOutputSchema,
|
|
986
1213
|
annotations: liveWebToolAnnotations("Google PAA + SERP Harvest")
|
|
987
1214
|
}, async (input) => formatHarvestPaa(await executor.harvestPaa(input), input));
|
|
988
1215
|
server.registerTool("search_serp", {
|
|
989
1216
|
title: "Google SERP Lookup",
|
|
990
1217
|
description: withReportNote("Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request."),
|
|
991
1218
|
inputSchema: SearchSerpInputSchema,
|
|
1219
|
+
outputSchema: SearchSerpOutputSchema,
|
|
992
1220
|
annotations: liveWebToolAnnotations("Google SERP Lookup")
|
|
993
1221
|
}, async (input) => formatSearchSerp(await executor.searchSerp(input), input));
|
|
994
1222
|
server.registerTool("extract_url", {
|
|
995
1223
|
title: "Single URL Extract",
|
|
996
1224
|
description: withReportNote("Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page."),
|
|
997
1225
|
inputSchema: ExtractUrlInputSchema,
|
|
1226
|
+
outputSchema: ExtractUrlOutputSchema,
|
|
998
1227
|
annotations: liveWebToolAnnotations("Single URL Extract")
|
|
999
1228
|
}, async (input) => formatExtractUrl(await executor.extractUrl(input), input));
|
|
1000
1229
|
server.registerTool("map_site_urls", {
|
|
@@ -1008,6 +1237,7 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
|
|
|
1008
1237
|
title: "Multi-Page Site Extract",
|
|
1009
1238
|
description: withReportNote("Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction."),
|
|
1010
1239
|
inputSchema: ExtractSiteInputSchema,
|
|
1240
|
+
outputSchema: ExtractSiteOutputSchema,
|
|
1011
1241
|
annotations: liveWebToolAnnotations("Multi-Page Site Extract")
|
|
1012
1242
|
}, async (input) => formatExtractSite(await executor.extractSite(input), input));
|
|
1013
1243
|
server.registerTool("youtube_harvest", {
|
|
@@ -1047,6 +1277,7 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
|
|
|
1047
1277
|
title: "Google Maps Business Profile Details",
|
|
1048
1278
|
description: withReportNote('Extract Google Maps business intelligence for one known/named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Do not use this for category searches, local market prospect lists, or requests for multiple GMB/GBP profiles; use maps_search first for those. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain.'),
|
|
1049
1279
|
inputSchema: MapsPlaceIntelInputSchema,
|
|
1280
|
+
outputSchema: MapsPlaceIntelOutputSchema,
|
|
1050
1281
|
annotations: liveWebToolAnnotations("Google Maps Business Profile Details")
|
|
1051
1282
|
}, async (input) => formatMapsPlaceIntel(await executor.mapsPlaceIntel(input), input));
|
|
1052
1283
|
server.registerTool("maps_search", {
|
|
@@ -1060,6 +1291,7 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
|
|
|
1060
1291
|
title: "MCP Scraper Credits & Costs",
|
|
1061
1292
|
description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
|
|
1062
1293
|
inputSchema: CreditsInfoInputSchema,
|
|
1294
|
+
outputSchema: CreditsInfoOutputSchema,
|
|
1063
1295
|
annotations: {
|
|
1064
1296
|
title: "MCP Scraper Credits & Costs",
|
|
1065
1297
|
readOnlyHint: true,
|
|
@@ -1183,4 +1415,4 @@ export {
|
|
|
1183
1415
|
buildPaaExtractorMcpServer,
|
|
1184
1416
|
HttpMcpToolExecutor
|
|
1185
1417
|
};
|
|
1186
|
-
//# sourceMappingURL=chunk-
|
|
1418
|
+
//# sourceMappingURL=chunk-RE6HCRYC.js.map
|