mcp-scraper 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,8 +17,8 @@ loadDotEnv();
17
17
  async function main() {
18
18
  const [{ serve }, { app }, { startWorker }, { migrate }] = await Promise.all([
19
19
  import("@hono/node-server"),
20
- import("../server-YNJHP5PU.js"),
21
- import("../worker-PBG6LGET.js"),
20
+ import("../server-QXVVTKJP.js"),
21
+ import("../worker-AUCXFHEL.js"),
22
22
  import("../db-YWCNHBLH.js")
23
23
  ]);
24
24
  const PORT = parseInt(process.env.PORT ?? "3001");
@@ -128,7 +128,7 @@ var HttpMcpToolExecutor = class {
128
128
  var import_mcp = require("@modelcontextprotocol/sdk/server/mcp.js");
129
129
 
130
130
  // src/version.ts
131
- var PACKAGE_VERSION = "0.1.7";
131
+ var PACKAGE_VERSION = "0.1.8";
132
132
 
133
133
  // src/mcp/mcp-tool-schemas.ts
134
134
  var import_zod = require("zod");
@@ -224,6 +224,120 @@ var MapsSearchOutputSchema = {
224
224
  })),
225
225
  durationMs: import_zod.z.number().int().min(0)
226
226
  };
227
+ var OrganicResultOutput = import_zod.z.object({
228
+ position: import_zod.z.number().int(),
229
+ title: import_zod.z.string(),
230
+ url: import_zod.z.string(),
231
+ domain: import_zod.z.string(),
232
+ snippet: NullableString
233
+ });
234
+ var AiOverviewOutput = import_zod.z.object({
235
+ detected: import_zod.z.boolean(),
236
+ text: NullableString
237
+ }).nullable();
238
+ var EntityIdsOutput = import_zod.z.object({
239
+ kgIds: import_zod.z.array(import_zod.z.string()),
240
+ cids: import_zod.z.array(import_zod.z.string()),
241
+ gcids: import_zod.z.array(import_zod.z.string())
242
+ }).nullable();
243
+ var HarvestPaaOutputSchema = {
244
+ query: import_zod.z.string(),
245
+ location: NullableString,
246
+ questionCount: import_zod.z.number().int().min(0),
247
+ completionStatus: NullableString,
248
+ questions: import_zod.z.array(import_zod.z.object({
249
+ question: import_zod.z.string(),
250
+ answer: NullableString,
251
+ sourceTitle: NullableString,
252
+ sourceSite: NullableString
253
+ })),
254
+ organicResults: import_zod.z.array(OrganicResultOutput),
255
+ aiOverview: AiOverviewOutput,
256
+ entityIds: EntityIdsOutput,
257
+ durationMs: import_zod.z.number().min(0).nullable()
258
+ };
259
+ var SearchSerpOutputSchema = {
260
+ query: import_zod.z.string(),
261
+ location: NullableString,
262
+ organicResults: import_zod.z.array(OrganicResultOutput),
263
+ localPack: import_zod.z.array(import_zod.z.object({
264
+ position: import_zod.z.number().int(),
265
+ name: import_zod.z.string(),
266
+ rating: NullableString,
267
+ reviewCount: NullableString,
268
+ websiteUrl: NullableString
269
+ })),
270
+ aiOverview: AiOverviewOutput,
271
+ entityIds: EntityIdsOutput
272
+ };
273
+ var ExtractUrlOutputSchema = {
274
+ url: import_zod.z.string(),
275
+ title: NullableString,
276
+ headings: import_zod.z.array(import_zod.z.object({
277
+ level: import_zod.z.number().int(),
278
+ text: import_zod.z.string()
279
+ })),
280
+ schemaBlockCount: import_zod.z.number().int().min(0),
281
+ entityName: NullableString,
282
+ entityTypes: import_zod.z.array(import_zod.z.string()),
283
+ napScore: import_zod.z.number().nullable(),
284
+ missingSchemaFields: import_zod.z.array(import_zod.z.string()),
285
+ screenshotSaved: NullableString
286
+ };
287
+ var ExtractSiteOutputSchema = {
288
+ url: import_zod.z.string(),
289
+ pageCount: import_zod.z.number().int().min(0),
290
+ pages: import_zod.z.array(import_zod.z.object({
291
+ url: import_zod.z.string(),
292
+ title: NullableString,
293
+ schemaTypes: import_zod.z.array(import_zod.z.string())
294
+ })),
295
+ durationMs: import_zod.z.number().min(0)
296
+ };
297
+ var MapsPlaceIntelOutputSchema = {
298
+ name: import_zod.z.string(),
299
+ rating: NullableString,
300
+ reviewCount: NullableString,
301
+ category: NullableString,
302
+ address: NullableString,
303
+ phone: NullableString,
304
+ website: NullableString,
305
+ hoursSummary: NullableString,
306
+ bookingUrl: NullableString,
307
+ kgmid: NullableString,
308
+ cidDecimal: NullableString,
309
+ cidUrl: NullableString,
310
+ lat: import_zod.z.number().nullable(),
311
+ lng: import_zod.z.number().nullable(),
312
+ reviewsStatus: import_zod.z.string(),
313
+ reviewsCollected: import_zod.z.number().int().min(0),
314
+ reviewTopics: import_zod.z.array(import_zod.z.object({
315
+ label: import_zod.z.string(),
316
+ count: import_zod.z.string()
317
+ }))
318
+ };
319
+ var CreditsInfoOutputSchema = {
320
+ balanceCredits: import_zod.z.number().nullable(),
321
+ matchedCost: import_zod.z.object({
322
+ label: import_zod.z.string(),
323
+ credits: import_zod.z.number(),
324
+ unit: import_zod.z.string(),
325
+ notes: NullableString
326
+ }).nullable(),
327
+ costs: import_zod.z.array(import_zod.z.object({
328
+ key: import_zod.z.string(),
329
+ label: import_zod.z.string(),
330
+ credits: import_zod.z.number(),
331
+ unit: import_zod.z.string(),
332
+ notes: NullableString
333
+ })),
334
+ ledger: import_zod.z.array(import_zod.z.object({
335
+ createdAt: import_zod.z.string(),
336
+ operation: import_zod.z.string(),
337
+ credits: import_zod.z.number(),
338
+ description: NullableString
339
+ }))
340
+ };
227
341
  var MapSiteUrlsOutputSchema = {
228
342
  startUrl: import_zod.z.string(),
229
343
  totalFound: import_zod.z.number().int().min(0),
@@ -434,7 +548,7 @@ function debugSection(debug) {
434
548
  if (!debug || typeof debug !== "object") return "";
435
549
  const request = debug.request ?? {};
436
550
  const browser = debug.browser ?? {};
437
- const kernel = browser.kernel ?? {};
551
+ const kernel = browser.browserRuntime ?? browser.kernel ?? {};
438
552
  const network = browser.networkLocation ?? {};
439
553
  const nav = browser.serpNavigation ?? {};
440
554
  const proxyResolution = kernel.proxyResolution ?? {};
@@ -460,12 +574,14 @@ function errorAttemptsSection(body) {
460
574
  const lines = attempts.slice(0, 5).map((attempt) => {
461
575
  const debug = attempt.debug ?? {};
462
576
  const browser = debug.browser ?? {};
463
- const kernel = browser.kernel ?? {};
577
+ const kernel = browser.browserRuntime ?? browser.kernel ?? {};
464
578
  const proxyResolution = kernel.proxyResolution ?? {};
465
579
  const network = browser.networkLocation ?? {};
466
580
  const nav = browser.serpNavigation ?? {};
467
581
  const geo = [network.ip, network.city, network.region].filter(Boolean).join(" / ") || "geo unknown";
468
- return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${attempt.kernel_session_id ?? kernel.sessionId ?? "unknown"} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 deleted ${attempt.kernel_delete_succeeded === true ? "yes" : attempt.kernel_delete_succeeded === false ? "no" : "unknown"}`;
582
+ const sessionId = attempt.browser_session_id ?? attempt.kernel_session_id ?? kernel.sessionId ?? "unknown";
583
+ const cleanupSucceeded = attempt.session_cleanup_succeeded ?? attempt.kernel_delete_succeeded;
584
+ return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${sessionId} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 cleanup ${cleanupSucceeded === true ? "yes" : cleanupSucceeded === false ? "no" : "unknown"}`;
469
585
  });
470
586
  return `
471
587
 
@@ -512,7 +628,31 @@ ${serpRows}` : "";
512
628
  const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
513
629
 
514
630
  ${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
515
- return oneBlock(full);
631
+ return {
632
+ ...oneBlock(full),
633
+ structuredContent: {
634
+ query: input.query,
635
+ location: input.location ?? null,
636
+ questionCount: flat.length,
637
+ completionStatus: diagnostics?.completionStatus ?? null,
638
+ questions: flat.map((r) => ({
639
+ question: String(r.question ?? ""),
640
+ answer: r.answer ?? null,
641
+ sourceTitle: r.source_title ?? null,
642
+ sourceSite: r.source_site ?? null
643
+ })),
644
+ organicResults: organic.map((r) => ({
645
+ position: Number(r.position) || 0,
646
+ title: String(r.title ?? ""),
647
+ url: String(r.url ?? ""),
648
+ domain: String(r.domain ?? ""),
649
+ snippet: r.snippet ?? null
650
+ })),
651
+ aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
652
+ entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null,
653
+ durationMs: durationMs ?? null
654
+ }
655
+ };
516
656
  }
517
657
  function formatSearchSerp(raw, input) {
518
658
  const parsed = parseData(raw);
@@ -550,7 +690,29 @@ ${localRows}` : "";
550
690
  const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
551
691
 
552
692
  ${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
553
- return oneBlock(full);
693
+ return {
694
+ ...oneBlock(full),
695
+ structuredContent: {
696
+ query: input.query,
697
+ location: input.location ?? null,
698
+ organicResults: organic.map((r) => ({
699
+ position: Number(r.position) || 0,
700
+ title: String(r.title ?? ""),
701
+ url: String(r.url ?? ""),
702
+ domain: String(r.domain ?? ""),
703
+ snippet: r.snippet ?? null
704
+ })),
705
+ localPack: localPack.map((b) => ({
706
+ position: Number(b.position) || 0,
707
+ name: String(b.name ?? ""),
708
+ rating: b.rating ?? null,
709
+ reviewCount: b.reviewCount ?? null,
710
+ websiteUrl: b.websiteUrl ?? null
711
+ })),
712
+ aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
713
+ entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null
714
+ }
715
+ };
554
716
  }
555
717
  function formatExtractUrl(raw, input) {
556
718
  const parsed = parseData(raw);
@@ -619,15 +781,27 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
619
781
  **${title}**
620
782
  ${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
621
783
  const textResult = oneBlock(full);
784
+ const structuredContent = {
785
+ url,
786
+ title: d.title ?? null,
787
+ headings: headings.map((h) => ({ level: Number(h.level) || 0, text: String(h.text ?? "") })),
788
+ schemaBlockCount: schemaCount,
789
+ entityName: kpo?.entityName ?? null,
790
+ entityTypes: kpo?.type ?? [],
791
+ napScore: kpo?.napScore ?? null,
792
+ missingSchemaFields: kpo?.missingFields ?? [],
793
+ screenshotSaved: screenshotPath ?? null
794
+ };
622
795
  if (screenshotMeta?.base64) {
623
796
  return {
624
797
  content: [
625
798
  ...textResult.content,
626
799
  { type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
627
- ]
800
+ ],
801
+ structuredContent
628
802
  };
629
803
  }
630
- return textResult;
804
+ return { ...textResult, structuredContent };
631
805
  }
632
806
  function formatMapSiteUrls(raw, input) {
633
807
  const parsed = parseData(raw);
@@ -697,7 +871,19 @@ ${pageRows}`,
697
871
  - Map URLs first: use \`map_site_urls\`
698
872
  - Inspect a single page: use \`extract_url\``
699
873
  ].join("\n");
700
- return oneBlock(full);
874
+ return {
875
+ ...oneBlock(full),
876
+ structuredContent: {
877
+ url: input.url,
878
+ pageCount: pages.length,
879
+ pages: pages.map((p) => ({
880
+ url: String(p.url ?? ""),
881
+ title: p.title ?? null,
882
+ schemaTypes: p.kpo?.type ?? []
883
+ })),
884
+ durationMs: d.durationMs ?? 0
885
+ }
886
+ };
701
887
  }
702
888
  function formatYoutubeHarvest(raw, input) {
703
889
  const parsed = parseData(raw);
@@ -894,7 +1080,26 @@ ${costRows}` : "",
894
1080
  |------|-----------|---------|-------------|
895
1081
  ${ledgerRows}` : ""
896
1082
  ].filter(Boolean).join("\n");
897
- return oneBlock(full);
1083
+ return {
1084
+ ...oneBlock(full),
1085
+ structuredContent: {
1086
+ balanceCredits: typeof balance === "number" ? balance : null,
1087
+ matchedCost: matched ? { label: matched.label, credits: matched.credits, unit: matched.unit, notes: matched.notes ?? null } : null,
1088
+ costs: costs.map((c) => ({
1089
+ key: c.key,
1090
+ label: c.label,
1091
+ credits: c.credits,
1092
+ unit: c.unit,
1093
+ notes: c.notes ?? null
1094
+ })),
1095
+ ledger: ledger.map((row) => ({
1096
+ createdAt: String(row.created_at ?? ""),
1097
+ operation: String(row.operation ?? ""),
1098
+ credits: row.amount_mc / 1e3,
1099
+ description: row.description ?? null
1100
+ }))
1101
+ }
1102
+ };
898
1103
  }
899
1104
  function formatMapsSearch(raw, input) {
900
1105
  const parsed = parseData(raw);
@@ -1043,7 +1248,28 @@ ${entitySection}` : null,
1043
1248
  ---
1044
1249
  *Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
1045
1250
  ].filter(Boolean).join("\n");
1046
- return oneBlock(full);
1251
+ return {
1252
+ ...oneBlock(full),
1253
+ structuredContent: {
1254
+ name,
1255
+ rating: rating ?? null,
1256
+ reviewCount: reviewCount ?? null,
1257
+ category: category ?? null,
1258
+ address: address ?? null,
1259
+ phone: phone ?? null,
1260
+ website: website ?? null,
1261
+ hoursSummary: hoursSummary ?? null,
1262
+ bookingUrl: bookingUrl ?? null,
1263
+ kgmid: kgmid ?? null,
1264
+ cidDecimal: cidDecimal ?? null,
1265
+ cidUrl: cidUrl ?? null,
1266
+ lat: lat ?? null,
1267
+ lng: lng ?? null,
1268
+ reviewsStatus,
1269
+ reviewsCollected: reviews.length,
1270
+ reviewTopics: topics.map((t) => ({ label: String(t.label ?? ""), count: String(t.count ?? "") }))
1271
+ }
1272
+ };
1047
1273
  }
1048
1274
  function formatFacebookAdTranscribe(raw, input) {
1049
1275
  const parsed = parseData(raw);
@@ -1095,18 +1321,21 @@ function buildPaaExtractorMcpServer(executor2, options = {}) {
1095
1321
  title: "Google PAA + SERP Harvest",
1096
1322
  description: withReportNote('Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded.'),
1097
1323
  inputSchema: HarvestPaaInputSchema,
1324
+ outputSchema: HarvestPaaOutputSchema,
1098
1325
  annotations: liveWebToolAnnotations("Google PAA + SERP Harvest")
1099
1326
  }, async (input) => formatHarvestPaa(await executor2.harvestPaa(input), input));
1100
1327
  server2.registerTool("search_serp", {
1101
1328
  title: "Google SERP Lookup",
1102
1329
  description: withReportNote("Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request."),
1103
1330
  inputSchema: SearchSerpInputSchema,
1331
+ outputSchema: SearchSerpOutputSchema,
1104
1332
  annotations: liveWebToolAnnotations("Google SERP Lookup")
1105
1333
  }, async (input) => formatSearchSerp(await executor2.searchSerp(input), input));
1106
1334
  server2.registerTool("extract_url", {
1107
1335
  title: "Single URL Extract",
1108
1336
  description: withReportNote("Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page."),
1109
1337
  inputSchema: ExtractUrlInputSchema,
1338
+ outputSchema: ExtractUrlOutputSchema,
1110
1339
  annotations: liveWebToolAnnotations("Single URL Extract")
1111
1340
  }, async (input) => formatExtractUrl(await executor2.extractUrl(input), input));
1112
1341
  server2.registerTool("map_site_urls", {
@@ -1120,6 +1349,7 @@ function buildPaaExtractorMcpServer(executor2, options = {}) {
1120
1349
  title: "Multi-Page Site Extract",
1121
1350
  description: withReportNote("Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction."),
1122
1351
  inputSchema: ExtractSiteInputSchema,
1352
+ outputSchema: ExtractSiteOutputSchema,
1123
1353
  annotations: liveWebToolAnnotations("Multi-Page Site Extract")
1124
1354
  }, async (input) => formatExtractSite(await executor2.extractSite(input), input));
1125
1355
  server2.registerTool("youtube_harvest", {
@@ -1159,6 +1389,7 @@ function buildPaaExtractorMcpServer(executor2, options = {}) {
1159
1389
  title: "Google Maps Business Profile Details",
1160
1390
  description: withReportNote('Extract Google Maps business intelligence for one known/named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Do not use this for category searches, local market prospect lists, or requests for multiple GMB/GBP profiles; use maps_search first for those. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain.'),
1161
1391
  inputSchema: MapsPlaceIntelInputSchema,
1392
+ outputSchema: MapsPlaceIntelOutputSchema,
1162
1393
  annotations: liveWebToolAnnotations("Google Maps Business Profile Details")
1163
1394
  }, async (input) => formatMapsPlaceIntel(await executor2.mapsPlaceIntel(input), input));
1164
1395
  server2.registerTool("maps_search", {
@@ -1172,6 +1403,7 @@ function buildPaaExtractorMcpServer(executor2, options = {}) {
1172
1403
  title: "MCP Scraper Credits & Costs",
1173
1404
  description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
1174
1405
  inputSchema: CreditsInfoInputSchema,
1406
+ outputSchema: CreditsInfoOutputSchema,
1175
1407
  annotations: {
1176
1408
  title: "MCP Scraper Credits & Costs",
1177
1409
  readOnlyHint: true,