mcp-scraper 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -50,6 +50,109 @@ var init_harvest_timeout = __esm({
50
50
  }
51
51
  });
52
52
 
53
+ // src/lib/browser-service-env.ts
54
+ function browserServiceApiKey() {
55
+ const value = (process.env.BROWSER_SERVICE_API_KEY ?? process.env.KERNEL_API_KEY)?.trim();
56
+ return value || void 0;
57
+ }
58
+ function browserServiceProxyId() {
59
+ const value = (process.env.BROWSER_SERVICE_PROXY_ID ?? process.env.KERNEL_PROXY_ID)?.trim();
60
+ return value || void 0;
61
+ }
62
+ var init_browser_service_env = __esm({
63
+ "src/lib/browser-service-env.ts"() {
64
+ "use strict";
65
+ }
66
+ });
67
+
68
+ // src/errors.ts
69
+ function sanitizeVendorName(message) {
70
+ return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
71
+ }
72
+ var RECAPTCHA_INSTRUCTIONS, CaptchaError, ExtractionError, RequestAbortedError;
73
+ var init_errors = __esm({
74
+ "src/errors.ts"() {
75
+ "use strict";
76
+ RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
77
+ CaptchaError = class extends Error {
78
+ constructor(instructions) {
79
+ super(`CAPTCHA detected. ${instructions}`);
80
+ this.instructions = instructions;
81
+ }
82
+ instructions;
83
+ name = "CaptchaError";
84
+ };
85
+ ExtractionError = class extends Error {
86
+ constructor(message, cause) {
87
+ super(message);
88
+ this.cause = cause;
89
+ }
90
+ cause;
91
+ name = "ExtractionError";
92
+ };
93
+ RequestAbortedError = class extends Error {
94
+ name = "RequestAbortedError";
95
+ constructor(message = "Request aborted before harvest completed") {
96
+ super(message);
97
+ }
98
+ };
99
+ }
100
+ });
101
+
102
+ // src/api/outbound-sanitize.ts
103
+ function sanitizeOutboundDiagnostics(value, parentKey = "") {
104
+ if (typeof value === "string") {
105
+ if (SANITIZED_VALUE_KEYS.test(parentKey) && /kernel/i.test(value)) {
106
+ return sanitizeVendorName(value);
107
+ }
108
+ return value;
109
+ }
110
+ if (Array.isArray(value)) return value.map((v) => sanitizeOutboundDiagnostics(v, parentKey));
111
+ if (value !== null && typeof value === "object") {
112
+ const out = {};
113
+ for (const [key, val] of Object.entries(value)) {
114
+ const renamed = KEY_RENAMES[key] ?? key;
115
+ out[renamed] = sanitizeOutboundDiagnostics(val, key);
116
+ }
117
+ return out;
118
+ }
119
+ return value;
120
+ }
121
+ function sanitizeAttempts(attempts) {
122
+ return attempts.map((a) => sanitizeOutboundDiagnostics(a));
123
+ }
124
+ function sanitizeHarvestResult(result) {
125
+ const diagnostics = result?.diagnostics;
126
+ if (!diagnostics?.debug) return result;
127
+ return {
128
+ ...result,
129
+ diagnostics: {
130
+ ...diagnostics,
131
+ debug: sanitizeOutboundDiagnostics(diagnostics.debug)
132
+ }
133
+ };
134
+ }
135
+ var KEY_RENAMES, SANITIZED_VALUE_KEYS;
136
+ var init_outbound_sanitize = __esm({
137
+ "src/api/outbound-sanitize.ts"() {
138
+ "use strict";
139
+ init_errors();
140
+ KEY_RENAMES = {
141
+ kernel: "browserRuntime",
142
+ kernel_session_id: "browser_session_id",
143
+ kernel_delete_started: "session_cleanup_started",
144
+ kernel_delete_succeeded: "session_cleanup_succeeded",
145
+ kernel_delete_error: "session_cleanup_error",
146
+ kernelSessionId: "browserSessionId",
147
+ kernelDeleteStarted: "sessionCleanupStarted",
148
+ kernelDeleteSucceeded: "sessionCleanupSucceeded",
149
+ kernelDeleteError: "sessionCleanupError",
150
+ kernelProxyId: "proxyId"
151
+ };
152
+ SANITIZED_VALUE_KEYS = /error|message/i;
153
+ }
154
+ });
155
+
53
156
  // src/blog/registry.ts
54
157
  var posts;
55
158
  var init_registry = __esm({
@@ -3425,7 +3528,7 @@ var init_url_utils = __esm({
3425
3528
 
3426
3529
  // src/api/kernel-fetch.ts
3427
3530
  async function fetchWithKernel(url) {
3428
- const apiKey = process.env.KERNEL_API_KEY;
3531
+ const apiKey = browserServiceApiKey();
3429
3532
  if (!apiKey) throw new Error("Browser backend API key not set");
3430
3533
  const client = new import_sdk.default({ apiKey });
3431
3534
  const kb = await client.browsers.create({ stealth: true, timeout_seconds: 60 });
@@ -3450,6 +3553,7 @@ var init_kernel_fetch = __esm({
3450
3553
  "src/api/kernel-fetch.ts"() {
3451
3554
  "use strict";
3452
3555
  import_sdk = __toESM(require("@onkernel/sdk"), 1);
3556
+ init_browser_service_env();
3453
3557
  import_playwright = require("playwright");
3454
3558
  }
3455
3559
  });
@@ -4239,8 +4343,8 @@ async function downloadAsset(url, destDir, filename) {
4239
4343
  }
4240
4344
  const writer = (0, import_node_fs.createWriteStream)(dest);
4241
4345
  await (0, import_promises2.pipeline)(import_node_stream.Readable.fromWeb(res.body), writer);
4242
- const { statSync } = await import("fs");
4243
- const sizeBytes = statSync(dest).size;
4346
+ const { statSync: statSync2 } = await import("fs");
4347
+ const sizeBytes = statSync2(dest).size;
4244
4348
  return { savedPath: dest, sizeBytes, mimeType };
4245
4349
  }
4246
4350
  async function harvestPageMedia(html, pageUrl, options = {}) {
@@ -8494,40 +8598,6 @@ var init_selectors = __esm({
8494
8598
  }
8495
8599
  });
8496
8600
 
8497
- // src/errors.ts
8498
- function sanitizeVendorName(message) {
8499
- return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
8500
- }
8501
- var RECAPTCHA_INSTRUCTIONS, CaptchaError, ExtractionError, RequestAbortedError;
8502
- var init_errors = __esm({
8503
- "src/errors.ts"() {
8504
- "use strict";
8505
- RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
8506
- CaptchaError = class extends Error {
8507
- constructor(instructions) {
8508
- super(`CAPTCHA detected. ${instructions}`);
8509
- this.instructions = instructions;
8510
- }
8511
- instructions;
8512
- name = "CaptchaError";
8513
- };
8514
- ExtractionError = class extends Error {
8515
- constructor(message, cause) {
8516
- super(message);
8517
- this.cause = cause;
8518
- }
8519
- cause;
8520
- name = "ExtractionError";
8521
- };
8522
- RequestAbortedError = class extends Error {
8523
- name = "RequestAbortedError";
8524
- constructor(message = "Request aborted before harvest completed") {
8525
- super(message);
8526
- }
8527
- };
8528
- }
8529
- });
8530
-
8531
8601
  // src/driver/BrowserDriver.ts
8532
8602
  function positiveIntFromEnv(name, fallback) {
8533
8603
  const raw = process.env[name];
@@ -9509,7 +9579,7 @@ async function writeOutputs(result, outputDir) {
9509
9579
  }
9510
9580
  }
9511
9581
  async function ytHarvest(rawOptions) {
9512
- const kernelApiKey = process.env.KERNEL_API_KEY;
9582
+ const kernelApiKey = browserServiceApiKey();
9513
9583
  if (!kernelApiKey) {
9514
9584
  throw new Error("A browser backend API key is required \u2014 YouTube harvesting requires a stealth session.");
9515
9585
  }
@@ -9543,6 +9613,7 @@ var init_youtube_harvest = __esm({
9543
9613
  "src/youtube/youtube-harvest.ts"() {
9544
9614
  "use strict";
9545
9615
  import_node_fs2 = require("fs");
9616
+ init_browser_service_env();
9546
9617
  import_node_path4 = __toESM(require("path"), 1);
9547
9618
  import_papaparse = __toESM(require("papaparse"), 1);
9548
9619
  init_schemas2();
@@ -9620,7 +9691,7 @@ function parseTimedtextXml(xml) {
9620
9691
  return results;
9621
9692
  }
9622
9693
  async function fetchViaKernelInnertube(videoId) {
9623
- const kernelApiKey = process.env.KERNEL_API_KEY;
9694
+ const kernelApiKey = browserServiceApiKey();
9624
9695
  if (!kernelApiKey) return null;
9625
9696
  const driver = new BrowserDriver();
9626
9697
  const start = Date.now();
@@ -9763,7 +9834,7 @@ async function attemptKernelWhisper(videoId, kernelApiKey, falKey, start) {
9763
9834
  }
9764
9835
  }
9765
9836
  async function fetchViaKernelWhisper(videoId) {
9766
- const kernelApiKey = process.env.KERNEL_API_KEY;
9837
+ const kernelApiKey = browserServiceApiKey();
9767
9838
  const falKey = process.env.FAL_KEY;
9768
9839
  if (!kernelApiKey || !falKey) return null;
9769
9840
  const start = Date.now();
@@ -9803,6 +9874,7 @@ var init_CaptionFetcher = __esm({
9803
9874
  "src/youtube/CaptionFetcher.ts"() {
9804
9875
  "use strict";
9805
9876
  init_BrowserDriver();
9877
+ init_browser_service_env();
9806
9878
  import_client2 = require("@fal-ai/client");
9807
9879
  WHISPER_RECORD_SECONDS = 90;
9808
9880
  }
@@ -10044,6 +10116,7 @@ var init_screenshot_routes = __esm({
10044
10116
  "src/api/screenshot-routes.ts"() {
10045
10117
  "use strict";
10046
10118
  import_hono3 = require("hono");
10119
+ init_browser_service_env();
10047
10120
  import_zod14 = require("zod");
10048
10121
  init_screenshot();
10049
10122
  init_api_auth();
@@ -10078,7 +10151,7 @@ var init_screenshot_routes = __esm({
10078
10151
  }
10079
10152
  const device2 = body.device === "mobile" ? "mobile" : "desktop";
10080
10153
  try {
10081
- const buf = await captureScreenshot(parsedFallback.href, process.env.KERNEL_API_KEY?.trim(), device2);
10154
+ const buf = await captureScreenshot(parsedFallback.href, browserServiceApiKey(), device2);
10082
10155
  return new Response(new Uint8Array(buf), {
10083
10156
  status: 200,
10084
10157
  headers: {
@@ -10094,7 +10167,7 @@ var init_screenshot_routes = __esm({
10094
10167
  }
10095
10168
  const device = body.device === "mobile" ? "mobile" : "desktop";
10096
10169
  try {
10097
- const buf = await captureScreenshot(urlCheck.parsed.href, process.env.KERNEL_API_KEY?.trim(), device);
10170
+ const buf = await captureScreenshot(urlCheck.parsed.href, browserServiceApiKey(), device);
10098
10171
  return new Response(new Uint8Array(buf), {
10099
10172
  status: 200,
10100
10173
  headers: {
@@ -11379,29 +11452,30 @@ function buildPageIntelUrl(body, country) {
11379
11452
  return `https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=${country}&q=${encodeURIComponent(body.query.trim())}&search_type=keyword_unordered`;
11380
11453
  }
11381
11454
  function kernelLaunchOpts() {
11382
- return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
11455
+ return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: browserServiceProxyId(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
11383
11456
  }
11384
11457
  async function kernelLaunchOptsResidential() {
11385
- let proxyId = process.env.KERNEL_PROXY_ID?.trim();
11458
+ let proxyId = browserServiceProxyId();
11386
11459
  try {
11387
11460
  const resolution2 = await resolveKernelProxyId({
11388
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
11461
+ kernelApiKey: browserServiceApiKey(),
11389
11462
  proxyMode: "location",
11390
- configuredKernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
11463
+ configuredKernelProxyId: browserServiceProxyId(),
11391
11464
  location: "New York, NY",
11392
11465
  gl: "us"
11393
11466
  });
11394
11467
  if (resolution2.kernelProxyId) proxyId = resolution2.kernelProxyId;
11395
11468
  } catch {
11396
- proxyId = process.env.KERNEL_PROXY_ID?.trim();
11469
+ proxyId = browserServiceProxyId();
11397
11470
  }
11398
- return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
11471
+ return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
11399
11472
  }
11400
11473
  var import_hono4, import_zod15, import_client3, FacebookAdBodySchema, FacebookPageIntelBodySchema, FacebookTranscribeBodySchema, FacebookSearchBodySchema, FacebookMediaBodySchema, facebookAdApp, ALLOWED_MEDIA_HOSTS;
11401
11474
  var init_facebook_ad_routes = __esm({
11402
11475
  "src/api/facebook-ad-routes.ts"() {
11403
11476
  "use strict";
11404
11477
  import_hono4 = require("hono");
11478
+ init_browser_service_env();
11405
11479
  import_zod15 = require("zod");
11406
11480
  init_db();
11407
11481
  init_rates();
@@ -14321,8 +14395,8 @@ async function harvest(rawOptions) {
14321
14395
  const onAttemptEvent = getAttemptLogSink(rawOptions);
14322
14396
  const requestedProxyMode = raw.proxyMode;
14323
14397
  const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
14324
- const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : process.env.KERNEL_API_KEY?.trim();
14325
- const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : process.env.KERNEL_PROXY_ID?.trim();
14398
+ const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : browserServiceApiKey();
14399
+ const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : browserServiceProxyId();
14326
14400
  const proxyOpts = {
14327
14401
  kernelApiKey,
14328
14402
  proxyMode,
@@ -14509,6 +14583,7 @@ var init_harvest = __esm({
14509
14583
  "src/harvest.ts"() {
14510
14584
  "use strict";
14511
14585
  init_schemas3();
14586
+ init_browser_service_env();
14512
14587
  init_BrowserDriver();
14513
14588
  init_PAAExtractor();
14514
14589
  init_OutputSerializer();
@@ -14933,8 +15008,8 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
14933
15008
  debug,
14934
15009
  serpOnly: true,
14935
15010
  headless: runtimeOptions.headless ?? true,
14936
- kernelApiKey: runtimeOptions.kernelApiKey ?? process.env.KERNEL_API_KEY?.trim(),
14937
- kernelProxyId: runtimeOptions.kernelProxyId ?? process.env.KERNEL_PROXY_ID?.trim(),
15011
+ kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
15012
+ kernelProxyId: runtimeOptions.kernelProxyId ?? browserServiceProxyId(),
14938
15013
  format: "json",
14939
15014
  outputDir: runtimeOptions.outputDir ?? "/tmp/serp-intelligence-output",
14940
15015
  signal: runtimeOptions.signal,
@@ -14945,7 +15020,7 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
14945
15020
  const pageSnapshotLimit = normalizePageSnapshotLimit(parsedInput);
14946
15021
  const pageSnapshotTargets = collectPageSnapshotTargets(harvestResult, pageSnapshotLimit);
14947
15022
  const pageSnapshotArtifacts = pageSnapshotTargets.length > 0 ? (await capturePageSnapshotsFn(pageSnapshotTargets, {
14948
- kernelApiKey: runtimeOptions.kernelApiKey ?? process.env.KERNEL_API_KEY?.trim(),
15023
+ kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
14949
15024
  timeoutMs: runtimeOptions.pageSnapshotTimeoutMs,
14950
15025
  maxConcurrency: runtimeOptions.pageSnapshotMaxConcurrency,
14951
15026
  debug,
@@ -14967,6 +15042,7 @@ var init_serp_capture_service = __esm({
14967
15042
  "src/serp-intelligence/serp-capture-service.ts"() {
14968
15043
  "use strict";
14969
15044
  init_harvest();
15045
+ init_browser_service_env();
14970
15046
  init_harvest_problems();
14971
15047
  init_page_snapshot_extractor();
14972
15048
  init_schemas4();
@@ -15071,6 +15147,7 @@ var init_serp_intelligence_routes = __esm({
15071
15147
  "src/api/serp-intelligence-routes.ts"() {
15072
15148
  "use strict";
15073
15149
  import_hono6 = require("hono");
15150
+ init_browser_service_env();
15074
15151
  init_page_snapshot_extractor();
15075
15152
  init_serp_capture_service();
15076
15153
  init_schemas4();
@@ -15103,8 +15180,8 @@ var init_serp_intelligence_routes = __esm({
15103
15180
  if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
15104
15181
  try {
15105
15182
  const result = await captureSerpIntelligenceSnapshot(parsed.data, {
15106
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
15107
- kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
15183
+ kernelApiKey: browserServiceApiKey(),
15184
+ kernelProxyId: browserServiceProxyId(),
15108
15185
  signal: c.req.raw.signal,
15109
15186
  billing: { creditsUsed: cost / 1e3 }
15110
15187
  });
@@ -15159,7 +15236,7 @@ var init_serp_intelligence_routes = __esm({
15159
15236
  if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
15160
15237
  try {
15161
15238
  const result = await capturePageSnapshots(targets, {
15162
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
15239
+ kernelApiKey: browserServiceApiKey(),
15163
15240
  timeoutMs: parsed.data.timeoutMs,
15164
15241
  maxConcurrency: parsed.data.maxConcurrency,
15165
15242
  debug: parsed.data.debug
@@ -15199,207 +15276,7 @@ var PACKAGE_VERSION;
15199
15276
  var init_version = __esm({
15200
15277
  "src/version.ts"() {
15201
15278
  "use strict";
15202
- PACKAGE_VERSION = "0.1.7";
15203
- }
15204
- });
15205
-
15206
- // src/mcp/mcp-tool-schemas.ts
15207
- var import_zod19, HarvestPaaInputSchema, ExtractUrlInputSchema, MapSiteUrlsInputSchema, ExtractSiteInputSchema, YoutubeHarvestInputSchema, YoutubeTranscribeInputSchema, FacebookPageIntelInputSchema, FacebookAdSearchInputSchema, FacebookAdTranscribeInputSchema, MapsPlaceIntelInputSchema, MapsSearchInputSchema, NullableString, MapsSearchOutputSchema, MapSiteUrlsOutputSchema, YoutubeHarvestOutputSchema, FacebookAdSearchOutputSchema, FacebookPageIntelOutputSchema, CreditsInfoInputSchema, SearchSerpInputSchema, CaptureSerpSnapshotInputSchema, ScreenshotInputSchema, CaptureSerpPageSnapshotsInputSchema;
15208
- var init_mcp_tool_schemas = __esm({
15209
- "src/mcp/mcp-tool-schemas.ts"() {
15210
- "use strict";
15211
- import_zod19 = require("zod");
15212
- HarvestPaaInputSchema = {
15213
- query: import_zod19.z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
15214
- location: import_zod19.z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
15215
- maxQuestions: import_zod19.z.number().int().min(1).max(200).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 200. Use 10 for quick probes, 30 for normal research, 100-200 when the user asks for everything/full/deep research. Larger harvests get a longer server time budget (151-200 questions \u2192 up to 280s). Credits are charged by extracted question; unused request hold is refunded."),
15216
- gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
15217
- hl: import_zod19.z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
15218
- device: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
15219
- proxyMode: import_zod19.z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
15220
- proxyZip: import_zod19.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
15221
- debug: import_zod19.z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior.")
15222
- };
15223
- ExtractUrlInputSchema = {
15224
- url: import_zod19.z.string().url().describe("Public http/https URL to extract. Use this when the user provides one specific page URL."),
15225
- screenshot: import_zod19.z.boolean().default(false).describe("Also capture a full-page screenshot of the URL. Saved to ~/Downloads/mcp-scraper/screenshots/ and returned inline. Use when the user asks to see or capture the page visually."),
15226
- screenshotDevice: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport for screenshot. desktop = 1440\xD7900. mobile = 390\xD7844. Default desktop."),
15227
- extractBranding: import_zod19.z.boolean().default(false).describe("Extract brand colors, fonts, logo, and favicon using a rendered browser session. Returns colorScheme (light/dark), colors (primary/accent/background/text/heading as hex), fonts (heading/body family names), and assets (logo URL, favicon URL). Use when the user asks about brand colors, site theme, or brand assets."),
15228
- downloadMedia: import_zod19.z.boolean().default(false).describe("Extract and download all page media (images, video, audio) to ~/Downloads/mcp-scraper/media/. Ad networks, tracking pixels, and noise URLs are filtered automatically. Use when the user asks to download or harvest assets from a page."),
15229
- mediaTypes: import_zod19.z.array(import_zod19.z.enum(["image", "video", "audio"])).default(["image", "video", "audio"]).describe("Which media types to download. Default all three."),
15230
- allowLocal: import_zod19.z.boolean().default(false).describe("Allow localhost and private-network URLs. For local development only.")
15231
- };
15232
- MapSiteUrlsInputSchema = {
15233
- url: import_zod19.z.string().url().describe("Public website URL or domain to crawl for internal URLs. Use before extract_site when the user asks to audit/map/crawl a site."),
15234
- maxUrls: import_zod19.z.number().int().min(1).max(500).optional().describe("Maximum URLs to discover. Use 100 for normal maps, higher when the user asks for a full inventory.")
15235
- };
15236
- ExtractSiteInputSchema = {
15237
- url: import_zod19.z.string().url().describe("Public website URL or domain to extract across multiple pages. Use when the user asks for a site audit, website crawl, or full-site content/schema extraction."),
15238
- maxPages: import_zod19.z.number().int().min(1).max(50).optional().describe("Maximum pages to extract. Use 50 when the user asks for full results or a complete crawl within MCP limits.")
15239
- };
15240
- YoutubeHarvestInputSchema = {
15241
- mode: import_zod19.z.enum(["search", "channel"]).describe("Use search for topic/keyword requests. Use channel when the user provides @handle, channel ID, or channel URL."),
15242
- query: import_zod19.z.string().optional().describe("Required when mode is search. The YouTube search topic in the user\u2019s words."),
15243
- channelHandle: import_zod19.z.string().optional().describe("YouTube channel handle, channel ID, or URL. Examples: @mkbhd, UC..., https://youtube.com/@mkbhd."),
15244
- maxVideos: import_zod19.z.number().int().min(1).max(500).default(50).describe("Number of videos to return. Default 50. Increase when user asks for full channel/history.")
15245
- };
15246
- YoutubeTranscribeInputSchema = {
15247
- videoId: import_zod19.z.string().min(1).describe("YouTube video ID, e.g. dQw4w9WgXcQ")
15248
- };
15249
- FacebookPageIntelInputSchema = {
15250
- pageId: import_zod19.z.string().optional(),
15251
- libraryId: import_zod19.z.string().optional(),
15252
- query: import_zod19.z.string().optional().describe("Advertiser or brand name when pageId/libraryId is not known. One of pageId, libraryId, or query is required."),
15253
- maxAds: import_zod19.z.number().int().min(1).max(200).default(50),
15254
- country: import_zod19.z.string().length(2).default("US")
15255
- };
15256
- FacebookAdSearchInputSchema = {
15257
- query: import_zod19.z.string().min(1).describe("Advertiser, brand, competitor, niche, or keyword to search in Facebook Ad Library."),
15258
- country: import_zod19.z.string().length(2).default("US"),
15259
- maxResults: import_zod19.z.number().int().min(1).max(20).default(10)
15260
- };
15261
- FacebookAdTranscribeInputSchema = {
15262
- videoUrl: import_zod19.z.string().url().describe("Facebook CDN video URL from a facebook_page_intel result")
15263
- };
15264
- MapsPlaceIntelInputSchema = {
15265
- businessName: import_zod19.z.string().min(1).describe('Business name only. If user says "Elite Roofing Denver CO", use businessName="Elite Roofing" and location="Denver, CO".'),
15266
- location: import_zod19.z.string().min(1).describe('City/region/country where the business should be searched, e.g. "Denver, CO". Infer from the user request when possible.'),
15267
- gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location."),
15268
- hl: import_zod19.z.string().length(2).default("en").describe("Language inferred from user request."),
15269
- includeReviews: import_zod19.z.boolean().default(false).describe("Whether to fetch individual review cards"),
15270
- maxReviews: import_zod19.z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
15271
- };
15272
- MapsSearchInputSchema = {
15273
- query: import_zod19.z.string().min(1).describe('Business category, niche, keyword, or search term. If the user says "roofers in Denver CO", use query="roofers" and location="Denver, CO". Do not put the location here when it can be separated.'),
15274
- location: import_zod19.z.string().optional().describe('City, region, country, or service area for the Maps search, e.g. "Denver, CO". Infer from the user request when present.'),
15275
- gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location."),
15276
- hl: import_zod19.z.string().length(2).default("en").describe("Language inferred from user request."),
15277
- maxResults: import_zod19.z.number().int().min(1).max(50).default(10).describe("Number of Google Maps business/profile candidates to return. Default 10. Maximum 50. Use 10 unless the user asks for more.")
15278
- };
15279
- NullableString = import_zod19.z.string().nullable();
15280
- MapsSearchOutputSchema = {
15281
- query: import_zod19.z.string(),
15282
- location: import_zod19.z.string().nullable(),
15283
- searchQuery: import_zod19.z.string(),
15284
- searchUrl: import_zod19.z.string().url(),
15285
- extractedAt: import_zod19.z.string(),
15286
- requestedMaxResults: import_zod19.z.number().int().min(1).max(50),
15287
- resultCount: import_zod19.z.number().int().min(0).max(50),
15288
- results: import_zod19.z.array(import_zod19.z.object({
15289
- position: import_zod19.z.number().int().min(1),
15290
- name: import_zod19.z.string(),
15291
- placeUrl: import_zod19.z.string().url(),
15292
- cid: NullableString,
15293
- cidDecimal: NullableString,
15294
- rating: NullableString,
15295
- reviewCount: NullableString,
15296
- category: NullableString,
15297
- address: NullableString,
15298
- websiteUrl: NullableString,
15299
- directionsUrl: NullableString,
15300
- metadata: import_zod19.z.array(import_zod19.z.string())
15301
- })),
15302
- durationMs: import_zod19.z.number().int().min(0)
15303
- };
15304
- MapSiteUrlsOutputSchema = {
15305
- startUrl: import_zod19.z.string(),
15306
- totalFound: import_zod19.z.number().int().min(0),
15307
- truncated: import_zod19.z.boolean(),
15308
- okCount: import_zod19.z.number().int().min(0),
15309
- redirectCount: import_zod19.z.number().int().min(0),
15310
- brokenCount: import_zod19.z.number().int().min(0),
15311
- urls: import_zod19.z.array(import_zod19.z.object({
15312
- url: import_zod19.z.string(),
15313
- status: import_zod19.z.number().int().nullable()
15314
- })),
15315
- durationMs: import_zod19.z.number().min(0)
15316
- };
15317
- YoutubeHarvestOutputSchema = {
15318
- mode: import_zod19.z.string(),
15319
- videoCount: import_zod19.z.number().int().min(0),
15320
- channel: import_zod19.z.object({
15321
- title: NullableString,
15322
- subscriberCount: NullableString
15323
- }).nullable(),
15324
- videos: import_zod19.z.array(import_zod19.z.object({
15325
- videoId: import_zod19.z.string(),
15326
- title: import_zod19.z.string(),
15327
- channelName: NullableString,
15328
- views: NullableString,
15329
- duration: NullableString,
15330
- url: NullableString
15331
- }))
15332
- };
15333
- FacebookAdSearchOutputSchema = {
15334
- query: import_zod19.z.string(),
15335
- advertiserCount: import_zod19.z.number().int().min(0),
15336
- advertisers: import_zod19.z.array(import_zod19.z.object({
15337
- name: NullableString,
15338
- adCount: import_zod19.z.number().int().nullable(),
15339
- libraryId: NullableString
15340
- }))
15341
- };
15342
- FacebookPageIntelOutputSchema = {
15343
- advertiserName: NullableString,
15344
- totalAds: import_zod19.z.number().int().min(0),
15345
- activeCount: import_zod19.z.number().int().min(0),
15346
- videoCount: import_zod19.z.number().int().min(0),
15347
- imageCount: import_zod19.z.number().int().min(0),
15348
- ads: import_zod19.z.array(import_zod19.z.object({
15349
- libraryId: NullableString,
15350
- status: NullableString,
15351
- creativeType: NullableString,
15352
- headline: NullableString,
15353
- cta: NullableString,
15354
- startDate: NullableString,
15355
- videoUrl: NullableString,
15356
- variations: import_zod19.z.number().int().nullable()
15357
- }))
15358
- };
15359
- CreditsInfoInputSchema = {
15360
- item: import_zod19.z.string().optional().describe('Optional tool, action, or feature to look up, e.g. "maps reviews", "extract_url", or "YouTube transcription"'),
15361
- includeLedger: import_zod19.z.boolean().default(false).describe("Whether to include recent credit ledger entries")
15362
- };
15363
- SearchSerpInputSchema = {
15364
- query: import_zod19.z.string().min(1).describe('Core search topic only. Separate location when possible. If user says "best dentist in Brooklyn NY serp", use query="best dentist" and location="Brooklyn, NY".'),
15365
- location: import_zod19.z.string().optional().describe("City, region, or country for geo-targeted results, inferred from user request when present."),
15366
- gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location or user language."),
15367
- hl: import_zod19.z.string().default("en").describe("Google interface/content language inferred from user request."),
15368
- device: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
15369
- proxyMode: import_zod19.z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
15370
- proxyZip: import_zod19.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
15371
- debug: import_zod19.z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior."),
15372
- pages: import_zod19.z.number().int().min(1).max(2).default(1).describe("Number of result pages to fetch (1\u20132)")
15373
- };
15374
- CaptureSerpSnapshotInputSchema = {
15375
- query: import_zod19.z.string().min(1).describe("Core search query to capture as a structured SERP Intelligence snapshot. Separate the place into location when the user gives a city, region, country, or ZIP."),
15376
- location: import_zod19.z.string().optional().describe("City, region, country, or service area used for localized Google results. MCP Scraper records location evidence; UULE alone is not proof of localization."),
15377
- gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from the requested market, e.g. us, gb, ca, au."),
15378
- hl: import_zod19.z.string().default("en").describe("Google interface/content language inferred from the user request."),
15379
- device: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use mobile only when the user asks for mobile rankings or mobile SERP evidence."),
15380
- proxyMode: import_zod19.z.enum(["location", "configured", "none"]).default("location").describe("Proxy behavior for capture. Use location for localized residential proxy targeting, configured for the static residential proxy, and none only for direct-network debugging."),
15381
- proxyZip: import_zod19.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting when a precise city-center or ZIP proxy is needed."),
15382
- pages: import_zod19.z.number().int().min(1).max(2).default(1).describe("Number of Google result pages to capture. Use 1 normally and 2 only when the user needs deeper ranking evidence."),
15383
- debug: import_zod19.z.boolean().default(false).describe("Include sanitized browser, proxy, and location diagnostics. Use true when debugging localization, CAPTCHA, proxy selection, or capture reliability."),
15384
- includePageSnapshots: import_zod19.z.boolean().default(false).describe("Also capture ranking-page snapshots for selected SERP URLs through the same product capture path."),
15385
- pageSnapshotLimit: import_zod19.z.number().int().min(0).max(10).default(0).describe("Maximum ranking-page snapshots to capture when includePageSnapshots is true. Use 0 when only SERP evidence is needed.")
15386
- };
15387
- ScreenshotInputSchema = {
15388
- url: import_zod19.z.string().url().describe("URL to capture as a full-page screenshot. Use http or https. Pass allowLocal: true to capture localhost or private-network URLs during development."),
15389
- device: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport profile. desktop = 1440\xD7900. mobile = 390\xD7844. Use desktop by default; use mobile when the user asks for a mobile view."),
15390
- allowLocal: import_zod19.z.boolean().default(false).describe("Allow localhost and private-network URLs (127.x, 192.168.x, 10.x, etc.). For local development only \u2014 not for production use.")
15391
- };
15392
- CaptureSerpPageSnapshotsInputSchema = {
15393
- urls: import_zod19.z.array(import_zod19.z.string().url()).min(1).max(25).describe("Public HTTP/HTTPS URLs to capture as SERP Intelligence page snapshots. Do not pass localhost, private IPs, file URLs, or internal admin URLs."),
15394
- targets: import_zod19.z.array(import_zod19.z.object({
15395
- url: import_zod19.z.string().url().describe("Public HTTP/HTTPS URL to capture."),
15396
- sourceKind: import_zod19.z.enum(["organic", "ai_citation", "local_pack_website", "configured_target", "site_subject"]).default("configured_target").describe("Why this page is being captured for SERP Intelligence evidence."),
15397
- sourcePosition: import_zod19.z.number().int().min(1).optional().describe("Ranking or citation position when the page came from SERP evidence.")
15398
- }).strict()).min(1).max(25).optional().describe("Structured page snapshot targets. Use this instead of urls when source kind or position should be preserved."),
15399
- maxConcurrency: import_zod19.z.number().int().min(1).max(5).default(2).describe("Parallel page captures. Use 2 normally; higher values can increase proxy/browser pressure."),
15400
- timeoutMs: import_zod19.z.number().int().min(1e3).max(6e4).default(15e3).describe("Per-page capture timeout in milliseconds. Increase for slow pages; timeout artifacts are returned as structured capture failures."),
15401
- debug: import_zod19.z.boolean().default(false).describe("Include sanitized browser/proxy diagnostics for page snapshot debugging. Use true for capture, network, or proxy troubleshooting.")
15402
- };
15279
+ PACKAGE_VERSION = "0.1.9";
15403
15280
  }
15404
15281
  });
15405
15282
 
@@ -15505,7 +15382,7 @@ function debugSection(debug) {
15505
15382
  if (!debug || typeof debug !== "object") return "";
15506
15383
  const request = debug.request ?? {};
15507
15384
  const browser = debug.browser ?? {};
15508
- const kernel = browser.kernel ?? {};
15385
+ const kernel = browser.browserRuntime ?? browser.kernel ?? {};
15509
15386
  const network = browser.networkLocation ?? {};
15510
15387
  const nav = browser.serpNavigation ?? {};
15511
15388
  const proxyResolution = kernel.proxyResolution ?? {};
@@ -15531,12 +15408,14 @@ function errorAttemptsSection(body) {
15531
15408
  const lines = attempts.slice(0, 5).map((attempt) => {
15532
15409
  const debug = attempt.debug ?? {};
15533
15410
  const browser = debug.browser ?? {};
15534
- const kernel = browser.kernel ?? {};
15411
+ const kernel = browser.browserRuntime ?? browser.kernel ?? {};
15535
15412
  const proxyResolution = kernel.proxyResolution ?? {};
15536
15413
  const network = browser.networkLocation ?? {};
15537
15414
  const nav = browser.serpNavigation ?? {};
15538
15415
  const geo = [network.ip, network.city, network.region].filter(Boolean).join(" / ") || "geo unknown";
15539
- return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${attempt.kernel_session_id ?? kernel.sessionId ?? "unknown"} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 deleted ${attempt.kernel_delete_succeeded === true ? "yes" : attempt.kernel_delete_succeeded === false ? "no" : "unknown"}`;
15416
+ const sessionId = attempt.browser_session_id ?? attempt.kernel_session_id ?? kernel.sessionId ?? "unknown";
15417
+ const cleanupSucceeded2 = attempt.session_cleanup_succeeded ?? attempt.kernel_delete_succeeded;
15418
+ return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${sessionId} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 cleanup ${cleanupSucceeded2 === true ? "yes" : cleanupSucceeded2 === false ? "no" : "unknown"}`;
15540
15419
  });
15541
15420
  return `
15542
15421
 
@@ -15583,7 +15462,31 @@ ${serpRows}` : "";
15583
15462
  const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
15584
15463
 
15585
15464
  ${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
15586
- return oneBlock(full);
15465
+ return {
15466
+ ...oneBlock(full),
15467
+ structuredContent: {
15468
+ query: input.query,
15469
+ location: input.location ?? null,
15470
+ questionCount: flat.length,
15471
+ completionStatus: diagnostics?.completionStatus ?? null,
15472
+ questions: flat.map((r) => ({
15473
+ question: String(r.question ?? ""),
15474
+ answer: r.answer ?? null,
15475
+ sourceTitle: r.source_title ?? null,
15476
+ sourceSite: r.source_site ?? null
15477
+ })),
15478
+ organicResults: organic.map((r) => ({
15479
+ position: Number(r.position) || 0,
15480
+ title: String(r.title ?? ""),
15481
+ url: String(r.url ?? ""),
15482
+ domain: String(r.domain ?? ""),
15483
+ snippet: r.snippet ?? null
15484
+ })),
15485
+ aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
15486
+ entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null,
15487
+ durationMs: durationMs ?? null
15488
+ }
15489
+ };
15587
15490
  }
15588
15491
  function formatSearchSerp(raw, input) {
15589
15492
  const parsed = parseData(raw);
@@ -15621,7 +15524,29 @@ ${localRows}` : "";
15621
15524
  const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
15622
15525
 
15623
15526
  ${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
15624
- return oneBlock(full);
15527
+ return {
15528
+ ...oneBlock(full),
15529
+ structuredContent: {
15530
+ query: input.query,
15531
+ location: input.location ?? null,
15532
+ organicResults: organic.map((r) => ({
15533
+ position: Number(r.position) || 0,
15534
+ title: String(r.title ?? ""),
15535
+ url: String(r.url ?? ""),
15536
+ domain: String(r.domain ?? ""),
15537
+ snippet: r.snippet ?? null
15538
+ })),
15539
+ localPack: localPack.map((b) => ({
15540
+ position: Number(b.position) || 0,
15541
+ name: String(b.name ?? ""),
15542
+ rating: b.rating ?? null,
15543
+ reviewCount: b.reviewCount ?? null,
15544
+ websiteUrl: b.websiteUrl ?? null
15545
+ })),
15546
+ aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
15547
+ entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null
15548
+ }
15549
+ };
15625
15550
  }
15626
15551
  function formatExtractUrl(raw, input) {
15627
15552
  const parsed = parseData(raw);
@@ -15690,15 +15615,27 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
15690
15615
  **${title}**
15691
15616
  ${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
15692
15617
  const textResult = oneBlock(full);
15618
+ const structuredContent = {
15619
+ url,
15620
+ title: d.title ?? null,
15621
+ headings: headings.map((h) => ({ level: Number(h.level) || 0, text: String(h.text ?? "") })),
15622
+ schemaBlockCount: schemaCount,
15623
+ entityName: kpo?.entityName ?? null,
15624
+ entityTypes: kpo?.type ?? [],
15625
+ napScore: kpo?.napScore ?? null,
15626
+ missingSchemaFields: kpo?.missingFields ?? [],
15627
+ screenshotSaved: screenshotPath ?? null
15628
+ };
15693
15629
  if (screenshotMeta?.base64) {
15694
15630
  return {
15695
15631
  content: [
15696
15632
  ...textResult.content,
15697
15633
  { type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
15698
- ]
15634
+ ],
15635
+ structuredContent
15699
15636
  };
15700
15637
  }
15701
- return textResult;
15638
+ return { ...textResult, structuredContent };
15702
15639
  }
15703
15640
  function formatMapSiteUrls(raw, input) {
15704
15641
  const parsed = parseData(raw);
@@ -15768,7 +15705,19 @@ ${pageRows}`,
15768
15705
  - Map URLs first: use \`map_site_urls\`
15769
15706
  - Inspect a single page: use \`extract_url\``
15770
15707
  ].join("\n");
15771
- return oneBlock(full);
15708
+ return {
15709
+ ...oneBlock(full),
15710
+ structuredContent: {
15711
+ url: input.url,
15712
+ pageCount: pages.length,
15713
+ pages: pages.map((p) => ({
15714
+ url: String(p.url ?? ""),
15715
+ title: p.title ?? null,
15716
+ schemaTypes: p.kpo?.type ?? []
15717
+ })),
15718
+ durationMs: d.durationMs ?? 0
15719
+ }
15720
+ };
15772
15721
  }
15773
15722
  function formatYoutubeHarvest(raw, input) {
15774
15723
  const parsed = parseData(raw);
@@ -15965,7 +15914,26 @@ ${costRows}` : "",
15965
15914
  |------|-----------|---------|-------------|
15966
15915
  ${ledgerRows}` : ""
15967
15916
  ].filter(Boolean).join("\n");
15968
- return oneBlock(full);
15917
+ return {
15918
+ ...oneBlock(full),
15919
+ structuredContent: {
15920
+ balanceCredits: typeof balance === "number" ? balance : null,
15921
+ matchedCost: matched ? { label: matched.label, credits: matched.credits, unit: matched.unit, notes: matched.notes ?? null } : null,
15922
+ costs: costs.map((c) => ({
15923
+ key: c.key,
15924
+ label: c.label,
15925
+ credits: c.credits,
15926
+ unit: c.unit,
15927
+ notes: c.notes ?? null
15928
+ })),
15929
+ ledger: ledger.map((row) => ({
15930
+ createdAt: String(row.created_at ?? ""),
15931
+ operation: String(row.operation ?? ""),
15932
+ credits: row.amount_mc / 1e3,
15933
+ description: row.description ?? null
15934
+ }))
15935
+ }
15936
+ };
15969
15937
  }
15970
15938
  function formatMapsSearch(raw, input) {
15971
15939
  const parsed = parseData(raw);
@@ -16114,7 +16082,28 @@ ${entitySection}` : null,
16114
16082
  ---
16115
16083
  *Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
16116
16084
  ].filter(Boolean).join("\n");
16117
- return oneBlock(full);
16085
+ return {
16086
+ ...oneBlock(full),
16087
+ structuredContent: {
16088
+ name,
16089
+ rating: rating ?? null,
16090
+ reviewCount: reviewCount ?? null,
16091
+ category: category ?? null,
16092
+ address: address ?? null,
16093
+ phone: phone ?? null,
16094
+ website: website ?? null,
16095
+ hoursSummary: hoursSummary ?? null,
16096
+ bookingUrl: bookingUrl ?? null,
16097
+ kgmid: kgmid ?? null,
16098
+ cidDecimal: cidDecimal ?? null,
16099
+ cidUrl: cidUrl ?? null,
16100
+ lat: lat ?? null,
16101
+ lng: lng ?? null,
16102
+ reviewsStatus,
16103
+ reviewsCollected: reviews.length,
16104
+ reviewTopics: topics.map((t) => ({ label: String(t.label ?? ""), count: String(t.count ?? "") }))
16105
+ }
16106
+ };
16118
16107
  }
16119
16108
  function formatFacebookAdTranscribe(raw, input) {
16120
16109
  const parsed = parseData(raw);
@@ -16158,6 +16147,320 @@ var init_mcp_response_formatter = __esm({
16158
16147
  }
16159
16148
  });
16160
16149
 
16150
+ // src/mcp/mcp-tool-schemas.ts
16151
+ var import_zod19, HarvestPaaInputSchema, ExtractUrlInputSchema, MapSiteUrlsInputSchema, ExtractSiteInputSchema, YoutubeHarvestInputSchema, YoutubeTranscribeInputSchema, FacebookPageIntelInputSchema, FacebookAdSearchInputSchema, FacebookAdTranscribeInputSchema, MapsPlaceIntelInputSchema, MapsSearchInputSchema, NullableString, MapsSearchOutputSchema, OrganicResultOutput, AiOverviewOutput, EntityIdsOutput, HarvestPaaOutputSchema, SearchSerpOutputSchema, ExtractUrlOutputSchema, ExtractSiteOutputSchema, MapsPlaceIntelOutputSchema, CreditsInfoOutputSchema, MapSiteUrlsOutputSchema, YoutubeHarvestOutputSchema, FacebookAdSearchOutputSchema, FacebookPageIntelOutputSchema, CreditsInfoInputSchema, SearchSerpInputSchema, CaptureSerpSnapshotInputSchema, ScreenshotInputSchema, CaptureSerpPageSnapshotsInputSchema;
16152
+ var init_mcp_tool_schemas = __esm({
16153
+ "src/mcp/mcp-tool-schemas.ts"() {
16154
+ "use strict";
16155
+ import_zod19 = require("zod");
16156
+ HarvestPaaInputSchema = {
16157
+ query: import_zod19.z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
16158
+ location: import_zod19.z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
16159
+ maxQuestions: import_zod19.z.number().int().min(1).max(200).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 200. Use 10 for quick probes, 30 for normal research, 100-200 when the user asks for everything/full/deep research. Larger harvests get a longer server time budget (151-200 questions \u2192 up to 280s). Credits are charged by extracted question; unused request hold is refunded."),
16160
+ gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
16161
+ hl: import_zod19.z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
16162
+ device: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
16163
+ proxyMode: import_zod19.z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
16164
+ proxyZip: import_zod19.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
16165
+ debug: import_zod19.z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior.")
16166
+ };
16167
+ ExtractUrlInputSchema = {
16168
+ url: import_zod19.z.string().url().describe("Public http/https URL to extract. Use this when the user provides one specific page URL."),
16169
+ screenshot: import_zod19.z.boolean().default(false).describe("Also capture a full-page screenshot of the URL. Saved to ~/Downloads/mcp-scraper/screenshots/ and returned inline. Use when the user asks to see or capture the page visually."),
16170
+ screenshotDevice: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport for screenshot. desktop = 1440\xD7900. mobile = 390\xD7844. Default desktop."),
16171
+ extractBranding: import_zod19.z.boolean().default(false).describe("Extract brand colors, fonts, logo, and favicon using a rendered browser session. Returns colorScheme (light/dark), colors (primary/accent/background/text/heading as hex), fonts (heading/body family names), and assets (logo URL, favicon URL). Use when the user asks about brand colors, site theme, or brand assets."),
16172
+ downloadMedia: import_zod19.z.boolean().default(false).describe("Extract and download all page media (images, video, audio) to ~/Downloads/mcp-scraper/media/. Ad networks, tracking pixels, and noise URLs are filtered automatically. Use when the user asks to download or harvest assets from a page."),
16173
+ mediaTypes: import_zod19.z.array(import_zod19.z.enum(["image", "video", "audio"])).default(["image", "video", "audio"]).describe("Which media types to download. Default all three."),
16174
+ allowLocal: import_zod19.z.boolean().default(false).describe("Allow localhost and private-network URLs. For local development only.")
16175
+ };
16176
+ MapSiteUrlsInputSchema = {
16177
+ url: import_zod19.z.string().url().describe("Public website URL or domain to crawl for internal URLs. Use before extract_site when the user asks to audit/map/crawl a site."),
16178
+ maxUrls: import_zod19.z.number().int().min(1).max(500).optional().describe("Maximum URLs to discover. Use 100 for normal maps, higher when the user asks for a full inventory.")
16179
+ };
16180
+ ExtractSiteInputSchema = {
16181
+ url: import_zod19.z.string().url().describe("Public website URL or domain to extract across multiple pages. Use when the user asks for a site audit, website crawl, or full-site content/schema extraction."),
16182
+ maxPages: import_zod19.z.number().int().min(1).max(50).optional().describe("Maximum pages to extract. Use 50 when the user asks for full results or a complete crawl within MCP limits.")
16183
+ };
16184
+ YoutubeHarvestInputSchema = {
16185
+ mode: import_zod19.z.enum(["search", "channel"]).describe("Use search for topic/keyword requests. Use channel when the user provides @handle, channel ID, or channel URL."),
16186
+ query: import_zod19.z.string().optional().describe("Required when mode is search. The YouTube search topic in the user\u2019s words."),
16187
+ channelHandle: import_zod19.z.string().optional().describe("YouTube channel handle, channel ID, or URL. Examples: @mkbhd, UC..., https://youtube.com/@mkbhd."),
16188
+ maxVideos: import_zod19.z.number().int().min(1).max(500).default(50).describe("Number of videos to return. Default 50. Increase when user asks for full channel/history.")
16189
+ };
16190
+ YoutubeTranscribeInputSchema = {
16191
+ videoId: import_zod19.z.string().min(1).describe("YouTube video ID, e.g. dQw4w9WgXcQ")
16192
+ };
16193
+ FacebookPageIntelInputSchema = {
16194
+ pageId: import_zod19.z.string().optional(),
16195
+ libraryId: import_zod19.z.string().optional(),
16196
+ query: import_zod19.z.string().optional().describe("Advertiser or brand name when pageId/libraryId is not known. One of pageId, libraryId, or query is required."),
16197
+ maxAds: import_zod19.z.number().int().min(1).max(200).default(50),
16198
+ country: import_zod19.z.string().length(2).default("US")
16199
+ };
16200
+ FacebookAdSearchInputSchema = {
16201
+ query: import_zod19.z.string().min(1).describe("Advertiser, brand, competitor, niche, or keyword to search in Facebook Ad Library."),
16202
+ country: import_zod19.z.string().length(2).default("US"),
16203
+ maxResults: import_zod19.z.number().int().min(1).max(20).default(10)
16204
+ };
16205
+ FacebookAdTranscribeInputSchema = {
16206
+ videoUrl: import_zod19.z.string().url().describe("Facebook CDN video URL from a facebook_page_intel result")
16207
+ };
16208
+ MapsPlaceIntelInputSchema = {
16209
+ businessName: import_zod19.z.string().min(1).describe('Business name only. If user says "Elite Roofing Denver CO", use businessName="Elite Roofing" and location="Denver, CO".'),
16210
+ location: import_zod19.z.string().min(1).describe('City/region/country where the business should be searched, e.g. "Denver, CO". Infer from the user request when possible.'),
16211
+ gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location."),
16212
+ hl: import_zod19.z.string().length(2).default("en").describe("Language inferred from user request."),
16213
+ includeReviews: import_zod19.z.boolean().default(false).describe("Whether to fetch individual review cards"),
16214
+ maxReviews: import_zod19.z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
16215
+ };
16216
+ MapsSearchInputSchema = {
16217
+ query: import_zod19.z.string().min(1).describe('Business category, niche, keyword, or search term. If the user says "roofers in Denver CO", use query="roofers" and location="Denver, CO". Do not put the location here when it can be separated.'),
16218
+ location: import_zod19.z.string().optional().describe('City, region, country, or service area for the Maps search, e.g. "Denver, CO". Infer from the user request when present.'),
16219
+ gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location."),
16220
+ hl: import_zod19.z.string().length(2).default("en").describe("Language inferred from user request."),
16221
+ maxResults: import_zod19.z.number().int().min(1).max(50).default(10).describe("Number of Google Maps business/profile candidates to return. Default 10. Maximum 50. Use 10 unless the user asks for more.")
16222
+ };
16223
+ NullableString = import_zod19.z.string().nullable();
16224
+ MapsSearchOutputSchema = {
16225
+ query: import_zod19.z.string(),
16226
+ location: import_zod19.z.string().nullable(),
16227
+ searchQuery: import_zod19.z.string(),
16228
+ searchUrl: import_zod19.z.string().url(),
16229
+ extractedAt: import_zod19.z.string(),
16230
+ requestedMaxResults: import_zod19.z.number().int().min(1).max(50),
16231
+ resultCount: import_zod19.z.number().int().min(0).max(50),
16232
+ results: import_zod19.z.array(import_zod19.z.object({
16233
+ position: import_zod19.z.number().int().min(1),
16234
+ name: import_zod19.z.string(),
16235
+ placeUrl: import_zod19.z.string().url(),
16236
+ cid: NullableString,
16237
+ cidDecimal: NullableString,
16238
+ rating: NullableString,
16239
+ reviewCount: NullableString,
16240
+ category: NullableString,
16241
+ address: NullableString,
16242
+ websiteUrl: NullableString,
16243
+ directionsUrl: NullableString,
16244
+ metadata: import_zod19.z.array(import_zod19.z.string())
16245
+ })),
16246
+ durationMs: import_zod19.z.number().int().min(0)
16247
+ };
16248
+ OrganicResultOutput = import_zod19.z.object({
16249
+ position: import_zod19.z.number().int(),
16250
+ title: import_zod19.z.string(),
16251
+ url: import_zod19.z.string(),
16252
+ domain: import_zod19.z.string(),
16253
+ snippet: NullableString
16254
+ });
16255
+ AiOverviewOutput = import_zod19.z.object({
16256
+ detected: import_zod19.z.boolean(),
16257
+ text: NullableString
16258
+ }).nullable();
16259
+ EntityIdsOutput = import_zod19.z.object({
16260
+ kgIds: import_zod19.z.array(import_zod19.z.string()),
16261
+ cids: import_zod19.z.array(import_zod19.z.string()),
16262
+ gcids: import_zod19.z.array(import_zod19.z.string())
16263
+ }).nullable();
16264
+ HarvestPaaOutputSchema = {
16265
+ query: import_zod19.z.string(),
16266
+ location: NullableString,
16267
+ questionCount: import_zod19.z.number().int().min(0),
16268
+ completionStatus: NullableString,
16269
+ questions: import_zod19.z.array(import_zod19.z.object({
16270
+ question: import_zod19.z.string(),
16271
+ answer: NullableString,
16272
+ sourceTitle: NullableString,
16273
+ sourceSite: NullableString
16274
+ })),
16275
+ organicResults: import_zod19.z.array(OrganicResultOutput),
16276
+ aiOverview: AiOverviewOutput,
16277
+ entityIds: EntityIdsOutput,
16278
+ durationMs: import_zod19.z.number().min(0).nullable()
16279
+ };
16280
+ SearchSerpOutputSchema = {
16281
+ query: import_zod19.z.string(),
16282
+ location: NullableString,
16283
+ organicResults: import_zod19.z.array(OrganicResultOutput),
16284
+ localPack: import_zod19.z.array(import_zod19.z.object({
16285
+ position: import_zod19.z.number().int(),
16286
+ name: import_zod19.z.string(),
16287
+ rating: NullableString,
16288
+ reviewCount: NullableString,
16289
+ websiteUrl: NullableString
16290
+ })),
16291
+ aiOverview: AiOverviewOutput,
16292
+ entityIds: EntityIdsOutput
16293
+ };
16294
+ ExtractUrlOutputSchema = {
16295
+ url: import_zod19.z.string(),
16296
+ title: NullableString,
16297
+ headings: import_zod19.z.array(import_zod19.z.object({
16298
+ level: import_zod19.z.number().int(),
16299
+ text: import_zod19.z.string()
16300
+ })),
16301
+ schemaBlockCount: import_zod19.z.number().int().min(0),
16302
+ entityName: NullableString,
16303
+ entityTypes: import_zod19.z.array(import_zod19.z.string()),
16304
+ napScore: import_zod19.z.number().nullable(),
16305
+ missingSchemaFields: import_zod19.z.array(import_zod19.z.string()),
16306
+ screenshotSaved: NullableString
16307
+ };
16308
+ ExtractSiteOutputSchema = {
16309
+ url: import_zod19.z.string(),
16310
+ pageCount: import_zod19.z.number().int().min(0),
16311
+ pages: import_zod19.z.array(import_zod19.z.object({
16312
+ url: import_zod19.z.string(),
16313
+ title: NullableString,
16314
+ schemaTypes: import_zod19.z.array(import_zod19.z.string())
16315
+ })),
16316
+ durationMs: import_zod19.z.number().min(0)
16317
+ };
16318
+ MapsPlaceIntelOutputSchema = {
16319
+ name: import_zod19.z.string(),
16320
+ rating: NullableString,
16321
+ reviewCount: NullableString,
16322
+ category: NullableString,
16323
+ address: NullableString,
16324
+ phone: NullableString,
16325
+ website: NullableString,
16326
+ hoursSummary: NullableString,
16327
+ bookingUrl: NullableString,
16328
+ kgmid: NullableString,
16329
+ cidDecimal: NullableString,
16330
+ cidUrl: NullableString,
16331
+ lat: import_zod19.z.number().nullable(),
16332
+ lng: import_zod19.z.number().nullable(),
16333
+ reviewsStatus: import_zod19.z.string(),
16334
+ reviewsCollected: import_zod19.z.number().int().min(0),
16335
+ reviewTopics: import_zod19.z.array(import_zod19.z.object({
16336
+ label: import_zod19.z.string(),
16337
+ count: import_zod19.z.string()
16338
+ }))
16339
+ };
16340
+ CreditsInfoOutputSchema = {
16341
+ balanceCredits: import_zod19.z.number().nullable(),
16342
+ matchedCost: import_zod19.z.object({
16343
+ label: import_zod19.z.string(),
16344
+ credits: import_zod19.z.number(),
16345
+ unit: import_zod19.z.string(),
16346
+ notes: NullableString
16347
+ }).nullable(),
16348
+ costs: import_zod19.z.array(import_zod19.z.object({
16349
+ key: import_zod19.z.string(),
16350
+ label: import_zod19.z.string(),
16351
+ credits: import_zod19.z.number(),
16352
+ unit: import_zod19.z.string(),
16353
+ notes: NullableString
16354
+ })),
16355
+ ledger: import_zod19.z.array(import_zod19.z.object({
16356
+ createdAt: import_zod19.z.string(),
16357
+ operation: import_zod19.z.string(),
16358
+ credits: import_zod19.z.number(),
16359
+ description: NullableString
16360
+ }))
16361
+ };
16362
+ MapSiteUrlsOutputSchema = {
16363
+ startUrl: import_zod19.z.string(),
16364
+ totalFound: import_zod19.z.number().int().min(0),
16365
+ truncated: import_zod19.z.boolean(),
16366
+ okCount: import_zod19.z.number().int().min(0),
16367
+ redirectCount: import_zod19.z.number().int().min(0),
16368
+ brokenCount: import_zod19.z.number().int().min(0),
16369
+ urls: import_zod19.z.array(import_zod19.z.object({
16370
+ url: import_zod19.z.string(),
16371
+ status: import_zod19.z.number().int().nullable()
16372
+ })),
16373
+ durationMs: import_zod19.z.number().min(0)
16374
+ };
16375
+ YoutubeHarvestOutputSchema = {
16376
+ mode: import_zod19.z.string(),
16377
+ videoCount: import_zod19.z.number().int().min(0),
16378
+ channel: import_zod19.z.object({
16379
+ title: NullableString,
16380
+ subscriberCount: NullableString
16381
+ }).nullable(),
16382
+ videos: import_zod19.z.array(import_zod19.z.object({
16383
+ videoId: import_zod19.z.string(),
16384
+ title: import_zod19.z.string(),
16385
+ channelName: NullableString,
16386
+ views: NullableString,
16387
+ duration: NullableString,
16388
+ url: NullableString
16389
+ }))
16390
+ };
16391
+ FacebookAdSearchOutputSchema = {
16392
+ query: import_zod19.z.string(),
16393
+ advertiserCount: import_zod19.z.number().int().min(0),
16394
+ advertisers: import_zod19.z.array(import_zod19.z.object({
16395
+ name: NullableString,
16396
+ adCount: import_zod19.z.number().int().nullable(),
16397
+ libraryId: NullableString
16398
+ }))
16399
+ };
16400
+ FacebookPageIntelOutputSchema = {
16401
+ advertiserName: NullableString,
16402
+ totalAds: import_zod19.z.number().int().min(0),
16403
+ activeCount: import_zod19.z.number().int().min(0),
16404
+ videoCount: import_zod19.z.number().int().min(0),
16405
+ imageCount: import_zod19.z.number().int().min(0),
16406
+ ads: import_zod19.z.array(import_zod19.z.object({
16407
+ libraryId: NullableString,
16408
+ status: NullableString,
16409
+ creativeType: NullableString,
16410
+ headline: NullableString,
16411
+ cta: NullableString,
16412
+ startDate: NullableString,
16413
+ videoUrl: NullableString,
16414
+ variations: import_zod19.z.number().int().nullable()
16415
+ }))
16416
+ };
16417
+ CreditsInfoInputSchema = {
16418
+ item: import_zod19.z.string().optional().describe('Optional tool, action, or feature to look up, e.g. "maps reviews", "extract_url", or "YouTube transcription"'),
16419
+ includeLedger: import_zod19.z.boolean().default(false).describe("Whether to include recent credit ledger entries")
16420
+ };
16421
+ SearchSerpInputSchema = {
16422
+ query: import_zod19.z.string().min(1).describe('Core search topic only. Separate location when possible. If user says "best dentist in Brooklyn NY serp", use query="best dentist" and location="Brooklyn, NY".'),
16423
+ location: import_zod19.z.string().optional().describe("City, region, or country for geo-targeted results, inferred from user request when present."),
16424
+ gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location or user language."),
16425
+ hl: import_zod19.z.string().default("en").describe("Google interface/content language inferred from user request."),
16426
+ device: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
16427
+ proxyMode: import_zod19.z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
16428
+ proxyZip: import_zod19.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
16429
+ debug: import_zod19.z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior."),
16430
+ pages: import_zod19.z.number().int().min(1).max(2).default(1).describe("Number of result pages to fetch (1\u20132)")
16431
+ };
16432
+ CaptureSerpSnapshotInputSchema = {
16433
+ query: import_zod19.z.string().min(1).describe("Core search query to capture as a structured SERP Intelligence snapshot. Separate the place into location when the user gives a city, region, country, or ZIP."),
16434
+ location: import_zod19.z.string().optional().describe("City, region, country, or service area used for localized Google results. MCP Scraper records location evidence; UULE alone is not proof of localization."),
16435
+ gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from the requested market, e.g. us, gb, ca, au."),
16436
+ hl: import_zod19.z.string().default("en").describe("Google interface/content language inferred from the user request."),
16437
+ device: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use mobile only when the user asks for mobile rankings or mobile SERP evidence."),
16438
+ proxyMode: import_zod19.z.enum(["location", "configured", "none"]).default("location").describe("Proxy behavior for capture. Use location for localized residential proxy targeting, configured for the static residential proxy, and none only for direct-network debugging."),
16439
+ proxyZip: import_zod19.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting when a precise city-center or ZIP proxy is needed."),
16440
+ pages: import_zod19.z.number().int().min(1).max(2).default(1).describe("Number of Google result pages to capture. Use 1 normally and 2 only when the user needs deeper ranking evidence."),
16441
+ debug: import_zod19.z.boolean().default(false).describe("Include sanitized browser, proxy, and location diagnostics. Use true when debugging localization, CAPTCHA, proxy selection, or capture reliability."),
16442
+ includePageSnapshots: import_zod19.z.boolean().default(false).describe("Also capture ranking-page snapshots for selected SERP URLs through the same product capture path."),
16443
+ pageSnapshotLimit: import_zod19.z.number().int().min(0).max(10).default(0).describe("Maximum ranking-page snapshots to capture when includePageSnapshots is true. Use 0 when only SERP evidence is needed.")
16444
+ };
16445
+ ScreenshotInputSchema = {
16446
+ url: import_zod19.z.string().url().describe("URL to capture as a full-page screenshot. Use http or https. Pass allowLocal: true to capture localhost or private-network URLs during development."),
16447
+ device: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport profile. desktop = 1440\xD7900. mobile = 390\xD7844. Use desktop by default; use mobile when the user asks for a mobile view."),
16448
+ allowLocal: import_zod19.z.boolean().default(false).describe("Allow localhost and private-network URLs (127.x, 192.168.x, 10.x, etc.). For local development only \u2014 not for production use.")
16449
+ };
16450
+ CaptureSerpPageSnapshotsInputSchema = {
16451
+ urls: import_zod19.z.array(import_zod19.z.string().url()).min(1).max(25).describe("Public HTTP/HTTPS URLs to capture as SERP Intelligence page snapshots. Do not pass localhost, private IPs, file URLs, or internal admin URLs."),
16452
+ targets: import_zod19.z.array(import_zod19.z.object({
16453
+ url: import_zod19.z.string().url().describe("Public HTTP/HTTPS URL to capture."),
16454
+ sourceKind: import_zod19.z.enum(["organic", "ai_citation", "local_pack_website", "configured_target", "site_subject"]).default("configured_target").describe("Why this page is being captured for SERP Intelligence evidence."),
16455
+ sourcePosition: import_zod19.z.number().int().min(1).optional().describe("Ranking or citation position when the page came from SERP evidence.")
16456
+ }).strict()).min(1).max(25).optional().describe("Structured page snapshot targets. Use this instead of urls when source kind or position should be preserved."),
16457
+ maxConcurrency: import_zod19.z.number().int().min(1).max(5).default(2).describe("Parallel page captures. Use 2 normally; higher values can increase proxy/browser pressure."),
16458
+ timeoutMs: import_zod19.z.number().int().min(1e3).max(6e4).default(15e3).describe("Per-page capture timeout in milliseconds. Increase for slow pages; timeout artifacts are returned as structured capture failures."),
16459
+ debug: import_zod19.z.boolean().default(false).describe("Include sanitized browser/proxy diagnostics for page snapshot debugging. Use true for capture, network, or proxy troubleshooting.")
16460
+ };
16461
+ }
16462
+ });
16463
+
16161
16464
  // src/mcp/paa-mcp-server.ts
16162
16465
  function liveWebToolAnnotations(title) {
16163
16466
  return {
@@ -16168,27 +16471,65 @@ function liveWebToolAnnotations(title) {
16168
16471
  openWorldHint: true
16169
16472
  };
16170
16473
  }
16474
+ function listSavedReports() {
16475
+ try {
16476
+ const dir = outputBaseDir();
16477
+ return (0, import_node_fs5.readdirSync)(dir).filter((f) => f.endsWith(".md")).map((f) => ({ filename: f, mtimeMs: (0, import_node_fs5.statSync)((0, import_node_path7.join)(dir, f)).mtimeMs })).sort((a, b) => b.mtimeMs - a.mtimeMs).slice(0, 100);
16478
+ } catch {
16479
+ return [];
16480
+ }
16481
+ }
16482
+ function registerSavedReportResources(server) {
16483
+ server.registerResource(
16484
+ "saved-report",
16485
+ new import_mcp.ResourceTemplate("report://{filename}", {
16486
+ list: () => ({
16487
+ resources: listSavedReports().map((r) => ({
16488
+ uri: `report://${encodeURIComponent(r.filename)}`,
16489
+ name: r.filename,
16490
+ mimeType: "text/markdown"
16491
+ }))
16492
+ })
16493
+ }),
16494
+ {
16495
+ title: "Saved MCP Scraper Reports",
16496
+ description: "Markdown research reports saved by previous MCP Scraper tool calls. Read a report to reuse prior research without re-scraping or spending credits.",
16497
+ mimeType: "text/markdown"
16498
+ },
16499
+ async (uri, variables) => {
16500
+ const requested = Array.isArray(variables.filename) ? variables.filename[0] : variables.filename;
16501
+ const filename = (0, import_node_path7.basename)(decodeURIComponent(String(requested ?? "")));
16502
+ if (!filename.endsWith(".md")) throw new Error("Only saved .md reports can be read");
16503
+ const text = (0, import_node_fs5.readFileSync)((0, import_node_path7.join)(outputBaseDir(), filename), "utf8");
16504
+ return { contents: [{ uri: uri.href, mimeType: "text/markdown", text }] };
16505
+ }
16506
+ );
16507
+ }
16171
16508
  function buildPaaExtractorMcpServer(executor, options = {}) {
16172
16509
  const savesReports = options.savesReportsLocally !== false;
16173
16510
  const reportNote = savesReports ? " Saves a full Markdown report locally." : " Reports are returned inline; no files are saved on this hosted endpoint.";
16174
16511
  const withReportNote = (description) => `${description}${reportNote}`;
16175
16512
  const server = new import_mcp.McpServer({ name: "mcp-scraper", version: PACKAGE_VERSION });
16513
+ if (savesReports) registerSavedReportResources(server);
16176
16514
  server.registerTool("harvest_paa", {
16177
16515
  title: "Google PAA + SERP Harvest",
16178
- description: withReportNote('Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded.'),
16516
+ description: withReportNote('Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-200 for "full", "deep", "all", or comprehensive research. Deep harvests above 100 questions can run for several minutes with no interim progress \u2014 warn the user before starting one and keep maxQuestions at or below 100 unless they explicitly want a deep harvest. Credits are charged by extracted question; unused request hold is refunded.'),
16179
16517
  inputSchema: HarvestPaaInputSchema,
16518
+ outputSchema: HarvestPaaOutputSchema,
16180
16519
  annotations: liveWebToolAnnotations("Google PAA + SERP Harvest")
16181
16520
  }, async (input) => formatHarvestPaa(await executor.harvestPaa(input), input));
16182
16521
  server.registerTool("search_serp", {
16183
16522
  title: "Google SERP Lookup",
16184
16523
  description: withReportNote("Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request."),
16185
16524
  inputSchema: SearchSerpInputSchema,
16525
+ outputSchema: SearchSerpOutputSchema,
16186
16526
  annotations: liveWebToolAnnotations("Google SERP Lookup")
16187
16527
  }, async (input) => formatSearchSerp(await executor.searchSerp(input), input));
16188
16528
  server.registerTool("extract_url", {
16189
16529
  title: "Single URL Extract",
16190
16530
  description: withReportNote("Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page."),
16191
16531
  inputSchema: ExtractUrlInputSchema,
16532
+ outputSchema: ExtractUrlOutputSchema,
16192
16533
  annotations: liveWebToolAnnotations("Single URL Extract")
16193
16534
  }, async (input) => formatExtractUrl(await executor.extractUrl(input), input));
16194
16535
  server.registerTool("map_site_urls", {
@@ -16202,6 +16543,7 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
16202
16543
  title: "Multi-Page Site Extract",
16203
16544
  description: withReportNote("Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction."),
16204
16545
  inputSchema: ExtractSiteInputSchema,
16546
+ outputSchema: ExtractSiteOutputSchema,
16205
16547
  annotations: liveWebToolAnnotations("Multi-Page Site Extract")
16206
16548
  }, async (input) => formatExtractSite(await executor.extractSite(input), input));
16207
16549
  server.registerTool("youtube_harvest", {
@@ -16241,6 +16583,7 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
16241
16583
  title: "Google Maps Business Profile Details",
16242
16584
  description: withReportNote('Extract Google Maps business intelligence for one known/named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Do not use this for category searches, local market prospect lists, or requests for multiple GMB/GBP profiles; use maps_search first for those. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain.'),
16243
16585
  inputSchema: MapsPlaceIntelInputSchema,
16586
+ outputSchema: MapsPlaceIntelOutputSchema,
16244
16587
  annotations: liveWebToolAnnotations("Google Maps Business Profile Details")
16245
16588
  }, async (input) => formatMapsPlaceIntel(await executor.mapsPlaceIntel(input), input));
16246
16589
  server.registerTool("maps_search", {
@@ -16254,6 +16597,7 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
16254
16597
  title: "MCP Scraper Credits & Costs",
16255
16598
  description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
16256
16599
  inputSchema: CreditsInfoInputSchema,
16600
+ outputSchema: CreditsInfoOutputSchema,
16257
16601
  annotations: {
16258
16602
  title: "MCP Scraper Credits & Costs",
16259
16603
  readOnlyHint: true,
@@ -16264,12 +16608,15 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
16264
16608
  }, async (input) => formatCreditsInfo(await executor.creditsInfo(input), input));
16265
16609
  return server;
16266
16610
  }
16267
- var import_mcp;
16611
+ var import_mcp, import_node_fs5, import_node_path7;
16268
16612
  var init_paa_mcp_server = __esm({
16269
16613
  "src/mcp/paa-mcp-server.ts"() {
16270
16614
  "use strict";
16271
16615
  import_mcp = require("@modelcontextprotocol/sdk/server/mcp.js");
16616
+ import_node_fs5 = require("fs");
16617
+ import_node_path7 = require("path");
16272
16618
  init_version();
16619
+ init_mcp_response_formatter();
16273
16620
  init_mcp_tool_schemas();
16274
16621
  init_mcp_response_formatter();
16275
16622
  }
@@ -16397,7 +16744,10 @@ function mcpAuthError() {
16397
16744
  });
16398
16745
  return new Response(body, {
16399
16746
  status: 401,
16400
- headers: { "Content-Type": "application/json" }
16747
+ headers: {
16748
+ "Content-Type": "application/json",
16749
+ "WWW-Authenticate": 'Bearer realm="mcp-scraper", error="invalid_token", error_description="Pass an MCP Scraper API key as x-api-key or Bearer token"'
16750
+ }
16401
16751
  });
16402
16752
  }
16403
16753
  async function requireMcpCallerKey(c) {
@@ -16833,7 +17183,7 @@ async function processJob(job) {
16833
17183
  const opts = typeof job.options === "string" ? JSON.parse(job.options) : job.options;
16834
17184
  const result = await harvest({
16835
17185
  ...opts,
16836
- kernelApiKey: process.env.KERNEL_API_KEY,
17186
+ kernelApiKey: browserServiceApiKey(),
16837
17187
  headless: true,
16838
17188
  format: "json",
16839
17189
  outputDir: "/tmp/paa-output-api",
@@ -16898,6 +17248,7 @@ var init_worker = __esm({
16898
17248
  "src/api/worker.ts"() {
16899
17249
  "use strict";
16900
17250
  init_db();
17251
+ init_browser_service_env();
16901
17252
  init_harvest();
16902
17253
  init_webhook();
16903
17254
  init_rates();
@@ -17000,6 +17351,8 @@ var init_server = __esm({
17000
17351
  "src/api/server.ts"() {
17001
17352
  "use strict";
17002
17353
  init_harvest_timeout();
17354
+ init_browser_service_env();
17355
+ init_outbound_sanitize();
17003
17356
  init_registry();
17004
17357
  init_template();
17005
17358
  init_og();
@@ -17316,7 +17669,7 @@ var init_server = __esm({
17316
17669
  try {
17317
17670
  const result = await harvest({
17318
17671
  ...options,
17319
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
17672
+ kernelApiKey: browserServiceApiKey(),
17320
17673
  headless: true,
17321
17674
  format: "json",
17322
17675
  outputDir: "/tmp/paa-output-api",
@@ -17331,7 +17684,7 @@ var init_server = __esm({
17331
17684
  if (diff > 0) await creditMc(user.id, diff, LedgerOperation.PAA_REFUND, "overestimate refund");
17332
17685
  else if (diff < 0) await debitMc(user.id, -diff, LedgerOperation.PAA, options.query);
17333
17686
  }
17334
- return c.json({ job_id: jobId, status: "done", result, attempts });
17687
+ return c.json({ job_id: jobId, status: "done", result: sanitizeHarvestResult(result), attempts: sanitizeAttempts(attempts) });
17335
17688
  } catch (err) {
17336
17689
  const problem = classifyHarvestProblem(err);
17337
17690
  const response = harvestProblemResponse(problem);
@@ -17339,18 +17692,19 @@ var init_server = __esm({
17339
17692
  if (problem.terminalStatus === "cancelled" || c.req.raw.signal.aborted) {
17340
17693
  await cancelJob(jobId, serializeHarvestProblem(problem));
17341
17694
  await creditMc(user.id, syncCost, LedgerOperation.REFUND, "cancelled call");
17342
- return c.json({ job_id: jobId, status: "cancelled", ...response, attempts }, problem.httpStatus);
17695
+ return c.json({ job_id: jobId, status: "cancelled", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
17343
17696
  }
17344
17697
  await failJob(jobId, serializeHarvestProblem(problem));
17345
17698
  await creditMc(user.id, syncCost, LedgerOperation.REFUND, "failed call");
17346
- return c.json({ job_id: jobId, status: "failed", ...response, attempts }, problem.httpStatus);
17699
+ return c.json({ job_id: jobId, status: "failed", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
17347
17700
  }
17348
17701
  });
17349
17702
  app.get("/jobs/:id", auth, async (c) => {
17350
17703
  const job = await getJob(c.req.param("id"), c.get("user").id);
17351
17704
  if (!job) return c.json({ error: "Job not found" }, 404);
17352
17705
  const attempts = await listHarvestAttempts(job.id, c.get("user").id);
17353
- return c.json({ ...job, attempts });
17706
+ const safeResult = job.result && typeof job.result === "object" ? sanitizeHarvestResult(job.result) : job.result;
17707
+ return c.json({ ...job, result: safeResult, attempts: sanitizeAttempts(attempts) });
17354
17708
  });
17355
17709
  app.get("/jobs", auth, async (c) => {
17356
17710
  return c.json(await listJobs(c.get("user").id));
@@ -17449,7 +17803,7 @@ var init_server = __esm({
17449
17803
  const { ok: euOk, balance_mc: euBal } = await debitMc(user.id, MC_COSTS.page_scrape, LedgerOperation.EXTRACT_URL, new URL(canonicalUrl).hostname);
17450
17804
  if (!euOk) return c.json(insufficientBalanceResponse(euBal, MC_COSTS.page_scrape), 402);
17451
17805
  try {
17452
- const kernelApiKey = process.env.KERNEL_API_KEY?.trim();
17806
+ const kernelApiKey = browserServiceApiKey();
17453
17807
  const device = screenshotDevice === "mobile" ? "mobile" : "desktop";
17454
17808
  const [result, pageData] = await Promise.all([
17455
17809
  extractKpo({ url: canonicalUrl, kernelApiKey }),
@@ -17487,7 +17841,7 @@ var init_server = __esm({
17487
17841
  startUrl: parsed.href,
17488
17842
  maxUrls: Math.min(2e3, Math.max(1, body.maxUrls ?? 500)),
17489
17843
  concurrency: Math.min(20, Math.max(1, body.concurrency ?? 12)),
17490
- kernelApiKey: body.browserFallback ?? body.kernelFallback ? process.env.KERNEL_API_KEY : void 0
17844
+ kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
17491
17845
  });
17492
17846
  await logRequestEvent({
17493
17847
  userId: user.id,
@@ -17527,7 +17881,7 @@ var init_server = __esm({
17527
17881
  const result = await extractSite({
17528
17882
  startUrl: parsed.href,
17529
17883
  maxPages: Math.min(200, Math.max(1, body.maxPages ?? 100)),
17530
- kernelApiKey: body.browserFallback ?? body.kernelFallback ? process.env.KERNEL_API_KEY : void 0
17884
+ kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
17531
17885
  });
17532
17886
  const pageCount = result.pages?.length ?? 1;
17533
17887
  const actualSiteMc = pageCount * MC_COSTS.page_scrape;
@@ -17797,10 +18151,10 @@ var init_server = __esm({
17797
18151
  });
17798
18152
 
17799
18153
  // bin/api-server.ts
17800
- var import_node_fs5 = require("fs");
18154
+ var import_node_fs6 = require("fs");
17801
18155
  function loadDotEnv() {
17802
18156
  try {
17803
- for (const line of (0, import_node_fs5.readFileSync)(".env", "utf8").split("\n")) {
18157
+ for (const line of (0, import_node_fs6.readFileSync)(".env", "utf8").split("\n")) {
17804
18158
  const eq = line.indexOf("=");
17805
18159
  if (eq < 1 || line.trimStart().startsWith("#")) continue;
17806
18160
  const k = line.slice(0, eq).trim();