mcp-scraper 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  harvest
3
- } from "./chunk-LUBDFS67.js";
3
+ } from "./chunk-TM22BLWP.js";
4
4
  import "./chunk-ZMOWIBMK.js";
5
5
 
6
6
  // src/video/VideoGenerator.ts
@@ -6,7 +6,7 @@ import {
6
6
  configureReportSaving,
7
7
  harvestTimeoutBudget,
8
8
  liveWebToolAnnotations
9
- } from "./chunk-3OIRNUF5.js";
9
+ } from "./chunk-JNC32DMS.js";
10
10
  import {
11
11
  BALANCE_PACK_LABELS,
12
12
  BALANCE_PRICE_IDS,
@@ -32,10 +32,12 @@ import {
32
32
  RawMapsHoursRowSchema,
33
33
  RawMapsOverviewSchema,
34
34
  RawMapsReviewStatsSchema,
35
+ browserServiceApiKey,
36
+ browserServiceProxyId,
35
37
  buildYouTubeChannelVideosUrl,
36
38
  harvest,
37
39
  resolveKernelProxyId
38
- } from "./chunk-LUBDFS67.js";
40
+ } from "./chunk-TM22BLWP.js";
39
41
  import {
40
42
  CaptchaError,
41
43
  RECAPTCHA_INSTRUCTIONS,
@@ -84,6 +86,53 @@ import {
84
86
  verifyPassword
85
87
  } from "./chunk-D4CJBZBY.js";
86
88
 
89
+ // src/api/outbound-sanitize.ts
90
+ var KEY_RENAMES = {
91
+ kernel: "browserRuntime",
92
+ kernel_session_id: "browser_session_id",
93
+ kernel_delete_started: "session_cleanup_started",
94
+ kernel_delete_succeeded: "session_cleanup_succeeded",
95
+ kernel_delete_error: "session_cleanup_error",
96
+ kernelSessionId: "browserSessionId",
97
+ kernelDeleteStarted: "sessionCleanupStarted",
98
+ kernelDeleteSucceeded: "sessionCleanupSucceeded",
99
+ kernelDeleteError: "sessionCleanupError",
100
+ kernelProxyId: "proxyId"
101
+ };
102
+ var SANITIZED_VALUE_KEYS = /error|message/i;
103
+ function sanitizeOutboundDiagnostics(value, parentKey = "") {
104
+ if (typeof value === "string") {
105
+ if (SANITIZED_VALUE_KEYS.test(parentKey) && /kernel/i.test(value)) {
106
+ return sanitizeVendorName(value);
107
+ }
108
+ return value;
109
+ }
110
+ if (Array.isArray(value)) return value.map((v) => sanitizeOutboundDiagnostics(v, parentKey));
111
+ if (value !== null && typeof value === "object") {
112
+ const out = {};
113
+ for (const [key, val] of Object.entries(value)) {
114
+ const renamed = KEY_RENAMES[key] ?? key;
115
+ out[renamed] = sanitizeOutboundDiagnostics(val, key);
116
+ }
117
+ return out;
118
+ }
119
+ return value;
120
+ }
121
+ function sanitizeAttempts(attempts) {
122
+ return attempts.map((a) => sanitizeOutboundDiagnostics(a));
123
+ }
124
+ function sanitizeHarvestResult(result) {
125
+ const diagnostics = result?.diagnostics;
126
+ if (!diagnostics?.debug) return result;
127
+ return {
128
+ ...result,
129
+ diagnostics: {
130
+ ...diagnostics,
131
+ debug: sanitizeOutboundDiagnostics(diagnostics.debug)
132
+ }
133
+ };
134
+ }
135
+
87
136
  // src/blog/registry.ts
88
137
  var posts = [
89
138
  {
@@ -3446,7 +3495,7 @@ import TurndownService from "turndown";
3446
3495
  import Kernel from "@onkernel/sdk";
3447
3496
  import { chromium } from "playwright";
3448
3497
  async function fetchWithKernel(url) {
3449
- const apiKey = process.env.KERNEL_API_KEY;
3498
+ const apiKey = browserServiceApiKey();
3450
3499
  if (!apiKey) throw new Error("Browser backend API key not set");
3451
3500
  const client = new Kernel({ apiKey });
3452
3501
  const kb = await client.browsers.create({ stealth: true, timeout_seconds: 60 });
@@ -7613,7 +7662,7 @@ async function writeOutputs(result, outputDir) {
7613
7662
  }
7614
7663
  }
7615
7664
  async function ytHarvest(rawOptions) {
7616
- const kernelApiKey = process.env.KERNEL_API_KEY;
7665
+ const kernelApiKey = browserServiceApiKey();
7617
7666
  if (!kernelApiKey) {
7618
7667
  throw new Error("A browser backend API key is required \u2014 YouTube harvesting requires a stealth session.");
7619
7668
  }
@@ -7708,7 +7757,7 @@ function parseTimedtextXml(xml) {
7708
7757
  return results;
7709
7758
  }
7710
7759
  async function fetchViaKernelInnertube(videoId) {
7711
- const kernelApiKey = process.env.KERNEL_API_KEY;
7760
+ const kernelApiKey = browserServiceApiKey();
7712
7761
  if (!kernelApiKey) return null;
7713
7762
  const driver = new BrowserDriver();
7714
7763
  const start = Date.now();
@@ -7852,7 +7901,7 @@ async function attemptKernelWhisper(videoId, kernelApiKey, falKey, start) {
7852
7901
  }
7853
7902
  }
7854
7903
  async function fetchViaKernelWhisper(videoId) {
7855
- const kernelApiKey = process.env.KERNEL_API_KEY;
7904
+ const kernelApiKey = browserServiceApiKey();
7856
7905
  const falKey = process.env.FAL_KEY;
7857
7906
  if (!kernelApiKey || !falKey) return null;
7858
7907
  const start = Date.now();
@@ -8123,7 +8172,7 @@ screenshotApp.post("/", async (c) => {
8123
8172
  }
8124
8173
  const device2 = body.device === "mobile" ? "mobile" : "desktop";
8125
8174
  try {
8126
- const buf = await captureScreenshot(parsedFallback.href, process.env.KERNEL_API_KEY?.trim(), device2);
8175
+ const buf = await captureScreenshot(parsedFallback.href, browserServiceApiKey(), device2);
8127
8176
  return new Response(new Uint8Array(buf), {
8128
8177
  status: 200,
8129
8178
  headers: {
@@ -8139,7 +8188,7 @@ screenshotApp.post("/", async (c) => {
8139
8188
  }
8140
8189
  const device = body.device === "mobile" ? "mobile" : "desktop";
8141
8190
  try {
8142
- const buf = await captureScreenshot(urlCheck.parsed.href, process.env.KERNEL_API_KEY?.trim(), device);
8191
+ const buf = await captureScreenshot(urlCheck.parsed.href, browserServiceApiKey(), device);
8143
8192
  return new Response(new Uint8Array(buf), {
8144
8193
  status: 200,
8145
8194
  headers: {
@@ -8966,23 +9015,23 @@ function buildPageIntelUrl(body, country) {
8966
9015
  return `https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=${country}&q=${encodeURIComponent(body.query.trim())}&search_type=keyword_unordered`;
8967
9016
  }
8968
9017
  function kernelLaunchOpts() {
8969
- return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
9018
+ return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: browserServiceProxyId(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
8970
9019
  }
8971
9020
  async function kernelLaunchOptsResidential() {
8972
- let proxyId = process.env.KERNEL_PROXY_ID?.trim();
9021
+ let proxyId = browserServiceProxyId();
8973
9022
  try {
8974
9023
  const resolution = await resolveKernelProxyId({
8975
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
9024
+ kernelApiKey: browserServiceApiKey(),
8976
9025
  proxyMode: "location",
8977
- configuredKernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
9026
+ configuredKernelProxyId: browserServiceProxyId(),
8978
9027
  location: "New York, NY",
8979
9028
  gl: "us"
8980
9029
  });
8981
9030
  if (resolution.kernelProxyId) proxyId = resolution.kernelProxyId;
8982
9031
  } catch {
8983
- proxyId = process.env.KERNEL_PROXY_ID?.trim();
9032
+ proxyId = browserServiceProxyId();
8984
9033
  }
8985
- return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
9034
+ return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
8986
9035
  }
8987
9036
  var facebookAdApp = new Hono4();
8988
9037
  facebookAdApp.post("/ad", createApiKeyAuth(), async (c) => {
@@ -10628,8 +10677,8 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
10628
10677
  debug,
10629
10678
  serpOnly: true,
10630
10679
  headless: runtimeOptions.headless ?? true,
10631
- kernelApiKey: runtimeOptions.kernelApiKey ?? process.env.KERNEL_API_KEY?.trim(),
10632
- kernelProxyId: runtimeOptions.kernelProxyId ?? process.env.KERNEL_PROXY_ID?.trim(),
10680
+ kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
10681
+ kernelProxyId: runtimeOptions.kernelProxyId ?? browserServiceProxyId(),
10633
10682
  format: "json",
10634
10683
  outputDir: runtimeOptions.outputDir ?? "/tmp/serp-intelligence-output",
10635
10684
  signal: runtimeOptions.signal,
@@ -10640,7 +10689,7 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
10640
10689
  const pageSnapshotLimit = normalizePageSnapshotLimit(parsedInput);
10641
10690
  const pageSnapshotTargets = collectPageSnapshotTargets(harvestResult, pageSnapshotLimit);
10642
10691
  const pageSnapshotArtifacts = pageSnapshotTargets.length > 0 ? (await capturePageSnapshotsFn(pageSnapshotTargets, {
10643
- kernelApiKey: runtimeOptions.kernelApiKey ?? process.env.KERNEL_API_KEY?.trim(),
10692
+ kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
10644
10693
  timeoutMs: runtimeOptions.pageSnapshotTimeoutMs,
10645
10694
  maxConcurrency: runtimeOptions.pageSnapshotMaxConcurrency,
10646
10695
  debug,
@@ -10748,8 +10797,8 @@ serpIntelligenceApp.post("/capture", async (c) => {
10748
10797
  if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
10749
10798
  try {
10750
10799
  const result = await captureSerpIntelligenceSnapshot(parsed.data, {
10751
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
10752
- kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
10800
+ kernelApiKey: browserServiceApiKey(),
10801
+ kernelProxyId: browserServiceProxyId(),
10753
10802
  signal: c.req.raw.signal,
10754
10803
  billing: { creditsUsed: cost / 1e3 }
10755
10804
  });
@@ -10804,7 +10853,7 @@ serpIntelligenceApp.post("/page-snapshots", async (c) => {
10804
10853
  if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
10805
10854
  try {
10806
10855
  const result = await capturePageSnapshots(targets, {
10807
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
10856
+ kernelApiKey: browserServiceApiKey(),
10808
10857
  timeoutMs: parsed.data.timeoutMs,
10809
10858
  maxConcurrency: parsed.data.maxConcurrency,
10810
10859
  debug: parsed.data.debug
@@ -10852,7 +10901,10 @@ function mcpAuthError() {
10852
10901
  });
10853
10902
  return new Response(body, {
10854
10903
  status: 401,
10855
- headers: { "Content-Type": "application/json" }
10904
+ headers: {
10905
+ "Content-Type": "application/json",
10906
+ "WWW-Authenticate": 'Bearer realm="mcp-scraper", error="invalid_token", error_description="Pass an MCP Scraper API key as x-api-key or Bearer token"'
10907
+ }
10856
10908
  });
10857
10909
  }
10858
10910
  async function requireMcpCallerKey(c) {
@@ -11517,7 +11569,7 @@ app.post("/harvest/sync", auth, async (c) => {
11517
11569
  try {
11518
11570
  const result = await harvest({
11519
11571
  ...options,
11520
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
11572
+ kernelApiKey: browserServiceApiKey(),
11521
11573
  headless: true,
11522
11574
  format: "json",
11523
11575
  outputDir: "/tmp/paa-output-api",
@@ -11532,7 +11584,7 @@ app.post("/harvest/sync", auth, async (c) => {
11532
11584
  if (diff > 0) await creditMc(user.id, diff, LedgerOperation.PAA_REFUND, "overestimate refund");
11533
11585
  else if (diff < 0) await debitMc(user.id, -diff, LedgerOperation.PAA, options.query);
11534
11586
  }
11535
- return c.json({ job_id: jobId, status: "done", result, attempts });
11587
+ return c.json({ job_id: jobId, status: "done", result: sanitizeHarvestResult(result), attempts: sanitizeAttempts(attempts) });
11536
11588
  } catch (err) {
11537
11589
  const problem = classifyHarvestProblem(err);
11538
11590
  const response = harvestProblemResponse(problem);
@@ -11540,18 +11592,19 @@ app.post("/harvest/sync", auth, async (c) => {
11540
11592
  if (problem.terminalStatus === "cancelled" || c.req.raw.signal.aborted) {
11541
11593
  await cancelJob(jobId, serializeHarvestProblem(problem));
11542
11594
  await creditMc(user.id, syncCost, LedgerOperation.REFUND, "cancelled call");
11543
- return c.json({ job_id: jobId, status: "cancelled", ...response, attempts }, problem.httpStatus);
11595
+ return c.json({ job_id: jobId, status: "cancelled", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
11544
11596
  }
11545
11597
  await failJob(jobId, serializeHarvestProblem(problem));
11546
11598
  await creditMc(user.id, syncCost, LedgerOperation.REFUND, "failed call");
11547
- return c.json({ job_id: jobId, status: "failed", ...response, attempts }, problem.httpStatus);
11599
+ return c.json({ job_id: jobId, status: "failed", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
11548
11600
  }
11549
11601
  });
11550
11602
  app.get("/jobs/:id", auth, async (c) => {
11551
11603
  const job = await getJob(c.req.param("id"), c.get("user").id);
11552
11604
  if (!job) return c.json({ error: "Job not found" }, 404);
11553
11605
  const attempts = await listHarvestAttempts(job.id, c.get("user").id);
11554
- return c.json({ ...job, attempts });
11606
+ const safeResult = job.result && typeof job.result === "object" ? sanitizeHarvestResult(job.result) : job.result;
11607
+ return c.json({ ...job, result: safeResult, attempts: sanitizeAttempts(attempts) });
11555
11608
  });
11556
11609
  app.get("/jobs", auth, async (c) => {
11557
11610
  return c.json(await listJobs(c.get("user").id));
@@ -11650,7 +11703,7 @@ app.post("/extract-url", auth, async (c) => {
11650
11703
  const { ok: euOk, balance_mc: euBal } = await debitMc(user.id, MC_COSTS.page_scrape, LedgerOperation.EXTRACT_URL, new URL(canonicalUrl).hostname);
11651
11704
  if (!euOk) return c.json(insufficientBalanceResponse(euBal, MC_COSTS.page_scrape), 402);
11652
11705
  try {
11653
- const kernelApiKey = process.env.KERNEL_API_KEY?.trim();
11706
+ const kernelApiKey = browserServiceApiKey();
11654
11707
  const device = screenshotDevice === "mobile" ? "mobile" : "desktop";
11655
11708
  const [result, pageData] = await Promise.all([
11656
11709
  extractKpo({ url: canonicalUrl, kernelApiKey }),
@@ -11688,7 +11741,7 @@ app.post("/map-urls", auth, async (c) => {
11688
11741
  startUrl: parsed.href,
11689
11742
  maxUrls: Math.min(2e3, Math.max(1, body.maxUrls ?? 500)),
11690
11743
  concurrency: Math.min(20, Math.max(1, body.concurrency ?? 12)),
11691
- kernelApiKey: body.browserFallback ?? body.kernelFallback ? process.env.KERNEL_API_KEY : void 0
11744
+ kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
11692
11745
  });
11693
11746
  await logRequestEvent({
11694
11747
  userId: user.id,
@@ -11728,7 +11781,7 @@ app.post("/extract-site", auth, async (c) => {
11728
11781
  const result = await extractSite({
11729
11782
  startUrl: parsed.href,
11730
11783
  maxPages: Math.min(200, Math.max(1, body.maxPages ?? 100)),
11731
- kernelApiKey: body.browserFallback ?? body.kernelFallback ? process.env.KERNEL_API_KEY : void 0
11784
+ kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
11732
11785
  });
11733
11786
  const pageCount = result.pages?.length ?? 1;
11734
11787
  const actualSiteMc = pageCount * MC_COSTS.page_scrape;
@@ -11875,7 +11928,7 @@ app.get("/cron/tick", async (c) => {
11875
11928
  if (!process.env.CRON_SECRET || secret2 !== `Bearer ${process.env.CRON_SECRET}`) {
11876
11929
  return c.json({ error: "Unauthorized" }, 401);
11877
11930
  }
11878
- const { drainQueue } = await import("./worker-PBG6LGET.js");
11931
+ const { drainQueue } = await import("./worker-AUCXFHEL.js");
11879
11932
  const budget = { maxJobs: 10, deadlineMs: Date.now() + 28e4 };
11880
11933
  const [results, sweepResult] = await Promise.all([
11881
11934
  drainQueue(budget),
@@ -11997,4 +12050,4 @@ app.get("/blog/:slug/", (c) => {
11997
12050
  export {
11998
12051
  app
11999
12052
  };
12000
- //# sourceMappingURL=server-YNJHP5PU.js.map
12053
+ //# sourceMappingURL=server-MTXAJG5J.js.map