mcp-scraper 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  harvest
3
- } from "./chunk-LUBDFS67.js";
3
+ } from "./chunk-TM22BLWP.js";
4
4
  import "./chunk-ZMOWIBMK.js";
5
5
 
6
6
  // src/video/VideoGenerator.ts
@@ -6,7 +6,7 @@ import {
6
6
  configureReportSaving,
7
7
  harvestTimeoutBudget,
8
8
  liveWebToolAnnotations
9
- } from "./chunk-3OIRNUF5.js";
9
+ } from "./chunk-RE6HCRYC.js";
10
10
  import {
11
11
  BALANCE_PACK_LABELS,
12
12
  BALANCE_PRICE_IDS,
@@ -32,10 +32,12 @@ import {
32
32
  RawMapsHoursRowSchema,
33
33
  RawMapsOverviewSchema,
34
34
  RawMapsReviewStatsSchema,
35
+ browserServiceApiKey,
36
+ browserServiceProxyId,
35
37
  buildYouTubeChannelVideosUrl,
36
38
  harvest,
37
39
  resolveKernelProxyId
38
- } from "./chunk-LUBDFS67.js";
40
+ } from "./chunk-TM22BLWP.js";
39
41
  import {
40
42
  CaptchaError,
41
43
  RECAPTCHA_INSTRUCTIONS,
@@ -84,6 +86,53 @@ import {
84
86
  verifyPassword
85
87
  } from "./chunk-D4CJBZBY.js";
86
88
 
89
+ // src/api/outbound-sanitize.ts
90
+ var KEY_RENAMES = {
91
+ kernel: "browserRuntime",
92
+ kernel_session_id: "browser_session_id",
93
+ kernel_delete_started: "session_cleanup_started",
94
+ kernel_delete_succeeded: "session_cleanup_succeeded",
95
+ kernel_delete_error: "session_cleanup_error",
96
+ kernelSessionId: "browserSessionId",
97
+ kernelDeleteStarted: "sessionCleanupStarted",
98
+ kernelDeleteSucceeded: "sessionCleanupSucceeded",
99
+ kernelDeleteError: "sessionCleanupError",
100
+ kernelProxyId: "proxyId"
101
+ };
102
+ var SANITIZED_VALUE_KEYS = /error|message/i;
103
+ function sanitizeOutboundDiagnostics(value, parentKey = "") {
104
+ if (typeof value === "string") {
105
+ if (SANITIZED_VALUE_KEYS.test(parentKey) && /kernel/i.test(value)) {
106
+ return sanitizeVendorName(value);
107
+ }
108
+ return value;
109
+ }
110
+ if (Array.isArray(value)) return value.map((v) => sanitizeOutboundDiagnostics(v, parentKey));
111
+ if (value !== null && typeof value === "object") {
112
+ const out = {};
113
+ for (const [key, val] of Object.entries(value)) {
114
+ const renamed = KEY_RENAMES[key] ?? key;
115
+ out[renamed] = sanitizeOutboundDiagnostics(val, key);
116
+ }
117
+ return out;
118
+ }
119
+ return value;
120
+ }
121
+ function sanitizeAttempts(attempts) {
122
+ return attempts.map((a) => sanitizeOutboundDiagnostics(a));
123
+ }
124
+ function sanitizeHarvestResult(result) {
125
+ const diagnostics = result?.diagnostics;
126
+ if (!diagnostics?.debug) return result;
127
+ return {
128
+ ...result,
129
+ diagnostics: {
130
+ ...diagnostics,
131
+ debug: sanitizeOutboundDiagnostics(diagnostics.debug)
132
+ }
133
+ };
134
+ }
135
+
87
136
  // src/blog/registry.ts
88
137
  var posts = [
89
138
  {
@@ -3446,7 +3495,7 @@ import TurndownService from "turndown";
3446
3495
  import Kernel from "@onkernel/sdk";
3447
3496
  import { chromium } from "playwright";
3448
3497
  async function fetchWithKernel(url) {
3449
- const apiKey = process.env.KERNEL_API_KEY;
3498
+ const apiKey = browserServiceApiKey();
3450
3499
  if (!apiKey) throw new Error("Browser backend API key not set");
3451
3500
  const client = new Kernel({ apiKey });
3452
3501
  const kb = await client.browsers.create({ stealth: true, timeout_seconds: 60 });
@@ -7613,7 +7662,7 @@ async function writeOutputs(result, outputDir) {
7613
7662
  }
7614
7663
  }
7615
7664
  async function ytHarvest(rawOptions) {
7616
- const kernelApiKey = process.env.KERNEL_API_KEY;
7665
+ const kernelApiKey = browserServiceApiKey();
7617
7666
  if (!kernelApiKey) {
7618
7667
  throw new Error("A browser backend API key is required \u2014 YouTube harvesting requires a stealth session.");
7619
7668
  }
@@ -7708,7 +7757,7 @@ function parseTimedtextXml(xml) {
7708
7757
  return results;
7709
7758
  }
7710
7759
  async function fetchViaKernelInnertube(videoId) {
7711
- const kernelApiKey = process.env.KERNEL_API_KEY;
7760
+ const kernelApiKey = browserServiceApiKey();
7712
7761
  if (!kernelApiKey) return null;
7713
7762
  const driver = new BrowserDriver();
7714
7763
  const start = Date.now();
@@ -7852,7 +7901,7 @@ async function attemptKernelWhisper(videoId, kernelApiKey, falKey, start) {
7852
7901
  }
7853
7902
  }
7854
7903
  async function fetchViaKernelWhisper(videoId) {
7855
- const kernelApiKey = process.env.KERNEL_API_KEY;
7904
+ const kernelApiKey = browserServiceApiKey();
7856
7905
  const falKey = process.env.FAL_KEY;
7857
7906
  if (!kernelApiKey || !falKey) return null;
7858
7907
  const start = Date.now();
@@ -8123,7 +8172,7 @@ screenshotApp.post("/", async (c) => {
8123
8172
  }
8124
8173
  const device2 = body.device === "mobile" ? "mobile" : "desktop";
8125
8174
  try {
8126
- const buf = await captureScreenshot(parsedFallback.href, process.env.KERNEL_API_KEY?.trim(), device2);
8175
+ const buf = await captureScreenshot(parsedFallback.href, browserServiceApiKey(), device2);
8127
8176
  return new Response(new Uint8Array(buf), {
8128
8177
  status: 200,
8129
8178
  headers: {
@@ -8139,7 +8188,7 @@ screenshotApp.post("/", async (c) => {
8139
8188
  }
8140
8189
  const device = body.device === "mobile" ? "mobile" : "desktop";
8141
8190
  try {
8142
- const buf = await captureScreenshot(urlCheck.parsed.href, process.env.KERNEL_API_KEY?.trim(), device);
8191
+ const buf = await captureScreenshot(urlCheck.parsed.href, browserServiceApiKey(), device);
8143
8192
  return new Response(new Uint8Array(buf), {
8144
8193
  status: 200,
8145
8194
  headers: {
@@ -8966,23 +9015,23 @@ function buildPageIntelUrl(body, country) {
8966
9015
  return `https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=${country}&q=${encodeURIComponent(body.query.trim())}&search_type=keyword_unordered`;
8967
9016
  }
8968
9017
  function kernelLaunchOpts() {
8969
- return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
9018
+ return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: browserServiceProxyId(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
8970
9019
  }
8971
9020
  async function kernelLaunchOptsResidential() {
8972
- let proxyId = process.env.KERNEL_PROXY_ID?.trim();
9021
+ let proxyId = browserServiceProxyId();
8973
9022
  try {
8974
9023
  const resolution = await resolveKernelProxyId({
8975
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
9024
+ kernelApiKey: browserServiceApiKey(),
8976
9025
  proxyMode: "location",
8977
- configuredKernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
9026
+ configuredKernelProxyId: browserServiceProxyId(),
8978
9027
  location: "New York, NY",
8979
9028
  gl: "us"
8980
9029
  });
8981
9030
  if (resolution.kernelProxyId) proxyId = resolution.kernelProxyId;
8982
9031
  } catch {
8983
- proxyId = process.env.KERNEL_PROXY_ID?.trim();
9032
+ proxyId = browserServiceProxyId();
8984
9033
  }
8985
- return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
9034
+ return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
8986
9035
  }
8987
9036
  var facebookAdApp = new Hono4();
8988
9037
  facebookAdApp.post("/ad", createApiKeyAuth(), async (c) => {
@@ -10628,8 +10677,8 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
10628
10677
  debug,
10629
10678
  serpOnly: true,
10630
10679
  headless: runtimeOptions.headless ?? true,
10631
- kernelApiKey: runtimeOptions.kernelApiKey ?? process.env.KERNEL_API_KEY?.trim(),
10632
- kernelProxyId: runtimeOptions.kernelProxyId ?? process.env.KERNEL_PROXY_ID?.trim(),
10680
+ kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
10681
+ kernelProxyId: runtimeOptions.kernelProxyId ?? browserServiceProxyId(),
10633
10682
  format: "json",
10634
10683
  outputDir: runtimeOptions.outputDir ?? "/tmp/serp-intelligence-output",
10635
10684
  signal: runtimeOptions.signal,
@@ -10640,7 +10689,7 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
10640
10689
  const pageSnapshotLimit = normalizePageSnapshotLimit(parsedInput);
10641
10690
  const pageSnapshotTargets = collectPageSnapshotTargets(harvestResult, pageSnapshotLimit);
10642
10691
  const pageSnapshotArtifacts = pageSnapshotTargets.length > 0 ? (await capturePageSnapshotsFn(pageSnapshotTargets, {
10643
- kernelApiKey: runtimeOptions.kernelApiKey ?? process.env.KERNEL_API_KEY?.trim(),
10692
+ kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
10644
10693
  timeoutMs: runtimeOptions.pageSnapshotTimeoutMs,
10645
10694
  maxConcurrency: runtimeOptions.pageSnapshotMaxConcurrency,
10646
10695
  debug,
@@ -10748,8 +10797,8 @@ serpIntelligenceApp.post("/capture", async (c) => {
10748
10797
  if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
10749
10798
  try {
10750
10799
  const result = await captureSerpIntelligenceSnapshot(parsed.data, {
10751
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
10752
- kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
10800
+ kernelApiKey: browserServiceApiKey(),
10801
+ kernelProxyId: browserServiceProxyId(),
10753
10802
  signal: c.req.raw.signal,
10754
10803
  billing: { creditsUsed: cost / 1e3 }
10755
10804
  });
@@ -10804,7 +10853,7 @@ serpIntelligenceApp.post("/page-snapshots", async (c) => {
10804
10853
  if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
10805
10854
  try {
10806
10855
  const result = await capturePageSnapshots(targets, {
10807
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
10856
+ kernelApiKey: browserServiceApiKey(),
10808
10857
  timeoutMs: parsed.data.timeoutMs,
10809
10858
  maxConcurrency: parsed.data.maxConcurrency,
10810
10859
  debug: parsed.data.debug
@@ -11517,7 +11566,7 @@ app.post("/harvest/sync", auth, async (c) => {
11517
11566
  try {
11518
11567
  const result = await harvest({
11519
11568
  ...options,
11520
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
11569
+ kernelApiKey: browserServiceApiKey(),
11521
11570
  headless: true,
11522
11571
  format: "json",
11523
11572
  outputDir: "/tmp/paa-output-api",
@@ -11532,7 +11581,7 @@ app.post("/harvest/sync", auth, async (c) => {
11532
11581
  if (diff > 0) await creditMc(user.id, diff, LedgerOperation.PAA_REFUND, "overestimate refund");
11533
11582
  else if (diff < 0) await debitMc(user.id, -diff, LedgerOperation.PAA, options.query);
11534
11583
  }
11535
- return c.json({ job_id: jobId, status: "done", result, attempts });
11584
+ return c.json({ job_id: jobId, status: "done", result: sanitizeHarvestResult(result), attempts: sanitizeAttempts(attempts) });
11536
11585
  } catch (err) {
11537
11586
  const problem = classifyHarvestProblem(err);
11538
11587
  const response = harvestProblemResponse(problem);
@@ -11540,18 +11589,19 @@ app.post("/harvest/sync", auth, async (c) => {
11540
11589
  if (problem.terminalStatus === "cancelled" || c.req.raw.signal.aborted) {
11541
11590
  await cancelJob(jobId, serializeHarvestProblem(problem));
11542
11591
  await creditMc(user.id, syncCost, LedgerOperation.REFUND, "cancelled call");
11543
- return c.json({ job_id: jobId, status: "cancelled", ...response, attempts }, problem.httpStatus);
11592
+ return c.json({ job_id: jobId, status: "cancelled", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
11544
11593
  }
11545
11594
  await failJob(jobId, serializeHarvestProblem(problem));
11546
11595
  await creditMc(user.id, syncCost, LedgerOperation.REFUND, "failed call");
11547
- return c.json({ job_id: jobId, status: "failed", ...response, attempts }, problem.httpStatus);
11596
+ return c.json({ job_id: jobId, status: "failed", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
11548
11597
  }
11549
11598
  });
11550
11599
  app.get("/jobs/:id", auth, async (c) => {
11551
11600
  const job = await getJob(c.req.param("id"), c.get("user").id);
11552
11601
  if (!job) return c.json({ error: "Job not found" }, 404);
11553
11602
  const attempts = await listHarvestAttempts(job.id, c.get("user").id);
11554
- return c.json({ ...job, attempts });
11603
+ const safeResult = job.result && typeof job.result === "object" ? sanitizeHarvestResult(job.result) : job.result;
11604
+ return c.json({ ...job, result: safeResult, attempts: sanitizeAttempts(attempts) });
11555
11605
  });
11556
11606
  app.get("/jobs", auth, async (c) => {
11557
11607
  return c.json(await listJobs(c.get("user").id));
@@ -11650,7 +11700,7 @@ app.post("/extract-url", auth, async (c) => {
11650
11700
  const { ok: euOk, balance_mc: euBal } = await debitMc(user.id, MC_COSTS.page_scrape, LedgerOperation.EXTRACT_URL, new URL(canonicalUrl).hostname);
11651
11701
  if (!euOk) return c.json(insufficientBalanceResponse(euBal, MC_COSTS.page_scrape), 402);
11652
11702
  try {
11653
- const kernelApiKey = process.env.KERNEL_API_KEY?.trim();
11703
+ const kernelApiKey = browserServiceApiKey();
11654
11704
  const device = screenshotDevice === "mobile" ? "mobile" : "desktop";
11655
11705
  const [result, pageData] = await Promise.all([
11656
11706
  extractKpo({ url: canonicalUrl, kernelApiKey }),
@@ -11688,7 +11738,7 @@ app.post("/map-urls", auth, async (c) => {
11688
11738
  startUrl: parsed.href,
11689
11739
  maxUrls: Math.min(2e3, Math.max(1, body.maxUrls ?? 500)),
11690
11740
  concurrency: Math.min(20, Math.max(1, body.concurrency ?? 12)),
11691
- kernelApiKey: body.browserFallback ?? body.kernelFallback ? process.env.KERNEL_API_KEY : void 0
11741
+ kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
11692
11742
  });
11693
11743
  await logRequestEvent({
11694
11744
  userId: user.id,
@@ -11728,7 +11778,7 @@ app.post("/extract-site", auth, async (c) => {
11728
11778
  const result = await extractSite({
11729
11779
  startUrl: parsed.href,
11730
11780
  maxPages: Math.min(200, Math.max(1, body.maxPages ?? 100)),
11731
- kernelApiKey: body.browserFallback ?? body.kernelFallback ? process.env.KERNEL_API_KEY : void 0
11781
+ kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
11732
11782
  });
11733
11783
  const pageCount = result.pages?.length ?? 1;
11734
11784
  const actualSiteMc = pageCount * MC_COSTS.page_scrape;
@@ -11875,7 +11925,7 @@ app.get("/cron/tick", async (c) => {
11875
11925
  if (!process.env.CRON_SECRET || secret2 !== `Bearer ${process.env.CRON_SECRET}`) {
11876
11926
  return c.json({ error: "Unauthorized" }, 401);
11877
11927
  }
11878
- const { drainQueue } = await import("./worker-PBG6LGET.js");
11928
+ const { drainQueue } = await import("./worker-AUCXFHEL.js");
11879
11929
  const budget = { maxJobs: 10, deadlineMs: Date.now() + 28e4 };
11880
11930
  const [results, sweepResult] = await Promise.all([
11881
11931
  drainQueue(budget),
@@ -11997,4 +12047,4 @@ app.get("/blog/:slug/", (c) => {
11997
12047
  export {
11998
12048
  app
11999
12049
  };
12000
- //# sourceMappingURL=server-YNJHP5PU.js.map
12050
+ //# sourceMappingURL=server-QXVVTKJP.js.map