mcp-scraper 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -0
- package/dist/bin/api-server.cjs +635 -281
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +479 -210
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.cjs +14 -4
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +4 -3
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-3OIRNUF5.js → chunk-JNC32DMS.js} +478 -209
- package/dist/chunk-JNC32DMS.js.map +1 -0
- package/dist/{chunk-LUBDFS67.js → chunk-TM22BLWP.js} +15 -3
- package/dist/chunk-TM22BLWP.js.map +1 -0
- package/dist/index.cjs +12 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +1 -1
- package/dist/{server-YNJHP5PU.js → server-MTXAJG5J.js} +84 -31
- package/dist/server-MTXAJG5J.js.map +1 -0
- package/dist/{worker-PBG6LGET.js → worker-AUCXFHEL.js} +4 -3
- package/dist/worker-AUCXFHEL.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-3OIRNUF5.js.map +0 -1
- package/dist/chunk-LUBDFS67.js.map +0 -1
- package/dist/server-YNJHP5PU.js.map +0 -1
- package/dist/worker-PBG6LGET.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
configureReportSaving,
|
|
7
7
|
harvestTimeoutBudget,
|
|
8
8
|
liveWebToolAnnotations
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-JNC32DMS.js";
|
|
10
10
|
import {
|
|
11
11
|
BALANCE_PACK_LABELS,
|
|
12
12
|
BALANCE_PRICE_IDS,
|
|
@@ -32,10 +32,12 @@ import {
|
|
|
32
32
|
RawMapsHoursRowSchema,
|
|
33
33
|
RawMapsOverviewSchema,
|
|
34
34
|
RawMapsReviewStatsSchema,
|
|
35
|
+
browserServiceApiKey,
|
|
36
|
+
browserServiceProxyId,
|
|
35
37
|
buildYouTubeChannelVideosUrl,
|
|
36
38
|
harvest,
|
|
37
39
|
resolveKernelProxyId
|
|
38
|
-
} from "./chunk-
|
|
40
|
+
} from "./chunk-TM22BLWP.js";
|
|
39
41
|
import {
|
|
40
42
|
CaptchaError,
|
|
41
43
|
RECAPTCHA_INSTRUCTIONS,
|
|
@@ -84,6 +86,53 @@ import {
|
|
|
84
86
|
verifyPassword
|
|
85
87
|
} from "./chunk-D4CJBZBY.js";
|
|
86
88
|
|
|
89
|
+
// src/api/outbound-sanitize.ts
|
|
90
|
+
var KEY_RENAMES = {
|
|
91
|
+
kernel: "browserRuntime",
|
|
92
|
+
kernel_session_id: "browser_session_id",
|
|
93
|
+
kernel_delete_started: "session_cleanup_started",
|
|
94
|
+
kernel_delete_succeeded: "session_cleanup_succeeded",
|
|
95
|
+
kernel_delete_error: "session_cleanup_error",
|
|
96
|
+
kernelSessionId: "browserSessionId",
|
|
97
|
+
kernelDeleteStarted: "sessionCleanupStarted",
|
|
98
|
+
kernelDeleteSucceeded: "sessionCleanupSucceeded",
|
|
99
|
+
kernelDeleteError: "sessionCleanupError",
|
|
100
|
+
kernelProxyId: "proxyId"
|
|
101
|
+
};
|
|
102
|
+
var SANITIZED_VALUE_KEYS = /error|message/i;
|
|
103
|
+
function sanitizeOutboundDiagnostics(value, parentKey = "") {
|
|
104
|
+
if (typeof value === "string") {
|
|
105
|
+
if (SANITIZED_VALUE_KEYS.test(parentKey) && /kernel/i.test(value)) {
|
|
106
|
+
return sanitizeVendorName(value);
|
|
107
|
+
}
|
|
108
|
+
return value;
|
|
109
|
+
}
|
|
110
|
+
if (Array.isArray(value)) return value.map((v) => sanitizeOutboundDiagnostics(v, parentKey));
|
|
111
|
+
if (value !== null && typeof value === "object") {
|
|
112
|
+
const out = {};
|
|
113
|
+
for (const [key, val] of Object.entries(value)) {
|
|
114
|
+
const renamed = KEY_RENAMES[key] ?? key;
|
|
115
|
+
out[renamed] = sanitizeOutboundDiagnostics(val, key);
|
|
116
|
+
}
|
|
117
|
+
return out;
|
|
118
|
+
}
|
|
119
|
+
return value;
|
|
120
|
+
}
|
|
121
|
+
function sanitizeAttempts(attempts) {
|
|
122
|
+
return attempts.map((a) => sanitizeOutboundDiagnostics(a));
|
|
123
|
+
}
|
|
124
|
+
function sanitizeHarvestResult(result) {
|
|
125
|
+
const diagnostics = result?.diagnostics;
|
|
126
|
+
if (!diagnostics?.debug) return result;
|
|
127
|
+
return {
|
|
128
|
+
...result,
|
|
129
|
+
diagnostics: {
|
|
130
|
+
...diagnostics,
|
|
131
|
+
debug: sanitizeOutboundDiagnostics(diagnostics.debug)
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
87
136
|
// src/blog/registry.ts
|
|
88
137
|
var posts = [
|
|
89
138
|
{
|
|
@@ -3446,7 +3495,7 @@ import TurndownService from "turndown";
|
|
|
3446
3495
|
import Kernel from "@onkernel/sdk";
|
|
3447
3496
|
import { chromium } from "playwright";
|
|
3448
3497
|
async function fetchWithKernel(url) {
|
|
3449
|
-
const apiKey =
|
|
3498
|
+
const apiKey = browserServiceApiKey();
|
|
3450
3499
|
if (!apiKey) throw new Error("Browser backend API key not set");
|
|
3451
3500
|
const client = new Kernel({ apiKey });
|
|
3452
3501
|
const kb = await client.browsers.create({ stealth: true, timeout_seconds: 60 });
|
|
@@ -7613,7 +7662,7 @@ async function writeOutputs(result, outputDir) {
|
|
|
7613
7662
|
}
|
|
7614
7663
|
}
|
|
7615
7664
|
async function ytHarvest(rawOptions) {
|
|
7616
|
-
const kernelApiKey =
|
|
7665
|
+
const kernelApiKey = browserServiceApiKey();
|
|
7617
7666
|
if (!kernelApiKey) {
|
|
7618
7667
|
throw new Error("A browser backend API key is required \u2014 YouTube harvesting requires a stealth session.");
|
|
7619
7668
|
}
|
|
@@ -7708,7 +7757,7 @@ function parseTimedtextXml(xml) {
|
|
|
7708
7757
|
return results;
|
|
7709
7758
|
}
|
|
7710
7759
|
async function fetchViaKernelInnertube(videoId) {
|
|
7711
|
-
const kernelApiKey =
|
|
7760
|
+
const kernelApiKey = browserServiceApiKey();
|
|
7712
7761
|
if (!kernelApiKey) return null;
|
|
7713
7762
|
const driver = new BrowserDriver();
|
|
7714
7763
|
const start = Date.now();
|
|
@@ -7852,7 +7901,7 @@ async function attemptKernelWhisper(videoId, kernelApiKey, falKey, start) {
|
|
|
7852
7901
|
}
|
|
7853
7902
|
}
|
|
7854
7903
|
async function fetchViaKernelWhisper(videoId) {
|
|
7855
|
-
const kernelApiKey =
|
|
7904
|
+
const kernelApiKey = browserServiceApiKey();
|
|
7856
7905
|
const falKey = process.env.FAL_KEY;
|
|
7857
7906
|
if (!kernelApiKey || !falKey) return null;
|
|
7858
7907
|
const start = Date.now();
|
|
@@ -8123,7 +8172,7 @@ screenshotApp.post("/", async (c) => {
|
|
|
8123
8172
|
}
|
|
8124
8173
|
const device2 = body.device === "mobile" ? "mobile" : "desktop";
|
|
8125
8174
|
try {
|
|
8126
|
-
const buf = await captureScreenshot(parsedFallback.href,
|
|
8175
|
+
const buf = await captureScreenshot(parsedFallback.href, browserServiceApiKey(), device2);
|
|
8127
8176
|
return new Response(new Uint8Array(buf), {
|
|
8128
8177
|
status: 200,
|
|
8129
8178
|
headers: {
|
|
@@ -8139,7 +8188,7 @@ screenshotApp.post("/", async (c) => {
|
|
|
8139
8188
|
}
|
|
8140
8189
|
const device = body.device === "mobile" ? "mobile" : "desktop";
|
|
8141
8190
|
try {
|
|
8142
|
-
const buf = await captureScreenshot(urlCheck.parsed.href,
|
|
8191
|
+
const buf = await captureScreenshot(urlCheck.parsed.href, browserServiceApiKey(), device);
|
|
8143
8192
|
return new Response(new Uint8Array(buf), {
|
|
8144
8193
|
status: 200,
|
|
8145
8194
|
headers: {
|
|
@@ -8966,23 +9015,23 @@ function buildPageIntelUrl(body, country) {
|
|
|
8966
9015
|
return `https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=${country}&q=${encodeURIComponent(body.query.trim())}&search_type=keyword_unordered`;
|
|
8967
9016
|
}
|
|
8968
9017
|
function kernelLaunchOpts() {
|
|
8969
|
-
return { headless: true, kernelApiKey:
|
|
9018
|
+
return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: browserServiceProxyId(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
8970
9019
|
}
|
|
8971
9020
|
async function kernelLaunchOptsResidential() {
|
|
8972
|
-
let proxyId =
|
|
9021
|
+
let proxyId = browserServiceProxyId();
|
|
8973
9022
|
try {
|
|
8974
9023
|
const resolution = await resolveKernelProxyId({
|
|
8975
|
-
kernelApiKey:
|
|
9024
|
+
kernelApiKey: browserServiceApiKey(),
|
|
8976
9025
|
proxyMode: "location",
|
|
8977
|
-
configuredKernelProxyId:
|
|
9026
|
+
configuredKernelProxyId: browserServiceProxyId(),
|
|
8978
9027
|
location: "New York, NY",
|
|
8979
9028
|
gl: "us"
|
|
8980
9029
|
});
|
|
8981
9030
|
if (resolution.kernelProxyId) proxyId = resolution.kernelProxyId;
|
|
8982
9031
|
} catch {
|
|
8983
|
-
proxyId =
|
|
9032
|
+
proxyId = browserServiceProxyId();
|
|
8984
9033
|
}
|
|
8985
|
-
return { headless: true, kernelApiKey:
|
|
9034
|
+
return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
8986
9035
|
}
|
|
8987
9036
|
var facebookAdApp = new Hono4();
|
|
8988
9037
|
facebookAdApp.post("/ad", createApiKeyAuth(), async (c) => {
|
|
@@ -10628,8 +10677,8 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
|
|
|
10628
10677
|
debug,
|
|
10629
10678
|
serpOnly: true,
|
|
10630
10679
|
headless: runtimeOptions.headless ?? true,
|
|
10631
|
-
kernelApiKey: runtimeOptions.kernelApiKey ??
|
|
10632
|
-
kernelProxyId: runtimeOptions.kernelProxyId ??
|
|
10680
|
+
kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
|
|
10681
|
+
kernelProxyId: runtimeOptions.kernelProxyId ?? browserServiceProxyId(),
|
|
10633
10682
|
format: "json",
|
|
10634
10683
|
outputDir: runtimeOptions.outputDir ?? "/tmp/serp-intelligence-output",
|
|
10635
10684
|
signal: runtimeOptions.signal,
|
|
@@ -10640,7 +10689,7 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
|
|
|
10640
10689
|
const pageSnapshotLimit = normalizePageSnapshotLimit(parsedInput);
|
|
10641
10690
|
const pageSnapshotTargets = collectPageSnapshotTargets(harvestResult, pageSnapshotLimit);
|
|
10642
10691
|
const pageSnapshotArtifacts = pageSnapshotTargets.length > 0 ? (await capturePageSnapshotsFn(pageSnapshotTargets, {
|
|
10643
|
-
kernelApiKey: runtimeOptions.kernelApiKey ??
|
|
10692
|
+
kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
|
|
10644
10693
|
timeoutMs: runtimeOptions.pageSnapshotTimeoutMs,
|
|
10645
10694
|
maxConcurrency: runtimeOptions.pageSnapshotMaxConcurrency,
|
|
10646
10695
|
debug,
|
|
@@ -10748,8 +10797,8 @@ serpIntelligenceApp.post("/capture", async (c) => {
|
|
|
10748
10797
|
if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
|
|
10749
10798
|
try {
|
|
10750
10799
|
const result = await captureSerpIntelligenceSnapshot(parsed.data, {
|
|
10751
|
-
kernelApiKey:
|
|
10752
|
-
kernelProxyId:
|
|
10800
|
+
kernelApiKey: browserServiceApiKey(),
|
|
10801
|
+
kernelProxyId: browserServiceProxyId(),
|
|
10753
10802
|
signal: c.req.raw.signal,
|
|
10754
10803
|
billing: { creditsUsed: cost / 1e3 }
|
|
10755
10804
|
});
|
|
@@ -10804,7 +10853,7 @@ serpIntelligenceApp.post("/page-snapshots", async (c) => {
|
|
|
10804
10853
|
if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
|
|
10805
10854
|
try {
|
|
10806
10855
|
const result = await capturePageSnapshots(targets, {
|
|
10807
|
-
kernelApiKey:
|
|
10856
|
+
kernelApiKey: browserServiceApiKey(),
|
|
10808
10857
|
timeoutMs: parsed.data.timeoutMs,
|
|
10809
10858
|
maxConcurrency: parsed.data.maxConcurrency,
|
|
10810
10859
|
debug: parsed.data.debug
|
|
@@ -10852,7 +10901,10 @@ function mcpAuthError() {
|
|
|
10852
10901
|
});
|
|
10853
10902
|
return new Response(body, {
|
|
10854
10903
|
status: 401,
|
|
10855
|
-
headers: {
|
|
10904
|
+
headers: {
|
|
10905
|
+
"Content-Type": "application/json",
|
|
10906
|
+
"WWW-Authenticate": 'Bearer realm="mcp-scraper", error="invalid_token", error_description="Pass an MCP Scraper API key as x-api-key or Bearer token"'
|
|
10907
|
+
}
|
|
10856
10908
|
});
|
|
10857
10909
|
}
|
|
10858
10910
|
async function requireMcpCallerKey(c) {
|
|
@@ -11517,7 +11569,7 @@ app.post("/harvest/sync", auth, async (c) => {
|
|
|
11517
11569
|
try {
|
|
11518
11570
|
const result = await harvest({
|
|
11519
11571
|
...options,
|
|
11520
|
-
kernelApiKey:
|
|
11572
|
+
kernelApiKey: browserServiceApiKey(),
|
|
11521
11573
|
headless: true,
|
|
11522
11574
|
format: "json",
|
|
11523
11575
|
outputDir: "/tmp/paa-output-api",
|
|
@@ -11532,7 +11584,7 @@ app.post("/harvest/sync", auth, async (c) => {
|
|
|
11532
11584
|
if (diff > 0) await creditMc(user.id, diff, LedgerOperation.PAA_REFUND, "overestimate refund");
|
|
11533
11585
|
else if (diff < 0) await debitMc(user.id, -diff, LedgerOperation.PAA, options.query);
|
|
11534
11586
|
}
|
|
11535
|
-
return c.json({ job_id: jobId, status: "done", result, attempts });
|
|
11587
|
+
return c.json({ job_id: jobId, status: "done", result: sanitizeHarvestResult(result), attempts: sanitizeAttempts(attempts) });
|
|
11536
11588
|
} catch (err) {
|
|
11537
11589
|
const problem = classifyHarvestProblem(err);
|
|
11538
11590
|
const response = harvestProblemResponse(problem);
|
|
@@ -11540,18 +11592,19 @@ app.post("/harvest/sync", auth, async (c) => {
|
|
|
11540
11592
|
if (problem.terminalStatus === "cancelled" || c.req.raw.signal.aborted) {
|
|
11541
11593
|
await cancelJob(jobId, serializeHarvestProblem(problem));
|
|
11542
11594
|
await creditMc(user.id, syncCost, LedgerOperation.REFUND, "cancelled call");
|
|
11543
|
-
return c.json({ job_id: jobId, status: "cancelled", ...response, attempts }, problem.httpStatus);
|
|
11595
|
+
return c.json({ job_id: jobId, status: "cancelled", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
|
|
11544
11596
|
}
|
|
11545
11597
|
await failJob(jobId, serializeHarvestProblem(problem));
|
|
11546
11598
|
await creditMc(user.id, syncCost, LedgerOperation.REFUND, "failed call");
|
|
11547
|
-
return c.json({ job_id: jobId, status: "failed", ...response, attempts }, problem.httpStatus);
|
|
11599
|
+
return c.json({ job_id: jobId, status: "failed", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
|
|
11548
11600
|
}
|
|
11549
11601
|
});
|
|
11550
11602
|
app.get("/jobs/:id", auth, async (c) => {
|
|
11551
11603
|
const job = await getJob(c.req.param("id"), c.get("user").id);
|
|
11552
11604
|
if (!job) return c.json({ error: "Job not found" }, 404);
|
|
11553
11605
|
const attempts = await listHarvestAttempts(job.id, c.get("user").id);
|
|
11554
|
-
|
|
11606
|
+
const safeResult = job.result && typeof job.result === "object" ? sanitizeHarvestResult(job.result) : job.result;
|
|
11607
|
+
return c.json({ ...job, result: safeResult, attempts: sanitizeAttempts(attempts) });
|
|
11555
11608
|
});
|
|
11556
11609
|
app.get("/jobs", auth, async (c) => {
|
|
11557
11610
|
return c.json(await listJobs(c.get("user").id));
|
|
@@ -11650,7 +11703,7 @@ app.post("/extract-url", auth, async (c) => {
|
|
|
11650
11703
|
const { ok: euOk, balance_mc: euBal } = await debitMc(user.id, MC_COSTS.page_scrape, LedgerOperation.EXTRACT_URL, new URL(canonicalUrl).hostname);
|
|
11651
11704
|
if (!euOk) return c.json(insufficientBalanceResponse(euBal, MC_COSTS.page_scrape), 402);
|
|
11652
11705
|
try {
|
|
11653
|
-
const kernelApiKey =
|
|
11706
|
+
const kernelApiKey = browserServiceApiKey();
|
|
11654
11707
|
const device = screenshotDevice === "mobile" ? "mobile" : "desktop";
|
|
11655
11708
|
const [result, pageData] = await Promise.all([
|
|
11656
11709
|
extractKpo({ url: canonicalUrl, kernelApiKey }),
|
|
@@ -11688,7 +11741,7 @@ app.post("/map-urls", auth, async (c) => {
|
|
|
11688
11741
|
startUrl: parsed.href,
|
|
11689
11742
|
maxUrls: Math.min(2e3, Math.max(1, body.maxUrls ?? 500)),
|
|
11690
11743
|
concurrency: Math.min(20, Math.max(1, body.concurrency ?? 12)),
|
|
11691
|
-
kernelApiKey: body.browserFallback ?? body.kernelFallback ?
|
|
11744
|
+
kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
|
|
11692
11745
|
});
|
|
11693
11746
|
await logRequestEvent({
|
|
11694
11747
|
userId: user.id,
|
|
@@ -11728,7 +11781,7 @@ app.post("/extract-site", auth, async (c) => {
|
|
|
11728
11781
|
const result = await extractSite({
|
|
11729
11782
|
startUrl: parsed.href,
|
|
11730
11783
|
maxPages: Math.min(200, Math.max(1, body.maxPages ?? 100)),
|
|
11731
|
-
kernelApiKey: body.browserFallback ?? body.kernelFallback ?
|
|
11784
|
+
kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
|
|
11732
11785
|
});
|
|
11733
11786
|
const pageCount = result.pages?.length ?? 1;
|
|
11734
11787
|
const actualSiteMc = pageCount * MC_COSTS.page_scrape;
|
|
@@ -11875,7 +11928,7 @@ app.get("/cron/tick", async (c) => {
|
|
|
11875
11928
|
if (!process.env.CRON_SECRET || secret2 !== `Bearer ${process.env.CRON_SECRET}`) {
|
|
11876
11929
|
return c.json({ error: "Unauthorized" }, 401);
|
|
11877
11930
|
}
|
|
11878
|
-
const { drainQueue } = await import("./worker-
|
|
11931
|
+
const { drainQueue } = await import("./worker-AUCXFHEL.js");
|
|
11879
11932
|
const budget = { maxJobs: 10, deadlineMs: Date.now() + 28e4 };
|
|
11880
11933
|
const [results, sweepResult] = await Promise.all([
|
|
11881
11934
|
drainQueue(budget),
|
|
@@ -11997,4 +12050,4 @@ app.get("/blog/:slug/", (c) => {
|
|
|
11997
12050
|
export {
|
|
11998
12051
|
app
|
|
11999
12052
|
};
|
|
12000
|
-
//# sourceMappingURL=server-
|
|
12053
|
+
//# sourceMappingURL=server-MTXAJG5J.js.map
|