mcp-scraper 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/api-server.cjs +388 -75
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +243 -11
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.cjs +14 -4
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +4 -3
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-3OIRNUF5.js → chunk-RE6HCRYC.js} +244 -12
- package/dist/chunk-RE6HCRYC.js.map +1 -0
- package/dist/{chunk-LUBDFS67.js → chunk-TM22BLWP.js} +15 -3
- package/dist/chunk-TM22BLWP.js.map +1 -0
- package/dist/index.cjs +12 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +1 -1
- package/dist/{server-YNJHP5PU.js → server-QXVVTKJP.js} +80 -30
- package/dist/server-QXVVTKJP.js.map +1 -0
- package/dist/{worker-PBG6LGET.js → worker-AUCXFHEL.js} +4 -3
- package/dist/worker-AUCXFHEL.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-3OIRNUF5.js.map +0 -1
- package/dist/chunk-LUBDFS67.js.map +0 -1
- package/dist/server-YNJHP5PU.js.map +0 -1
- package/dist/worker-PBG6LGET.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
configureReportSaving,
|
|
7
7
|
harvestTimeoutBudget,
|
|
8
8
|
liveWebToolAnnotations
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-RE6HCRYC.js";
|
|
10
10
|
import {
|
|
11
11
|
BALANCE_PACK_LABELS,
|
|
12
12
|
BALANCE_PRICE_IDS,
|
|
@@ -32,10 +32,12 @@ import {
|
|
|
32
32
|
RawMapsHoursRowSchema,
|
|
33
33
|
RawMapsOverviewSchema,
|
|
34
34
|
RawMapsReviewStatsSchema,
|
|
35
|
+
browserServiceApiKey,
|
|
36
|
+
browserServiceProxyId,
|
|
35
37
|
buildYouTubeChannelVideosUrl,
|
|
36
38
|
harvest,
|
|
37
39
|
resolveKernelProxyId
|
|
38
|
-
} from "./chunk-
|
|
40
|
+
} from "./chunk-TM22BLWP.js";
|
|
39
41
|
import {
|
|
40
42
|
CaptchaError,
|
|
41
43
|
RECAPTCHA_INSTRUCTIONS,
|
|
@@ -84,6 +86,53 @@ import {
|
|
|
84
86
|
verifyPassword
|
|
85
87
|
} from "./chunk-D4CJBZBY.js";
|
|
86
88
|
|
|
89
|
+
// src/api/outbound-sanitize.ts
|
|
90
|
+
var KEY_RENAMES = {
|
|
91
|
+
kernel: "browserRuntime",
|
|
92
|
+
kernel_session_id: "browser_session_id",
|
|
93
|
+
kernel_delete_started: "session_cleanup_started",
|
|
94
|
+
kernel_delete_succeeded: "session_cleanup_succeeded",
|
|
95
|
+
kernel_delete_error: "session_cleanup_error",
|
|
96
|
+
kernelSessionId: "browserSessionId",
|
|
97
|
+
kernelDeleteStarted: "sessionCleanupStarted",
|
|
98
|
+
kernelDeleteSucceeded: "sessionCleanupSucceeded",
|
|
99
|
+
kernelDeleteError: "sessionCleanupError",
|
|
100
|
+
kernelProxyId: "proxyId"
|
|
101
|
+
};
|
|
102
|
+
var SANITIZED_VALUE_KEYS = /error|message/i;
|
|
103
|
+
function sanitizeOutboundDiagnostics(value, parentKey = "") {
|
|
104
|
+
if (typeof value === "string") {
|
|
105
|
+
if (SANITIZED_VALUE_KEYS.test(parentKey) && /kernel/i.test(value)) {
|
|
106
|
+
return sanitizeVendorName(value);
|
|
107
|
+
}
|
|
108
|
+
return value;
|
|
109
|
+
}
|
|
110
|
+
if (Array.isArray(value)) return value.map((v) => sanitizeOutboundDiagnostics(v, parentKey));
|
|
111
|
+
if (value !== null && typeof value === "object") {
|
|
112
|
+
const out = {};
|
|
113
|
+
for (const [key, val] of Object.entries(value)) {
|
|
114
|
+
const renamed = KEY_RENAMES[key] ?? key;
|
|
115
|
+
out[renamed] = sanitizeOutboundDiagnostics(val, key);
|
|
116
|
+
}
|
|
117
|
+
return out;
|
|
118
|
+
}
|
|
119
|
+
return value;
|
|
120
|
+
}
|
|
121
|
+
function sanitizeAttempts(attempts) {
|
|
122
|
+
return attempts.map((a) => sanitizeOutboundDiagnostics(a));
|
|
123
|
+
}
|
|
124
|
+
function sanitizeHarvestResult(result) {
|
|
125
|
+
const diagnostics = result?.diagnostics;
|
|
126
|
+
if (!diagnostics?.debug) return result;
|
|
127
|
+
return {
|
|
128
|
+
...result,
|
|
129
|
+
diagnostics: {
|
|
130
|
+
...diagnostics,
|
|
131
|
+
debug: sanitizeOutboundDiagnostics(diagnostics.debug)
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
87
136
|
// src/blog/registry.ts
|
|
88
137
|
var posts = [
|
|
89
138
|
{
|
|
@@ -3446,7 +3495,7 @@ import TurndownService from "turndown";
|
|
|
3446
3495
|
import Kernel from "@onkernel/sdk";
|
|
3447
3496
|
import { chromium } from "playwright";
|
|
3448
3497
|
async function fetchWithKernel(url) {
|
|
3449
|
-
const apiKey =
|
|
3498
|
+
const apiKey = browserServiceApiKey();
|
|
3450
3499
|
if (!apiKey) throw new Error("Browser backend API key not set");
|
|
3451
3500
|
const client = new Kernel({ apiKey });
|
|
3452
3501
|
const kb = await client.browsers.create({ stealth: true, timeout_seconds: 60 });
|
|
@@ -7613,7 +7662,7 @@ async function writeOutputs(result, outputDir) {
|
|
|
7613
7662
|
}
|
|
7614
7663
|
}
|
|
7615
7664
|
async function ytHarvest(rawOptions) {
|
|
7616
|
-
const kernelApiKey =
|
|
7665
|
+
const kernelApiKey = browserServiceApiKey();
|
|
7617
7666
|
if (!kernelApiKey) {
|
|
7618
7667
|
throw new Error("A browser backend API key is required \u2014 YouTube harvesting requires a stealth session.");
|
|
7619
7668
|
}
|
|
@@ -7708,7 +7757,7 @@ function parseTimedtextXml(xml) {
|
|
|
7708
7757
|
return results;
|
|
7709
7758
|
}
|
|
7710
7759
|
async function fetchViaKernelInnertube(videoId) {
|
|
7711
|
-
const kernelApiKey =
|
|
7760
|
+
const kernelApiKey = browserServiceApiKey();
|
|
7712
7761
|
if (!kernelApiKey) return null;
|
|
7713
7762
|
const driver = new BrowserDriver();
|
|
7714
7763
|
const start = Date.now();
|
|
@@ -7852,7 +7901,7 @@ async function attemptKernelWhisper(videoId, kernelApiKey, falKey, start) {
|
|
|
7852
7901
|
}
|
|
7853
7902
|
}
|
|
7854
7903
|
async function fetchViaKernelWhisper(videoId) {
|
|
7855
|
-
const kernelApiKey =
|
|
7904
|
+
const kernelApiKey = browserServiceApiKey();
|
|
7856
7905
|
const falKey = process.env.FAL_KEY;
|
|
7857
7906
|
if (!kernelApiKey || !falKey) return null;
|
|
7858
7907
|
const start = Date.now();
|
|
@@ -8123,7 +8172,7 @@ screenshotApp.post("/", async (c) => {
|
|
|
8123
8172
|
}
|
|
8124
8173
|
const device2 = body.device === "mobile" ? "mobile" : "desktop";
|
|
8125
8174
|
try {
|
|
8126
|
-
const buf = await captureScreenshot(parsedFallback.href,
|
|
8175
|
+
const buf = await captureScreenshot(parsedFallback.href, browserServiceApiKey(), device2);
|
|
8127
8176
|
return new Response(new Uint8Array(buf), {
|
|
8128
8177
|
status: 200,
|
|
8129
8178
|
headers: {
|
|
@@ -8139,7 +8188,7 @@ screenshotApp.post("/", async (c) => {
|
|
|
8139
8188
|
}
|
|
8140
8189
|
const device = body.device === "mobile" ? "mobile" : "desktop";
|
|
8141
8190
|
try {
|
|
8142
|
-
const buf = await captureScreenshot(urlCheck.parsed.href,
|
|
8191
|
+
const buf = await captureScreenshot(urlCheck.parsed.href, browserServiceApiKey(), device);
|
|
8143
8192
|
return new Response(new Uint8Array(buf), {
|
|
8144
8193
|
status: 200,
|
|
8145
8194
|
headers: {
|
|
@@ -8966,23 +9015,23 @@ function buildPageIntelUrl(body, country) {
|
|
|
8966
9015
|
return `https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=${country}&q=${encodeURIComponent(body.query.trim())}&search_type=keyword_unordered`;
|
|
8967
9016
|
}
|
|
8968
9017
|
function kernelLaunchOpts() {
|
|
8969
|
-
return { headless: true, kernelApiKey:
|
|
9018
|
+
return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: browserServiceProxyId(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
8970
9019
|
}
|
|
8971
9020
|
async function kernelLaunchOptsResidential() {
|
|
8972
|
-
let proxyId =
|
|
9021
|
+
let proxyId = browserServiceProxyId();
|
|
8973
9022
|
try {
|
|
8974
9023
|
const resolution = await resolveKernelProxyId({
|
|
8975
|
-
kernelApiKey:
|
|
9024
|
+
kernelApiKey: browserServiceApiKey(),
|
|
8976
9025
|
proxyMode: "location",
|
|
8977
|
-
configuredKernelProxyId:
|
|
9026
|
+
configuredKernelProxyId: browserServiceProxyId(),
|
|
8978
9027
|
location: "New York, NY",
|
|
8979
9028
|
gl: "us"
|
|
8980
9029
|
});
|
|
8981
9030
|
if (resolution.kernelProxyId) proxyId = resolution.kernelProxyId;
|
|
8982
9031
|
} catch {
|
|
8983
|
-
proxyId =
|
|
9032
|
+
proxyId = browserServiceProxyId();
|
|
8984
9033
|
}
|
|
8985
|
-
return { headless: true, kernelApiKey:
|
|
9034
|
+
return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
8986
9035
|
}
|
|
8987
9036
|
var facebookAdApp = new Hono4();
|
|
8988
9037
|
facebookAdApp.post("/ad", createApiKeyAuth(), async (c) => {
|
|
@@ -10628,8 +10677,8 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
|
|
|
10628
10677
|
debug,
|
|
10629
10678
|
serpOnly: true,
|
|
10630
10679
|
headless: runtimeOptions.headless ?? true,
|
|
10631
|
-
kernelApiKey: runtimeOptions.kernelApiKey ??
|
|
10632
|
-
kernelProxyId: runtimeOptions.kernelProxyId ??
|
|
10680
|
+
kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
|
|
10681
|
+
kernelProxyId: runtimeOptions.kernelProxyId ?? browserServiceProxyId(),
|
|
10633
10682
|
format: "json",
|
|
10634
10683
|
outputDir: runtimeOptions.outputDir ?? "/tmp/serp-intelligence-output",
|
|
10635
10684
|
signal: runtimeOptions.signal,
|
|
@@ -10640,7 +10689,7 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
|
|
|
10640
10689
|
const pageSnapshotLimit = normalizePageSnapshotLimit(parsedInput);
|
|
10641
10690
|
const pageSnapshotTargets = collectPageSnapshotTargets(harvestResult, pageSnapshotLimit);
|
|
10642
10691
|
const pageSnapshotArtifacts = pageSnapshotTargets.length > 0 ? (await capturePageSnapshotsFn(pageSnapshotTargets, {
|
|
10643
|
-
kernelApiKey: runtimeOptions.kernelApiKey ??
|
|
10692
|
+
kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
|
|
10644
10693
|
timeoutMs: runtimeOptions.pageSnapshotTimeoutMs,
|
|
10645
10694
|
maxConcurrency: runtimeOptions.pageSnapshotMaxConcurrency,
|
|
10646
10695
|
debug,
|
|
@@ -10748,8 +10797,8 @@ serpIntelligenceApp.post("/capture", async (c) => {
|
|
|
10748
10797
|
if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
|
|
10749
10798
|
try {
|
|
10750
10799
|
const result = await captureSerpIntelligenceSnapshot(parsed.data, {
|
|
10751
|
-
kernelApiKey:
|
|
10752
|
-
kernelProxyId:
|
|
10800
|
+
kernelApiKey: browserServiceApiKey(),
|
|
10801
|
+
kernelProxyId: browserServiceProxyId(),
|
|
10753
10802
|
signal: c.req.raw.signal,
|
|
10754
10803
|
billing: { creditsUsed: cost / 1e3 }
|
|
10755
10804
|
});
|
|
@@ -10804,7 +10853,7 @@ serpIntelligenceApp.post("/page-snapshots", async (c) => {
|
|
|
10804
10853
|
if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
|
|
10805
10854
|
try {
|
|
10806
10855
|
const result = await capturePageSnapshots(targets, {
|
|
10807
|
-
kernelApiKey:
|
|
10856
|
+
kernelApiKey: browserServiceApiKey(),
|
|
10808
10857
|
timeoutMs: parsed.data.timeoutMs,
|
|
10809
10858
|
maxConcurrency: parsed.data.maxConcurrency,
|
|
10810
10859
|
debug: parsed.data.debug
|
|
@@ -11517,7 +11566,7 @@ app.post("/harvest/sync", auth, async (c) => {
|
|
|
11517
11566
|
try {
|
|
11518
11567
|
const result = await harvest({
|
|
11519
11568
|
...options,
|
|
11520
|
-
kernelApiKey:
|
|
11569
|
+
kernelApiKey: browserServiceApiKey(),
|
|
11521
11570
|
headless: true,
|
|
11522
11571
|
format: "json",
|
|
11523
11572
|
outputDir: "/tmp/paa-output-api",
|
|
@@ -11532,7 +11581,7 @@ app.post("/harvest/sync", auth, async (c) => {
|
|
|
11532
11581
|
if (diff > 0) await creditMc(user.id, diff, LedgerOperation.PAA_REFUND, "overestimate refund");
|
|
11533
11582
|
else if (diff < 0) await debitMc(user.id, -diff, LedgerOperation.PAA, options.query);
|
|
11534
11583
|
}
|
|
11535
|
-
return c.json({ job_id: jobId, status: "done", result, attempts });
|
|
11584
|
+
return c.json({ job_id: jobId, status: "done", result: sanitizeHarvestResult(result), attempts: sanitizeAttempts(attempts) });
|
|
11536
11585
|
} catch (err) {
|
|
11537
11586
|
const problem = classifyHarvestProblem(err);
|
|
11538
11587
|
const response = harvestProblemResponse(problem);
|
|
@@ -11540,18 +11589,19 @@ app.post("/harvest/sync", auth, async (c) => {
|
|
|
11540
11589
|
if (problem.terminalStatus === "cancelled" || c.req.raw.signal.aborted) {
|
|
11541
11590
|
await cancelJob(jobId, serializeHarvestProblem(problem));
|
|
11542
11591
|
await creditMc(user.id, syncCost, LedgerOperation.REFUND, "cancelled call");
|
|
11543
|
-
return c.json({ job_id: jobId, status: "cancelled", ...response, attempts }, problem.httpStatus);
|
|
11592
|
+
return c.json({ job_id: jobId, status: "cancelled", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
|
|
11544
11593
|
}
|
|
11545
11594
|
await failJob(jobId, serializeHarvestProblem(problem));
|
|
11546
11595
|
await creditMc(user.id, syncCost, LedgerOperation.REFUND, "failed call");
|
|
11547
|
-
return c.json({ job_id: jobId, status: "failed", ...response, attempts }, problem.httpStatus);
|
|
11596
|
+
return c.json({ job_id: jobId, status: "failed", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
|
|
11548
11597
|
}
|
|
11549
11598
|
});
|
|
11550
11599
|
app.get("/jobs/:id", auth, async (c) => {
|
|
11551
11600
|
const job = await getJob(c.req.param("id"), c.get("user").id);
|
|
11552
11601
|
if (!job) return c.json({ error: "Job not found" }, 404);
|
|
11553
11602
|
const attempts = await listHarvestAttempts(job.id, c.get("user").id);
|
|
11554
|
-
|
|
11603
|
+
const safeResult = job.result && typeof job.result === "object" ? sanitizeHarvestResult(job.result) : job.result;
|
|
11604
|
+
return c.json({ ...job, result: safeResult, attempts: sanitizeAttempts(attempts) });
|
|
11555
11605
|
});
|
|
11556
11606
|
app.get("/jobs", auth, async (c) => {
|
|
11557
11607
|
return c.json(await listJobs(c.get("user").id));
|
|
@@ -11650,7 +11700,7 @@ app.post("/extract-url", auth, async (c) => {
|
|
|
11650
11700
|
const { ok: euOk, balance_mc: euBal } = await debitMc(user.id, MC_COSTS.page_scrape, LedgerOperation.EXTRACT_URL, new URL(canonicalUrl).hostname);
|
|
11651
11701
|
if (!euOk) return c.json(insufficientBalanceResponse(euBal, MC_COSTS.page_scrape), 402);
|
|
11652
11702
|
try {
|
|
11653
|
-
const kernelApiKey =
|
|
11703
|
+
const kernelApiKey = browserServiceApiKey();
|
|
11654
11704
|
const device = screenshotDevice === "mobile" ? "mobile" : "desktop";
|
|
11655
11705
|
const [result, pageData] = await Promise.all([
|
|
11656
11706
|
extractKpo({ url: canonicalUrl, kernelApiKey }),
|
|
@@ -11688,7 +11738,7 @@ app.post("/map-urls", auth, async (c) => {
|
|
|
11688
11738
|
startUrl: parsed.href,
|
|
11689
11739
|
maxUrls: Math.min(2e3, Math.max(1, body.maxUrls ?? 500)),
|
|
11690
11740
|
concurrency: Math.min(20, Math.max(1, body.concurrency ?? 12)),
|
|
11691
|
-
kernelApiKey: body.browserFallback ?? body.kernelFallback ?
|
|
11741
|
+
kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
|
|
11692
11742
|
});
|
|
11693
11743
|
await logRequestEvent({
|
|
11694
11744
|
userId: user.id,
|
|
@@ -11728,7 +11778,7 @@ app.post("/extract-site", auth, async (c) => {
|
|
|
11728
11778
|
const result = await extractSite({
|
|
11729
11779
|
startUrl: parsed.href,
|
|
11730
11780
|
maxPages: Math.min(200, Math.max(1, body.maxPages ?? 100)),
|
|
11731
|
-
kernelApiKey: body.browserFallback ?? body.kernelFallback ?
|
|
11781
|
+
kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
|
|
11732
11782
|
});
|
|
11733
11783
|
const pageCount = result.pages?.length ?? 1;
|
|
11734
11784
|
const actualSiteMc = pageCount * MC_COSTS.page_scrape;
|
|
@@ -11875,7 +11925,7 @@ app.get("/cron/tick", async (c) => {
|
|
|
11875
11925
|
if (!process.env.CRON_SECRET || secret2 !== `Bearer ${process.env.CRON_SECRET}`) {
|
|
11876
11926
|
return c.json({ error: "Unauthorized" }, 401);
|
|
11877
11927
|
}
|
|
11878
|
-
const { drainQueue } = await import("./worker-
|
|
11928
|
+
const { drainQueue } = await import("./worker-AUCXFHEL.js");
|
|
11879
11929
|
const budget = { maxJobs: 10, deadlineMs: Date.now() + 28e4 };
|
|
11880
11930
|
const [results, sweepResult] = await Promise.all([
|
|
11881
11931
|
drainQueue(budget),
|
|
@@ -11997,4 +12047,4 @@ app.get("/blog/:slug/", (c) => {
|
|
|
11997
12047
|
export {
|
|
11998
12048
|
app
|
|
11999
12049
|
};
|
|
12000
|
-
//# sourceMappingURL=server-
|
|
12050
|
+
//# sourceMappingURL=server-QXVVTKJP.js.map
|