mcp-scraper 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/api-server.cjs +388 -75
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +243 -11
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.cjs +14 -4
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +4 -3
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-3OIRNUF5.js → chunk-RE6HCRYC.js} +244 -12
- package/dist/chunk-RE6HCRYC.js.map +1 -0
- package/dist/{chunk-LUBDFS67.js → chunk-TM22BLWP.js} +15 -3
- package/dist/chunk-TM22BLWP.js.map +1 -0
- package/dist/index.cjs +12 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +1 -1
- package/dist/{server-YNJHP5PU.js → server-QXVVTKJP.js} +80 -30
- package/dist/server-QXVVTKJP.js.map +1 -0
- package/dist/{worker-PBG6LGET.js → worker-AUCXFHEL.js} +4 -3
- package/dist/worker-AUCXFHEL.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-3OIRNUF5.js.map +0 -1
- package/dist/chunk-LUBDFS67.js.map +0 -1
- package/dist/server-YNJHP5PU.js.map +0 -1
- package/dist/worker-PBG6LGET.js.map +0 -1
package/dist/bin/api-server.cjs
CHANGED
|
@@ -50,6 +50,109 @@ var init_harvest_timeout = __esm({
|
|
|
50
50
|
}
|
|
51
51
|
});
|
|
52
52
|
|
|
53
|
+
// src/lib/browser-service-env.ts
|
|
54
|
+
function browserServiceApiKey() {
|
|
55
|
+
const value = (process.env.BROWSER_SERVICE_API_KEY ?? process.env.KERNEL_API_KEY)?.trim();
|
|
56
|
+
return value || void 0;
|
|
57
|
+
}
|
|
58
|
+
function browserServiceProxyId() {
|
|
59
|
+
const value = (process.env.BROWSER_SERVICE_PROXY_ID ?? process.env.KERNEL_PROXY_ID)?.trim();
|
|
60
|
+
return value || void 0;
|
|
61
|
+
}
|
|
62
|
+
var init_browser_service_env = __esm({
|
|
63
|
+
"src/lib/browser-service-env.ts"() {
|
|
64
|
+
"use strict";
|
|
65
|
+
}
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
// src/errors.ts
|
|
69
|
+
function sanitizeVendorName(message) {
|
|
70
|
+
return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
|
|
71
|
+
}
|
|
72
|
+
var RECAPTCHA_INSTRUCTIONS, CaptchaError, ExtractionError, RequestAbortedError;
|
|
73
|
+
var init_errors = __esm({
|
|
74
|
+
"src/errors.ts"() {
|
|
75
|
+
"use strict";
|
|
76
|
+
RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
|
|
77
|
+
CaptchaError = class extends Error {
|
|
78
|
+
constructor(instructions) {
|
|
79
|
+
super(`CAPTCHA detected. ${instructions}`);
|
|
80
|
+
this.instructions = instructions;
|
|
81
|
+
}
|
|
82
|
+
instructions;
|
|
83
|
+
name = "CaptchaError";
|
|
84
|
+
};
|
|
85
|
+
ExtractionError = class extends Error {
|
|
86
|
+
constructor(message, cause) {
|
|
87
|
+
super(message);
|
|
88
|
+
this.cause = cause;
|
|
89
|
+
}
|
|
90
|
+
cause;
|
|
91
|
+
name = "ExtractionError";
|
|
92
|
+
};
|
|
93
|
+
RequestAbortedError = class extends Error {
|
|
94
|
+
name = "RequestAbortedError";
|
|
95
|
+
constructor(message = "Request aborted before harvest completed") {
|
|
96
|
+
super(message);
|
|
97
|
+
}
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// src/api/outbound-sanitize.ts
|
|
103
|
+
function sanitizeOutboundDiagnostics(value, parentKey = "") {
|
|
104
|
+
if (typeof value === "string") {
|
|
105
|
+
if (SANITIZED_VALUE_KEYS.test(parentKey) && /kernel/i.test(value)) {
|
|
106
|
+
return sanitizeVendorName(value);
|
|
107
|
+
}
|
|
108
|
+
return value;
|
|
109
|
+
}
|
|
110
|
+
if (Array.isArray(value)) return value.map((v) => sanitizeOutboundDiagnostics(v, parentKey));
|
|
111
|
+
if (value !== null && typeof value === "object") {
|
|
112
|
+
const out = {};
|
|
113
|
+
for (const [key, val] of Object.entries(value)) {
|
|
114
|
+
const renamed = KEY_RENAMES[key] ?? key;
|
|
115
|
+
out[renamed] = sanitizeOutboundDiagnostics(val, key);
|
|
116
|
+
}
|
|
117
|
+
return out;
|
|
118
|
+
}
|
|
119
|
+
return value;
|
|
120
|
+
}
|
|
121
|
+
function sanitizeAttempts(attempts) {
|
|
122
|
+
return attempts.map((a) => sanitizeOutboundDiagnostics(a));
|
|
123
|
+
}
|
|
124
|
+
function sanitizeHarvestResult(result) {
|
|
125
|
+
const diagnostics = result?.diagnostics;
|
|
126
|
+
if (!diagnostics?.debug) return result;
|
|
127
|
+
return {
|
|
128
|
+
...result,
|
|
129
|
+
diagnostics: {
|
|
130
|
+
...diagnostics,
|
|
131
|
+
debug: sanitizeOutboundDiagnostics(diagnostics.debug)
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
var KEY_RENAMES, SANITIZED_VALUE_KEYS;
|
|
136
|
+
var init_outbound_sanitize = __esm({
|
|
137
|
+
"src/api/outbound-sanitize.ts"() {
|
|
138
|
+
"use strict";
|
|
139
|
+
init_errors();
|
|
140
|
+
KEY_RENAMES = {
|
|
141
|
+
kernel: "browserRuntime",
|
|
142
|
+
kernel_session_id: "browser_session_id",
|
|
143
|
+
kernel_delete_started: "session_cleanup_started",
|
|
144
|
+
kernel_delete_succeeded: "session_cleanup_succeeded",
|
|
145
|
+
kernel_delete_error: "session_cleanup_error",
|
|
146
|
+
kernelSessionId: "browserSessionId",
|
|
147
|
+
kernelDeleteStarted: "sessionCleanupStarted",
|
|
148
|
+
kernelDeleteSucceeded: "sessionCleanupSucceeded",
|
|
149
|
+
kernelDeleteError: "sessionCleanupError",
|
|
150
|
+
kernelProxyId: "proxyId"
|
|
151
|
+
};
|
|
152
|
+
SANITIZED_VALUE_KEYS = /error|message/i;
|
|
153
|
+
}
|
|
154
|
+
});
|
|
155
|
+
|
|
53
156
|
// src/blog/registry.ts
|
|
54
157
|
var posts;
|
|
55
158
|
var init_registry = __esm({
|
|
@@ -3425,7 +3528,7 @@ var init_url_utils = __esm({
|
|
|
3425
3528
|
|
|
3426
3529
|
// src/api/kernel-fetch.ts
|
|
3427
3530
|
async function fetchWithKernel(url) {
|
|
3428
|
-
const apiKey =
|
|
3531
|
+
const apiKey = browserServiceApiKey();
|
|
3429
3532
|
if (!apiKey) throw new Error("Browser backend API key not set");
|
|
3430
3533
|
const client = new import_sdk.default({ apiKey });
|
|
3431
3534
|
const kb = await client.browsers.create({ stealth: true, timeout_seconds: 60 });
|
|
@@ -3450,6 +3553,7 @@ var init_kernel_fetch = __esm({
|
|
|
3450
3553
|
"src/api/kernel-fetch.ts"() {
|
|
3451
3554
|
"use strict";
|
|
3452
3555
|
import_sdk = __toESM(require("@onkernel/sdk"), 1);
|
|
3556
|
+
init_browser_service_env();
|
|
3453
3557
|
import_playwright = require("playwright");
|
|
3454
3558
|
}
|
|
3455
3559
|
});
|
|
@@ -8494,40 +8598,6 @@ var init_selectors = __esm({
|
|
|
8494
8598
|
}
|
|
8495
8599
|
});
|
|
8496
8600
|
|
|
8497
|
-
// src/errors.ts
|
|
8498
|
-
function sanitizeVendorName(message) {
|
|
8499
|
-
return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
|
|
8500
|
-
}
|
|
8501
|
-
var RECAPTCHA_INSTRUCTIONS, CaptchaError, ExtractionError, RequestAbortedError;
|
|
8502
|
-
var init_errors = __esm({
|
|
8503
|
-
"src/errors.ts"() {
|
|
8504
|
-
"use strict";
|
|
8505
|
-
RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
|
|
8506
|
-
CaptchaError = class extends Error {
|
|
8507
|
-
constructor(instructions) {
|
|
8508
|
-
super(`CAPTCHA detected. ${instructions}`);
|
|
8509
|
-
this.instructions = instructions;
|
|
8510
|
-
}
|
|
8511
|
-
instructions;
|
|
8512
|
-
name = "CaptchaError";
|
|
8513
|
-
};
|
|
8514
|
-
ExtractionError = class extends Error {
|
|
8515
|
-
constructor(message, cause) {
|
|
8516
|
-
super(message);
|
|
8517
|
-
this.cause = cause;
|
|
8518
|
-
}
|
|
8519
|
-
cause;
|
|
8520
|
-
name = "ExtractionError";
|
|
8521
|
-
};
|
|
8522
|
-
RequestAbortedError = class extends Error {
|
|
8523
|
-
name = "RequestAbortedError";
|
|
8524
|
-
constructor(message = "Request aborted before harvest completed") {
|
|
8525
|
-
super(message);
|
|
8526
|
-
}
|
|
8527
|
-
};
|
|
8528
|
-
}
|
|
8529
|
-
});
|
|
8530
|
-
|
|
8531
8601
|
// src/driver/BrowserDriver.ts
|
|
8532
8602
|
function positiveIntFromEnv(name, fallback) {
|
|
8533
8603
|
const raw = process.env[name];
|
|
@@ -9509,7 +9579,7 @@ async function writeOutputs(result, outputDir) {
|
|
|
9509
9579
|
}
|
|
9510
9580
|
}
|
|
9511
9581
|
async function ytHarvest(rawOptions) {
|
|
9512
|
-
const kernelApiKey =
|
|
9582
|
+
const kernelApiKey = browserServiceApiKey();
|
|
9513
9583
|
if (!kernelApiKey) {
|
|
9514
9584
|
throw new Error("A browser backend API key is required \u2014 YouTube harvesting requires a stealth session.");
|
|
9515
9585
|
}
|
|
@@ -9543,6 +9613,7 @@ var init_youtube_harvest = __esm({
|
|
|
9543
9613
|
"src/youtube/youtube-harvest.ts"() {
|
|
9544
9614
|
"use strict";
|
|
9545
9615
|
import_node_fs2 = require("fs");
|
|
9616
|
+
init_browser_service_env();
|
|
9546
9617
|
import_node_path4 = __toESM(require("path"), 1);
|
|
9547
9618
|
import_papaparse = __toESM(require("papaparse"), 1);
|
|
9548
9619
|
init_schemas2();
|
|
@@ -9620,7 +9691,7 @@ function parseTimedtextXml(xml) {
|
|
|
9620
9691
|
return results;
|
|
9621
9692
|
}
|
|
9622
9693
|
async function fetchViaKernelInnertube(videoId) {
|
|
9623
|
-
const kernelApiKey =
|
|
9694
|
+
const kernelApiKey = browserServiceApiKey();
|
|
9624
9695
|
if (!kernelApiKey) return null;
|
|
9625
9696
|
const driver = new BrowserDriver();
|
|
9626
9697
|
const start = Date.now();
|
|
@@ -9763,7 +9834,7 @@ async function attemptKernelWhisper(videoId, kernelApiKey, falKey, start) {
|
|
|
9763
9834
|
}
|
|
9764
9835
|
}
|
|
9765
9836
|
async function fetchViaKernelWhisper(videoId) {
|
|
9766
|
-
const kernelApiKey =
|
|
9837
|
+
const kernelApiKey = browserServiceApiKey();
|
|
9767
9838
|
const falKey = process.env.FAL_KEY;
|
|
9768
9839
|
if (!kernelApiKey || !falKey) return null;
|
|
9769
9840
|
const start = Date.now();
|
|
@@ -9803,6 +9874,7 @@ var init_CaptionFetcher = __esm({
|
|
|
9803
9874
|
"src/youtube/CaptionFetcher.ts"() {
|
|
9804
9875
|
"use strict";
|
|
9805
9876
|
init_BrowserDriver();
|
|
9877
|
+
init_browser_service_env();
|
|
9806
9878
|
import_client2 = require("@fal-ai/client");
|
|
9807
9879
|
WHISPER_RECORD_SECONDS = 90;
|
|
9808
9880
|
}
|
|
@@ -10044,6 +10116,7 @@ var init_screenshot_routes = __esm({
|
|
|
10044
10116
|
"src/api/screenshot-routes.ts"() {
|
|
10045
10117
|
"use strict";
|
|
10046
10118
|
import_hono3 = require("hono");
|
|
10119
|
+
init_browser_service_env();
|
|
10047
10120
|
import_zod14 = require("zod");
|
|
10048
10121
|
init_screenshot();
|
|
10049
10122
|
init_api_auth();
|
|
@@ -10078,7 +10151,7 @@ var init_screenshot_routes = __esm({
|
|
|
10078
10151
|
}
|
|
10079
10152
|
const device2 = body.device === "mobile" ? "mobile" : "desktop";
|
|
10080
10153
|
try {
|
|
10081
|
-
const buf = await captureScreenshot(parsedFallback.href,
|
|
10154
|
+
const buf = await captureScreenshot(parsedFallback.href, browserServiceApiKey(), device2);
|
|
10082
10155
|
return new Response(new Uint8Array(buf), {
|
|
10083
10156
|
status: 200,
|
|
10084
10157
|
headers: {
|
|
@@ -10094,7 +10167,7 @@ var init_screenshot_routes = __esm({
|
|
|
10094
10167
|
}
|
|
10095
10168
|
const device = body.device === "mobile" ? "mobile" : "desktop";
|
|
10096
10169
|
try {
|
|
10097
|
-
const buf = await captureScreenshot(urlCheck.parsed.href,
|
|
10170
|
+
const buf = await captureScreenshot(urlCheck.parsed.href, browserServiceApiKey(), device);
|
|
10098
10171
|
return new Response(new Uint8Array(buf), {
|
|
10099
10172
|
status: 200,
|
|
10100
10173
|
headers: {
|
|
@@ -11379,29 +11452,30 @@ function buildPageIntelUrl(body, country) {
|
|
|
11379
11452
|
return `https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=${country}&q=${encodeURIComponent(body.query.trim())}&search_type=keyword_unordered`;
|
|
11380
11453
|
}
|
|
11381
11454
|
function kernelLaunchOpts() {
|
|
11382
|
-
return { headless: true, kernelApiKey:
|
|
11455
|
+
return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: browserServiceProxyId(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
11383
11456
|
}
|
|
11384
11457
|
async function kernelLaunchOptsResidential() {
|
|
11385
|
-
let proxyId =
|
|
11458
|
+
let proxyId = browserServiceProxyId();
|
|
11386
11459
|
try {
|
|
11387
11460
|
const resolution2 = await resolveKernelProxyId({
|
|
11388
|
-
kernelApiKey:
|
|
11461
|
+
kernelApiKey: browserServiceApiKey(),
|
|
11389
11462
|
proxyMode: "location",
|
|
11390
|
-
configuredKernelProxyId:
|
|
11463
|
+
configuredKernelProxyId: browserServiceProxyId(),
|
|
11391
11464
|
location: "New York, NY",
|
|
11392
11465
|
gl: "us"
|
|
11393
11466
|
});
|
|
11394
11467
|
if (resolution2.kernelProxyId) proxyId = resolution2.kernelProxyId;
|
|
11395
11468
|
} catch {
|
|
11396
|
-
proxyId =
|
|
11469
|
+
proxyId = browserServiceProxyId();
|
|
11397
11470
|
}
|
|
11398
|
-
return { headless: true, kernelApiKey:
|
|
11471
|
+
return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
11399
11472
|
}
|
|
11400
11473
|
var import_hono4, import_zod15, import_client3, FacebookAdBodySchema, FacebookPageIntelBodySchema, FacebookTranscribeBodySchema, FacebookSearchBodySchema, FacebookMediaBodySchema, facebookAdApp, ALLOWED_MEDIA_HOSTS;
|
|
11401
11474
|
var init_facebook_ad_routes = __esm({
|
|
11402
11475
|
"src/api/facebook-ad-routes.ts"() {
|
|
11403
11476
|
"use strict";
|
|
11404
11477
|
import_hono4 = require("hono");
|
|
11478
|
+
init_browser_service_env();
|
|
11405
11479
|
import_zod15 = require("zod");
|
|
11406
11480
|
init_db();
|
|
11407
11481
|
init_rates();
|
|
@@ -14321,8 +14395,8 @@ async function harvest(rawOptions) {
|
|
|
14321
14395
|
const onAttemptEvent = getAttemptLogSink(rawOptions);
|
|
14322
14396
|
const requestedProxyMode = raw.proxyMode;
|
|
14323
14397
|
const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
|
|
14324
|
-
const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() :
|
|
14325
|
-
const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() :
|
|
14398
|
+
const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : browserServiceApiKey();
|
|
14399
|
+
const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : browserServiceProxyId();
|
|
14326
14400
|
const proxyOpts = {
|
|
14327
14401
|
kernelApiKey,
|
|
14328
14402
|
proxyMode,
|
|
@@ -14509,6 +14583,7 @@ var init_harvest = __esm({
|
|
|
14509
14583
|
"src/harvest.ts"() {
|
|
14510
14584
|
"use strict";
|
|
14511
14585
|
init_schemas3();
|
|
14586
|
+
init_browser_service_env();
|
|
14512
14587
|
init_BrowserDriver();
|
|
14513
14588
|
init_PAAExtractor();
|
|
14514
14589
|
init_OutputSerializer();
|
|
@@ -14933,8 +15008,8 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
|
|
|
14933
15008
|
debug,
|
|
14934
15009
|
serpOnly: true,
|
|
14935
15010
|
headless: runtimeOptions.headless ?? true,
|
|
14936
|
-
kernelApiKey: runtimeOptions.kernelApiKey ??
|
|
14937
|
-
kernelProxyId: runtimeOptions.kernelProxyId ??
|
|
15011
|
+
kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
|
|
15012
|
+
kernelProxyId: runtimeOptions.kernelProxyId ?? browserServiceProxyId(),
|
|
14938
15013
|
format: "json",
|
|
14939
15014
|
outputDir: runtimeOptions.outputDir ?? "/tmp/serp-intelligence-output",
|
|
14940
15015
|
signal: runtimeOptions.signal,
|
|
@@ -14945,7 +15020,7 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
|
|
|
14945
15020
|
const pageSnapshotLimit = normalizePageSnapshotLimit(parsedInput);
|
|
14946
15021
|
const pageSnapshotTargets = collectPageSnapshotTargets(harvestResult, pageSnapshotLimit);
|
|
14947
15022
|
const pageSnapshotArtifacts = pageSnapshotTargets.length > 0 ? (await capturePageSnapshotsFn(pageSnapshotTargets, {
|
|
14948
|
-
kernelApiKey: runtimeOptions.kernelApiKey ??
|
|
15023
|
+
kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
|
|
14949
15024
|
timeoutMs: runtimeOptions.pageSnapshotTimeoutMs,
|
|
14950
15025
|
maxConcurrency: runtimeOptions.pageSnapshotMaxConcurrency,
|
|
14951
15026
|
debug,
|
|
@@ -14967,6 +15042,7 @@ var init_serp_capture_service = __esm({
|
|
|
14967
15042
|
"src/serp-intelligence/serp-capture-service.ts"() {
|
|
14968
15043
|
"use strict";
|
|
14969
15044
|
init_harvest();
|
|
15045
|
+
init_browser_service_env();
|
|
14970
15046
|
init_harvest_problems();
|
|
14971
15047
|
init_page_snapshot_extractor();
|
|
14972
15048
|
init_schemas4();
|
|
@@ -15071,6 +15147,7 @@ var init_serp_intelligence_routes = __esm({
|
|
|
15071
15147
|
"src/api/serp-intelligence-routes.ts"() {
|
|
15072
15148
|
"use strict";
|
|
15073
15149
|
import_hono6 = require("hono");
|
|
15150
|
+
init_browser_service_env();
|
|
15074
15151
|
init_page_snapshot_extractor();
|
|
15075
15152
|
init_serp_capture_service();
|
|
15076
15153
|
init_schemas4();
|
|
@@ -15103,8 +15180,8 @@ var init_serp_intelligence_routes = __esm({
|
|
|
15103
15180
|
if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
|
|
15104
15181
|
try {
|
|
15105
15182
|
const result = await captureSerpIntelligenceSnapshot(parsed.data, {
|
|
15106
|
-
kernelApiKey:
|
|
15107
|
-
kernelProxyId:
|
|
15183
|
+
kernelApiKey: browserServiceApiKey(),
|
|
15184
|
+
kernelProxyId: browserServiceProxyId(),
|
|
15108
15185
|
signal: c.req.raw.signal,
|
|
15109
15186
|
billing: { creditsUsed: cost / 1e3 }
|
|
15110
15187
|
});
|
|
@@ -15159,7 +15236,7 @@ var init_serp_intelligence_routes = __esm({
|
|
|
15159
15236
|
if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
|
|
15160
15237
|
try {
|
|
15161
15238
|
const result = await capturePageSnapshots(targets, {
|
|
15162
|
-
kernelApiKey:
|
|
15239
|
+
kernelApiKey: browserServiceApiKey(),
|
|
15163
15240
|
timeoutMs: parsed.data.timeoutMs,
|
|
15164
15241
|
maxConcurrency: parsed.data.maxConcurrency,
|
|
15165
15242
|
debug: parsed.data.debug
|
|
@@ -15199,12 +15276,12 @@ var PACKAGE_VERSION;
|
|
|
15199
15276
|
var init_version = __esm({
|
|
15200
15277
|
"src/version.ts"() {
|
|
15201
15278
|
"use strict";
|
|
15202
|
-
PACKAGE_VERSION = "0.1.
|
|
15279
|
+
PACKAGE_VERSION = "0.1.8";
|
|
15203
15280
|
}
|
|
15204
15281
|
});
|
|
15205
15282
|
|
|
15206
15283
|
// src/mcp/mcp-tool-schemas.ts
|
|
15207
|
-
var import_zod19, HarvestPaaInputSchema, ExtractUrlInputSchema, MapSiteUrlsInputSchema, ExtractSiteInputSchema, YoutubeHarvestInputSchema, YoutubeTranscribeInputSchema, FacebookPageIntelInputSchema, FacebookAdSearchInputSchema, FacebookAdTranscribeInputSchema, MapsPlaceIntelInputSchema, MapsSearchInputSchema, NullableString, MapsSearchOutputSchema, MapSiteUrlsOutputSchema, YoutubeHarvestOutputSchema, FacebookAdSearchOutputSchema, FacebookPageIntelOutputSchema, CreditsInfoInputSchema, SearchSerpInputSchema, CaptureSerpSnapshotInputSchema, ScreenshotInputSchema, CaptureSerpPageSnapshotsInputSchema;
|
|
15284
|
+
var import_zod19, HarvestPaaInputSchema, ExtractUrlInputSchema, MapSiteUrlsInputSchema, ExtractSiteInputSchema, YoutubeHarvestInputSchema, YoutubeTranscribeInputSchema, FacebookPageIntelInputSchema, FacebookAdSearchInputSchema, FacebookAdTranscribeInputSchema, MapsPlaceIntelInputSchema, MapsSearchInputSchema, NullableString, MapsSearchOutputSchema, OrganicResultOutput, AiOverviewOutput, EntityIdsOutput, HarvestPaaOutputSchema, SearchSerpOutputSchema, ExtractUrlOutputSchema, ExtractSiteOutputSchema, MapsPlaceIntelOutputSchema, CreditsInfoOutputSchema, MapSiteUrlsOutputSchema, YoutubeHarvestOutputSchema, FacebookAdSearchOutputSchema, FacebookPageIntelOutputSchema, CreditsInfoInputSchema, SearchSerpInputSchema, CaptureSerpSnapshotInputSchema, ScreenshotInputSchema, CaptureSerpPageSnapshotsInputSchema;
|
|
15208
15285
|
var init_mcp_tool_schemas = __esm({
|
|
15209
15286
|
"src/mcp/mcp-tool-schemas.ts"() {
|
|
15210
15287
|
"use strict";
|
|
@@ -15301,6 +15378,120 @@ var init_mcp_tool_schemas = __esm({
|
|
|
15301
15378
|
})),
|
|
15302
15379
|
durationMs: import_zod19.z.number().int().min(0)
|
|
15303
15380
|
};
|
|
15381
|
+
OrganicResultOutput = import_zod19.z.object({
|
|
15382
|
+
position: import_zod19.z.number().int(),
|
|
15383
|
+
title: import_zod19.z.string(),
|
|
15384
|
+
url: import_zod19.z.string(),
|
|
15385
|
+
domain: import_zod19.z.string(),
|
|
15386
|
+
snippet: NullableString
|
|
15387
|
+
});
|
|
15388
|
+
AiOverviewOutput = import_zod19.z.object({
|
|
15389
|
+
detected: import_zod19.z.boolean(),
|
|
15390
|
+
text: NullableString
|
|
15391
|
+
}).nullable();
|
|
15392
|
+
EntityIdsOutput = import_zod19.z.object({
|
|
15393
|
+
kgIds: import_zod19.z.array(import_zod19.z.string()),
|
|
15394
|
+
cids: import_zod19.z.array(import_zod19.z.string()),
|
|
15395
|
+
gcids: import_zod19.z.array(import_zod19.z.string())
|
|
15396
|
+
}).nullable();
|
|
15397
|
+
HarvestPaaOutputSchema = {
|
|
15398
|
+
query: import_zod19.z.string(),
|
|
15399
|
+
location: NullableString,
|
|
15400
|
+
questionCount: import_zod19.z.number().int().min(0),
|
|
15401
|
+
completionStatus: NullableString,
|
|
15402
|
+
questions: import_zod19.z.array(import_zod19.z.object({
|
|
15403
|
+
question: import_zod19.z.string(),
|
|
15404
|
+
answer: NullableString,
|
|
15405
|
+
sourceTitle: NullableString,
|
|
15406
|
+
sourceSite: NullableString
|
|
15407
|
+
})),
|
|
15408
|
+
organicResults: import_zod19.z.array(OrganicResultOutput),
|
|
15409
|
+
aiOverview: AiOverviewOutput,
|
|
15410
|
+
entityIds: EntityIdsOutput,
|
|
15411
|
+
durationMs: import_zod19.z.number().min(0).nullable()
|
|
15412
|
+
};
|
|
15413
|
+
SearchSerpOutputSchema = {
|
|
15414
|
+
query: import_zod19.z.string(),
|
|
15415
|
+
location: NullableString,
|
|
15416
|
+
organicResults: import_zod19.z.array(OrganicResultOutput),
|
|
15417
|
+
localPack: import_zod19.z.array(import_zod19.z.object({
|
|
15418
|
+
position: import_zod19.z.number().int(),
|
|
15419
|
+
name: import_zod19.z.string(),
|
|
15420
|
+
rating: NullableString,
|
|
15421
|
+
reviewCount: NullableString,
|
|
15422
|
+
websiteUrl: NullableString
|
|
15423
|
+
})),
|
|
15424
|
+
aiOverview: AiOverviewOutput,
|
|
15425
|
+
entityIds: EntityIdsOutput
|
|
15426
|
+
};
|
|
15427
|
+
ExtractUrlOutputSchema = {
|
|
15428
|
+
url: import_zod19.z.string(),
|
|
15429
|
+
title: NullableString,
|
|
15430
|
+
headings: import_zod19.z.array(import_zod19.z.object({
|
|
15431
|
+
level: import_zod19.z.number().int(),
|
|
15432
|
+
text: import_zod19.z.string()
|
|
15433
|
+
})),
|
|
15434
|
+
schemaBlockCount: import_zod19.z.number().int().min(0),
|
|
15435
|
+
entityName: NullableString,
|
|
15436
|
+
entityTypes: import_zod19.z.array(import_zod19.z.string()),
|
|
15437
|
+
napScore: import_zod19.z.number().nullable(),
|
|
15438
|
+
missingSchemaFields: import_zod19.z.array(import_zod19.z.string()),
|
|
15439
|
+
screenshotSaved: NullableString
|
|
15440
|
+
};
|
|
15441
|
+
ExtractSiteOutputSchema = {
|
|
15442
|
+
url: import_zod19.z.string(),
|
|
15443
|
+
pageCount: import_zod19.z.number().int().min(0),
|
|
15444
|
+
pages: import_zod19.z.array(import_zod19.z.object({
|
|
15445
|
+
url: import_zod19.z.string(),
|
|
15446
|
+
title: NullableString,
|
|
15447
|
+
schemaTypes: import_zod19.z.array(import_zod19.z.string())
|
|
15448
|
+
})),
|
|
15449
|
+
durationMs: import_zod19.z.number().min(0)
|
|
15450
|
+
};
|
|
15451
|
+
MapsPlaceIntelOutputSchema = {
|
|
15452
|
+
name: import_zod19.z.string(),
|
|
15453
|
+
rating: NullableString,
|
|
15454
|
+
reviewCount: NullableString,
|
|
15455
|
+
category: NullableString,
|
|
15456
|
+
address: NullableString,
|
|
15457
|
+
phone: NullableString,
|
|
15458
|
+
website: NullableString,
|
|
15459
|
+
hoursSummary: NullableString,
|
|
15460
|
+
bookingUrl: NullableString,
|
|
15461
|
+
kgmid: NullableString,
|
|
15462
|
+
cidDecimal: NullableString,
|
|
15463
|
+
cidUrl: NullableString,
|
|
15464
|
+
lat: import_zod19.z.number().nullable(),
|
|
15465
|
+
lng: import_zod19.z.number().nullable(),
|
|
15466
|
+
reviewsStatus: import_zod19.z.string(),
|
|
15467
|
+
reviewsCollected: import_zod19.z.number().int().min(0),
|
|
15468
|
+
reviewTopics: import_zod19.z.array(import_zod19.z.object({
|
|
15469
|
+
label: import_zod19.z.string(),
|
|
15470
|
+
count: import_zod19.z.string()
|
|
15471
|
+
}))
|
|
15472
|
+
};
|
|
15473
|
+
CreditsInfoOutputSchema = {
|
|
15474
|
+
balanceCredits: import_zod19.z.number().nullable(),
|
|
15475
|
+
matchedCost: import_zod19.z.object({
|
|
15476
|
+
label: import_zod19.z.string(),
|
|
15477
|
+
credits: import_zod19.z.number(),
|
|
15478
|
+
unit: import_zod19.z.string(),
|
|
15479
|
+
notes: NullableString
|
|
15480
|
+
}).nullable(),
|
|
15481
|
+
costs: import_zod19.z.array(import_zod19.z.object({
|
|
15482
|
+
key: import_zod19.z.string(),
|
|
15483
|
+
label: import_zod19.z.string(),
|
|
15484
|
+
credits: import_zod19.z.number(),
|
|
15485
|
+
unit: import_zod19.z.string(),
|
|
15486
|
+
notes: NullableString
|
|
15487
|
+
})),
|
|
15488
|
+
ledger: import_zod19.z.array(import_zod19.z.object({
|
|
15489
|
+
createdAt: import_zod19.z.string(),
|
|
15490
|
+
operation: import_zod19.z.string(),
|
|
15491
|
+
credits: import_zod19.z.number(),
|
|
15492
|
+
description: NullableString
|
|
15493
|
+
}))
|
|
15494
|
+
};
|
|
15304
15495
|
MapSiteUrlsOutputSchema = {
|
|
15305
15496
|
startUrl: import_zod19.z.string(),
|
|
15306
15497
|
totalFound: import_zod19.z.number().int().min(0),
|
|
@@ -15505,7 +15696,7 @@ function debugSection(debug) {
|
|
|
15505
15696
|
if (!debug || typeof debug !== "object") return "";
|
|
15506
15697
|
const request = debug.request ?? {};
|
|
15507
15698
|
const browser = debug.browser ?? {};
|
|
15508
|
-
const kernel = browser.kernel ?? {};
|
|
15699
|
+
const kernel = browser.browserRuntime ?? browser.kernel ?? {};
|
|
15509
15700
|
const network = browser.networkLocation ?? {};
|
|
15510
15701
|
const nav = browser.serpNavigation ?? {};
|
|
15511
15702
|
const proxyResolution = kernel.proxyResolution ?? {};
|
|
@@ -15531,12 +15722,14 @@ function errorAttemptsSection(body) {
|
|
|
15531
15722
|
const lines = attempts.slice(0, 5).map((attempt) => {
|
|
15532
15723
|
const debug = attempt.debug ?? {};
|
|
15533
15724
|
const browser = debug.browser ?? {};
|
|
15534
|
-
const kernel = browser.kernel ?? {};
|
|
15725
|
+
const kernel = browser.browserRuntime ?? browser.kernel ?? {};
|
|
15535
15726
|
const proxyResolution = kernel.proxyResolution ?? {};
|
|
15536
15727
|
const network = browser.networkLocation ?? {};
|
|
15537
15728
|
const nav = browser.serpNavigation ?? {};
|
|
15538
15729
|
const geo = [network.ip, network.city, network.region].filter(Boolean).join(" / ") || "geo unknown";
|
|
15539
|
-
|
|
15730
|
+
const sessionId = attempt.browser_session_id ?? attempt.kernel_session_id ?? kernel.sessionId ?? "unknown";
|
|
15731
|
+
const cleanupSucceeded2 = attempt.session_cleanup_succeeded ?? attempt.kernel_delete_succeeded;
|
|
15732
|
+
return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${sessionId} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 cleanup ${cleanupSucceeded2 === true ? "yes" : cleanupSucceeded2 === false ? "no" : "unknown"}`;
|
|
15540
15733
|
});
|
|
15541
15734
|
return `
|
|
15542
15735
|
|
|
@@ -15583,7 +15776,31 @@ ${serpRows}` : "";
|
|
|
15583
15776
|
const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
15584
15777
|
|
|
15585
15778
|
${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
|
|
15586
|
-
return
|
|
15779
|
+
return {
|
|
15780
|
+
...oneBlock(full),
|
|
15781
|
+
structuredContent: {
|
|
15782
|
+
query: input.query,
|
|
15783
|
+
location: input.location ?? null,
|
|
15784
|
+
questionCount: flat.length,
|
|
15785
|
+
completionStatus: diagnostics?.completionStatus ?? null,
|
|
15786
|
+
questions: flat.map((r) => ({
|
|
15787
|
+
question: String(r.question ?? ""),
|
|
15788
|
+
answer: r.answer ?? null,
|
|
15789
|
+
sourceTitle: r.source_title ?? null,
|
|
15790
|
+
sourceSite: r.source_site ?? null
|
|
15791
|
+
})),
|
|
15792
|
+
organicResults: organic.map((r) => ({
|
|
15793
|
+
position: Number(r.position) || 0,
|
|
15794
|
+
title: String(r.title ?? ""),
|
|
15795
|
+
url: String(r.url ?? ""),
|
|
15796
|
+
domain: String(r.domain ?? ""),
|
|
15797
|
+
snippet: r.snippet ?? null
|
|
15798
|
+
})),
|
|
15799
|
+
aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
|
|
15800
|
+
entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null,
|
|
15801
|
+
durationMs: durationMs ?? null
|
|
15802
|
+
}
|
|
15803
|
+
};
|
|
15587
15804
|
}
|
|
15588
15805
|
function formatSearchSerp(raw, input) {
|
|
15589
15806
|
const parsed = parseData(raw);
|
|
@@ -15621,7 +15838,29 @@ ${localRows}` : "";
|
|
|
15621
15838
|
const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
15622
15839
|
|
|
15623
15840
|
${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
|
|
15624
|
-
return
|
|
15841
|
+
return {
|
|
15842
|
+
...oneBlock(full),
|
|
15843
|
+
structuredContent: {
|
|
15844
|
+
query: input.query,
|
|
15845
|
+
location: input.location ?? null,
|
|
15846
|
+
organicResults: organic.map((r) => ({
|
|
15847
|
+
position: Number(r.position) || 0,
|
|
15848
|
+
title: String(r.title ?? ""),
|
|
15849
|
+
url: String(r.url ?? ""),
|
|
15850
|
+
domain: String(r.domain ?? ""),
|
|
15851
|
+
snippet: r.snippet ?? null
|
|
15852
|
+
})),
|
|
15853
|
+
localPack: localPack.map((b) => ({
|
|
15854
|
+
position: Number(b.position) || 0,
|
|
15855
|
+
name: String(b.name ?? ""),
|
|
15856
|
+
rating: b.rating ?? null,
|
|
15857
|
+
reviewCount: b.reviewCount ?? null,
|
|
15858
|
+
websiteUrl: b.websiteUrl ?? null
|
|
15859
|
+
})),
|
|
15860
|
+
aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
|
|
15861
|
+
entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null
|
|
15862
|
+
}
|
|
15863
|
+
};
|
|
15625
15864
|
}
|
|
15626
15865
|
function formatExtractUrl(raw, input) {
|
|
15627
15866
|
const parsed = parseData(raw);
|
|
@@ -15690,15 +15929,27 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
|
15690
15929
|
**${title}**
|
|
15691
15930
|
${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
|
|
15692
15931
|
const textResult = oneBlock(full);
|
|
15932
|
+
const structuredContent = {
|
|
15933
|
+
url,
|
|
15934
|
+
title: d.title ?? null,
|
|
15935
|
+
headings: headings.map((h) => ({ level: Number(h.level) || 0, text: String(h.text ?? "") })),
|
|
15936
|
+
schemaBlockCount: schemaCount,
|
|
15937
|
+
entityName: kpo?.entityName ?? null,
|
|
15938
|
+
entityTypes: kpo?.type ?? [],
|
|
15939
|
+
napScore: kpo?.napScore ?? null,
|
|
15940
|
+
missingSchemaFields: kpo?.missingFields ?? [],
|
|
15941
|
+
screenshotSaved: screenshotPath ?? null
|
|
15942
|
+
};
|
|
15693
15943
|
if (screenshotMeta?.base64) {
|
|
15694
15944
|
return {
|
|
15695
15945
|
content: [
|
|
15696
15946
|
...textResult.content,
|
|
15697
15947
|
{ type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
|
|
15698
|
-
]
|
|
15948
|
+
],
|
|
15949
|
+
structuredContent
|
|
15699
15950
|
};
|
|
15700
15951
|
}
|
|
15701
|
-
return textResult;
|
|
15952
|
+
return { ...textResult, structuredContent };
|
|
15702
15953
|
}
|
|
15703
15954
|
function formatMapSiteUrls(raw, input) {
|
|
15704
15955
|
const parsed = parseData(raw);
|
|
@@ -15768,7 +16019,19 @@ ${pageRows}`,
|
|
|
15768
16019
|
- Map URLs first: use \`map_site_urls\`
|
|
15769
16020
|
- Inspect a single page: use \`extract_url\``
|
|
15770
16021
|
].join("\n");
|
|
15771
|
-
return
|
|
16022
|
+
return {
|
|
16023
|
+
...oneBlock(full),
|
|
16024
|
+
structuredContent: {
|
|
16025
|
+
url: input.url,
|
|
16026
|
+
pageCount: pages.length,
|
|
16027
|
+
pages: pages.map((p) => ({
|
|
16028
|
+
url: String(p.url ?? ""),
|
|
16029
|
+
title: p.title ?? null,
|
|
16030
|
+
schemaTypes: p.kpo?.type ?? []
|
|
16031
|
+
})),
|
|
16032
|
+
durationMs: d.durationMs ?? 0
|
|
16033
|
+
}
|
|
16034
|
+
};
|
|
15772
16035
|
}
|
|
15773
16036
|
function formatYoutubeHarvest(raw, input) {
|
|
15774
16037
|
const parsed = parseData(raw);
|
|
@@ -15965,7 +16228,26 @@ ${costRows}` : "",
|
|
|
15965
16228
|
|------|-----------|---------|-------------|
|
|
15966
16229
|
${ledgerRows}` : ""
|
|
15967
16230
|
].filter(Boolean).join("\n");
|
|
15968
|
-
return
|
|
16231
|
+
return {
|
|
16232
|
+
...oneBlock(full),
|
|
16233
|
+
structuredContent: {
|
|
16234
|
+
balanceCredits: typeof balance === "number" ? balance : null,
|
|
16235
|
+
matchedCost: matched ? { label: matched.label, credits: matched.credits, unit: matched.unit, notes: matched.notes ?? null } : null,
|
|
16236
|
+
costs: costs.map((c) => ({
|
|
16237
|
+
key: c.key,
|
|
16238
|
+
label: c.label,
|
|
16239
|
+
credits: c.credits,
|
|
16240
|
+
unit: c.unit,
|
|
16241
|
+
notes: c.notes ?? null
|
|
16242
|
+
})),
|
|
16243
|
+
ledger: ledger.map((row) => ({
|
|
16244
|
+
createdAt: String(row.created_at ?? ""),
|
|
16245
|
+
operation: String(row.operation ?? ""),
|
|
16246
|
+
credits: row.amount_mc / 1e3,
|
|
16247
|
+
description: row.description ?? null
|
|
16248
|
+
}))
|
|
16249
|
+
}
|
|
16250
|
+
};
|
|
15969
16251
|
}
|
|
15970
16252
|
function formatMapsSearch(raw, input) {
|
|
15971
16253
|
const parsed = parseData(raw);
|
|
@@ -16114,7 +16396,28 @@ ${entitySection}` : null,
|
|
|
16114
16396
|
---
|
|
16115
16397
|
*Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
|
|
16116
16398
|
].filter(Boolean).join("\n");
|
|
16117
|
-
return
|
|
16399
|
+
return {
|
|
16400
|
+
...oneBlock(full),
|
|
16401
|
+
structuredContent: {
|
|
16402
|
+
name,
|
|
16403
|
+
rating: rating ?? null,
|
|
16404
|
+
reviewCount: reviewCount ?? null,
|
|
16405
|
+
category: category ?? null,
|
|
16406
|
+
address: address ?? null,
|
|
16407
|
+
phone: phone ?? null,
|
|
16408
|
+
website: website ?? null,
|
|
16409
|
+
hoursSummary: hoursSummary ?? null,
|
|
16410
|
+
bookingUrl: bookingUrl ?? null,
|
|
16411
|
+
kgmid: kgmid ?? null,
|
|
16412
|
+
cidDecimal: cidDecimal ?? null,
|
|
16413
|
+
cidUrl: cidUrl ?? null,
|
|
16414
|
+
lat: lat ?? null,
|
|
16415
|
+
lng: lng ?? null,
|
|
16416
|
+
reviewsStatus,
|
|
16417
|
+
reviewsCollected: reviews.length,
|
|
16418
|
+
reviewTopics: topics.map((t) => ({ label: String(t.label ?? ""), count: String(t.count ?? "") }))
|
|
16419
|
+
}
|
|
16420
|
+
};
|
|
16118
16421
|
}
|
|
16119
16422
|
function formatFacebookAdTranscribe(raw, input) {
|
|
16120
16423
|
const parsed = parseData(raw);
|
|
@@ -16177,18 +16480,21 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
|
|
|
16177
16480
|
title: "Google PAA + SERP Harvest",
|
|
16178
16481
|
description: withReportNote('Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded.'),
|
|
16179
16482
|
inputSchema: HarvestPaaInputSchema,
|
|
16483
|
+
outputSchema: HarvestPaaOutputSchema,
|
|
16180
16484
|
annotations: liveWebToolAnnotations("Google PAA + SERP Harvest")
|
|
16181
16485
|
}, async (input) => formatHarvestPaa(await executor.harvestPaa(input), input));
|
|
16182
16486
|
server.registerTool("search_serp", {
|
|
16183
16487
|
title: "Google SERP Lookup",
|
|
16184
16488
|
description: withReportNote("Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request."),
|
|
16185
16489
|
inputSchema: SearchSerpInputSchema,
|
|
16490
|
+
outputSchema: SearchSerpOutputSchema,
|
|
16186
16491
|
annotations: liveWebToolAnnotations("Google SERP Lookup")
|
|
16187
16492
|
}, async (input) => formatSearchSerp(await executor.searchSerp(input), input));
|
|
16188
16493
|
server.registerTool("extract_url", {
|
|
16189
16494
|
title: "Single URL Extract",
|
|
16190
16495
|
description: withReportNote("Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page."),
|
|
16191
16496
|
inputSchema: ExtractUrlInputSchema,
|
|
16497
|
+
outputSchema: ExtractUrlOutputSchema,
|
|
16192
16498
|
annotations: liveWebToolAnnotations("Single URL Extract")
|
|
16193
16499
|
}, async (input) => formatExtractUrl(await executor.extractUrl(input), input));
|
|
16194
16500
|
server.registerTool("map_site_urls", {
|
|
@@ -16202,6 +16508,7 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
|
|
|
16202
16508
|
title: "Multi-Page Site Extract",
|
|
16203
16509
|
description: withReportNote("Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction."),
|
|
16204
16510
|
inputSchema: ExtractSiteInputSchema,
|
|
16511
|
+
outputSchema: ExtractSiteOutputSchema,
|
|
16205
16512
|
annotations: liveWebToolAnnotations("Multi-Page Site Extract")
|
|
16206
16513
|
}, async (input) => formatExtractSite(await executor.extractSite(input), input));
|
|
16207
16514
|
server.registerTool("youtube_harvest", {
|
|
@@ -16241,6 +16548,7 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
|
|
|
16241
16548
|
title: "Google Maps Business Profile Details",
|
|
16242
16549
|
description: withReportNote('Extract Google Maps business intelligence for one known/named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Do not use this for category searches, local market prospect lists, or requests for multiple GMB/GBP profiles; use maps_search first for those. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain.'),
|
|
16243
16550
|
inputSchema: MapsPlaceIntelInputSchema,
|
|
16551
|
+
outputSchema: MapsPlaceIntelOutputSchema,
|
|
16244
16552
|
annotations: liveWebToolAnnotations("Google Maps Business Profile Details")
|
|
16245
16553
|
}, async (input) => formatMapsPlaceIntel(await executor.mapsPlaceIntel(input), input));
|
|
16246
16554
|
server.registerTool("maps_search", {
|
|
@@ -16254,6 +16562,7 @@ function buildPaaExtractorMcpServer(executor, options = {}) {
|
|
|
16254
16562
|
title: "MCP Scraper Credits & Costs",
|
|
16255
16563
|
description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
|
|
16256
16564
|
inputSchema: CreditsInfoInputSchema,
|
|
16565
|
+
outputSchema: CreditsInfoOutputSchema,
|
|
16257
16566
|
annotations: {
|
|
16258
16567
|
title: "MCP Scraper Credits & Costs",
|
|
16259
16568
|
readOnlyHint: true,
|
|
@@ -16833,7 +17142,7 @@ async function processJob(job) {
|
|
|
16833
17142
|
const opts = typeof job.options === "string" ? JSON.parse(job.options) : job.options;
|
|
16834
17143
|
const result = await harvest({
|
|
16835
17144
|
...opts,
|
|
16836
|
-
kernelApiKey:
|
|
17145
|
+
kernelApiKey: browserServiceApiKey(),
|
|
16837
17146
|
headless: true,
|
|
16838
17147
|
format: "json",
|
|
16839
17148
|
outputDir: "/tmp/paa-output-api",
|
|
@@ -16898,6 +17207,7 @@ var init_worker = __esm({
|
|
|
16898
17207
|
"src/api/worker.ts"() {
|
|
16899
17208
|
"use strict";
|
|
16900
17209
|
init_db();
|
|
17210
|
+
init_browser_service_env();
|
|
16901
17211
|
init_harvest();
|
|
16902
17212
|
init_webhook();
|
|
16903
17213
|
init_rates();
|
|
@@ -17000,6 +17310,8 @@ var init_server = __esm({
|
|
|
17000
17310
|
"src/api/server.ts"() {
|
|
17001
17311
|
"use strict";
|
|
17002
17312
|
init_harvest_timeout();
|
|
17313
|
+
init_browser_service_env();
|
|
17314
|
+
init_outbound_sanitize();
|
|
17003
17315
|
init_registry();
|
|
17004
17316
|
init_template();
|
|
17005
17317
|
init_og();
|
|
@@ -17316,7 +17628,7 @@ var init_server = __esm({
|
|
|
17316
17628
|
try {
|
|
17317
17629
|
const result = await harvest({
|
|
17318
17630
|
...options,
|
|
17319
|
-
kernelApiKey:
|
|
17631
|
+
kernelApiKey: browserServiceApiKey(),
|
|
17320
17632
|
headless: true,
|
|
17321
17633
|
format: "json",
|
|
17322
17634
|
outputDir: "/tmp/paa-output-api",
|
|
@@ -17331,7 +17643,7 @@ var init_server = __esm({
|
|
|
17331
17643
|
if (diff > 0) await creditMc(user.id, diff, LedgerOperation.PAA_REFUND, "overestimate refund");
|
|
17332
17644
|
else if (diff < 0) await debitMc(user.id, -diff, LedgerOperation.PAA, options.query);
|
|
17333
17645
|
}
|
|
17334
|
-
return c.json({ job_id: jobId, status: "done", result, attempts });
|
|
17646
|
+
return c.json({ job_id: jobId, status: "done", result: sanitizeHarvestResult(result), attempts: sanitizeAttempts(attempts) });
|
|
17335
17647
|
} catch (err) {
|
|
17336
17648
|
const problem = classifyHarvestProblem(err);
|
|
17337
17649
|
const response = harvestProblemResponse(problem);
|
|
@@ -17339,18 +17651,19 @@ var init_server = __esm({
|
|
|
17339
17651
|
if (problem.terminalStatus === "cancelled" || c.req.raw.signal.aborted) {
|
|
17340
17652
|
await cancelJob(jobId, serializeHarvestProblem(problem));
|
|
17341
17653
|
await creditMc(user.id, syncCost, LedgerOperation.REFUND, "cancelled call");
|
|
17342
|
-
return c.json({ job_id: jobId, status: "cancelled", ...response, attempts }, problem.httpStatus);
|
|
17654
|
+
return c.json({ job_id: jobId, status: "cancelled", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
|
|
17343
17655
|
}
|
|
17344
17656
|
await failJob(jobId, serializeHarvestProblem(problem));
|
|
17345
17657
|
await creditMc(user.id, syncCost, LedgerOperation.REFUND, "failed call");
|
|
17346
|
-
return c.json({ job_id: jobId, status: "failed", ...response, attempts }, problem.httpStatus);
|
|
17658
|
+
return c.json({ job_id: jobId, status: "failed", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
|
|
17347
17659
|
}
|
|
17348
17660
|
});
|
|
17349
17661
|
app.get("/jobs/:id", auth, async (c) => {
|
|
17350
17662
|
const job = await getJob(c.req.param("id"), c.get("user").id);
|
|
17351
17663
|
if (!job) return c.json({ error: "Job not found" }, 404);
|
|
17352
17664
|
const attempts = await listHarvestAttempts(job.id, c.get("user").id);
|
|
17353
|
-
|
|
17665
|
+
const safeResult = job.result && typeof job.result === "object" ? sanitizeHarvestResult(job.result) : job.result;
|
|
17666
|
+
return c.json({ ...job, result: safeResult, attempts: sanitizeAttempts(attempts) });
|
|
17354
17667
|
});
|
|
17355
17668
|
app.get("/jobs", auth, async (c) => {
|
|
17356
17669
|
return c.json(await listJobs(c.get("user").id));
|
|
@@ -17449,7 +17762,7 @@ var init_server = __esm({
|
|
|
17449
17762
|
const { ok: euOk, balance_mc: euBal } = await debitMc(user.id, MC_COSTS.page_scrape, LedgerOperation.EXTRACT_URL, new URL(canonicalUrl).hostname);
|
|
17450
17763
|
if (!euOk) return c.json(insufficientBalanceResponse(euBal, MC_COSTS.page_scrape), 402);
|
|
17451
17764
|
try {
|
|
17452
|
-
const kernelApiKey =
|
|
17765
|
+
const kernelApiKey = browserServiceApiKey();
|
|
17453
17766
|
const device = screenshotDevice === "mobile" ? "mobile" : "desktop";
|
|
17454
17767
|
const [result, pageData] = await Promise.all([
|
|
17455
17768
|
extractKpo({ url: canonicalUrl, kernelApiKey }),
|
|
@@ -17487,7 +17800,7 @@ var init_server = __esm({
|
|
|
17487
17800
|
startUrl: parsed.href,
|
|
17488
17801
|
maxUrls: Math.min(2e3, Math.max(1, body.maxUrls ?? 500)),
|
|
17489
17802
|
concurrency: Math.min(20, Math.max(1, body.concurrency ?? 12)),
|
|
17490
|
-
kernelApiKey: body.browserFallback ?? body.kernelFallback ?
|
|
17803
|
+
kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
|
|
17491
17804
|
});
|
|
17492
17805
|
await logRequestEvent({
|
|
17493
17806
|
userId: user.id,
|
|
@@ -17527,7 +17840,7 @@ var init_server = __esm({
|
|
|
17527
17840
|
const result = await extractSite({
|
|
17528
17841
|
startUrl: parsed.href,
|
|
17529
17842
|
maxPages: Math.min(200, Math.max(1, body.maxPages ?? 100)),
|
|
17530
|
-
kernelApiKey: body.browserFallback ?? body.kernelFallback ?
|
|
17843
|
+
kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
|
|
17531
17844
|
});
|
|
17532
17845
|
const pageCount = result.pages?.length ?? 1;
|
|
17533
17846
|
const actualSiteMc = pageCount * MC_COSTS.page_scrape;
|