mcp-scraper 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/bin/api-server.cjs +957 -243
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +540 -158
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +2 -1
- package/dist/bin/mcp-stdio-server.js.map +1 -1
- package/dist/bin/paa-harvest.cjs +36 -5
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +5 -3
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-6TWZS2FQ.js → chunk-RE6HCRYC.js} +543 -159
- package/dist/chunk-RE6HCRYC.js.map +1 -0
- package/dist/{chunk-W4P2U5VF.js → chunk-TM22BLWP.js} +46 -34
- package/dist/chunk-TM22BLWP.js.map +1 -0
- package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
- package/dist/chunk-ZK456YXN.js.map +1 -0
- package/dist/chunk-ZMOWIBMK.js +36 -0
- package/dist/chunk-ZMOWIBMK.js.map +1 -0
- package/dist/index.cjs +34 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/{server-2Y27U4TO.js → server-QXVVTKJP.js} +311 -48
- package/dist/server-QXVVTKJP.js.map +1 -0
- package/dist/{worker-UT4ZQU2T.js → worker-AUCXFHEL.js} +6 -4
- package/dist/worker-AUCXFHEL.js.map +1 -0
- package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
- package/docs/adr/README.md +11 -0
- package/docs/mcp-tool-quality-spec.md +238 -0
- package/package.json +5 -4
- package/dist/chunk-6TWZS2FQ.js.map +0 -1
- package/dist/chunk-7HB7NDOY.js.map +0 -1
- package/dist/chunk-W4P2U5VF.js.map +0 -1
- package/dist/server-2Y27U4TO.js.map +0 -1
- package/dist/worker-UT4ZQU2T.js.map +0 -1
|
@@ -3,8 +3,10 @@ import {
|
|
|
3
3
|
CaptureSerpSnapshotInputSchema,
|
|
4
4
|
HttpMcpToolExecutor,
|
|
5
5
|
buildPaaExtractorMcpServer,
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
configureReportSaving,
|
|
7
|
+
harvestTimeoutBudget,
|
|
8
|
+
liveWebToolAnnotations
|
|
9
|
+
} from "./chunk-RE6HCRYC.js";
|
|
8
10
|
import {
|
|
9
11
|
BALANCE_PACK_LABELS,
|
|
10
12
|
BALANCE_PRICE_IDS,
|
|
@@ -20,20 +22,27 @@ import {
|
|
|
20
22
|
harvestProblemResponse,
|
|
21
23
|
insufficientBalanceResponse,
|
|
22
24
|
serializeHarvestProblem
|
|
23
|
-
} from "./chunk-
|
|
25
|
+
} from "./chunk-ZK456YXN.js";
|
|
24
26
|
import {
|
|
25
27
|
BrowserDriver,
|
|
26
|
-
CaptchaError,
|
|
27
28
|
MapsPlaceOptionsSchema,
|
|
29
|
+
MapsSearchOptionsSchema,
|
|
28
30
|
MapsSelectors,
|
|
29
31
|
RawMapsAboutAttributeSchema,
|
|
30
32
|
RawMapsHoursRowSchema,
|
|
31
33
|
RawMapsOverviewSchema,
|
|
32
34
|
RawMapsReviewStatsSchema,
|
|
35
|
+
browserServiceApiKey,
|
|
36
|
+
browserServiceProxyId,
|
|
33
37
|
buildYouTubeChannelVideosUrl,
|
|
34
38
|
harvest,
|
|
35
39
|
resolveKernelProxyId
|
|
36
|
-
} from "./chunk-
|
|
40
|
+
} from "./chunk-TM22BLWP.js";
|
|
41
|
+
import {
|
|
42
|
+
CaptchaError,
|
|
43
|
+
RECAPTCHA_INSTRUCTIONS,
|
|
44
|
+
sanitizeVendorName
|
|
45
|
+
} from "./chunk-ZMOWIBMK.js";
|
|
37
46
|
import {
|
|
38
47
|
SiteAuditJobRowSchema,
|
|
39
48
|
cancelJob,
|
|
@@ -77,6 +86,53 @@ import {
|
|
|
77
86
|
verifyPassword
|
|
78
87
|
} from "./chunk-D4CJBZBY.js";
|
|
79
88
|
|
|
89
|
+
// src/api/outbound-sanitize.ts
|
|
90
|
+
var KEY_RENAMES = {
|
|
91
|
+
kernel: "browserRuntime",
|
|
92
|
+
kernel_session_id: "browser_session_id",
|
|
93
|
+
kernel_delete_started: "session_cleanup_started",
|
|
94
|
+
kernel_delete_succeeded: "session_cleanup_succeeded",
|
|
95
|
+
kernel_delete_error: "session_cleanup_error",
|
|
96
|
+
kernelSessionId: "browserSessionId",
|
|
97
|
+
kernelDeleteStarted: "sessionCleanupStarted",
|
|
98
|
+
kernelDeleteSucceeded: "sessionCleanupSucceeded",
|
|
99
|
+
kernelDeleteError: "sessionCleanupError",
|
|
100
|
+
kernelProxyId: "proxyId"
|
|
101
|
+
};
|
|
102
|
+
var SANITIZED_VALUE_KEYS = /error|message/i;
|
|
103
|
+
function sanitizeOutboundDiagnostics(value, parentKey = "") {
|
|
104
|
+
if (typeof value === "string") {
|
|
105
|
+
if (SANITIZED_VALUE_KEYS.test(parentKey) && /kernel/i.test(value)) {
|
|
106
|
+
return sanitizeVendorName(value);
|
|
107
|
+
}
|
|
108
|
+
return value;
|
|
109
|
+
}
|
|
110
|
+
if (Array.isArray(value)) return value.map((v) => sanitizeOutboundDiagnostics(v, parentKey));
|
|
111
|
+
if (value !== null && typeof value === "object") {
|
|
112
|
+
const out = {};
|
|
113
|
+
for (const [key, val] of Object.entries(value)) {
|
|
114
|
+
const renamed = KEY_RENAMES[key] ?? key;
|
|
115
|
+
out[renamed] = sanitizeOutboundDiagnostics(val, key);
|
|
116
|
+
}
|
|
117
|
+
return out;
|
|
118
|
+
}
|
|
119
|
+
return value;
|
|
120
|
+
}
|
|
121
|
+
function sanitizeAttempts(attempts) {
|
|
122
|
+
return attempts.map((a) => sanitizeOutboundDiagnostics(a));
|
|
123
|
+
}
|
|
124
|
+
function sanitizeHarvestResult(result) {
|
|
125
|
+
const diagnostics = result?.diagnostics;
|
|
126
|
+
if (!diagnostics?.debug) return result;
|
|
127
|
+
return {
|
|
128
|
+
...result,
|
|
129
|
+
diagnostics: {
|
|
130
|
+
...diagnostics,
|
|
131
|
+
debug: sanitizeOutboundDiagnostics(diagnostics.debug)
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
80
136
|
// src/blog/registry.ts
|
|
81
137
|
var posts = [
|
|
82
138
|
{
|
|
@@ -3439,7 +3495,7 @@ import TurndownService from "turndown";
|
|
|
3439
3495
|
import Kernel from "@onkernel/sdk";
|
|
3440
3496
|
import { chromium } from "playwright";
|
|
3441
3497
|
async function fetchWithKernel(url) {
|
|
3442
|
-
const apiKey =
|
|
3498
|
+
const apiKey = browserServiceApiKey();
|
|
3443
3499
|
if (!apiKey) throw new Error("Browser backend API key not set");
|
|
3444
3500
|
const client = new Kernel({ apiKey });
|
|
3445
3501
|
const kb = await client.browsers.create({ stealth: true, timeout_seconds: 60 });
|
|
@@ -3474,9 +3530,9 @@ async function extractKpo(opts) {
|
|
|
3474
3530
|
redirect: "manual"
|
|
3475
3531
|
});
|
|
3476
3532
|
if (res.status >= 300 && res.status < 400) {
|
|
3477
|
-
const
|
|
3478
|
-
if (!
|
|
3479
|
-
const next = new URL(
|
|
3533
|
+
const location2 = res.headers.get("location");
|
|
3534
|
+
if (!location2) return null;
|
|
3535
|
+
const next = new URL(location2, target).href;
|
|
3480
3536
|
const checkedRedirect = await validatePublicHttpUrl(next, { field: "redirect URL" });
|
|
3481
3537
|
if (checkedRedirect.error || !checkedRedirect.parsed) return null;
|
|
3482
3538
|
target = checkedRedirect.parsed.href;
|
|
@@ -7606,7 +7662,7 @@ async function writeOutputs(result, outputDir) {
|
|
|
7606
7662
|
}
|
|
7607
7663
|
}
|
|
7608
7664
|
async function ytHarvest(rawOptions) {
|
|
7609
|
-
const kernelApiKey =
|
|
7665
|
+
const kernelApiKey = browserServiceApiKey();
|
|
7610
7666
|
if (!kernelApiKey) {
|
|
7611
7667
|
throw new Error("A browser backend API key is required \u2014 YouTube harvesting requires a stealth session.");
|
|
7612
7668
|
}
|
|
@@ -7701,7 +7757,7 @@ function parseTimedtextXml(xml) {
|
|
|
7701
7757
|
return results;
|
|
7702
7758
|
}
|
|
7703
7759
|
async function fetchViaKernelInnertube(videoId) {
|
|
7704
|
-
const kernelApiKey =
|
|
7760
|
+
const kernelApiKey = browserServiceApiKey();
|
|
7705
7761
|
if (!kernelApiKey) return null;
|
|
7706
7762
|
const driver = new BrowserDriver();
|
|
7707
7763
|
const start = Date.now();
|
|
@@ -7845,7 +7901,7 @@ async function attemptKernelWhisper(videoId, kernelApiKey, falKey, start) {
|
|
|
7845
7901
|
}
|
|
7846
7902
|
}
|
|
7847
7903
|
async function fetchViaKernelWhisper(videoId) {
|
|
7848
|
-
const kernelApiKey =
|
|
7904
|
+
const kernelApiKey = browserServiceApiKey();
|
|
7849
7905
|
const falKey = process.env.FAL_KEY;
|
|
7850
7906
|
if (!kernelApiKey || !falKey) return null;
|
|
7851
7907
|
const start = Date.now();
|
|
@@ -8116,7 +8172,7 @@ screenshotApp.post("/", async (c) => {
|
|
|
8116
8172
|
}
|
|
8117
8173
|
const device2 = body.device === "mobile" ? "mobile" : "desktop";
|
|
8118
8174
|
try {
|
|
8119
|
-
const buf = await captureScreenshot(parsedFallback.href,
|
|
8175
|
+
const buf = await captureScreenshot(parsedFallback.href, browserServiceApiKey(), device2);
|
|
8120
8176
|
return new Response(new Uint8Array(buf), {
|
|
8121
8177
|
status: 200,
|
|
8122
8178
|
headers: {
|
|
@@ -8132,7 +8188,7 @@ screenshotApp.post("/", async (c) => {
|
|
|
8132
8188
|
}
|
|
8133
8189
|
const device = body.device === "mobile" ? "mobile" : "desktop";
|
|
8134
8190
|
try {
|
|
8135
|
-
const buf = await captureScreenshot(urlCheck.parsed.href,
|
|
8191
|
+
const buf = await captureScreenshot(urlCheck.parsed.href, browserServiceApiKey(), device);
|
|
8136
8192
|
return new Response(new Uint8Array(buf), {
|
|
8137
8193
|
status: 200,
|
|
8138
8194
|
headers: {
|
|
@@ -8959,23 +9015,23 @@ function buildPageIntelUrl(body, country) {
|
|
|
8959
9015
|
return `https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=${country}&q=${encodeURIComponent(body.query.trim())}&search_type=keyword_unordered`;
|
|
8960
9016
|
}
|
|
8961
9017
|
function kernelLaunchOpts() {
|
|
8962
|
-
return { headless: true, kernelApiKey:
|
|
9018
|
+
return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: browserServiceProxyId(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
8963
9019
|
}
|
|
8964
9020
|
async function kernelLaunchOptsResidential() {
|
|
8965
|
-
let proxyId =
|
|
9021
|
+
let proxyId = browserServiceProxyId();
|
|
8966
9022
|
try {
|
|
8967
9023
|
const resolution = await resolveKernelProxyId({
|
|
8968
|
-
kernelApiKey:
|
|
9024
|
+
kernelApiKey: browserServiceApiKey(),
|
|
8969
9025
|
proxyMode: "location",
|
|
8970
|
-
configuredKernelProxyId:
|
|
9026
|
+
configuredKernelProxyId: browserServiceProxyId(),
|
|
8971
9027
|
location: "New York, NY",
|
|
8972
9028
|
gl: "us"
|
|
8973
9029
|
});
|
|
8974
9030
|
if (resolution.kernelProxyId) proxyId = resolution.kernelProxyId;
|
|
8975
9031
|
} catch {
|
|
8976
|
-
proxyId =
|
|
9032
|
+
proxyId = browserServiceProxyId();
|
|
8977
9033
|
}
|
|
8978
|
-
return { headless: true, kernelApiKey:
|
|
9034
|
+
return { headless: true, kernelApiKey: browserServiceApiKey(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
8979
9035
|
}
|
|
8980
9036
|
var facebookAdApp = new Hono4();
|
|
8981
9037
|
facebookAdApp.post("/ad", createApiKeyAuth(), async (c) => {
|
|
@@ -9242,8 +9298,8 @@ var MapsNavigator = class {
|
|
|
9242
9298
|
this.page = page;
|
|
9243
9299
|
}
|
|
9244
9300
|
page;
|
|
9245
|
-
async navigateToPlacePage(businessName,
|
|
9246
|
-
const query = `${businessName} ${
|
|
9301
|
+
async navigateToPlacePage(businessName, location2) {
|
|
9302
|
+
const query = `${businessName} ${location2}`;
|
|
9247
9303
|
const searchUrl = `https://www.google.com/maps/search/${encodeURIComponent(query)}`;
|
|
9248
9304
|
await this.page.goto(searchUrl, { waitUntil: "domcontentloaded", timeout: 45e3 });
|
|
9249
9305
|
const onPlacePage = await this.page.evaluate(() => /\/maps\/place\//.test(window.location.href));
|
|
@@ -9668,8 +9724,213 @@ var MapsExtractor = class {
|
|
|
9668
9724
|
}
|
|
9669
9725
|
};
|
|
9670
9726
|
|
|
9727
|
+
// src/extractor/MapsSearchExtractor.ts
|
|
9728
|
+
var MAPS_SEARCH_SCROLL_BUDGET_MS = 6e4;
|
|
9729
|
+
var MAPS_SEARCH_SCROLL_STEP_MS = 1200;
|
|
9730
|
+
var MAPS_SEARCH_MAX_NO_GROWTH_ROUNDS = 4;
|
|
9731
|
+
var MapsSearchExtractor = class {
|
|
9732
|
+
constructor(driver) {
|
|
9733
|
+
this.driver = driver;
|
|
9734
|
+
}
|
|
9735
|
+
driver;
|
|
9736
|
+
async extract(options) {
|
|
9737
|
+
const startMs = Date.now();
|
|
9738
|
+
const searchQuery = [options.query, options.location].filter(Boolean).join(" ");
|
|
9739
|
+
const searchUrl = `https://www.google.com/maps/search/${encodeURIComponent(searchQuery)}?hl=${encodeURIComponent(options.hl)}`;
|
|
9740
|
+
const config = {
|
|
9741
|
+
headless: options.headless,
|
|
9742
|
+
kernelApiKey: options.kernelApiKey,
|
|
9743
|
+
kernelProxyId: options.kernelProxyId,
|
|
9744
|
+
viewport: { width: 1280, height: 900 },
|
|
9745
|
+
locale: `${options.hl}-${options.gl.toUpperCase()}`
|
|
9746
|
+
};
|
|
9747
|
+
try {
|
|
9748
|
+
await this.driver.launch(config);
|
|
9749
|
+
const page = this.driver.getPage();
|
|
9750
|
+
await page.goto(searchUrl, { waitUntil: "domcontentloaded", timeout: 6e4 });
|
|
9751
|
+
await page.waitForTimeout(3e3);
|
|
9752
|
+
const blocked = await this.detectBlock(page);
|
|
9753
|
+
if (blocked) throw new CaptchaError(RECAPTCHA_INSTRUCTIONS);
|
|
9754
|
+
const results = await this.collectResults(page, options.maxResults);
|
|
9755
|
+
return {
|
|
9756
|
+
query: options.query,
|
|
9757
|
+
location: options.location ?? null,
|
|
9758
|
+
searchQuery,
|
|
9759
|
+
searchUrl,
|
|
9760
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
9761
|
+
requestedMaxResults: options.maxResults,
|
|
9762
|
+
resultCount: results.length,
|
|
9763
|
+
results,
|
|
9764
|
+
durationMs: Date.now() - startMs
|
|
9765
|
+
};
|
|
9766
|
+
} finally {
|
|
9767
|
+
await this.driver.close();
|
|
9768
|
+
}
|
|
9769
|
+
}
|
|
9770
|
+
async detectBlock(page) {
|
|
9771
|
+
return page.evaluate(() => {
|
|
9772
|
+
const text = document.body.innerText.slice(0, 2e3);
|
|
9773
|
+
return /unusual traffic|captcha|recaptcha|about this page/i.test(text) || /\/sorry\//.test(location.href);
|
|
9774
|
+
});
|
|
9775
|
+
}
|
|
9776
|
+
async collectResults(page, maxResults) {
|
|
9777
|
+
const seen = /* @__PURE__ */ new Map();
|
|
9778
|
+
const started = Date.now();
|
|
9779
|
+
let noGrowthRounds = 0;
|
|
9780
|
+
while (Date.now() - started < MAPS_SEARCH_SCROLL_BUDGET_MS) {
|
|
9781
|
+
const before = seen.size;
|
|
9782
|
+
const batch = await this.extractVisibleResults(page);
|
|
9783
|
+
for (const result of batch) {
|
|
9784
|
+
const key = this.resultKey(result);
|
|
9785
|
+
if (!seen.has(key)) seen.set(key, { ...result, position: seen.size + 1 });
|
|
9786
|
+
if (seen.size >= maxResults) break;
|
|
9787
|
+
}
|
|
9788
|
+
if (seen.size >= maxResults) break;
|
|
9789
|
+
if (seen.size === before) noGrowthRounds += 1;
|
|
9790
|
+
else noGrowthRounds = 0;
|
|
9791
|
+
if (noGrowthRounds >= MAPS_SEARCH_MAX_NO_GROWTH_ROUNDS) break;
|
|
9792
|
+
await page.evaluate(() => {
|
|
9793
|
+
const feed = document.querySelector('[role="feed"]');
|
|
9794
|
+
if (feed) {
|
|
9795
|
+
feed.scrollTop = feed.scrollHeight;
|
|
9796
|
+
} else {
|
|
9797
|
+
window.scrollTo(0, document.body.scrollHeight);
|
|
9798
|
+
}
|
|
9799
|
+
});
|
|
9800
|
+
await page.waitForTimeout(MAPS_SEARCH_SCROLL_STEP_MS);
|
|
9801
|
+
}
|
|
9802
|
+
return [...seen.values()].slice(0, maxResults);
|
|
9803
|
+
}
|
|
9804
|
+
resultKey(result) {
|
|
9805
|
+
return result.cidDecimal ?? result.placeUrl.replace(/[?&].*$/, "") ?? result.name;
|
|
9806
|
+
}
|
|
9807
|
+
async extractVisibleResults(page) {
|
|
9808
|
+
return page.evaluate(() => {
|
|
9809
|
+
function normalizeText(value) {
|
|
9810
|
+
const text = value?.replace(/\s+/g, " ").trim() ?? "";
|
|
9811
|
+
return text || null;
|
|
9812
|
+
}
|
|
9813
|
+
function cidFromUrl(url) {
|
|
9814
|
+
const fid = url.match(/!1s(0x[0-9a-f]+):(0x[0-9a-f]+)/i);
|
|
9815
|
+
if (!fid) return { cid: null, cidDecimal: null };
|
|
9816
|
+
let cidDecimal = null;
|
|
9817
|
+
try {
|
|
9818
|
+
cidDecimal = BigInt(fid[2]).toString();
|
|
9819
|
+
} catch {
|
|
9820
|
+
}
|
|
9821
|
+
return { cid: `${fid[1]}:${fid[2]}`, cidDecimal };
|
|
9822
|
+
}
|
|
9823
|
+
function textParts(card) {
|
|
9824
|
+
if (!card) return [];
|
|
9825
|
+
const parts = [];
|
|
9826
|
+
card.querySelectorAll("div, span").forEach((el2) => {
|
|
9827
|
+
const text = Array.from(el2.childNodes).filter((node) => node.nodeType === 3).map((node) => node.textContent?.trim() ?? "").filter((text2) => text2.length > 1 && text2.length < 140).join(" ");
|
|
9828
|
+
if (text && !parts.includes(text)) parts.push(text);
|
|
9829
|
+
});
|
|
9830
|
+
return parts;
|
|
9831
|
+
}
|
|
9832
|
+
function firstMatching(parts, pattern) {
|
|
9833
|
+
const value = parts.find((part) => pattern.test(part));
|
|
9834
|
+
return value ?? null;
|
|
9835
|
+
}
|
|
9836
|
+
const out = [];
|
|
9837
|
+
const seen = /* @__PURE__ */ new Set();
|
|
9838
|
+
const anchors = Array.from(document.querySelectorAll('a[href*="/maps/place/"]'));
|
|
9839
|
+
for (const anchor of anchors) {
|
|
9840
|
+
const placeUrl = anchor.href;
|
|
9841
|
+
const stableUrl = placeUrl.replace(/[?&].*$/, "");
|
|
9842
|
+
if (seen.has(stableUrl)) continue;
|
|
9843
|
+
seen.add(stableUrl);
|
|
9844
|
+
const card = anchor.closest('.Nv2PK, [role="article"], .bfdHYd') ?? anchor.parentElement;
|
|
9845
|
+
const parts = textParts(card);
|
|
9846
|
+
const aria = normalizeText(anchor.getAttribute("aria-label"));
|
|
9847
|
+
const heading = normalizeText(card?.querySelector('.qBF1Pd, .fontHeadlineSmall, [role="heading"]')?.textContent);
|
|
9848
|
+
const name = aria ?? heading ?? parts[0] ?? stableUrl;
|
|
9849
|
+
const links = Array.from(card?.querySelectorAll("a[href]") ?? []);
|
|
9850
|
+
const websiteUrl = links.find((link) => link.href.startsWith("http") && !link.href.includes("google."))?.href ?? null;
|
|
9851
|
+
const directionsUrl = links.find((link) => /google\.[^/]+\/maps\/dir|\/dir\//i.test(link.href))?.href ?? null;
|
|
9852
|
+
const rating = firstMatching(parts, /^\d(?:\.\d)?$/);
|
|
9853
|
+
const reviewCountRaw = firstMatching(parts, /^\(?[\d,]+\)?$/);
|
|
9854
|
+
const category = parts.find((part) => !/^\d(?:\.\d)?$|^\(?[\d,]+\)?$|open|closed|directions|website/i.test(part)) ?? null;
|
|
9855
|
+
const address = parts.find((part) => /\b[A-Z]{2}\s+\d{5}\b|\b(?:St|Street|Ave|Avenue|Rd|Road|Blvd|Drive|Dr)\b/i.test(part)) ?? null;
|
|
9856
|
+
const { cid, cidDecimal } = cidFromUrl(placeUrl);
|
|
9857
|
+
out.push({
|
|
9858
|
+
position: out.length + 1,
|
|
9859
|
+
name,
|
|
9860
|
+
placeUrl,
|
|
9861
|
+
cid,
|
|
9862
|
+
cidDecimal,
|
|
9863
|
+
rating,
|
|
9864
|
+
reviewCount: reviewCountRaw ? reviewCountRaw.replace(/[()]/g, "") : null,
|
|
9865
|
+
category,
|
|
9866
|
+
address,
|
|
9867
|
+
websiteUrl,
|
|
9868
|
+
directionsUrl,
|
|
9869
|
+
metadata: parts.slice(0, 20)
|
|
9870
|
+
});
|
|
9871
|
+
}
|
|
9872
|
+
return out;
|
|
9873
|
+
});
|
|
9874
|
+
}
|
|
9875
|
+
};
|
|
9876
|
+
|
|
9671
9877
|
// src/api/maps-routes.ts
|
|
9878
|
+
function mapsErrorResponse(c, msg, errorCode) {
|
|
9879
|
+
const blocked = msg.includes("CAPTCHA") || msg.includes("blocked");
|
|
9880
|
+
return c.json({
|
|
9881
|
+
error: sanitizeVendorName(msg),
|
|
9882
|
+
error_code: blocked ? "captcha_or_blocked" : errorCode,
|
|
9883
|
+
retryable: blocked
|
|
9884
|
+
}, blocked ? 503 : 500);
|
|
9885
|
+
}
|
|
9672
9886
|
var mapsApp = new Hono5();
|
|
9887
|
+
mapsApp.post("/search", createApiKeyAuth(), async (c) => {
|
|
9888
|
+
const user = c.get("user");
|
|
9889
|
+
const body = await c.req.json().catch(() => ({}));
|
|
9890
|
+
const parsed = MapsSearchOptionsSchema.safeParse({
|
|
9891
|
+
kernelApiKey: process.env.KERNEL_API_KEY,
|
|
9892
|
+
...body
|
|
9893
|
+
});
|
|
9894
|
+
if (!parsed.success) {
|
|
9895
|
+
return c.json({ error: parsed.error.issues[0]?.message ?? "Invalid request" }, 400);
|
|
9896
|
+
}
|
|
9897
|
+
const { ok, balance_mc } = await debitMc(
|
|
9898
|
+
user.id,
|
|
9899
|
+
MC_COSTS.maps_search,
|
|
9900
|
+
LedgerOperation.MAPS_SEARCH,
|
|
9901
|
+
[parsed.data.query, parsed.data.location].filter(Boolean).join(" ")
|
|
9902
|
+
);
|
|
9903
|
+
if (!ok) return c.json(insufficientBalanceResponse(balance_mc, MC_COSTS.maps_search), 402);
|
|
9904
|
+
const driver = new BrowserDriver();
|
|
9905
|
+
const extractor = new MapsSearchExtractor(driver);
|
|
9906
|
+
try {
|
|
9907
|
+
const result = await extractor.extract(parsed.data);
|
|
9908
|
+
await logRequestEvent({
|
|
9909
|
+
userId: user.id,
|
|
9910
|
+
source: "maps_search",
|
|
9911
|
+
status: "done",
|
|
9912
|
+
query: result.searchQuery,
|
|
9913
|
+
location: parsed.data.location,
|
|
9914
|
+
resultCount: result.resultCount,
|
|
9915
|
+
result
|
|
9916
|
+
});
|
|
9917
|
+
return c.json(result);
|
|
9918
|
+
} catch (err) {
|
|
9919
|
+
await creditMc(user.id, MC_COSTS.maps_search, LedgerOperation.REFUND, "failed maps_search call");
|
|
9920
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
9921
|
+
await logRequestEvent({
|
|
9922
|
+
userId: user.id,
|
|
9923
|
+
source: "maps_search",
|
|
9924
|
+
status: "failed",
|
|
9925
|
+
query: [parsed.data.query, parsed.data.location].filter(Boolean).join(" "),
|
|
9926
|
+
location: parsed.data.location,
|
|
9927
|
+
error: msg
|
|
9928
|
+
});
|
|
9929
|
+
return mapsErrorResponse(c, msg, "maps_search_failed");
|
|
9930
|
+
} finally {
|
|
9931
|
+
await driver.close();
|
|
9932
|
+
}
|
|
9933
|
+
});
|
|
9673
9934
|
mapsApp.post("/place", createApiKeyAuth(), async (c) => {
|
|
9674
9935
|
const user = c.get("user");
|
|
9675
9936
|
const body = await c.req.json().catch(() => ({}));
|
|
@@ -9736,10 +9997,7 @@ mapsApp.post("/place", createApiKeyAuth(), async (c) => {
|
|
|
9736
9997
|
location: parsed.data.location,
|
|
9737
9998
|
error: msg
|
|
9738
9999
|
});
|
|
9739
|
-
|
|
9740
|
-
return c.json({ error: msg }, 503);
|
|
9741
|
-
}
|
|
9742
|
-
return c.json({ error: msg }, 500);
|
|
10000
|
+
return mapsErrorResponse(c, msg, "maps_place_failed");
|
|
9743
10001
|
} finally {
|
|
9744
10002
|
await driver.close();
|
|
9745
10003
|
}
|
|
@@ -10419,8 +10677,8 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
|
|
|
10419
10677
|
debug,
|
|
10420
10678
|
serpOnly: true,
|
|
10421
10679
|
headless: runtimeOptions.headless ?? true,
|
|
10422
|
-
kernelApiKey: runtimeOptions.kernelApiKey ??
|
|
10423
|
-
kernelProxyId: runtimeOptions.kernelProxyId ??
|
|
10680
|
+
kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
|
|
10681
|
+
kernelProxyId: runtimeOptions.kernelProxyId ?? browserServiceProxyId(),
|
|
10424
10682
|
format: "json",
|
|
10425
10683
|
outputDir: runtimeOptions.outputDir ?? "/tmp/serp-intelligence-output",
|
|
10426
10684
|
signal: runtimeOptions.signal,
|
|
@@ -10431,7 +10689,7 @@ async function captureSerpIntelligenceSnapshot(rawInput, runtimeOptions = {}) {
|
|
|
10431
10689
|
const pageSnapshotLimit = normalizePageSnapshotLimit(parsedInput);
|
|
10432
10690
|
const pageSnapshotTargets = collectPageSnapshotTargets(harvestResult, pageSnapshotLimit);
|
|
10433
10691
|
const pageSnapshotArtifacts = pageSnapshotTargets.length > 0 ? (await capturePageSnapshotsFn(pageSnapshotTargets, {
|
|
10434
|
-
kernelApiKey: runtimeOptions.kernelApiKey ??
|
|
10692
|
+
kernelApiKey: runtimeOptions.kernelApiKey ?? browserServiceApiKey(),
|
|
10435
10693
|
timeoutMs: runtimeOptions.pageSnapshotTimeoutMs,
|
|
10436
10694
|
maxConcurrency: runtimeOptions.pageSnapshotMaxConcurrency,
|
|
10437
10695
|
debug,
|
|
@@ -10539,8 +10797,8 @@ serpIntelligenceApp.post("/capture", async (c) => {
|
|
|
10539
10797
|
if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
|
|
10540
10798
|
try {
|
|
10541
10799
|
const result = await captureSerpIntelligenceSnapshot(parsed.data, {
|
|
10542
|
-
kernelApiKey:
|
|
10543
|
-
kernelProxyId:
|
|
10800
|
+
kernelApiKey: browserServiceApiKey(),
|
|
10801
|
+
kernelProxyId: browserServiceProxyId(),
|
|
10544
10802
|
signal: c.req.raw.signal,
|
|
10545
10803
|
billing: { creditsUsed: cost / 1e3 }
|
|
10546
10804
|
});
|
|
@@ -10595,7 +10853,7 @@ serpIntelligenceApp.post("/page-snapshots", async (c) => {
|
|
|
10595
10853
|
if (!ok) return c.json(insufficientBalanceResponse(balance_mc, cost), 402);
|
|
10596
10854
|
try {
|
|
10597
10855
|
const result = await capturePageSnapshots(targets, {
|
|
10598
|
-
kernelApiKey:
|
|
10856
|
+
kernelApiKey: browserServiceApiKey(),
|
|
10599
10857
|
timeoutMs: parsed.data.timeoutMs,
|
|
10600
10858
|
maxConcurrency: parsed.data.maxConcurrency,
|
|
10601
10859
|
debug: parsed.data.debug
|
|
@@ -10631,6 +10889,7 @@ serpIntelligenceApp.post("/page-snapshots", async (c) => {
|
|
|
10631
10889
|
// src/mcp/mcp-routes.ts
|
|
10632
10890
|
import { Hono as Hono7 } from "hono";
|
|
10633
10891
|
import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
|
|
10892
|
+
configureReportSaving(false);
|
|
10634
10893
|
function mcpAuthError() {
|
|
10635
10894
|
const body = JSON.stringify({
|
|
10636
10895
|
jsonrpc: "2.0",
|
|
@@ -10657,15 +10916,18 @@ async function requireMcpCallerKey(c) {
|
|
|
10657
10916
|
}
|
|
10658
10917
|
var mcpApp = new Hono7();
|
|
10659
10918
|
function registerSerpIntelligenceCaptureTools(server, executor) {
|
|
10660
|
-
const serpExecutor = executor;
|
|
10661
10919
|
server.registerTool("capture_serp_snapshot", {
|
|
10920
|
+
title: "SERP Intelligence Snapshot",
|
|
10662
10921
|
description: "Capture a structured SERP Intelligence Google snapshot through POST /serp-intelligence/capture, the same product capture path used by Phoenix. Split query from location, infer gl/hl, use proxyMode location for localized residential proxy evidence, configured for the static residential proxy, and none only for direct-network debugging. Set debug true when investigating location evidence, proxy behavior, CAPTCHA, or capture reliability.",
|
|
10663
|
-
inputSchema: CaptureSerpSnapshotInputSchema
|
|
10664
|
-
|
|
10922
|
+
inputSchema: CaptureSerpSnapshotInputSchema,
|
|
10923
|
+
annotations: liveWebToolAnnotations("SERP Intelligence Snapshot")
|
|
10924
|
+
}, async (input) => executor.captureSerpSnapshot(input));
|
|
10665
10925
|
server.registerTool("capture_serp_page_snapshots", {
|
|
10926
|
+
title: "SERP Intelligence Page Snapshots",
|
|
10666
10927
|
description: "Capture public ranking-page evidence through POST /serp-intelligence/page-snapshots, the same product page snapshot path used by Phoenix. Provide urls for simple captures or targets when preserving organic, AI citation, local-pack, configured target, or site-subject source metadata. Private IPs, localhost, file URLs, and internal URLs are rejected by the service. Use timeoutMs for slow pages and debug true for sanitized proxy/browser diagnostics.",
|
|
10667
|
-
inputSchema: CaptureSerpPageSnapshotsInputSchema
|
|
10668
|
-
|
|
10928
|
+
inputSchema: CaptureSerpPageSnapshotsInputSchema,
|
|
10929
|
+
annotations: liveWebToolAnnotations("SERP Intelligence Page Snapshots")
|
|
10930
|
+
}, async (input) => executor.captureSerpPageSnapshots(input));
|
|
10669
10931
|
}
|
|
10670
10932
|
mcpApp.all("/", async (c) => {
|
|
10671
10933
|
try {
|
|
@@ -10678,7 +10940,7 @@ mcpApp.all("/", async (c) => {
|
|
|
10678
10940
|
sessionIdGenerator: void 0,
|
|
10679
10941
|
enableJsonResponse: true
|
|
10680
10942
|
});
|
|
10681
|
-
const server = buildPaaExtractorMcpServer(executor);
|
|
10943
|
+
const server = buildPaaExtractorMcpServer(executor, { savesReportsLocally: false });
|
|
10682
10944
|
registerSerpIntelligenceCaptureTools(server, executor);
|
|
10683
10945
|
await server.connect(transport);
|
|
10684
10946
|
return transport.handleRequest(c.req.raw);
|
|
@@ -11304,7 +11566,7 @@ app.post("/harvest/sync", auth, async (c) => {
|
|
|
11304
11566
|
try {
|
|
11305
11567
|
const result = await harvest({
|
|
11306
11568
|
...options,
|
|
11307
|
-
kernelApiKey:
|
|
11569
|
+
kernelApiKey: browserServiceApiKey(),
|
|
11308
11570
|
headless: true,
|
|
11309
11571
|
format: "json",
|
|
11310
11572
|
outputDir: "/tmp/paa-output-api",
|
|
@@ -11319,7 +11581,7 @@ app.post("/harvest/sync", auth, async (c) => {
|
|
|
11319
11581
|
if (diff > 0) await creditMc(user.id, diff, LedgerOperation.PAA_REFUND, "overestimate refund");
|
|
11320
11582
|
else if (diff < 0) await debitMc(user.id, -diff, LedgerOperation.PAA, options.query);
|
|
11321
11583
|
}
|
|
11322
|
-
return c.json({ job_id: jobId, status: "done", result, attempts });
|
|
11584
|
+
return c.json({ job_id: jobId, status: "done", result: sanitizeHarvestResult(result), attempts: sanitizeAttempts(attempts) });
|
|
11323
11585
|
} catch (err) {
|
|
11324
11586
|
const problem = classifyHarvestProblem(err);
|
|
11325
11587
|
const response = harvestProblemResponse(problem);
|
|
@@ -11327,18 +11589,19 @@ app.post("/harvest/sync", auth, async (c) => {
|
|
|
11327
11589
|
if (problem.terminalStatus === "cancelled" || c.req.raw.signal.aborted) {
|
|
11328
11590
|
await cancelJob(jobId, serializeHarvestProblem(problem));
|
|
11329
11591
|
await creditMc(user.id, syncCost, LedgerOperation.REFUND, "cancelled call");
|
|
11330
|
-
return c.json({ job_id: jobId, status: "cancelled", ...response, attempts }, problem.httpStatus);
|
|
11592
|
+
return c.json({ job_id: jobId, status: "cancelled", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
|
|
11331
11593
|
}
|
|
11332
11594
|
await failJob(jobId, serializeHarvestProblem(problem));
|
|
11333
11595
|
await creditMc(user.id, syncCost, LedgerOperation.REFUND, "failed call");
|
|
11334
|
-
return c.json({ job_id: jobId, status: "failed", ...response, attempts }, problem.httpStatus);
|
|
11596
|
+
return c.json({ job_id: jobId, status: "failed", ...response, attempts: sanitizeAttempts(attempts) }, problem.httpStatus);
|
|
11335
11597
|
}
|
|
11336
11598
|
});
|
|
11337
11599
|
app.get("/jobs/:id", auth, async (c) => {
|
|
11338
11600
|
const job = await getJob(c.req.param("id"), c.get("user").id);
|
|
11339
11601
|
if (!job) return c.json({ error: "Job not found" }, 404);
|
|
11340
11602
|
const attempts = await listHarvestAttempts(job.id, c.get("user").id);
|
|
11341
|
-
|
|
11603
|
+
const safeResult = job.result && typeof job.result === "object" ? sanitizeHarvestResult(job.result) : job.result;
|
|
11604
|
+
return c.json({ ...job, result: safeResult, attempts: sanitizeAttempts(attempts) });
|
|
11342
11605
|
});
|
|
11343
11606
|
app.get("/jobs", auth, async (c) => {
|
|
11344
11607
|
return c.json(await listJobs(c.get("user").id));
|
|
@@ -11437,7 +11700,7 @@ app.post("/extract-url", auth, async (c) => {
|
|
|
11437
11700
|
const { ok: euOk, balance_mc: euBal } = await debitMc(user.id, MC_COSTS.page_scrape, LedgerOperation.EXTRACT_URL, new URL(canonicalUrl).hostname);
|
|
11438
11701
|
if (!euOk) return c.json(insufficientBalanceResponse(euBal, MC_COSTS.page_scrape), 402);
|
|
11439
11702
|
try {
|
|
11440
|
-
const kernelApiKey =
|
|
11703
|
+
const kernelApiKey = browserServiceApiKey();
|
|
11441
11704
|
const device = screenshotDevice === "mobile" ? "mobile" : "desktop";
|
|
11442
11705
|
const [result, pageData] = await Promise.all([
|
|
11443
11706
|
extractKpo({ url: canonicalUrl, kernelApiKey }),
|
|
@@ -11475,7 +11738,7 @@ app.post("/map-urls", auth, async (c) => {
|
|
|
11475
11738
|
startUrl: parsed.href,
|
|
11476
11739
|
maxUrls: Math.min(2e3, Math.max(1, body.maxUrls ?? 500)),
|
|
11477
11740
|
concurrency: Math.min(20, Math.max(1, body.concurrency ?? 12)),
|
|
11478
|
-
kernelApiKey: body.browserFallback ?? body.kernelFallback ?
|
|
11741
|
+
kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
|
|
11479
11742
|
});
|
|
11480
11743
|
await logRequestEvent({
|
|
11481
11744
|
userId: user.id,
|
|
@@ -11515,7 +11778,7 @@ app.post("/extract-site", auth, async (c) => {
|
|
|
11515
11778
|
const result = await extractSite({
|
|
11516
11779
|
startUrl: parsed.href,
|
|
11517
11780
|
maxPages: Math.min(200, Math.max(1, body.maxPages ?? 100)),
|
|
11518
|
-
kernelApiKey: body.browserFallback ?? body.kernelFallback ?
|
|
11781
|
+
kernelApiKey: body.browserFallback ?? body.kernelFallback ? browserServiceApiKey() : void 0
|
|
11519
11782
|
});
|
|
11520
11783
|
const pageCount = result.pages?.length ?? 1;
|
|
11521
11784
|
const actualSiteMc = pageCount * MC_COSTS.page_scrape;
|
|
@@ -11662,7 +11925,7 @@ app.get("/cron/tick", async (c) => {
|
|
|
11662
11925
|
if (!process.env.CRON_SECRET || secret2 !== `Bearer ${process.env.CRON_SECRET}`) {
|
|
11663
11926
|
return c.json({ error: "Unauthorized" }, 401);
|
|
11664
11927
|
}
|
|
11665
|
-
const { drainQueue } = await import("./worker-
|
|
11928
|
+
const { drainQueue } = await import("./worker-AUCXFHEL.js");
|
|
11666
11929
|
const budget = { maxJobs: 10, deadlineMs: Date.now() + 28e4 };
|
|
11667
11930
|
const [results, sweepResult] = await Promise.all([
|
|
11668
11931
|
drainQueue(budget),
|
|
@@ -11784,4 +12047,4 @@ app.get("/blog/:slug/", (c) => {
|
|
|
11784
12047
|
export {
|
|
11785
12048
|
app
|
|
11786
12049
|
};
|
|
11787
|
-
//# sourceMappingURL=server-
|
|
12050
|
+
//# sourceMappingURL=server-QXVVTKJP.js.map
|