mcp-scraper 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/api-server.cjs +691 -570
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +1 -1
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.js +1 -1
- package/dist/{chunk-Y74EXABN.js → chunk-7HB7NDOY.js} +2 -2
- package/dist/{chunk-JQKZWEON.js → chunk-DZY3XO3M.js} +2 -2
- package/dist/{chunk-JQKZWEON.js.map → chunk-DZY3XO3M.js.map} +1 -1
- package/dist/{chunk-HERFK7W6.js → chunk-W4P2U5VF.js} +2 -1
- package/dist/index.js +1 -1
- package/dist/{server-6CHHLOII.js → server-KUF3QJC7.js} +133 -19
- package/dist/server-KUF3QJC7.js.map +1 -0
- package/dist/{worker-D4D2YQTA.js → worker-UT4ZQU2T.js} +3 -3
- package/package.json +1 -1
- package/dist/server-6CHHLOII.js.map +0 -1
- /package/dist/{chunk-Y74EXABN.js.map → chunk-7HB7NDOY.js.map} +0 -0
- /package/dist/{chunk-HERFK7W6.js.map → chunk-W4P2U5VF.js.map} +0 -0
- /package/dist/{worker-D4D2YQTA.js.map → worker-UT4ZQU2T.js.map} +0 -0
|
@@ -3,7 +3,7 @@ import {
|
|
|
3
3
|
CaptureSerpSnapshotInputSchema,
|
|
4
4
|
HttpMcpToolExecutor,
|
|
5
5
|
buildPaaExtractorMcpServer
|
|
6
|
-
} from "./chunk-
|
|
6
|
+
} from "./chunk-DZY3XO3M.js";
|
|
7
7
|
import {
|
|
8
8
|
BALANCE_PACK_LABELS,
|
|
9
9
|
BALANCE_PRICE_IDS,
|
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
harvestProblemResponse,
|
|
20
20
|
insufficientBalanceResponse,
|
|
21
21
|
serializeHarvestProblem
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-7HB7NDOY.js";
|
|
23
23
|
import {
|
|
24
24
|
BrowserDriver,
|
|
25
25
|
CaptchaError,
|
|
@@ -30,8 +30,9 @@ import {
|
|
|
30
30
|
RawMapsOverviewSchema,
|
|
31
31
|
RawMapsReviewStatsSchema,
|
|
32
32
|
buildYouTubeChannelVideosUrl,
|
|
33
|
-
harvest
|
|
34
|
-
|
|
33
|
+
harvest,
|
|
34
|
+
resolveKernelProxyId
|
|
35
|
+
} from "./chunk-W4P2U5VF.js";
|
|
35
36
|
import {
|
|
36
37
|
SiteAuditJobRowSchema,
|
|
37
38
|
cancelJob,
|
|
@@ -8767,6 +8768,106 @@ var FacebookAdExtractor = class {
|
|
|
8767
8768
|
}
|
|
8768
8769
|
};
|
|
8769
8770
|
|
|
8771
|
+
// src/extractor/FacebookAdGraphql.ts
|
|
8772
|
+
var AD_LIBRARY_QUERY = "AdLibrarySearchPaginationQuery";
|
|
8773
|
+
function parseFbGraphqlJson(text) {
|
|
8774
|
+
const out = [];
|
|
8775
|
+
const body = text.replace(/^for\s*\(;;\);/, "").trim();
|
|
8776
|
+
try {
|
|
8777
|
+
out.push(JSON.parse(body));
|
|
8778
|
+
return out;
|
|
8779
|
+
} catch {
|
|
8780
|
+
for (const line of body.split("\n")) {
|
|
8781
|
+
const trimmed = line.trim();
|
|
8782
|
+
if (!trimmed) continue;
|
|
8783
|
+
try {
|
|
8784
|
+
out.push(JSON.parse(trimmed));
|
|
8785
|
+
} catch {
|
|
8786
|
+
continue;
|
|
8787
|
+
}
|
|
8788
|
+
}
|
|
8789
|
+
return out;
|
|
8790
|
+
}
|
|
8791
|
+
}
|
|
8792
|
+
function extractCollatedResults(payload) {
|
|
8793
|
+
const root = payload;
|
|
8794
|
+
const edges = root?.data?.ad_library_main?.search_results_connection?.edges ?? [];
|
|
8795
|
+
const results = [];
|
|
8796
|
+
for (const edge of edges) {
|
|
8797
|
+
const node = edge?.node;
|
|
8798
|
+
for (const raw of node?.collated_results ?? []) {
|
|
8799
|
+
const r = raw;
|
|
8800
|
+
const id = r.ad_archive_id;
|
|
8801
|
+
if (id === void 0 || id === null) continue;
|
|
8802
|
+
const snapshot = r.snapshot ?? null;
|
|
8803
|
+
results.push({
|
|
8804
|
+
ad_archive_id: String(id),
|
|
8805
|
+
page_id: r.page_id != null ? String(r.page_id) : "",
|
|
8806
|
+
page_name: r.page_name ?? snapshot?.page_name ?? "",
|
|
8807
|
+
is_active: Boolean(r.is_active),
|
|
8808
|
+
collation_count: typeof r.collation_count === "number" ? r.collation_count : null,
|
|
8809
|
+
snapshot
|
|
8810
|
+
});
|
|
8811
|
+
}
|
|
8812
|
+
}
|
|
8813
|
+
return results;
|
|
8814
|
+
}
|
|
8815
|
+
async function collectAdLibraryResults(page, url, maxResults, opts = {}) {
|
|
8816
|
+
const captureMs = opts.captureMs ?? 3e4;
|
|
8817
|
+
const collected = [];
|
|
8818
|
+
const seen = /* @__PURE__ */ new Set();
|
|
8819
|
+
const handler = (resp) => {
|
|
8820
|
+
if (!resp.url().includes("/api/graphql")) return;
|
|
8821
|
+
const friendlyName = (resp.request().postData() ?? "").match(/fb_api_req_friendly_name=([^&]+)/)?.[1];
|
|
8822
|
+
if (friendlyName !== AD_LIBRARY_QUERY) return;
|
|
8823
|
+
void resp.text().then((text) => {
|
|
8824
|
+
for (const payload of parseFbGraphqlJson(text)) {
|
|
8825
|
+
for (const result of extractCollatedResults(payload)) {
|
|
8826
|
+
if (seen.has(result.ad_archive_id)) continue;
|
|
8827
|
+
seen.add(result.ad_archive_id);
|
|
8828
|
+
collected.push(result);
|
|
8829
|
+
}
|
|
8830
|
+
}
|
|
8831
|
+
}).catch(() => void 0);
|
|
8832
|
+
};
|
|
8833
|
+
page.on("response", handler);
|
|
8834
|
+
try {
|
|
8835
|
+
await page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
|
|
8836
|
+
const deadline = Date.now() + captureMs;
|
|
8837
|
+
let lastCount = -1;
|
|
8838
|
+
let stableRounds = 0;
|
|
8839
|
+
while (Date.now() < deadline && collected.length < maxResults) {
|
|
8840
|
+
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)).catch(() => void 0);
|
|
8841
|
+
await page.waitForTimeout(2e3);
|
|
8842
|
+
if (collected.length === lastCount) {
|
|
8843
|
+
stableRounds++;
|
|
8844
|
+
if (stableRounds >= 2 && collected.length > 0) break;
|
|
8845
|
+
} else {
|
|
8846
|
+
stableRounds = 0;
|
|
8847
|
+
}
|
|
8848
|
+
lastCount = collected.length;
|
|
8849
|
+
}
|
|
8850
|
+
} finally {
|
|
8851
|
+
page.off("response", handler);
|
|
8852
|
+
}
|
|
8853
|
+
return collected.slice(0, maxResults);
|
|
8854
|
+
}
|
|
8855
|
+
function advertisersFromResults(results, maxResults) {
|
|
8856
|
+
const byPage = /* @__PURE__ */ new Map();
|
|
8857
|
+
for (const r of results) {
|
|
8858
|
+
if (!r.page_id || !r.page_name) continue;
|
|
8859
|
+
const collation = typeof r.collation_count === "number" && r.collation_count > 0 ? r.collation_count : 0;
|
|
8860
|
+
const existing = byPage.get(r.page_id);
|
|
8861
|
+
if (existing) {
|
|
8862
|
+
existing.resultCount++;
|
|
8863
|
+
existing.maxCollation = Math.max(existing.maxCollation, collation);
|
|
8864
|
+
} else {
|
|
8865
|
+
byPage.set(r.page_id, { pageName: r.page_name, pageId: r.page_id, sampleLibraryId: r.ad_archive_id, maxCollation: collation, resultCount: 1 });
|
|
8866
|
+
}
|
|
8867
|
+
}
|
|
8868
|
+
return [...byPage.values()].map((e) => ({ pageName: e.pageName, pageId: e.pageId, sampleLibraryId: e.sampleLibraryId, adCount: Math.max(e.maxCollation, e.resultCount) })).sort((a, b) => b.adCount - a.adCount).slice(0, maxResults);
|
|
8869
|
+
}
|
|
8870
|
+
|
|
8770
8871
|
// src/api/facebook-ad-routes.ts
|
|
8771
8872
|
import { fal as fal2 } from "@fal-ai/client";
|
|
8772
8873
|
var FacebookAdBodySchema = z13.object({
|
|
@@ -8811,6 +8912,22 @@ function buildPageIntelUrl(body, country) {
|
|
|
8811
8912
|
function kernelLaunchOpts() {
|
|
8812
8913
|
return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
8813
8914
|
}
|
|
8915
|
+
async function kernelLaunchOptsResidential() {
|
|
8916
|
+
let proxyId = process.env.KERNEL_PROXY_ID?.trim();
|
|
8917
|
+
try {
|
|
8918
|
+
const resolution = await resolveKernelProxyId({
|
|
8919
|
+
kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
|
|
8920
|
+
proxyMode: "location",
|
|
8921
|
+
configuredKernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
|
|
8922
|
+
location: "New York, NY",
|
|
8923
|
+
gl: "us"
|
|
8924
|
+
});
|
|
8925
|
+
if (resolution.kernelProxyId) proxyId = resolution.kernelProxyId;
|
|
8926
|
+
} catch {
|
|
8927
|
+
proxyId = process.env.KERNEL_PROXY_ID?.trim();
|
|
8928
|
+
}
|
|
8929
|
+
return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
|
|
8930
|
+
}
|
|
8814
8931
|
var facebookAdApp = new Hono4();
|
|
8815
8932
|
facebookAdApp.post("/ad", createApiKeyAuth(), async (c) => {
|
|
8816
8933
|
const raw = await c.req.json().catch(() => ({}));
|
|
@@ -8867,7 +8984,7 @@ facebookAdApp.post("/page-intel", createApiKeyAuth(), async (c) => {
|
|
|
8867
8984
|
const driver = new BrowserDriver();
|
|
8868
8985
|
let refunded = false;
|
|
8869
8986
|
try {
|
|
8870
|
-
await driver.launch(
|
|
8987
|
+
await driver.launch(await kernelLaunchOptsResidential());
|
|
8871
8988
|
await driver.navigateTo(listingUrl);
|
|
8872
8989
|
const extractor = new FacebookAdExtractor(driver);
|
|
8873
8990
|
const result = await extractor.extractPageIntel(listingUrl, maxAds);
|
|
@@ -8951,18 +9068,15 @@ facebookAdApp.post("/search", createApiKeyAuth(), async (c) => {
|
|
|
8951
9068
|
const driver = new BrowserDriver();
|
|
8952
9069
|
let searchRefunded = false;
|
|
8953
9070
|
try {
|
|
8954
|
-
await driver.launch(
|
|
9071
|
+
await driver.launch(await kernelLaunchOptsResidential());
|
|
8955
9072
|
const page = driver.getPage();
|
|
8956
|
-
await
|
|
8957
|
-
|
|
8958
|
-
|
|
8959
|
-
|
|
8960
|
-
|
|
8961
|
-
|
|
8962
|
-
|
|
8963
|
-
{ timeout: 2e4, polling: 500 }
|
|
8964
|
-
);
|
|
8965
|
-
} catch {
|
|
9073
|
+
const collated = await collectAdLibraryResults(page, searchUrl, Math.max(maxResults * 4, 40));
|
|
9074
|
+
const gqlAdvertisers = advertisersFromResults(collated, maxResults);
|
|
9075
|
+
if (gqlAdvertisers.length > 0) {
|
|
9076
|
+
const results2 = gqlAdvertisers.map((a) => ({ name: a.pageName, pageName: a.pageName, pageId: a.pageId, libraryId: a.sampleLibraryId, sampleLibraryId: a.sampleLibraryId, adCount: a.adCount }));
|
|
9077
|
+
const searchResult2 = { query: body.query.trim(), searchUrl, results: results2, via: "graphql" };
|
|
9078
|
+
await logRequestEvent({ userId: fbUser.id, source: "facebook_search", status: "done", query: body.query.trim(), resultCount: results2.length, result: searchResult2 });
|
|
9079
|
+
return c.json(searchResult2);
|
|
8966
9080
|
}
|
|
8967
9081
|
await page.waitForTimeout(1500);
|
|
8968
9082
|
for (let scroll = 0; scroll < 3; scroll++) {
|
|
@@ -9008,7 +9122,7 @@ facebookAdApp.post("/search", createApiKeyAuth(), async (c) => {
|
|
|
9008
9122
|
advertiserMap.set(pageName, { pageName, sampleLibraryId: libraryId, adCount: 1 });
|
|
9009
9123
|
}
|
|
9010
9124
|
}
|
|
9011
|
-
const results = [...advertiserMap.values()].sort((a, b) => b.adCount - a.adCount).slice(0, maxResults);
|
|
9125
|
+
const results = [...advertiserMap.values()].sort((a, b) => b.adCount - a.adCount).slice(0, maxResults).map((a) => ({ name: a.pageName, pageName: a.pageName, libraryId: a.sampleLibraryId, sampleLibraryId: a.sampleLibraryId, adCount: a.adCount }));
|
|
9012
9126
|
const searchResult = { query: body.query.trim(), searchUrl, results };
|
|
9013
9127
|
await logRequestEvent({ userId: fbUser.id, source: "facebook_search", status: "done", query: body.query.trim(), resultCount: results.length, result: searchResult });
|
|
9014
9128
|
return c.json(searchResult);
|
|
@@ -11500,7 +11614,7 @@ app.get("/cron/tick", async (c) => {
|
|
|
11500
11614
|
if (!process.env.CRON_SECRET || secret2 !== `Bearer ${process.env.CRON_SECRET}`) {
|
|
11501
11615
|
return c.json({ error: "Unauthorized" }, 401);
|
|
11502
11616
|
}
|
|
11503
|
-
const { drainQueue } = await import("./worker-
|
|
11617
|
+
const { drainQueue } = await import("./worker-UT4ZQU2T.js");
|
|
11504
11618
|
const budget = { maxJobs: 10, deadlineMs: Date.now() + 28e4 };
|
|
11505
11619
|
const [results, sweepResult] = await Promise.all([
|
|
11506
11620
|
drainQueue(budget),
|
|
@@ -11622,4 +11736,4 @@ app.get("/blog/:slug/", (c) => {
|
|
|
11622
11736
|
export {
|
|
11623
11737
|
app
|
|
11624
11738
|
};
|
|
11625
|
-
//# sourceMappingURL=server-
|
|
11739
|
+
//# sourceMappingURL=server-KUF3QJC7.js.map
|