mcp-scraper 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ import {
3
3
  CaptureSerpSnapshotInputSchema,
4
4
  HttpMcpToolExecutor,
5
5
  buildPaaExtractorMcpServer
6
- } from "./chunk-JQKZWEON.js";
6
+ } from "./chunk-DZY3XO3M.js";
7
7
  import {
8
8
  BALANCE_PACK_LABELS,
9
9
  BALANCE_PRICE_IDS,
@@ -19,7 +19,7 @@ import {
19
19
  harvestProblemResponse,
20
20
  insufficientBalanceResponse,
21
21
  serializeHarvestProblem
22
- } from "./chunk-Y74EXABN.js";
22
+ } from "./chunk-7HB7NDOY.js";
23
23
  import {
24
24
  BrowserDriver,
25
25
  CaptchaError,
@@ -30,8 +30,9 @@ import {
30
30
  RawMapsOverviewSchema,
31
31
  RawMapsReviewStatsSchema,
32
32
  buildYouTubeChannelVideosUrl,
33
- harvest
34
- } from "./chunk-HERFK7W6.js";
33
+ harvest,
34
+ resolveKernelProxyId
35
+ } from "./chunk-W4P2U5VF.js";
35
36
  import {
36
37
  SiteAuditJobRowSchema,
37
38
  cancelJob,
@@ -8767,6 +8768,106 @@ var FacebookAdExtractor = class {
8767
8768
  }
8768
8769
  };
8769
8770
 
8771
+ // src/extractor/FacebookAdGraphql.ts
8772
+ var AD_LIBRARY_QUERY = "AdLibrarySearchPaginationQuery";
8773
+ function parseFbGraphqlJson(text) {
8774
+ const out = [];
8775
+ const body = text.replace(/^for\s*\(;;\);/, "").trim();
8776
+ try {
8777
+ out.push(JSON.parse(body));
8778
+ return out;
8779
+ } catch {
8780
+ for (const line of body.split("\n")) {
8781
+ const trimmed = line.trim();
8782
+ if (!trimmed) continue;
8783
+ try {
8784
+ out.push(JSON.parse(trimmed));
8785
+ } catch {
8786
+ continue;
8787
+ }
8788
+ }
8789
+ return out;
8790
+ }
8791
+ }
8792
+ function extractCollatedResults(payload) {
8793
+ const root = payload;
8794
+ const edges = root?.data?.ad_library_main?.search_results_connection?.edges ?? [];
8795
+ const results = [];
8796
+ for (const edge of edges) {
8797
+ const node = edge?.node;
8798
+ for (const raw of node?.collated_results ?? []) {
8799
+ const r = raw;
8800
+ const id = r.ad_archive_id;
8801
+ if (id === void 0 || id === null) continue;
8802
+ const snapshot = r.snapshot ?? null;
8803
+ results.push({
8804
+ ad_archive_id: String(id),
8805
+ page_id: r.page_id != null ? String(r.page_id) : "",
8806
+ page_name: r.page_name ?? snapshot?.page_name ?? "",
8807
+ is_active: Boolean(r.is_active),
8808
+ collation_count: typeof r.collation_count === "number" ? r.collation_count : null,
8809
+ snapshot
8810
+ });
8811
+ }
8812
+ }
8813
+ return results;
8814
+ }
8815
+ async function collectAdLibraryResults(page, url, maxResults, opts = {}) {
8816
+ const captureMs = opts.captureMs ?? 3e4;
8817
+ const collected = [];
8818
+ const seen = /* @__PURE__ */ new Set();
8819
+ const handler = (resp) => {
8820
+ if (!resp.url().includes("/api/graphql")) return;
8821
+ const friendlyName = (resp.request().postData() ?? "").match(/fb_api_req_friendly_name=([^&]+)/)?.[1];
8822
+ if (friendlyName !== AD_LIBRARY_QUERY) return;
8823
+ void resp.text().then((text) => {
8824
+ for (const payload of parseFbGraphqlJson(text)) {
8825
+ for (const result of extractCollatedResults(payload)) {
8826
+ if (seen.has(result.ad_archive_id)) continue;
8827
+ seen.add(result.ad_archive_id);
8828
+ collected.push(result);
8829
+ }
8830
+ }
8831
+ }).catch(() => void 0);
8832
+ };
8833
+ page.on("response", handler);
8834
+ try {
8835
+ await page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
8836
+ const deadline = Date.now() + captureMs;
8837
+ let lastCount = -1;
8838
+ let stableRounds = 0;
8839
+ while (Date.now() < deadline && collected.length < maxResults) {
8840
+ await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)).catch(() => void 0);
8841
+ await page.waitForTimeout(2e3);
8842
+ if (collected.length === lastCount) {
8843
+ stableRounds++;
8844
+ if (stableRounds >= 2 && collected.length > 0) break;
8845
+ } else {
8846
+ stableRounds = 0;
8847
+ }
8848
+ lastCount = collected.length;
8849
+ }
8850
+ } finally {
8851
+ page.off("response", handler);
8852
+ }
8853
+ return collected.slice(0, maxResults);
8854
+ }
8855
+ function advertisersFromResults(results, maxResults) {
8856
+ const byPage = /* @__PURE__ */ new Map();
8857
+ for (const r of results) {
8858
+ if (!r.page_id || !r.page_name) continue;
8859
+ const collation = typeof r.collation_count === "number" && r.collation_count > 0 ? r.collation_count : 0;
8860
+ const existing = byPage.get(r.page_id);
8861
+ if (existing) {
8862
+ existing.resultCount++;
8863
+ existing.maxCollation = Math.max(existing.maxCollation, collation);
8864
+ } else {
8865
+ byPage.set(r.page_id, { pageName: r.page_name, pageId: r.page_id, sampleLibraryId: r.ad_archive_id, maxCollation: collation, resultCount: 1 });
8866
+ }
8867
+ }
8868
+ return [...byPage.values()].map((e) => ({ pageName: e.pageName, pageId: e.pageId, sampleLibraryId: e.sampleLibraryId, adCount: Math.max(e.maxCollation, e.resultCount) })).sort((a, b) => b.adCount - a.adCount).slice(0, maxResults);
8869
+ }
8870
+
8770
8871
  // src/api/facebook-ad-routes.ts
8771
8872
  import { fal as fal2 } from "@fal-ai/client";
8772
8873
  var FacebookAdBodySchema = z13.object({
@@ -8811,6 +8912,22 @@ function buildPageIntelUrl(body, country) {
8811
8912
  function kernelLaunchOpts() {
8812
8913
  return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(), viewport: { width: 1280, height: 900 }, locale: "en-US" };
8813
8914
  }
8915
+ async function kernelLaunchOptsResidential() {
8916
+ let proxyId = process.env.KERNEL_PROXY_ID?.trim();
8917
+ try {
8918
+ const resolution = await resolveKernelProxyId({
8919
+ kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
8920
+ proxyMode: "location",
8921
+ configuredKernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
8922
+ location: "New York, NY",
8923
+ gl: "us"
8924
+ });
8925
+ if (resolution.kernelProxyId) proxyId = resolution.kernelProxyId;
8926
+ } catch {
8927
+ proxyId = process.env.KERNEL_PROXY_ID?.trim();
8928
+ }
8929
+ return { headless: true, kernelApiKey: process.env.KERNEL_API_KEY?.trim(), kernelProxyId: proxyId, viewport: { width: 1280, height: 900 }, locale: "en-US" };
8930
+ }
8814
8931
  var facebookAdApp = new Hono4();
8815
8932
  facebookAdApp.post("/ad", createApiKeyAuth(), async (c) => {
8816
8933
  const raw = await c.req.json().catch(() => ({}));
@@ -8867,7 +8984,7 @@ facebookAdApp.post("/page-intel", createApiKeyAuth(), async (c) => {
8867
8984
  const driver = new BrowserDriver();
8868
8985
  let refunded = false;
8869
8986
  try {
8870
- await driver.launch(kernelLaunchOpts());
8987
+ await driver.launch(await kernelLaunchOptsResidential());
8871
8988
  await driver.navigateTo(listingUrl);
8872
8989
  const extractor = new FacebookAdExtractor(driver);
8873
8990
  const result = await extractor.extractPageIntel(listingUrl, maxAds);
@@ -8951,18 +9068,15 @@ facebookAdApp.post("/search", createApiKeyAuth(), async (c) => {
8951
9068
  const driver = new BrowserDriver();
8952
9069
  let searchRefunded = false;
8953
9070
  try {
8954
- await driver.launch(kernelLaunchOpts());
9071
+ await driver.launch(await kernelLaunchOptsResidential());
8955
9072
  const page = driver.getPage();
8956
- await driver.navigateTo(searchUrl);
8957
- try {
8958
- await page.waitForFunction(
8959
- () => {
8960
- const bt = document.body ? document.body.innerText ?? "" : "";
8961
- return bt.includes("Library ID") || bt.includes("No results");
8962
- },
8963
- { timeout: 2e4, polling: 500 }
8964
- );
8965
- } catch {
9073
+ const collated = await collectAdLibraryResults(page, searchUrl, Math.max(maxResults * 4, 40));
9074
+ const gqlAdvertisers = advertisersFromResults(collated, maxResults);
9075
+ if (gqlAdvertisers.length > 0) {
9076
+ const results2 = gqlAdvertisers.map((a) => ({ name: a.pageName, pageName: a.pageName, pageId: a.pageId, libraryId: a.sampleLibraryId, sampleLibraryId: a.sampleLibraryId, adCount: a.adCount }));
9077
+ const searchResult2 = { query: body.query.trim(), searchUrl, results: results2, via: "graphql" };
9078
+ await logRequestEvent({ userId: fbUser.id, source: "facebook_search", status: "done", query: body.query.trim(), resultCount: results2.length, result: searchResult2 });
9079
+ return c.json(searchResult2);
8966
9080
  }
8967
9081
  await page.waitForTimeout(1500);
8968
9082
  for (let scroll = 0; scroll < 3; scroll++) {
@@ -9008,7 +9122,7 @@ facebookAdApp.post("/search", createApiKeyAuth(), async (c) => {
9008
9122
  advertiserMap.set(pageName, { pageName, sampleLibraryId: libraryId, adCount: 1 });
9009
9123
  }
9010
9124
  }
9011
- const results = [...advertiserMap.values()].sort((a, b) => b.adCount - a.adCount).slice(0, maxResults);
9125
+ const results = [...advertiserMap.values()].sort((a, b) => b.adCount - a.adCount).slice(0, maxResults).map((a) => ({ name: a.pageName, pageName: a.pageName, libraryId: a.sampleLibraryId, sampleLibraryId: a.sampleLibraryId, adCount: a.adCount }));
9012
9126
  const searchResult = { query: body.query.trim(), searchUrl, results };
9013
9127
  await logRequestEvent({ userId: fbUser.id, source: "facebook_search", status: "done", query: body.query.trim(), resultCount: results.length, result: searchResult });
9014
9128
  return c.json(searchResult);
@@ -11500,7 +11614,7 @@ app.get("/cron/tick", async (c) => {
11500
11614
  if (!process.env.CRON_SECRET || secret2 !== `Bearer ${process.env.CRON_SECRET}`) {
11501
11615
  return c.json({ error: "Unauthorized" }, 401);
11502
11616
  }
11503
- const { drainQueue } = await import("./worker-D4D2YQTA.js");
11617
+ const { drainQueue } = await import("./worker-UT4ZQU2T.js");
11504
11618
  const budget = { maxJobs: 10, deadlineMs: Date.now() + 28e4 };
11505
11619
  const [results, sweepResult] = await Promise.all([
11506
11620
  drainQueue(budget),
@@ -11622,4 +11736,4 @@ app.get("/blog/:slug/", (c) => {
11622
11736
  export {
11623
11737
  app
11624
11738
  };
11625
- //# sourceMappingURL=server-6CHHLOII.js.map
11739
+ //# sourceMappingURL=server-KUF3QJC7.js.map