mcp-scraper 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +74 -8
  2. package/dist/bin/api-server.cjs +4691 -3614
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +2 -2
  5. package/dist/bin/browser-agent-stdio-server.cjs +85 -8
  6. package/dist/bin/browser-agent-stdio-server.cjs.map +1 -1
  7. package/dist/bin/browser-agent-stdio-server.js +83 -6
  8. package/dist/bin/browser-agent-stdio-server.js.map +1 -1
  9. package/dist/bin/mcp-stdio-server.cjs +170 -12
  10. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  11. package/dist/bin/mcp-stdio-server.js +3 -3
  12. package/dist/bin/paa-harvest.cjs +223 -74
  13. package/dist/bin/paa-harvest.cjs.map +1 -1
  14. package/dist/bin/paa-harvest.js +2 -2
  15. package/dist/{chunk-GXBT5CDU.js → chunk-IQOCZGJJ.js} +39 -2
  16. package/dist/chunk-IQOCZGJJ.js.map +1 -0
  17. package/dist/{chunk-ZMOWIBMK.js → chunk-M2S27J6Z.js} +9 -2
  18. package/dist/{chunk-ZMOWIBMK.js.map → chunk-M2S27J6Z.js.map} +1 -1
  19. package/dist/{chunk-TM22BLWP.js → chunk-MY3S7EX7.js} +221 -76
  20. package/dist/chunk-MY3S7EX7.js.map +1 -0
  21. package/dist/{chunk-BMVQB3WN.js → chunk-OR7DLLH2.js} +173 -14
  22. package/dist/chunk-OR7DLLH2.js.map +1 -0
  23. package/dist/chunk-XR65SANX.js +7 -0
  24. package/dist/chunk-XR65SANX.js.map +1 -0
  25. package/dist/index.cjs +223 -74
  26. package/dist/index.cjs.map +1 -1
  27. package/dist/index.d.cts +1 -0
  28. package/dist/index.d.ts +1 -0
  29. package/dist/index.js +2 -2
  30. package/dist/{server-ASCMKUQ5.js → server-CJMX2QUM.js} +880 -181
  31. package/dist/server-CJMX2QUM.js.map +1 -0
  32. package/dist/{worker-KJ4A7WIR.js → worker-NAKGTIF5.js} +4 -4
  33. package/package.json +1 -1
  34. package/dist/chunk-2BS7BUEE.js +0 -7
  35. package/dist/chunk-2BS7BUEE.js.map +0 -1
  36. package/dist/chunk-BMVQB3WN.js.map +0 -1
  37. package/dist/chunk-GXBT5CDU.js.map +0 -1
  38. package/dist/chunk-TM22BLWP.js.map +0 -1
  39. package/dist/server-ASCMKUQ5.js.map +0 -1
  40. /package/dist/{worker-KJ4A7WIR.js.map → worker-NAKGTIF5.js.map} +0 -0
@@ -2,9 +2,9 @@
2
2
  import {
3
3
  HttpMcpToolExecutor,
4
4
  buildPaaExtractorMcpServer
5
- } from "../chunk-BMVQB3WN.js";
6
- import "../chunk-2BS7BUEE.js";
7
- import "../chunk-ZMOWIBMK.js";
5
+ } from "../chunk-OR7DLLH2.js";
6
+ import "../chunk-XR65SANX.js";
7
+ import "../chunk-M2S27J6Z.js";
8
8
 
9
9
  // bin/mcp-stdio-server.ts
10
10
  import { readFileSync } from "fs";
@@ -77,8 +77,12 @@ var MapsSearchOptionsSchema = import_zod.z.object({
77
77
  gl: import_zod.z.string().length(2).default("us"),
78
78
  hl: import_zod.z.string().length(2).default("en"),
79
79
  maxResults: import_zod.z.number().int().min(1).max(50).default(10),
80
+ proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location"),
81
+ proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional(),
82
+ debug: import_zod.z.boolean().default(false),
80
83
  kernelApiKey: import_zod.z.string().optional(),
81
84
  kernelProxyId: import_zod.z.string().optional(),
85
+ kernelProxyResolution: import_zod.z.unknown().optional(),
82
86
  headless: import_zod.z.boolean().default(true)
83
87
  });
84
88
  var RawPAAItemSchema = import_zod.z.object({
@@ -244,6 +248,12 @@ var RequestAbortedError = class extends Error {
244
248
  super(message);
245
249
  }
246
250
  };
251
+ var LocationMismatchError = class extends Error {
252
+ name = "LocationMismatchError";
253
+ constructor(message = "Google returned results for a different location than requested") {
254
+ super(message);
255
+ }
256
+ };
247
257
 
248
258
  // src/driver/BrowserDriver.ts
249
259
  import_playwright_extra.chromium.use((0, import_puppeteer_extra_plugin_stealth.default)());
@@ -2264,16 +2274,18 @@ var US_CITY_CENTER_ZIPS = {
2264
2274
  function proxyIdSuffix2(proxyId) {
2265
2275
  return proxyId ? proxyId.slice(-6) : null;
2266
2276
  }
2267
- function resolution(source, proxyMode, proxyId, target, error) {
2277
+ function resolution(source, proxyMode, proxyId, target, error, disposable = false) {
2268
2278
  return {
2269
2279
  kernelProxyId: proxyId,
2280
+ ...disposable && proxyId ? { disposableProxyId: proxyId } : {},
2270
2281
  resolution: {
2271
2282
  source,
2272
2283
  proxyMode,
2273
2284
  proxyIdPresent: Boolean(proxyId),
2274
2285
  proxyIdSuffix: proxyIdSuffix2(proxyId),
2275
2286
  target,
2276
- error
2287
+ error,
2288
+ disposable
2277
2289
  }
2278
2290
  };
2279
2291
  }
@@ -2303,6 +2315,10 @@ function kernelCityIdentifierCandidates(city) {
2303
2315
  function proxyName(country, state, city) {
2304
2316
  return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
2305
2317
  }
2318
+ function freshProxyName(baseName, attemptIndex) {
2319
+ const stamp = `${Date.now()}-${attemptIndex ?? 0}-${Math.random().toString(36).slice(2, 8)}`;
2320
+ return `${baseName}-fresh-${stamp}`;
2321
+ }
2306
2322
  function zipProxyName(zip) {
2307
2323
  return `mcp-serp-residential-us-zip-${zip}`;
2308
2324
  }
@@ -2372,6 +2388,12 @@ function zipTarget(target, zip) {
2372
2388
  }
2373
2389
  };
2374
2390
  }
2391
+ function withProxyName(target, name) {
2392
+ return {
2393
+ ...target,
2394
+ proxyName: name
2395
+ };
2396
+ }
2375
2397
  function configMatches(config, target, city) {
2376
2398
  if (target.level === "zip") {
2377
2399
  return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
@@ -2410,6 +2432,55 @@ function escalatedTargetLevel(target, attemptIndex) {
2410
2432
  function errorText2(err) {
2411
2433
  return err instanceof Error ? err.message : String(err);
2412
2434
  }
2435
+ function freshTargetCandidates(target, explicitZip, attemptIndex) {
2436
+ const out = [];
2437
+ const zip = knownZipFor(target, explicitZip);
2438
+ if (zip) {
2439
+ const targetZip = zipTarget(target, zip);
2440
+ out.push(withProxyName(targetZip, freshProxyName(targetZip.proxyName, attemptIndex)));
2441
+ }
2442
+ for (const city of target.cityCandidates) {
2443
+ const cityTarget = {
2444
+ ...target,
2445
+ level: "city",
2446
+ city,
2447
+ proxyName: proxyName(target.country, target.state, city),
2448
+ config: {
2449
+ country: target.country,
2450
+ state: target.state,
2451
+ city
2452
+ }
2453
+ };
2454
+ out.push(withProxyName(cityTarget, freshProxyName(cityTarget.proxyName, attemptIndex)));
2455
+ }
2456
+ const fallbackTarget = stateTarget(target);
2457
+ out.push(withProxyName(fallbackTarget, freshProxyName(fallbackTarget.proxyName, attemptIndex)));
2458
+ return out;
2459
+ }
2460
+ async function createFreshLocationProxy(kernel, options, target) {
2461
+ const createErrors = [];
2462
+ for (const candidate of freshTargetCandidates(target, options.proxyZip, options.attemptIndex)) {
2463
+ try {
2464
+ const created = await kernel.proxies.create({
2465
+ type: "residential",
2466
+ name: candidate.proxyName,
2467
+ config: candidate.level === "zip" ? { country: candidate.country, zip: candidate.zip } : candidate.config
2468
+ });
2469
+ if (created.id) {
2470
+ return resolution("location_created", options.proxyMode, created.id, candidate, null, true);
2471
+ }
2472
+ createErrors.push(`${candidate.proxyName}: Kernel did not return a proxy id`);
2473
+ } catch (err) {
2474
+ createErrors.push(`${candidate.proxyName}: ${errorText2(err)}`);
2475
+ }
2476
+ }
2477
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
2478
+ }
2479
+ async function deleteKernelProxyId(kernelApiKey, proxyId) {
2480
+ if (!kernelApiKey || !proxyId) return;
2481
+ const kernel = new import_sdk2.default({ apiKey: kernelApiKey });
2482
+ await kernel.proxies.delete(proxyId);
2483
+ }
2413
2484
  async function resolveKernelProxyId(options) {
2414
2485
  if (options.proxyMode === "none") {
2415
2486
  return resolution("disabled", options.proxyMode, void 0, null, null);
@@ -2424,6 +2495,9 @@ async function resolveKernelProxyId(options) {
2424
2495
  const kernel = new import_sdk2.default({ apiKey: options.kernelApiKey });
2425
2496
  try {
2426
2497
  const attemptIndex = options.attemptIndex ?? 0;
2498
+ if (options.fresh) {
2499
+ return await createFreshLocationProxy(kernel, options, target);
2500
+ }
2427
2501
  if (attemptIndex >= 1) {
2428
2502
  const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
2429
2503
  const createErrors2 = [];
@@ -2527,6 +2601,7 @@ async function resolveKernelProxyId(options) {
2527
2601
 
2528
2602
  // src/harvest.ts
2529
2603
  var MAX_ATTEMPTS = 3;
2604
+ var LOCATION_PROXY_MAX_ATTEMPTS = 5;
2530
2605
  function abortReason(signal) {
2531
2606
  if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
2532
2607
  return new RequestAbortedError();
@@ -2556,9 +2631,12 @@ async function emitAttemptEvent(sink, event) {
2556
2631
  }
2557
2632
  function classifyAttemptError(err) {
2558
2633
  if (err instanceof CaptchaError) return "captcha";
2634
+ if (err instanceof LocationMismatchError) return "location_mismatch";
2559
2635
  if (err instanceof RequestAbortedError) return "request_aborted";
2560
2636
  if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
2561
2637
  const message = err instanceof Error ? err.message : String(err);
2638
+ if (looksLikeProxyTunnelFailure(message)) return "proxy_tunnel_failed";
2639
+ if (looksLikeProxyUnavailable(message)) return "proxy_unavailable";
2562
2640
  return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
2563
2641
  }
2564
2642
  function classifyAttemptResult(result) {
@@ -2567,6 +2645,49 @@ function classifyAttemptResult(result) {
2567
2645
  function errorMessage(err) {
2568
2646
  return err instanceof Error ? err.message : String(err);
2569
2647
  }
2648
+ function maxAttemptsForProxyMode(proxyMode) {
2649
+ return proxyMode === "location" ? LOCATION_PROXY_MAX_ATTEMPTS : MAX_ATTEMPTS;
2650
+ }
2651
+ function looksLikeProxyTunnelFailure(message) {
2652
+ return /ERR_TUNNEL_CONNECTION_FAILED|ERR_PROXY_CONNECTION_FAILED|ERR_SOCKS_CONNECTION_FAILED|tunnel connection failed|proxy connection failed|transport error: proxy/i.test(message);
2653
+ }
2654
+ function looksLikeProxyUnavailable(message) {
2655
+ return /proxy unavailable|proxy_unavailable|connection_test_failed|did not return a proxy id|configured fallback/i.test(message);
2656
+ }
2657
+ function retryableLocationProxyError(outcome) {
2658
+ return outcome === "captcha" || outcome === "proxy_tunnel_failed" || outcome === "proxy_unavailable";
2659
+ }
2660
+ function locationMismatchMessage(result) {
2661
+ const evidence = result.diagnostics.debug?.locationEvidence;
2662
+ const expected = evidence?.expected?.canonicalLocation ?? result.location ?? "requested location";
2663
+ const candidates = evidence?.candidates.slice(0, 3).map((candidate) => `${candidate.city}, ${candidate.regionCode}`).join("; ");
2664
+ return candidates ? `Google returned results for ${candidates}, not ${expected}` : `Google returned results for a different location than ${expected}`;
2665
+ }
2666
+ function shouldRetryLocationMismatch(result, proxyMode) {
2667
+ return proxyMode === "location" && result.diagnostics.debug?.locationEvidence?.status === "mismatch";
2668
+ }
2669
+ function stripInternalDebug(result, keepDebug) {
2670
+ if (keepDebug || !result.diagnostics.debug) return result;
2671
+ const diagnostics = { ...result.diagnostics };
2672
+ delete diagnostics.debug;
2673
+ return { ...result, diagnostics };
2674
+ }
2675
+ async function cleanupDisposableProxy(kernelApiKey, proxyId) {
2676
+ if (!kernelApiKey || !proxyId) return;
2677
+ try {
2678
+ await deleteKernelProxyId(kernelApiKey, proxyId);
2679
+ console.info(JSON.stringify({
2680
+ event: "kernel_proxy_deleted",
2681
+ proxy_id_suffix: proxyId.slice(-6)
2682
+ }));
2683
+ } catch (err) {
2684
+ console.warn(JSON.stringify({
2685
+ event: "kernel_proxy_delete_failed",
2686
+ proxy_id_suffix: proxyId.slice(-6),
2687
+ message: errorMessage(err)
2688
+ }));
2689
+ }
2690
+ }
2570
2691
  async function extractOnce(options, signal) {
2571
2692
  const driver = new BrowserDriver();
2572
2693
  const reporter = new ProgressReporter();
@@ -2634,26 +2755,35 @@ async function harvest(rawOptions) {
2634
2755
  proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
2635
2756
  gl: typeof raw.gl === "string" ? raw.gl : "us"
2636
2757
  };
2758
+ const requestedDebug = typeof raw.debug === "boolean" ? raw.debug : false;
2759
+ const needsLocationEvidence = proxyMode === "location" && Boolean(proxyOpts.location);
2760
+ const maxAttempts = maxAttemptsForProxyMode(proxyMode);
2637
2761
  const serializer = new OutputSerializer();
2638
- for (let i = 0; i < MAX_ATTEMPTS; i++) {
2762
+ let lastError = null;
2763
+ for (let i = 0; i < maxAttempts; i++) {
2639
2764
  const attemptNumber = i + 1;
2640
2765
  const startedAtMs = Date.now();
2641
2766
  try {
2642
2767
  if (signal?.aborted) throw abortReason(signal);
2643
- const resolution2 = await resolveKernelProxyId({ ...proxyOpts, attemptIndex: i });
2768
+ const resolution2 = await resolveKernelProxyId({
2769
+ ...proxyOpts,
2770
+ attemptIndex: i,
2771
+ fresh: proxyMode === "location"
2772
+ });
2644
2773
  const mergedAttempt = {
2645
2774
  ...raw,
2646
2775
  kernelApiKey,
2647
2776
  kernelProxyId: resolution2.kernelProxyId,
2648
2777
  kernelProxyResolution: resolution2.resolution,
2649
- proxyMode
2778
+ proxyMode,
2779
+ debug: requestedDebug || needsLocationEvidence
2650
2780
  };
2651
2781
  if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
2652
2782
  const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
2653
2783
  await emitAttemptEvent(onAttemptEvent, {
2654
2784
  type: "started",
2655
2785
  attemptNumber,
2656
- maxAttempts: MAX_ATTEMPTS,
2786
+ maxAttempts,
2657
2787
  query: attemptOptions.query,
2658
2788
  location: attemptOptions.location ?? null,
2659
2789
  maxQuestions: attemptOptions.maxQuestions,
@@ -2662,7 +2792,7 @@ async function harvest(rawOptions) {
2662
2792
  console.info(JSON.stringify({
2663
2793
  event: "harvest_attempt_started",
2664
2794
  attempt_number: attemptNumber,
2665
- max_attempts: MAX_ATTEMPTS,
2795
+ max_attempts: maxAttempts,
2666
2796
  query: attemptOptions.query,
2667
2797
  location: attemptOptions.location ?? null,
2668
2798
  max_questions: attemptOptions.maxQuestions
@@ -2670,57 +2800,84 @@ async function harvest(rawOptions) {
2670
2800
  const attempt = await extractOnce(attemptOptions, signal);
2671
2801
  if (attempt.error) {
2672
2802
  const err = attempt.error;
2673
- if (err instanceof CaptchaError) {
2674
- const willRetry = i < MAX_ATTEMPTS - 1;
2803
+ const outcome = classifyAttemptError(err);
2804
+ const willRetry = i < maxAttempts - 1 && (outcome === "captcha" || proxyMode === "location" && retryableLocationProxyError(outcome));
2805
+ if (outcome === "captcha") {
2675
2806
  console.warn(JSON.stringify({
2676
2807
  event: "harvest_attempt_captcha",
2677
2808
  attempt_number: attemptNumber,
2678
- max_attempts: MAX_ATTEMPTS,
2679
- message: err.message,
2809
+ max_attempts: maxAttempts,
2810
+ message: errorMessage(err),
2811
+ will_retry: willRetry
2812
+ }));
2813
+ } else if (willRetry) {
2814
+ console.warn(JSON.stringify({
2815
+ event: "harvest_attempt_proxy_retry",
2816
+ attempt_number: attemptNumber,
2817
+ max_attempts: maxAttempts,
2818
+ outcome,
2819
+ message: errorMessage(err),
2680
2820
  will_retry: willRetry
2681
2821
  }));
2682
- await emitAttemptEvent(onAttemptEvent, {
2683
- type: "finished",
2684
- attemptNumber,
2685
- maxAttempts: MAX_ATTEMPTS,
2686
- outcome: "captcha",
2687
- kernelSessionId: attempt.cleanup.kernelSessionId,
2688
- questionCount: 0,
2689
- durationMs: Date.now() - startedAtMs,
2690
- error: err.message,
2691
- willRetry,
2692
- cleanup: attempt.cleanup,
2693
- debug: attempt.debug,
2694
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
2695
- });
2696
- if (willRetry) continue;
2697
- break;
2698
2822
  }
2699
2823
  await emitAttemptEvent(onAttemptEvent, {
2700
2824
  type: "finished",
2701
2825
  attemptNumber,
2702
- maxAttempts: MAX_ATTEMPTS,
2703
- outcome: classifyAttemptError(err),
2826
+ maxAttempts,
2827
+ outcome,
2704
2828
  kernelSessionId: attempt.cleanup.kernelSessionId,
2705
2829
  questionCount: 0,
2706
2830
  durationMs: Date.now() - startedAtMs,
2707
2831
  error: errorMessage(err),
2708
- willRetry: false,
2832
+ willRetry,
2709
2833
  cleanup: attempt.cleanup,
2710
2834
  debug: attempt.debug,
2711
2835
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
2712
2836
  });
2713
- throw err;
2837
+ await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
2838
+ lastError = err;
2839
+ if (willRetry) continue;
2840
+ break;
2714
2841
  }
2715
2842
  const result = attempt.result;
2716
2843
  if (!result) throw new Error("Harvest attempt completed without a result");
2844
+ if (shouldRetryLocationMismatch(result, proxyMode)) {
2845
+ const err = new LocationMismatchError(locationMismatchMessage(result));
2846
+ const willRetry = i < maxAttempts - 1;
2847
+ console.warn(JSON.stringify({
2848
+ event: "harvest_attempt_location_mismatch",
2849
+ attempt_number: attemptNumber,
2850
+ max_attempts: maxAttempts,
2851
+ message: err.message,
2852
+ will_retry: willRetry
2853
+ }));
2854
+ await emitAttemptEvent(onAttemptEvent, {
2855
+ type: "finished",
2856
+ attemptNumber,
2857
+ maxAttempts,
2858
+ outcome: "location_mismatch",
2859
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2860
+ questionCount: result.totalQuestions,
2861
+ durationMs: Date.now() - startedAtMs,
2862
+ error: err.message,
2863
+ willRetry,
2864
+ cleanup: attempt.cleanup,
2865
+ debug: attempt.debug,
2866
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2867
+ });
2868
+ await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
2869
+ lastError = err;
2870
+ if (willRetry) continue;
2871
+ break;
2872
+ }
2873
+ const finalResult = stripInternalDebug(result, requestedDebug);
2717
2874
  await emitAttemptEvent(onAttemptEvent, {
2718
2875
  type: "finished",
2719
2876
  attemptNumber,
2720
- maxAttempts: MAX_ATTEMPTS,
2721
- outcome: classifyAttemptResult(result),
2877
+ maxAttempts,
2878
+ outcome: classifyAttemptResult(finalResult),
2722
2879
  kernelSessionId: attempt.cleanup.kernelSessionId,
2723
- questionCount: result.totalQuestions,
2880
+ questionCount: finalResult.totalQuestions,
2724
2881
  durationMs: Date.now() - startedAtMs,
2725
2882
  error: null,
2726
2883
  willRetry: false,
@@ -2728,64 +2885,52 @@ async function harvest(rawOptions) {
2728
2885
  debug: attempt.debug,
2729
2886
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
2730
2887
  });
2888
+ await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
2731
2889
  if (attemptOptions.format === "json" || attemptOptions.format === "both") {
2732
- await serializer.writeJSON(result, attemptOptions.outputDir);
2890
+ await serializer.writeJSON(finalResult, attemptOptions.outputDir);
2733
2891
  }
2734
2892
  if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
2735
2893
  await Promise.all([
2736
- serializer.writeCSV(result.flat, attemptOptions.outputDir),
2737
- result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2738
- result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2739
- result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2740
- result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2741
- result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, attemptOptions.outputDir) : Promise.resolve("")
2894
+ serializer.writeCSV(finalResult.flat, attemptOptions.outputDir),
2895
+ finalResult.videos.length > 0 ? serializer.writeVideoCSV(finalResult.videos, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
2896
+ finalResult.forums.length > 0 ? serializer.writeForumCSV(finalResult.forums, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
2897
+ finalResult.aiOverview.detected ? serializer.writeAIOverviewCSV(finalResult.aiOverview.citations, finalResult.aiOverview.text, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
2898
+ finalResult.aiMode.detected ? serializer.writeAIModeCSV(finalResult.aiMode.citations, finalResult.aiMode.text, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
2899
+ finalResult.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(finalResult.whatPeopleSaying, finalResult.seed, attemptOptions.outputDir) : Promise.resolve("")
2742
2900
  ]);
2743
2901
  }
2744
- return result;
2902
+ return finalResult;
2745
2903
  } catch (err) {
2746
- if (err instanceof CaptchaError) {
2747
- const willRetry = i < MAX_ATTEMPTS - 1;
2904
+ const outcome = classifyAttemptError(err);
2905
+ const willRetry = i < maxAttempts - 1 && (outcome === "captcha" || proxyMode === "location" && retryableLocationProxyError(outcome));
2906
+ if (outcome === "captcha") {
2748
2907
  console.warn(JSON.stringify({
2749
2908
  event: "harvest_attempt_captcha",
2750
2909
  attempt_number: attemptNumber,
2751
- max_attempts: MAX_ATTEMPTS,
2752
- message: err.message,
2910
+ max_attempts: maxAttempts,
2911
+ message: errorMessage(err),
2912
+ will_retry: willRetry
2913
+ }));
2914
+ } else if (willRetry) {
2915
+ console.warn(JSON.stringify({
2916
+ event: "harvest_attempt_proxy_retry",
2917
+ attempt_number: attemptNumber,
2918
+ max_attempts: maxAttempts,
2919
+ outcome,
2920
+ message: errorMessage(err),
2753
2921
  will_retry: willRetry
2754
2922
  }));
2755
- await emitAttemptEvent(onAttemptEvent, {
2756
- type: "finished",
2757
- attemptNumber,
2758
- maxAttempts: MAX_ATTEMPTS,
2759
- outcome: "captcha",
2760
- kernelSessionId: null,
2761
- questionCount: 0,
2762
- durationMs: Date.now() - startedAtMs,
2763
- error: err.message,
2764
- willRetry,
2765
- cleanup: {
2766
- kernelSessionId: null,
2767
- kernelDeleteStarted: false,
2768
- kernelDeleteSucceeded: null,
2769
- kernelDeleteError: null,
2770
- browserCloseSucceeded: null,
2771
- browserCloseError: null
2772
- },
2773
- debug: null,
2774
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
2775
- });
2776
- if (willRetry) continue;
2777
- break;
2778
2923
  }
2779
2924
  await emitAttemptEvent(onAttemptEvent, {
2780
2925
  type: "finished",
2781
2926
  attemptNumber,
2782
- maxAttempts: MAX_ATTEMPTS,
2783
- outcome: classifyAttemptError(err),
2927
+ maxAttempts,
2928
+ outcome,
2784
2929
  kernelSessionId: null,
2785
2930
  questionCount: 0,
2786
2931
  durationMs: Date.now() - startedAtMs,
2787
2932
  error: errorMessage(err),
2788
- willRetry: false,
2933
+ willRetry,
2789
2934
  cleanup: {
2790
2935
  kernelSessionId: null,
2791
2936
  kernelDeleteStarted: false,
@@ -2797,15 +2942,19 @@ async function harvest(rawOptions) {
2797
2942
  debug: null,
2798
2943
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
2799
2944
  });
2945
+ lastError = err;
2946
+ if (willRetry) continue;
2947
+ if (outcome === "captcha") break;
2800
2948
  throw err;
2801
2949
  }
2802
2950
  }
2951
+ if (lastError && !(lastError instanceof CaptchaError)) throw lastError;
2803
2952
  console.warn(JSON.stringify({
2804
2953
  event: "harvest_captcha_exhausted",
2805
- max_attempts: MAX_ATTEMPTS,
2954
+ max_attempts: maxAttempts,
2806
2955
  session_kind: kernelApiKey ? "kernel" : "local"
2807
2956
  }));
2808
- throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${MAX_ATTEMPTS} fresh sessions. Try again in a few minutes.`));
2957
+ throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${maxAttempts} fresh sessions. Try again in a few minutes.`));
2809
2958
  }
2810
2959
 
2811
2960
  // src/cli.ts