mcp-scraper 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -8
- package/dist/bin/api-server.cjs +4691 -3614
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/browser-agent-stdio-server.cjs +85 -8
- package/dist/bin/browser-agent-stdio-server.cjs.map +1 -1
- package/dist/bin/browser-agent-stdio-server.js +83 -6
- package/dist/bin/browser-agent-stdio-server.js.map +1 -1
- package/dist/bin/mcp-stdio-server.cjs +170 -12
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +3 -3
- package/dist/bin/paa-harvest.cjs +223 -74
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +2 -2
- package/dist/{chunk-GXBT5CDU.js → chunk-IQOCZGJJ.js} +39 -2
- package/dist/chunk-IQOCZGJJ.js.map +1 -0
- package/dist/{chunk-BMVQB3WN.js → chunk-KIF4PKFZ.js} +173 -14
- package/dist/chunk-KIF4PKFZ.js.map +1 -0
- package/dist/{chunk-ZMOWIBMK.js → chunk-M2S27J6Z.js} +9 -2
- package/dist/{chunk-ZMOWIBMK.js.map → chunk-M2S27J6Z.js.map} +1 -1
- package/dist/{chunk-TM22BLWP.js → chunk-MY3S7EX7.js} +221 -76
- package/dist/chunk-MY3S7EX7.js.map +1 -0
- package/dist/chunk-PYBMZ346.js +7 -0
- package/dist/chunk-PYBMZ346.js.map +1 -0
- package/dist/index.cjs +223 -74
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +2 -2
- package/dist/{server-ASCMKUQ5.js → server-3QMDOEOS.js} +880 -181
- package/dist/server-3QMDOEOS.js.map +1 -0
- package/dist/{worker-KJ4A7WIR.js → worker-NAKGTIF5.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-2BS7BUEE.js +0 -7
- package/dist/chunk-2BS7BUEE.js.map +0 -1
- package/dist/chunk-BMVQB3WN.js.map +0 -1
- package/dist/chunk-GXBT5CDU.js.map +0 -1
- package/dist/chunk-TM22BLWP.js.map +0 -1
- package/dist/server-ASCMKUQ5.js.map +0 -1
- /package/dist/{worker-KJ4A7WIR.js.map → worker-NAKGTIF5.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -77,8 +77,12 @@ var MapsSearchOptionsSchema = import_zod.z.object({
|
|
|
77
77
|
gl: import_zod.z.string().length(2).default("us"),
|
|
78
78
|
hl: import_zod.z.string().length(2).default("en"),
|
|
79
79
|
maxResults: import_zod.z.number().int().min(1).max(50).default(10),
|
|
80
|
+
proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location"),
|
|
81
|
+
proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional(),
|
|
82
|
+
debug: import_zod.z.boolean().default(false),
|
|
80
83
|
kernelApiKey: import_zod.z.string().optional(),
|
|
81
84
|
kernelProxyId: import_zod.z.string().optional(),
|
|
85
|
+
kernelProxyResolution: import_zod.z.unknown().optional(),
|
|
82
86
|
headless: import_zod.z.boolean().default(true)
|
|
83
87
|
});
|
|
84
88
|
var RawPAAItemSchema = import_zod.z.object({
|
|
@@ -254,6 +258,12 @@ var RequestAbortedError = class extends Error {
|
|
|
254
258
|
super(message);
|
|
255
259
|
}
|
|
256
260
|
};
|
|
261
|
+
var LocationMismatchError = class extends Error {
|
|
262
|
+
name = "LocationMismatchError";
|
|
263
|
+
constructor(message = "Google returned results for a different location than requested") {
|
|
264
|
+
super(message);
|
|
265
|
+
}
|
|
266
|
+
};
|
|
257
267
|
|
|
258
268
|
// src/driver/BrowserDriver.ts
|
|
259
269
|
import_playwright_extra.chromium.use((0, import_puppeteer_extra_plugin_stealth.default)());
|
|
@@ -2274,16 +2284,18 @@ var US_CITY_CENTER_ZIPS = {
|
|
|
2274
2284
|
function proxyIdSuffix2(proxyId) {
|
|
2275
2285
|
return proxyId ? proxyId.slice(-6) : null;
|
|
2276
2286
|
}
|
|
2277
|
-
function resolution(source, proxyMode, proxyId, target, error) {
|
|
2287
|
+
function resolution(source, proxyMode, proxyId, target, error, disposable = false) {
|
|
2278
2288
|
return {
|
|
2279
2289
|
kernelProxyId: proxyId,
|
|
2290
|
+
...disposable && proxyId ? { disposableProxyId: proxyId } : {},
|
|
2280
2291
|
resolution: {
|
|
2281
2292
|
source,
|
|
2282
2293
|
proxyMode,
|
|
2283
2294
|
proxyIdPresent: Boolean(proxyId),
|
|
2284
2295
|
proxyIdSuffix: proxyIdSuffix2(proxyId),
|
|
2285
2296
|
target,
|
|
2286
|
-
error
|
|
2297
|
+
error,
|
|
2298
|
+
disposable
|
|
2287
2299
|
}
|
|
2288
2300
|
};
|
|
2289
2301
|
}
|
|
@@ -2313,6 +2325,10 @@ function kernelCityIdentifierCandidates(city) {
|
|
|
2313
2325
|
function proxyName(country, state, city) {
|
|
2314
2326
|
return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
|
|
2315
2327
|
}
|
|
2328
|
+
function freshProxyName(baseName, attemptIndex) {
|
|
2329
|
+
const stamp = `${Date.now()}-${attemptIndex ?? 0}-${Math.random().toString(36).slice(2, 8)}`;
|
|
2330
|
+
return `${baseName}-fresh-${stamp}`;
|
|
2331
|
+
}
|
|
2316
2332
|
function zipProxyName(zip) {
|
|
2317
2333
|
return `mcp-serp-residential-us-zip-${zip}`;
|
|
2318
2334
|
}
|
|
@@ -2382,6 +2398,12 @@ function zipTarget(target, zip) {
|
|
|
2382
2398
|
}
|
|
2383
2399
|
};
|
|
2384
2400
|
}
|
|
2401
|
+
function withProxyName(target, name) {
|
|
2402
|
+
return {
|
|
2403
|
+
...target,
|
|
2404
|
+
proxyName: name
|
|
2405
|
+
};
|
|
2406
|
+
}
|
|
2385
2407
|
function configMatches(config, target, city) {
|
|
2386
2408
|
if (target.level === "zip") {
|
|
2387
2409
|
return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
|
|
@@ -2420,6 +2442,55 @@ function escalatedTargetLevel(target, attemptIndex) {
|
|
|
2420
2442
|
function errorText2(err) {
|
|
2421
2443
|
return err instanceof Error ? err.message : String(err);
|
|
2422
2444
|
}
|
|
2445
|
+
function freshTargetCandidates(target, explicitZip, attemptIndex) {
|
|
2446
|
+
const out = [];
|
|
2447
|
+
const zip = knownZipFor(target, explicitZip);
|
|
2448
|
+
if (zip) {
|
|
2449
|
+
const targetZip = zipTarget(target, zip);
|
|
2450
|
+
out.push(withProxyName(targetZip, freshProxyName(targetZip.proxyName, attemptIndex)));
|
|
2451
|
+
}
|
|
2452
|
+
for (const city of target.cityCandidates) {
|
|
2453
|
+
const cityTarget = {
|
|
2454
|
+
...target,
|
|
2455
|
+
level: "city",
|
|
2456
|
+
city,
|
|
2457
|
+
proxyName: proxyName(target.country, target.state, city),
|
|
2458
|
+
config: {
|
|
2459
|
+
country: target.country,
|
|
2460
|
+
state: target.state,
|
|
2461
|
+
city
|
|
2462
|
+
}
|
|
2463
|
+
};
|
|
2464
|
+
out.push(withProxyName(cityTarget, freshProxyName(cityTarget.proxyName, attemptIndex)));
|
|
2465
|
+
}
|
|
2466
|
+
const fallbackTarget = stateTarget(target);
|
|
2467
|
+
out.push(withProxyName(fallbackTarget, freshProxyName(fallbackTarget.proxyName, attemptIndex)));
|
|
2468
|
+
return out;
|
|
2469
|
+
}
|
|
2470
|
+
async function createFreshLocationProxy(kernel, options, target) {
|
|
2471
|
+
const createErrors = [];
|
|
2472
|
+
for (const candidate of freshTargetCandidates(target, options.proxyZip, options.attemptIndex)) {
|
|
2473
|
+
try {
|
|
2474
|
+
const created = await kernel.proxies.create({
|
|
2475
|
+
type: "residential",
|
|
2476
|
+
name: candidate.proxyName,
|
|
2477
|
+
config: candidate.level === "zip" ? { country: candidate.country, zip: candidate.zip } : candidate.config
|
|
2478
|
+
});
|
|
2479
|
+
if (created.id) {
|
|
2480
|
+
return resolution("location_created", options.proxyMode, created.id, candidate, null, true);
|
|
2481
|
+
}
|
|
2482
|
+
createErrors.push(`${candidate.proxyName}: Kernel did not return a proxy id`);
|
|
2483
|
+
} catch (err) {
|
|
2484
|
+
createErrors.push(`${candidate.proxyName}: ${errorText2(err)}`);
|
|
2485
|
+
}
|
|
2486
|
+
}
|
|
2487
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
|
|
2488
|
+
}
|
|
2489
|
+
async function deleteKernelProxyId(kernelApiKey, proxyId) {
|
|
2490
|
+
if (!kernelApiKey || !proxyId) return;
|
|
2491
|
+
const kernel = new import_sdk2.default({ apiKey: kernelApiKey });
|
|
2492
|
+
await kernel.proxies.delete(proxyId);
|
|
2493
|
+
}
|
|
2423
2494
|
async function resolveKernelProxyId(options) {
|
|
2424
2495
|
if (options.proxyMode === "none") {
|
|
2425
2496
|
return resolution("disabled", options.proxyMode, void 0, null, null);
|
|
@@ -2434,6 +2505,9 @@ async function resolveKernelProxyId(options) {
|
|
|
2434
2505
|
const kernel = new import_sdk2.default({ apiKey: options.kernelApiKey });
|
|
2435
2506
|
try {
|
|
2436
2507
|
const attemptIndex = options.attemptIndex ?? 0;
|
|
2508
|
+
if (options.fresh) {
|
|
2509
|
+
return await createFreshLocationProxy(kernel, options, target);
|
|
2510
|
+
}
|
|
2437
2511
|
if (attemptIndex >= 1) {
|
|
2438
2512
|
const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
|
|
2439
2513
|
const createErrors2 = [];
|
|
@@ -2537,6 +2611,7 @@ async function resolveKernelProxyId(options) {
|
|
|
2537
2611
|
|
|
2538
2612
|
// src/harvest.ts
|
|
2539
2613
|
var MAX_ATTEMPTS = 3;
|
|
2614
|
+
var LOCATION_PROXY_MAX_ATTEMPTS = 5;
|
|
2540
2615
|
function abortReason(signal) {
|
|
2541
2616
|
if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
|
|
2542
2617
|
return new RequestAbortedError();
|
|
@@ -2566,9 +2641,12 @@ async function emitAttemptEvent(sink, event) {
|
|
|
2566
2641
|
}
|
|
2567
2642
|
function classifyAttemptError(err) {
|
|
2568
2643
|
if (err instanceof CaptchaError) return "captcha";
|
|
2644
|
+
if (err instanceof LocationMismatchError) return "location_mismatch";
|
|
2569
2645
|
if (err instanceof RequestAbortedError) return "request_aborted";
|
|
2570
2646
|
if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
|
|
2571
2647
|
const message = err instanceof Error ? err.message : String(err);
|
|
2648
|
+
if (looksLikeProxyTunnelFailure(message)) return "proxy_tunnel_failed";
|
|
2649
|
+
if (looksLikeProxyUnavailable(message)) return "proxy_unavailable";
|
|
2572
2650
|
return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
|
|
2573
2651
|
}
|
|
2574
2652
|
function classifyAttemptResult(result) {
|
|
@@ -2577,6 +2655,49 @@ function classifyAttemptResult(result) {
|
|
|
2577
2655
|
function errorMessage(err) {
|
|
2578
2656
|
return err instanceof Error ? err.message : String(err);
|
|
2579
2657
|
}
|
|
2658
|
+
function maxAttemptsForProxyMode(proxyMode) {
|
|
2659
|
+
return proxyMode === "location" ? LOCATION_PROXY_MAX_ATTEMPTS : MAX_ATTEMPTS;
|
|
2660
|
+
}
|
|
2661
|
+
function looksLikeProxyTunnelFailure(message) {
|
|
2662
|
+
return /ERR_TUNNEL_CONNECTION_FAILED|ERR_PROXY_CONNECTION_FAILED|ERR_SOCKS_CONNECTION_FAILED|tunnel connection failed|proxy connection failed|transport error: proxy/i.test(message);
|
|
2663
|
+
}
|
|
2664
|
+
function looksLikeProxyUnavailable(message) {
|
|
2665
|
+
return /proxy unavailable|proxy_unavailable|connection_test_failed|did not return a proxy id|configured fallback/i.test(message);
|
|
2666
|
+
}
|
|
2667
|
+
function retryableLocationProxyError(outcome) {
|
|
2668
|
+
return outcome === "captcha" || outcome === "proxy_tunnel_failed" || outcome === "proxy_unavailable";
|
|
2669
|
+
}
|
|
2670
|
+
function locationMismatchMessage(result) {
|
|
2671
|
+
const evidence = result.diagnostics.debug?.locationEvidence;
|
|
2672
|
+
const expected = evidence?.expected?.canonicalLocation ?? result.location ?? "requested location";
|
|
2673
|
+
const candidates = evidence?.candidates.slice(0, 3).map((candidate) => `${candidate.city}, ${candidate.regionCode}`).join("; ");
|
|
2674
|
+
return candidates ? `Google returned results for ${candidates}, not ${expected}` : `Google returned results for a different location than ${expected}`;
|
|
2675
|
+
}
|
|
2676
|
+
function shouldRetryLocationMismatch(result, proxyMode) {
|
|
2677
|
+
return proxyMode === "location" && result.diagnostics.debug?.locationEvidence?.status === "mismatch";
|
|
2678
|
+
}
|
|
2679
|
+
function stripInternalDebug(result, keepDebug) {
|
|
2680
|
+
if (keepDebug || !result.diagnostics.debug) return result;
|
|
2681
|
+
const diagnostics = { ...result.diagnostics };
|
|
2682
|
+
delete diagnostics.debug;
|
|
2683
|
+
return { ...result, diagnostics };
|
|
2684
|
+
}
|
|
2685
|
+
async function cleanupDisposableProxy(kernelApiKey, proxyId) {
|
|
2686
|
+
if (!kernelApiKey || !proxyId) return;
|
|
2687
|
+
try {
|
|
2688
|
+
await deleteKernelProxyId(kernelApiKey, proxyId);
|
|
2689
|
+
console.info(JSON.stringify({
|
|
2690
|
+
event: "kernel_proxy_deleted",
|
|
2691
|
+
proxy_id_suffix: proxyId.slice(-6)
|
|
2692
|
+
}));
|
|
2693
|
+
} catch (err) {
|
|
2694
|
+
console.warn(JSON.stringify({
|
|
2695
|
+
event: "kernel_proxy_delete_failed",
|
|
2696
|
+
proxy_id_suffix: proxyId.slice(-6),
|
|
2697
|
+
message: errorMessage(err)
|
|
2698
|
+
}));
|
|
2699
|
+
}
|
|
2700
|
+
}
|
|
2580
2701
|
async function extractOnce(options, signal) {
|
|
2581
2702
|
const driver = new BrowserDriver();
|
|
2582
2703
|
const reporter = new ProgressReporter();
|
|
@@ -2644,26 +2765,35 @@ async function harvest(rawOptions) {
|
|
|
2644
2765
|
proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
|
|
2645
2766
|
gl: typeof raw.gl === "string" ? raw.gl : "us"
|
|
2646
2767
|
};
|
|
2768
|
+
const requestedDebug = typeof raw.debug === "boolean" ? raw.debug : false;
|
|
2769
|
+
const needsLocationEvidence = proxyMode === "location" && Boolean(proxyOpts.location);
|
|
2770
|
+
const maxAttempts = maxAttemptsForProxyMode(proxyMode);
|
|
2647
2771
|
const serializer = new OutputSerializer();
|
|
2648
|
-
|
|
2772
|
+
let lastError = null;
|
|
2773
|
+
for (let i = 0; i < maxAttempts; i++) {
|
|
2649
2774
|
const attemptNumber = i + 1;
|
|
2650
2775
|
const startedAtMs = Date.now();
|
|
2651
2776
|
try {
|
|
2652
2777
|
if (signal?.aborted) throw abortReason(signal);
|
|
2653
|
-
const resolution2 = await resolveKernelProxyId({
|
|
2778
|
+
const resolution2 = await resolveKernelProxyId({
|
|
2779
|
+
...proxyOpts,
|
|
2780
|
+
attemptIndex: i,
|
|
2781
|
+
fresh: proxyMode === "location"
|
|
2782
|
+
});
|
|
2654
2783
|
const mergedAttempt = {
|
|
2655
2784
|
...raw,
|
|
2656
2785
|
kernelApiKey,
|
|
2657
2786
|
kernelProxyId: resolution2.kernelProxyId,
|
|
2658
2787
|
kernelProxyResolution: resolution2.resolution,
|
|
2659
|
-
proxyMode
|
|
2788
|
+
proxyMode,
|
|
2789
|
+
debug: requestedDebug || needsLocationEvidence
|
|
2660
2790
|
};
|
|
2661
2791
|
if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
|
|
2662
2792
|
const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
|
|
2663
2793
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2664
2794
|
type: "started",
|
|
2665
2795
|
attemptNumber,
|
|
2666
|
-
maxAttempts
|
|
2796
|
+
maxAttempts,
|
|
2667
2797
|
query: attemptOptions.query,
|
|
2668
2798
|
location: attemptOptions.location ?? null,
|
|
2669
2799
|
maxQuestions: attemptOptions.maxQuestions,
|
|
@@ -2672,7 +2802,7 @@ async function harvest(rawOptions) {
|
|
|
2672
2802
|
console.info(JSON.stringify({
|
|
2673
2803
|
event: "harvest_attempt_started",
|
|
2674
2804
|
attempt_number: attemptNumber,
|
|
2675
|
-
max_attempts:
|
|
2805
|
+
max_attempts: maxAttempts,
|
|
2676
2806
|
query: attemptOptions.query,
|
|
2677
2807
|
location: attemptOptions.location ?? null,
|
|
2678
2808
|
max_questions: attemptOptions.maxQuestions
|
|
@@ -2680,57 +2810,84 @@ async function harvest(rawOptions) {
|
|
|
2680
2810
|
const attempt = await extractOnce(attemptOptions, signal);
|
|
2681
2811
|
if (attempt.error) {
|
|
2682
2812
|
const err = attempt.error;
|
|
2683
|
-
|
|
2684
|
-
|
|
2813
|
+
const outcome = classifyAttemptError(err);
|
|
2814
|
+
const willRetry = i < maxAttempts - 1 && (outcome === "captcha" || proxyMode === "location" && retryableLocationProxyError(outcome));
|
|
2815
|
+
if (outcome === "captcha") {
|
|
2685
2816
|
console.warn(JSON.stringify({
|
|
2686
2817
|
event: "harvest_attempt_captcha",
|
|
2687
2818
|
attempt_number: attemptNumber,
|
|
2688
|
-
max_attempts:
|
|
2689
|
-
message: err
|
|
2819
|
+
max_attempts: maxAttempts,
|
|
2820
|
+
message: errorMessage(err),
|
|
2821
|
+
will_retry: willRetry
|
|
2822
|
+
}));
|
|
2823
|
+
} else if (willRetry) {
|
|
2824
|
+
console.warn(JSON.stringify({
|
|
2825
|
+
event: "harvest_attempt_proxy_retry",
|
|
2826
|
+
attempt_number: attemptNumber,
|
|
2827
|
+
max_attempts: maxAttempts,
|
|
2828
|
+
outcome,
|
|
2829
|
+
message: errorMessage(err),
|
|
2690
2830
|
will_retry: willRetry
|
|
2691
2831
|
}));
|
|
2692
|
-
await emitAttemptEvent(onAttemptEvent, {
|
|
2693
|
-
type: "finished",
|
|
2694
|
-
attemptNumber,
|
|
2695
|
-
maxAttempts: MAX_ATTEMPTS,
|
|
2696
|
-
outcome: "captcha",
|
|
2697
|
-
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2698
|
-
questionCount: 0,
|
|
2699
|
-
durationMs: Date.now() - startedAtMs,
|
|
2700
|
-
error: err.message,
|
|
2701
|
-
willRetry,
|
|
2702
|
-
cleanup: attempt.cleanup,
|
|
2703
|
-
debug: attempt.debug,
|
|
2704
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2705
|
-
});
|
|
2706
|
-
if (willRetry) continue;
|
|
2707
|
-
break;
|
|
2708
2832
|
}
|
|
2709
2833
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2710
2834
|
type: "finished",
|
|
2711
2835
|
attemptNumber,
|
|
2712
|
-
maxAttempts
|
|
2713
|
-
outcome
|
|
2836
|
+
maxAttempts,
|
|
2837
|
+
outcome,
|
|
2714
2838
|
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2715
2839
|
questionCount: 0,
|
|
2716
2840
|
durationMs: Date.now() - startedAtMs,
|
|
2717
2841
|
error: errorMessage(err),
|
|
2718
|
-
willRetry
|
|
2842
|
+
willRetry,
|
|
2719
2843
|
cleanup: attempt.cleanup,
|
|
2720
2844
|
debug: attempt.debug,
|
|
2721
2845
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2722
2846
|
});
|
|
2723
|
-
|
|
2847
|
+
await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
|
|
2848
|
+
lastError = err;
|
|
2849
|
+
if (willRetry) continue;
|
|
2850
|
+
break;
|
|
2724
2851
|
}
|
|
2725
2852
|
const result = attempt.result;
|
|
2726
2853
|
if (!result) throw new Error("Harvest attempt completed without a result");
|
|
2854
|
+
if (shouldRetryLocationMismatch(result, proxyMode)) {
|
|
2855
|
+
const err = new LocationMismatchError(locationMismatchMessage(result));
|
|
2856
|
+
const willRetry = i < maxAttempts - 1;
|
|
2857
|
+
console.warn(JSON.stringify({
|
|
2858
|
+
event: "harvest_attempt_location_mismatch",
|
|
2859
|
+
attempt_number: attemptNumber,
|
|
2860
|
+
max_attempts: maxAttempts,
|
|
2861
|
+
message: err.message,
|
|
2862
|
+
will_retry: willRetry
|
|
2863
|
+
}));
|
|
2864
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2865
|
+
type: "finished",
|
|
2866
|
+
attemptNumber,
|
|
2867
|
+
maxAttempts,
|
|
2868
|
+
outcome: "location_mismatch",
|
|
2869
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2870
|
+
questionCount: result.totalQuestions,
|
|
2871
|
+
durationMs: Date.now() - startedAtMs,
|
|
2872
|
+
error: err.message,
|
|
2873
|
+
willRetry,
|
|
2874
|
+
cleanup: attempt.cleanup,
|
|
2875
|
+
debug: attempt.debug,
|
|
2876
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2877
|
+
});
|
|
2878
|
+
await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
|
|
2879
|
+
lastError = err;
|
|
2880
|
+
if (willRetry) continue;
|
|
2881
|
+
break;
|
|
2882
|
+
}
|
|
2883
|
+
const finalResult = stripInternalDebug(result, requestedDebug);
|
|
2727
2884
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2728
2885
|
type: "finished",
|
|
2729
2886
|
attemptNumber,
|
|
2730
|
-
maxAttempts
|
|
2731
|
-
outcome: classifyAttemptResult(
|
|
2887
|
+
maxAttempts,
|
|
2888
|
+
outcome: classifyAttemptResult(finalResult),
|
|
2732
2889
|
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2733
|
-
questionCount:
|
|
2890
|
+
questionCount: finalResult.totalQuestions,
|
|
2734
2891
|
durationMs: Date.now() - startedAtMs,
|
|
2735
2892
|
error: null,
|
|
2736
2893
|
willRetry: false,
|
|
@@ -2738,64 +2895,52 @@ async function harvest(rawOptions) {
|
|
|
2738
2895
|
debug: attempt.debug,
|
|
2739
2896
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2740
2897
|
});
|
|
2898
|
+
await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
|
|
2741
2899
|
if (attemptOptions.format === "json" || attemptOptions.format === "both") {
|
|
2742
|
-
await serializer.writeJSON(
|
|
2900
|
+
await serializer.writeJSON(finalResult, attemptOptions.outputDir);
|
|
2743
2901
|
}
|
|
2744
2902
|
if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
|
|
2745
2903
|
await Promise.all([
|
|
2746
|
-
serializer.writeCSV(
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2904
|
+
serializer.writeCSV(finalResult.flat, attemptOptions.outputDir),
|
|
2905
|
+
finalResult.videos.length > 0 ? serializer.writeVideoCSV(finalResult.videos, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2906
|
+
finalResult.forums.length > 0 ? serializer.writeForumCSV(finalResult.forums, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2907
|
+
finalResult.aiOverview.detected ? serializer.writeAIOverviewCSV(finalResult.aiOverview.citations, finalResult.aiOverview.text, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2908
|
+
finalResult.aiMode.detected ? serializer.writeAIModeCSV(finalResult.aiMode.citations, finalResult.aiMode.text, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2909
|
+
finalResult.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(finalResult.whatPeopleSaying, finalResult.seed, attemptOptions.outputDir) : Promise.resolve("")
|
|
2752
2910
|
]);
|
|
2753
2911
|
}
|
|
2754
|
-
return
|
|
2912
|
+
return finalResult;
|
|
2755
2913
|
} catch (err) {
|
|
2756
|
-
|
|
2757
|
-
|
|
2914
|
+
const outcome = classifyAttemptError(err);
|
|
2915
|
+
const willRetry = i < maxAttempts - 1 && (outcome === "captcha" || proxyMode === "location" && retryableLocationProxyError(outcome));
|
|
2916
|
+
if (outcome === "captcha") {
|
|
2758
2917
|
console.warn(JSON.stringify({
|
|
2759
2918
|
event: "harvest_attempt_captcha",
|
|
2760
2919
|
attempt_number: attemptNumber,
|
|
2761
|
-
max_attempts:
|
|
2762
|
-
message: err
|
|
2920
|
+
max_attempts: maxAttempts,
|
|
2921
|
+
message: errorMessage(err),
|
|
2922
|
+
will_retry: willRetry
|
|
2923
|
+
}));
|
|
2924
|
+
} else if (willRetry) {
|
|
2925
|
+
console.warn(JSON.stringify({
|
|
2926
|
+
event: "harvest_attempt_proxy_retry",
|
|
2927
|
+
attempt_number: attemptNumber,
|
|
2928
|
+
max_attempts: maxAttempts,
|
|
2929
|
+
outcome,
|
|
2930
|
+
message: errorMessage(err),
|
|
2763
2931
|
will_retry: willRetry
|
|
2764
2932
|
}));
|
|
2765
|
-
await emitAttemptEvent(onAttemptEvent, {
|
|
2766
|
-
type: "finished",
|
|
2767
|
-
attemptNumber,
|
|
2768
|
-
maxAttempts: MAX_ATTEMPTS,
|
|
2769
|
-
outcome: "captcha",
|
|
2770
|
-
kernelSessionId: null,
|
|
2771
|
-
questionCount: 0,
|
|
2772
|
-
durationMs: Date.now() - startedAtMs,
|
|
2773
|
-
error: err.message,
|
|
2774
|
-
willRetry,
|
|
2775
|
-
cleanup: {
|
|
2776
|
-
kernelSessionId: null,
|
|
2777
|
-
kernelDeleteStarted: false,
|
|
2778
|
-
kernelDeleteSucceeded: null,
|
|
2779
|
-
kernelDeleteError: null,
|
|
2780
|
-
browserCloseSucceeded: null,
|
|
2781
|
-
browserCloseError: null
|
|
2782
|
-
},
|
|
2783
|
-
debug: null,
|
|
2784
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2785
|
-
});
|
|
2786
|
-
if (willRetry) continue;
|
|
2787
|
-
break;
|
|
2788
2933
|
}
|
|
2789
2934
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2790
2935
|
type: "finished",
|
|
2791
2936
|
attemptNumber,
|
|
2792
|
-
maxAttempts
|
|
2793
|
-
outcome
|
|
2937
|
+
maxAttempts,
|
|
2938
|
+
outcome,
|
|
2794
2939
|
kernelSessionId: null,
|
|
2795
2940
|
questionCount: 0,
|
|
2796
2941
|
durationMs: Date.now() - startedAtMs,
|
|
2797
2942
|
error: errorMessage(err),
|
|
2798
|
-
willRetry
|
|
2943
|
+
willRetry,
|
|
2799
2944
|
cleanup: {
|
|
2800
2945
|
kernelSessionId: null,
|
|
2801
2946
|
kernelDeleteStarted: false,
|
|
@@ -2807,15 +2952,19 @@ async function harvest(rawOptions) {
|
|
|
2807
2952
|
debug: null,
|
|
2808
2953
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2809
2954
|
});
|
|
2955
|
+
lastError = err;
|
|
2956
|
+
if (willRetry) continue;
|
|
2957
|
+
if (outcome === "captcha") break;
|
|
2810
2958
|
throw err;
|
|
2811
2959
|
}
|
|
2812
2960
|
}
|
|
2961
|
+
if (lastError && !(lastError instanceof CaptchaError)) throw lastError;
|
|
2813
2962
|
console.warn(JSON.stringify({
|
|
2814
2963
|
event: "harvest_captcha_exhausted",
|
|
2815
|
-
max_attempts:
|
|
2964
|
+
max_attempts: maxAttempts,
|
|
2816
2965
|
session_kind: kernelApiKey ? "kernel" : "local"
|
|
2817
2966
|
}));
|
|
2818
|
-
throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${
|
|
2967
|
+
throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${maxAttempts} fresh sessions. Try again in a few minutes.`));
|
|
2819
2968
|
}
|
|
2820
2969
|
|
|
2821
2970
|
// src/video/VideoGenerator.ts
|