mcp-scraper 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -8
- package/dist/bin/api-server.cjs +4691 -3614
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/browser-agent-stdio-server.cjs +85 -8
- package/dist/bin/browser-agent-stdio-server.cjs.map +1 -1
- package/dist/bin/browser-agent-stdio-server.js +83 -6
- package/dist/bin/browser-agent-stdio-server.js.map +1 -1
- package/dist/bin/mcp-stdio-server.cjs +170 -12
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +3 -3
- package/dist/bin/paa-harvest.cjs +223 -74
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +2 -2
- package/dist/{chunk-GXBT5CDU.js → chunk-IQOCZGJJ.js} +39 -2
- package/dist/chunk-IQOCZGJJ.js.map +1 -0
- package/dist/{chunk-ZMOWIBMK.js → chunk-M2S27J6Z.js} +9 -2
- package/dist/{chunk-ZMOWIBMK.js.map → chunk-M2S27J6Z.js.map} +1 -1
- package/dist/{chunk-TM22BLWP.js → chunk-MY3S7EX7.js} +221 -76
- package/dist/chunk-MY3S7EX7.js.map +1 -0
- package/dist/{chunk-BMVQB3WN.js → chunk-OR7DLLH2.js} +173 -14
- package/dist/chunk-OR7DLLH2.js.map +1 -0
- package/dist/chunk-XR65SANX.js +7 -0
- package/dist/chunk-XR65SANX.js.map +1 -0
- package/dist/index.cjs +223 -74
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +2 -2
- package/dist/{server-ASCMKUQ5.js → server-CJMX2QUM.js} +880 -181
- package/dist/server-CJMX2QUM.js.map +1 -0
- package/dist/{worker-KJ4A7WIR.js → worker-NAKGTIF5.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-2BS7BUEE.js +0 -7
- package/dist/chunk-2BS7BUEE.js.map +0 -1
- package/dist/chunk-BMVQB3WN.js.map +0 -1
- package/dist/chunk-GXBT5CDU.js.map +0 -1
- package/dist/chunk-TM22BLWP.js.map +0 -1
- package/dist/server-ASCMKUQ5.js.map +0 -1
- /package/dist/{worker-KJ4A7WIR.js.map → worker-NAKGTIF5.js.map} +0 -0
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
import {
|
|
3
3
|
HttpMcpToolExecutor,
|
|
4
4
|
buildPaaExtractorMcpServer
|
|
5
|
-
} from "../chunk-
|
|
6
|
-
import "../chunk-
|
|
7
|
-
import "../chunk-
|
|
5
|
+
} from "../chunk-OR7DLLH2.js";
|
|
6
|
+
import "../chunk-XR65SANX.js";
|
|
7
|
+
import "../chunk-M2S27J6Z.js";
|
|
8
8
|
|
|
9
9
|
// bin/mcp-stdio-server.ts
|
|
10
10
|
import { readFileSync } from "fs";
|
package/dist/bin/paa-harvest.cjs
CHANGED
|
@@ -77,8 +77,12 @@ var MapsSearchOptionsSchema = import_zod.z.object({
|
|
|
77
77
|
gl: import_zod.z.string().length(2).default("us"),
|
|
78
78
|
hl: import_zod.z.string().length(2).default("en"),
|
|
79
79
|
maxResults: import_zod.z.number().int().min(1).max(50).default(10),
|
|
80
|
+
proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location"),
|
|
81
|
+
proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional(),
|
|
82
|
+
debug: import_zod.z.boolean().default(false),
|
|
80
83
|
kernelApiKey: import_zod.z.string().optional(),
|
|
81
84
|
kernelProxyId: import_zod.z.string().optional(),
|
|
85
|
+
kernelProxyResolution: import_zod.z.unknown().optional(),
|
|
82
86
|
headless: import_zod.z.boolean().default(true)
|
|
83
87
|
});
|
|
84
88
|
var RawPAAItemSchema = import_zod.z.object({
|
|
@@ -244,6 +248,12 @@ var RequestAbortedError = class extends Error {
|
|
|
244
248
|
super(message);
|
|
245
249
|
}
|
|
246
250
|
};
|
|
251
|
+
var LocationMismatchError = class extends Error {
|
|
252
|
+
name = "LocationMismatchError";
|
|
253
|
+
constructor(message = "Google returned results for a different location than requested") {
|
|
254
|
+
super(message);
|
|
255
|
+
}
|
|
256
|
+
};
|
|
247
257
|
|
|
248
258
|
// src/driver/BrowserDriver.ts
|
|
249
259
|
import_playwright_extra.chromium.use((0, import_puppeteer_extra_plugin_stealth.default)());
|
|
@@ -2264,16 +2274,18 @@ var US_CITY_CENTER_ZIPS = {
|
|
|
2264
2274
|
function proxyIdSuffix2(proxyId) {
|
|
2265
2275
|
return proxyId ? proxyId.slice(-6) : null;
|
|
2266
2276
|
}
|
|
2267
|
-
function resolution(source, proxyMode, proxyId, target, error) {
|
|
2277
|
+
function resolution(source, proxyMode, proxyId, target, error, disposable = false) {
|
|
2268
2278
|
return {
|
|
2269
2279
|
kernelProxyId: proxyId,
|
|
2280
|
+
...disposable && proxyId ? { disposableProxyId: proxyId } : {},
|
|
2270
2281
|
resolution: {
|
|
2271
2282
|
source,
|
|
2272
2283
|
proxyMode,
|
|
2273
2284
|
proxyIdPresent: Boolean(proxyId),
|
|
2274
2285
|
proxyIdSuffix: proxyIdSuffix2(proxyId),
|
|
2275
2286
|
target,
|
|
2276
|
-
error
|
|
2287
|
+
error,
|
|
2288
|
+
disposable
|
|
2277
2289
|
}
|
|
2278
2290
|
};
|
|
2279
2291
|
}
|
|
@@ -2303,6 +2315,10 @@ function kernelCityIdentifierCandidates(city) {
|
|
|
2303
2315
|
function proxyName(country, state, city) {
|
|
2304
2316
|
return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
|
|
2305
2317
|
}
|
|
2318
|
+
function freshProxyName(baseName, attemptIndex) {
|
|
2319
|
+
const stamp = `${Date.now()}-${attemptIndex ?? 0}-${Math.random().toString(36).slice(2, 8)}`;
|
|
2320
|
+
return `${baseName}-fresh-${stamp}`;
|
|
2321
|
+
}
|
|
2306
2322
|
function zipProxyName(zip) {
|
|
2307
2323
|
return `mcp-serp-residential-us-zip-${zip}`;
|
|
2308
2324
|
}
|
|
@@ -2372,6 +2388,12 @@ function zipTarget(target, zip) {
|
|
|
2372
2388
|
}
|
|
2373
2389
|
};
|
|
2374
2390
|
}
|
|
2391
|
+
function withProxyName(target, name) {
|
|
2392
|
+
return {
|
|
2393
|
+
...target,
|
|
2394
|
+
proxyName: name
|
|
2395
|
+
};
|
|
2396
|
+
}
|
|
2375
2397
|
function configMatches(config, target, city) {
|
|
2376
2398
|
if (target.level === "zip") {
|
|
2377
2399
|
return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
|
|
@@ -2410,6 +2432,55 @@ function escalatedTargetLevel(target, attemptIndex) {
|
|
|
2410
2432
|
function errorText2(err) {
|
|
2411
2433
|
return err instanceof Error ? err.message : String(err);
|
|
2412
2434
|
}
|
|
2435
|
+
function freshTargetCandidates(target, explicitZip, attemptIndex) {
|
|
2436
|
+
const out = [];
|
|
2437
|
+
const zip = knownZipFor(target, explicitZip);
|
|
2438
|
+
if (zip) {
|
|
2439
|
+
const targetZip = zipTarget(target, zip);
|
|
2440
|
+
out.push(withProxyName(targetZip, freshProxyName(targetZip.proxyName, attemptIndex)));
|
|
2441
|
+
}
|
|
2442
|
+
for (const city of target.cityCandidates) {
|
|
2443
|
+
const cityTarget = {
|
|
2444
|
+
...target,
|
|
2445
|
+
level: "city",
|
|
2446
|
+
city,
|
|
2447
|
+
proxyName: proxyName(target.country, target.state, city),
|
|
2448
|
+
config: {
|
|
2449
|
+
country: target.country,
|
|
2450
|
+
state: target.state,
|
|
2451
|
+
city
|
|
2452
|
+
}
|
|
2453
|
+
};
|
|
2454
|
+
out.push(withProxyName(cityTarget, freshProxyName(cityTarget.proxyName, attemptIndex)));
|
|
2455
|
+
}
|
|
2456
|
+
const fallbackTarget = stateTarget(target);
|
|
2457
|
+
out.push(withProxyName(fallbackTarget, freshProxyName(fallbackTarget.proxyName, attemptIndex)));
|
|
2458
|
+
return out;
|
|
2459
|
+
}
|
|
2460
|
+
async function createFreshLocationProxy(kernel, options, target) {
|
|
2461
|
+
const createErrors = [];
|
|
2462
|
+
for (const candidate of freshTargetCandidates(target, options.proxyZip, options.attemptIndex)) {
|
|
2463
|
+
try {
|
|
2464
|
+
const created = await kernel.proxies.create({
|
|
2465
|
+
type: "residential",
|
|
2466
|
+
name: candidate.proxyName,
|
|
2467
|
+
config: candidate.level === "zip" ? { country: candidate.country, zip: candidate.zip } : candidate.config
|
|
2468
|
+
});
|
|
2469
|
+
if (created.id) {
|
|
2470
|
+
return resolution("location_created", options.proxyMode, created.id, candidate, null, true);
|
|
2471
|
+
}
|
|
2472
|
+
createErrors.push(`${candidate.proxyName}: Kernel did not return a proxy id`);
|
|
2473
|
+
} catch (err) {
|
|
2474
|
+
createErrors.push(`${candidate.proxyName}: ${errorText2(err)}`);
|
|
2475
|
+
}
|
|
2476
|
+
}
|
|
2477
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
|
|
2478
|
+
}
|
|
2479
|
+
async function deleteKernelProxyId(kernelApiKey, proxyId) {
|
|
2480
|
+
if (!kernelApiKey || !proxyId) return;
|
|
2481
|
+
const kernel = new import_sdk2.default({ apiKey: kernelApiKey });
|
|
2482
|
+
await kernel.proxies.delete(proxyId);
|
|
2483
|
+
}
|
|
2413
2484
|
async function resolveKernelProxyId(options) {
|
|
2414
2485
|
if (options.proxyMode === "none") {
|
|
2415
2486
|
return resolution("disabled", options.proxyMode, void 0, null, null);
|
|
@@ -2424,6 +2495,9 @@ async function resolveKernelProxyId(options) {
|
|
|
2424
2495
|
const kernel = new import_sdk2.default({ apiKey: options.kernelApiKey });
|
|
2425
2496
|
try {
|
|
2426
2497
|
const attemptIndex = options.attemptIndex ?? 0;
|
|
2498
|
+
if (options.fresh) {
|
|
2499
|
+
return await createFreshLocationProxy(kernel, options, target);
|
|
2500
|
+
}
|
|
2427
2501
|
if (attemptIndex >= 1) {
|
|
2428
2502
|
const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
|
|
2429
2503
|
const createErrors2 = [];
|
|
@@ -2527,6 +2601,7 @@ async function resolveKernelProxyId(options) {
|
|
|
2527
2601
|
|
|
2528
2602
|
// src/harvest.ts
|
|
2529
2603
|
var MAX_ATTEMPTS = 3;
|
|
2604
|
+
var LOCATION_PROXY_MAX_ATTEMPTS = 5;
|
|
2530
2605
|
function abortReason(signal) {
|
|
2531
2606
|
if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
|
|
2532
2607
|
return new RequestAbortedError();
|
|
@@ -2556,9 +2631,12 @@ async function emitAttemptEvent(sink, event) {
|
|
|
2556
2631
|
}
|
|
2557
2632
|
function classifyAttemptError(err) {
|
|
2558
2633
|
if (err instanceof CaptchaError) return "captcha";
|
|
2634
|
+
if (err instanceof LocationMismatchError) return "location_mismatch";
|
|
2559
2635
|
if (err instanceof RequestAbortedError) return "request_aborted";
|
|
2560
2636
|
if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
|
|
2561
2637
|
const message = err instanceof Error ? err.message : String(err);
|
|
2638
|
+
if (looksLikeProxyTunnelFailure(message)) return "proxy_tunnel_failed";
|
|
2639
|
+
if (looksLikeProxyUnavailable(message)) return "proxy_unavailable";
|
|
2562
2640
|
return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
|
|
2563
2641
|
}
|
|
2564
2642
|
function classifyAttemptResult(result) {
|
|
@@ -2567,6 +2645,49 @@ function classifyAttemptResult(result) {
|
|
|
2567
2645
|
function errorMessage(err) {
|
|
2568
2646
|
return err instanceof Error ? err.message : String(err);
|
|
2569
2647
|
}
|
|
2648
|
+
function maxAttemptsForProxyMode(proxyMode) {
|
|
2649
|
+
return proxyMode === "location" ? LOCATION_PROXY_MAX_ATTEMPTS : MAX_ATTEMPTS;
|
|
2650
|
+
}
|
|
2651
|
+
function looksLikeProxyTunnelFailure(message) {
|
|
2652
|
+
return /ERR_TUNNEL_CONNECTION_FAILED|ERR_PROXY_CONNECTION_FAILED|ERR_SOCKS_CONNECTION_FAILED|tunnel connection failed|proxy connection failed|transport error: proxy/i.test(message);
|
|
2653
|
+
}
|
|
2654
|
+
function looksLikeProxyUnavailable(message) {
|
|
2655
|
+
return /proxy unavailable|proxy_unavailable|connection_test_failed|did not return a proxy id|configured fallback/i.test(message);
|
|
2656
|
+
}
|
|
2657
|
+
function retryableLocationProxyError(outcome) {
|
|
2658
|
+
return outcome === "captcha" || outcome === "proxy_tunnel_failed" || outcome === "proxy_unavailable";
|
|
2659
|
+
}
|
|
2660
|
+
function locationMismatchMessage(result) {
|
|
2661
|
+
const evidence = result.diagnostics.debug?.locationEvidence;
|
|
2662
|
+
const expected = evidence?.expected?.canonicalLocation ?? result.location ?? "requested location";
|
|
2663
|
+
const candidates = evidence?.candidates.slice(0, 3).map((candidate) => `${candidate.city}, ${candidate.regionCode}`).join("; ");
|
|
2664
|
+
return candidates ? `Google returned results for ${candidates}, not ${expected}` : `Google returned results for a different location than ${expected}`;
|
|
2665
|
+
}
|
|
2666
|
+
function shouldRetryLocationMismatch(result, proxyMode) {
|
|
2667
|
+
return proxyMode === "location" && result.diagnostics.debug?.locationEvidence?.status === "mismatch";
|
|
2668
|
+
}
|
|
2669
|
+
function stripInternalDebug(result, keepDebug) {
|
|
2670
|
+
if (keepDebug || !result.diagnostics.debug) return result;
|
|
2671
|
+
const diagnostics = { ...result.diagnostics };
|
|
2672
|
+
delete diagnostics.debug;
|
|
2673
|
+
return { ...result, diagnostics };
|
|
2674
|
+
}
|
|
2675
|
+
async function cleanupDisposableProxy(kernelApiKey, proxyId) {
|
|
2676
|
+
if (!kernelApiKey || !proxyId) return;
|
|
2677
|
+
try {
|
|
2678
|
+
await deleteKernelProxyId(kernelApiKey, proxyId);
|
|
2679
|
+
console.info(JSON.stringify({
|
|
2680
|
+
event: "kernel_proxy_deleted",
|
|
2681
|
+
proxy_id_suffix: proxyId.slice(-6)
|
|
2682
|
+
}));
|
|
2683
|
+
} catch (err) {
|
|
2684
|
+
console.warn(JSON.stringify({
|
|
2685
|
+
event: "kernel_proxy_delete_failed",
|
|
2686
|
+
proxy_id_suffix: proxyId.slice(-6),
|
|
2687
|
+
message: errorMessage(err)
|
|
2688
|
+
}));
|
|
2689
|
+
}
|
|
2690
|
+
}
|
|
2570
2691
|
async function extractOnce(options, signal) {
|
|
2571
2692
|
const driver = new BrowserDriver();
|
|
2572
2693
|
const reporter = new ProgressReporter();
|
|
@@ -2634,26 +2755,35 @@ async function harvest(rawOptions) {
|
|
|
2634
2755
|
proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
|
|
2635
2756
|
gl: typeof raw.gl === "string" ? raw.gl : "us"
|
|
2636
2757
|
};
|
|
2758
|
+
const requestedDebug = typeof raw.debug === "boolean" ? raw.debug : false;
|
|
2759
|
+
const needsLocationEvidence = proxyMode === "location" && Boolean(proxyOpts.location);
|
|
2760
|
+
const maxAttempts = maxAttemptsForProxyMode(proxyMode);
|
|
2637
2761
|
const serializer = new OutputSerializer();
|
|
2638
|
-
|
|
2762
|
+
let lastError = null;
|
|
2763
|
+
for (let i = 0; i < maxAttempts; i++) {
|
|
2639
2764
|
const attemptNumber = i + 1;
|
|
2640
2765
|
const startedAtMs = Date.now();
|
|
2641
2766
|
try {
|
|
2642
2767
|
if (signal?.aborted) throw abortReason(signal);
|
|
2643
|
-
const resolution2 = await resolveKernelProxyId({
|
|
2768
|
+
const resolution2 = await resolveKernelProxyId({
|
|
2769
|
+
...proxyOpts,
|
|
2770
|
+
attemptIndex: i,
|
|
2771
|
+
fresh: proxyMode === "location"
|
|
2772
|
+
});
|
|
2644
2773
|
const mergedAttempt = {
|
|
2645
2774
|
...raw,
|
|
2646
2775
|
kernelApiKey,
|
|
2647
2776
|
kernelProxyId: resolution2.kernelProxyId,
|
|
2648
2777
|
kernelProxyResolution: resolution2.resolution,
|
|
2649
|
-
proxyMode
|
|
2778
|
+
proxyMode,
|
|
2779
|
+
debug: requestedDebug || needsLocationEvidence
|
|
2650
2780
|
};
|
|
2651
2781
|
if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
|
|
2652
2782
|
const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
|
|
2653
2783
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2654
2784
|
type: "started",
|
|
2655
2785
|
attemptNumber,
|
|
2656
|
-
maxAttempts
|
|
2786
|
+
maxAttempts,
|
|
2657
2787
|
query: attemptOptions.query,
|
|
2658
2788
|
location: attemptOptions.location ?? null,
|
|
2659
2789
|
maxQuestions: attemptOptions.maxQuestions,
|
|
@@ -2662,7 +2792,7 @@ async function harvest(rawOptions) {
|
|
|
2662
2792
|
console.info(JSON.stringify({
|
|
2663
2793
|
event: "harvest_attempt_started",
|
|
2664
2794
|
attempt_number: attemptNumber,
|
|
2665
|
-
max_attempts:
|
|
2795
|
+
max_attempts: maxAttempts,
|
|
2666
2796
|
query: attemptOptions.query,
|
|
2667
2797
|
location: attemptOptions.location ?? null,
|
|
2668
2798
|
max_questions: attemptOptions.maxQuestions
|
|
@@ -2670,57 +2800,84 @@ async function harvest(rawOptions) {
|
|
|
2670
2800
|
const attempt = await extractOnce(attemptOptions, signal);
|
|
2671
2801
|
if (attempt.error) {
|
|
2672
2802
|
const err = attempt.error;
|
|
2673
|
-
|
|
2674
|
-
|
|
2803
|
+
const outcome = classifyAttemptError(err);
|
|
2804
|
+
const willRetry = i < maxAttempts - 1 && (outcome === "captcha" || proxyMode === "location" && retryableLocationProxyError(outcome));
|
|
2805
|
+
if (outcome === "captcha") {
|
|
2675
2806
|
console.warn(JSON.stringify({
|
|
2676
2807
|
event: "harvest_attempt_captcha",
|
|
2677
2808
|
attempt_number: attemptNumber,
|
|
2678
|
-
max_attempts:
|
|
2679
|
-
message: err
|
|
2809
|
+
max_attempts: maxAttempts,
|
|
2810
|
+
message: errorMessage(err),
|
|
2811
|
+
will_retry: willRetry
|
|
2812
|
+
}));
|
|
2813
|
+
} else if (willRetry) {
|
|
2814
|
+
console.warn(JSON.stringify({
|
|
2815
|
+
event: "harvest_attempt_proxy_retry",
|
|
2816
|
+
attempt_number: attemptNumber,
|
|
2817
|
+
max_attempts: maxAttempts,
|
|
2818
|
+
outcome,
|
|
2819
|
+
message: errorMessage(err),
|
|
2680
2820
|
will_retry: willRetry
|
|
2681
2821
|
}));
|
|
2682
|
-
await emitAttemptEvent(onAttemptEvent, {
|
|
2683
|
-
type: "finished",
|
|
2684
|
-
attemptNumber,
|
|
2685
|
-
maxAttempts: MAX_ATTEMPTS,
|
|
2686
|
-
outcome: "captcha",
|
|
2687
|
-
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2688
|
-
questionCount: 0,
|
|
2689
|
-
durationMs: Date.now() - startedAtMs,
|
|
2690
|
-
error: err.message,
|
|
2691
|
-
willRetry,
|
|
2692
|
-
cleanup: attempt.cleanup,
|
|
2693
|
-
debug: attempt.debug,
|
|
2694
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2695
|
-
});
|
|
2696
|
-
if (willRetry) continue;
|
|
2697
|
-
break;
|
|
2698
2822
|
}
|
|
2699
2823
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2700
2824
|
type: "finished",
|
|
2701
2825
|
attemptNumber,
|
|
2702
|
-
maxAttempts
|
|
2703
|
-
outcome
|
|
2826
|
+
maxAttempts,
|
|
2827
|
+
outcome,
|
|
2704
2828
|
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2705
2829
|
questionCount: 0,
|
|
2706
2830
|
durationMs: Date.now() - startedAtMs,
|
|
2707
2831
|
error: errorMessage(err),
|
|
2708
|
-
willRetry
|
|
2832
|
+
willRetry,
|
|
2709
2833
|
cleanup: attempt.cleanup,
|
|
2710
2834
|
debug: attempt.debug,
|
|
2711
2835
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2712
2836
|
});
|
|
2713
|
-
|
|
2837
|
+
await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
|
|
2838
|
+
lastError = err;
|
|
2839
|
+
if (willRetry) continue;
|
|
2840
|
+
break;
|
|
2714
2841
|
}
|
|
2715
2842
|
const result = attempt.result;
|
|
2716
2843
|
if (!result) throw new Error("Harvest attempt completed without a result");
|
|
2844
|
+
if (shouldRetryLocationMismatch(result, proxyMode)) {
|
|
2845
|
+
const err = new LocationMismatchError(locationMismatchMessage(result));
|
|
2846
|
+
const willRetry = i < maxAttempts - 1;
|
|
2847
|
+
console.warn(JSON.stringify({
|
|
2848
|
+
event: "harvest_attempt_location_mismatch",
|
|
2849
|
+
attempt_number: attemptNumber,
|
|
2850
|
+
max_attempts: maxAttempts,
|
|
2851
|
+
message: err.message,
|
|
2852
|
+
will_retry: willRetry
|
|
2853
|
+
}));
|
|
2854
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2855
|
+
type: "finished",
|
|
2856
|
+
attemptNumber,
|
|
2857
|
+
maxAttempts,
|
|
2858
|
+
outcome: "location_mismatch",
|
|
2859
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2860
|
+
questionCount: result.totalQuestions,
|
|
2861
|
+
durationMs: Date.now() - startedAtMs,
|
|
2862
|
+
error: err.message,
|
|
2863
|
+
willRetry,
|
|
2864
|
+
cleanup: attempt.cleanup,
|
|
2865
|
+
debug: attempt.debug,
|
|
2866
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2867
|
+
});
|
|
2868
|
+
await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
|
|
2869
|
+
lastError = err;
|
|
2870
|
+
if (willRetry) continue;
|
|
2871
|
+
break;
|
|
2872
|
+
}
|
|
2873
|
+
const finalResult = stripInternalDebug(result, requestedDebug);
|
|
2717
2874
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2718
2875
|
type: "finished",
|
|
2719
2876
|
attemptNumber,
|
|
2720
|
-
maxAttempts
|
|
2721
|
-
outcome: classifyAttemptResult(
|
|
2877
|
+
maxAttempts,
|
|
2878
|
+
outcome: classifyAttemptResult(finalResult),
|
|
2722
2879
|
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2723
|
-
questionCount:
|
|
2880
|
+
questionCount: finalResult.totalQuestions,
|
|
2724
2881
|
durationMs: Date.now() - startedAtMs,
|
|
2725
2882
|
error: null,
|
|
2726
2883
|
willRetry: false,
|
|
@@ -2728,64 +2885,52 @@ async function harvest(rawOptions) {
|
|
|
2728
2885
|
debug: attempt.debug,
|
|
2729
2886
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2730
2887
|
});
|
|
2888
|
+
await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
|
|
2731
2889
|
if (attemptOptions.format === "json" || attemptOptions.format === "both") {
|
|
2732
|
-
await serializer.writeJSON(
|
|
2890
|
+
await serializer.writeJSON(finalResult, attemptOptions.outputDir);
|
|
2733
2891
|
}
|
|
2734
2892
|
if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
|
|
2735
2893
|
await Promise.all([
|
|
2736
|
-
serializer.writeCSV(
|
|
2737
|
-
|
|
2738
|
-
|
|
2739
|
-
|
|
2740
|
-
|
|
2741
|
-
|
|
2894
|
+
serializer.writeCSV(finalResult.flat, attemptOptions.outputDir),
|
|
2895
|
+
finalResult.videos.length > 0 ? serializer.writeVideoCSV(finalResult.videos, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2896
|
+
finalResult.forums.length > 0 ? serializer.writeForumCSV(finalResult.forums, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2897
|
+
finalResult.aiOverview.detected ? serializer.writeAIOverviewCSV(finalResult.aiOverview.citations, finalResult.aiOverview.text, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2898
|
+
finalResult.aiMode.detected ? serializer.writeAIModeCSV(finalResult.aiMode.citations, finalResult.aiMode.text, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2899
|
+
finalResult.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(finalResult.whatPeopleSaying, finalResult.seed, attemptOptions.outputDir) : Promise.resolve("")
|
|
2742
2900
|
]);
|
|
2743
2901
|
}
|
|
2744
|
-
return
|
|
2902
|
+
return finalResult;
|
|
2745
2903
|
} catch (err) {
|
|
2746
|
-
|
|
2747
|
-
|
|
2904
|
+
const outcome = classifyAttemptError(err);
|
|
2905
|
+
const willRetry = i < maxAttempts - 1 && (outcome === "captcha" || proxyMode === "location" && retryableLocationProxyError(outcome));
|
|
2906
|
+
if (outcome === "captcha") {
|
|
2748
2907
|
console.warn(JSON.stringify({
|
|
2749
2908
|
event: "harvest_attempt_captcha",
|
|
2750
2909
|
attempt_number: attemptNumber,
|
|
2751
|
-
max_attempts:
|
|
2752
|
-
message: err
|
|
2910
|
+
max_attempts: maxAttempts,
|
|
2911
|
+
message: errorMessage(err),
|
|
2912
|
+
will_retry: willRetry
|
|
2913
|
+
}));
|
|
2914
|
+
} else if (willRetry) {
|
|
2915
|
+
console.warn(JSON.stringify({
|
|
2916
|
+
event: "harvest_attempt_proxy_retry",
|
|
2917
|
+
attempt_number: attemptNumber,
|
|
2918
|
+
max_attempts: maxAttempts,
|
|
2919
|
+
outcome,
|
|
2920
|
+
message: errorMessage(err),
|
|
2753
2921
|
will_retry: willRetry
|
|
2754
2922
|
}));
|
|
2755
|
-
await emitAttemptEvent(onAttemptEvent, {
|
|
2756
|
-
type: "finished",
|
|
2757
|
-
attemptNumber,
|
|
2758
|
-
maxAttempts: MAX_ATTEMPTS,
|
|
2759
|
-
outcome: "captcha",
|
|
2760
|
-
kernelSessionId: null,
|
|
2761
|
-
questionCount: 0,
|
|
2762
|
-
durationMs: Date.now() - startedAtMs,
|
|
2763
|
-
error: err.message,
|
|
2764
|
-
willRetry,
|
|
2765
|
-
cleanup: {
|
|
2766
|
-
kernelSessionId: null,
|
|
2767
|
-
kernelDeleteStarted: false,
|
|
2768
|
-
kernelDeleteSucceeded: null,
|
|
2769
|
-
kernelDeleteError: null,
|
|
2770
|
-
browserCloseSucceeded: null,
|
|
2771
|
-
browserCloseError: null
|
|
2772
|
-
},
|
|
2773
|
-
debug: null,
|
|
2774
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2775
|
-
});
|
|
2776
|
-
if (willRetry) continue;
|
|
2777
|
-
break;
|
|
2778
2923
|
}
|
|
2779
2924
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2780
2925
|
type: "finished",
|
|
2781
2926
|
attemptNumber,
|
|
2782
|
-
maxAttempts
|
|
2783
|
-
outcome
|
|
2927
|
+
maxAttempts,
|
|
2928
|
+
outcome,
|
|
2784
2929
|
kernelSessionId: null,
|
|
2785
2930
|
questionCount: 0,
|
|
2786
2931
|
durationMs: Date.now() - startedAtMs,
|
|
2787
2932
|
error: errorMessage(err),
|
|
2788
|
-
willRetry
|
|
2933
|
+
willRetry,
|
|
2789
2934
|
cleanup: {
|
|
2790
2935
|
kernelSessionId: null,
|
|
2791
2936
|
kernelDeleteStarted: false,
|
|
@@ -2797,15 +2942,19 @@ async function harvest(rawOptions) {
|
|
|
2797
2942
|
debug: null,
|
|
2798
2943
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2799
2944
|
});
|
|
2945
|
+
lastError = err;
|
|
2946
|
+
if (willRetry) continue;
|
|
2947
|
+
if (outcome === "captcha") break;
|
|
2800
2948
|
throw err;
|
|
2801
2949
|
}
|
|
2802
2950
|
}
|
|
2951
|
+
if (lastError && !(lastError instanceof CaptchaError)) throw lastError;
|
|
2803
2952
|
console.warn(JSON.stringify({
|
|
2804
2953
|
event: "harvest_captcha_exhausted",
|
|
2805
|
-
max_attempts:
|
|
2954
|
+
max_attempts: maxAttempts,
|
|
2806
2955
|
session_kind: kernelApiKey ? "kernel" : "local"
|
|
2807
2956
|
}));
|
|
2808
|
-
throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${
|
|
2957
|
+
throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${maxAttempts} fresh sessions. Try again in a few minutes.`));
|
|
2809
2958
|
}
|
|
2810
2959
|
|
|
2811
2960
|
// src/cli.ts
|