mcp-scraper 0.1.9 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -8
- package/dist/bin/api-server.cjs +5615 -3733
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/browser-agent-stdio-server.cjs +391 -0
- package/dist/bin/browser-agent-stdio-server.cjs.map +1 -0
- package/dist/bin/browser-agent-stdio-server.d.cts +1 -0
- package/dist/bin/browser-agent-stdio-server.d.ts +1 -0
- package/dist/bin/browser-agent-stdio-server.js +390 -0
- package/dist/bin/browser-agent-stdio-server.js.map +1 -0
- package/dist/bin/mcp-stdio-server.cjs +170 -12
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +3 -2
- package/dist/bin/mcp-stdio-server.js.map +1 -1
- package/dist/bin/paa-harvest.cjs +223 -74
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +2 -2
- package/dist/{chunk-ZK456YXN.js → chunk-IQOCZGJJ.js} +58 -4
- package/dist/chunk-IQOCZGJJ.js.map +1 -0
- package/dist/{chunk-ZMOWIBMK.js → chunk-M2S27J6Z.js} +9 -2
- package/dist/{chunk-ZMOWIBMK.js.map → chunk-M2S27J6Z.js.map} +1 -1
- package/dist/{chunk-TM22BLWP.js → chunk-MY3S7EX7.js} +221 -76
- package/dist/chunk-MY3S7EX7.js.map +1 -0
- package/dist/{chunk-JNC32DMS.js → chunk-OR7DLLH2.js} +175 -16
- package/dist/chunk-OR7DLLH2.js.map +1 -0
- package/dist/chunk-XR65SANX.js +7 -0
- package/dist/chunk-XR65SANX.js.map +1 -0
- package/dist/index.cjs +223 -74
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +2 -2
- package/dist/{server-MTXAJG5J.js → server-CJMX2QUM.js} +1655 -194
- package/dist/server-CJMX2QUM.js.map +1 -0
- package/dist/{worker-AUCXFHEL.js → worker-NAKGTIF5.js} +4 -4
- package/docs/specs/api-forge-spec.md +234 -0
- package/docs/specs/deferred-work-spec.md +74 -0
- package/docs/specs/oauth-mcp-spec.md +213 -0
- package/package.json +3 -2
- package/dist/chunk-JNC32DMS.js.map +0 -1
- package/dist/chunk-TM22BLWP.js.map +0 -1
- package/dist/chunk-ZK456YXN.js.map +0 -1
- package/dist/server-MTXAJG5J.js.map +0 -1
- /package/dist/{worker-AUCXFHEL.js.map → worker-NAKGTIF5.js.map} +0 -0
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
CaptchaError,
|
|
3
3
|
ExtractionError,
|
|
4
|
+
LocationMismatchError,
|
|
4
5
|
RECAPTCHA_INSTRUCTIONS,
|
|
5
6
|
RequestAbortedError,
|
|
6
7
|
sanitizeVendorName
|
|
7
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-M2S27J6Z.js";
|
|
8
9
|
|
|
9
10
|
// src/lib/browser-service-env.ts
|
|
10
11
|
function browserServiceApiKey() {
|
|
@@ -57,8 +58,12 @@ var MapsSearchOptionsSchema = z.object({
|
|
|
57
58
|
gl: z.string().length(2).default("us"),
|
|
58
59
|
hl: z.string().length(2).default("en"),
|
|
59
60
|
maxResults: z.number().int().min(1).max(50).default(10),
|
|
61
|
+
proxyMode: z.enum(["location", "configured", "none"]).default("location"),
|
|
62
|
+
proxyZip: z.string().regex(/^\d{5}$/).optional(),
|
|
63
|
+
debug: z.boolean().default(false),
|
|
60
64
|
kernelApiKey: z.string().optional(),
|
|
61
65
|
kernelProxyId: z.string().optional(),
|
|
66
|
+
kernelProxyResolution: z.unknown().optional(),
|
|
62
67
|
headless: z.boolean().default(true)
|
|
63
68
|
});
|
|
64
69
|
var RawPAAItemSchema = z.object({
|
|
@@ -2232,16 +2237,18 @@ var US_CITY_CENTER_ZIPS = {
|
|
|
2232
2237
|
function proxyIdSuffix2(proxyId) {
|
|
2233
2238
|
return proxyId ? proxyId.slice(-6) : null;
|
|
2234
2239
|
}
|
|
2235
|
-
function resolution(source, proxyMode, proxyId, target, error) {
|
|
2240
|
+
function resolution(source, proxyMode, proxyId, target, error, disposable = false) {
|
|
2236
2241
|
return {
|
|
2237
2242
|
kernelProxyId: proxyId,
|
|
2243
|
+
...disposable && proxyId ? { disposableProxyId: proxyId } : {},
|
|
2238
2244
|
resolution: {
|
|
2239
2245
|
source,
|
|
2240
2246
|
proxyMode,
|
|
2241
2247
|
proxyIdPresent: Boolean(proxyId),
|
|
2242
2248
|
proxyIdSuffix: proxyIdSuffix2(proxyId),
|
|
2243
2249
|
target,
|
|
2244
|
-
error
|
|
2250
|
+
error,
|
|
2251
|
+
disposable
|
|
2245
2252
|
}
|
|
2246
2253
|
};
|
|
2247
2254
|
}
|
|
@@ -2271,6 +2278,10 @@ function kernelCityIdentifierCandidates(city) {
|
|
|
2271
2278
|
function proxyName(country, state, city) {
|
|
2272
2279
|
return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
|
|
2273
2280
|
}
|
|
2281
|
+
function freshProxyName(baseName, attemptIndex) {
|
|
2282
|
+
const stamp = `${Date.now()}-${attemptIndex ?? 0}-${Math.random().toString(36).slice(2, 8)}`;
|
|
2283
|
+
return `${baseName}-fresh-${stamp}`;
|
|
2284
|
+
}
|
|
2274
2285
|
function zipProxyName(zip) {
|
|
2275
2286
|
return `mcp-serp-residential-us-zip-${zip}`;
|
|
2276
2287
|
}
|
|
@@ -2340,6 +2351,12 @@ function zipTarget(target, zip) {
|
|
|
2340
2351
|
}
|
|
2341
2352
|
};
|
|
2342
2353
|
}
|
|
2354
|
+
function withProxyName(target, name) {
|
|
2355
|
+
return {
|
|
2356
|
+
...target,
|
|
2357
|
+
proxyName: name
|
|
2358
|
+
};
|
|
2359
|
+
}
|
|
2343
2360
|
function configMatches(config, target, city) {
|
|
2344
2361
|
if (target.level === "zip") {
|
|
2345
2362
|
return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
|
|
@@ -2378,6 +2395,55 @@ function escalatedTargetLevel(target, attemptIndex) {
|
|
|
2378
2395
|
function errorText2(err) {
|
|
2379
2396
|
return err instanceof Error ? err.message : String(err);
|
|
2380
2397
|
}
|
|
2398
|
+
function freshTargetCandidates(target, explicitZip, attemptIndex) {
|
|
2399
|
+
const out = [];
|
|
2400
|
+
const zip = knownZipFor(target, explicitZip);
|
|
2401
|
+
if (zip) {
|
|
2402
|
+
const targetZip = zipTarget(target, zip);
|
|
2403
|
+
out.push(withProxyName(targetZip, freshProxyName(targetZip.proxyName, attemptIndex)));
|
|
2404
|
+
}
|
|
2405
|
+
for (const city of target.cityCandidates) {
|
|
2406
|
+
const cityTarget = {
|
|
2407
|
+
...target,
|
|
2408
|
+
level: "city",
|
|
2409
|
+
city,
|
|
2410
|
+
proxyName: proxyName(target.country, target.state, city),
|
|
2411
|
+
config: {
|
|
2412
|
+
country: target.country,
|
|
2413
|
+
state: target.state,
|
|
2414
|
+
city
|
|
2415
|
+
}
|
|
2416
|
+
};
|
|
2417
|
+
out.push(withProxyName(cityTarget, freshProxyName(cityTarget.proxyName, attemptIndex)));
|
|
2418
|
+
}
|
|
2419
|
+
const fallbackTarget = stateTarget(target);
|
|
2420
|
+
out.push(withProxyName(fallbackTarget, freshProxyName(fallbackTarget.proxyName, attemptIndex)));
|
|
2421
|
+
return out;
|
|
2422
|
+
}
|
|
2423
|
+
async function createFreshLocationProxy(kernel, options, target) {
|
|
2424
|
+
const createErrors = [];
|
|
2425
|
+
for (const candidate of freshTargetCandidates(target, options.proxyZip, options.attemptIndex)) {
|
|
2426
|
+
try {
|
|
2427
|
+
const created = await kernel.proxies.create({
|
|
2428
|
+
type: "residential",
|
|
2429
|
+
name: candidate.proxyName,
|
|
2430
|
+
config: candidate.level === "zip" ? { country: candidate.country, zip: candidate.zip } : candidate.config
|
|
2431
|
+
});
|
|
2432
|
+
if (created.id) {
|
|
2433
|
+
return resolution("location_created", options.proxyMode, created.id, candidate, null, true);
|
|
2434
|
+
}
|
|
2435
|
+
createErrors.push(`${candidate.proxyName}: Kernel did not return a proxy id`);
|
|
2436
|
+
} catch (err) {
|
|
2437
|
+
createErrors.push(`${candidate.proxyName}: ${errorText2(err)}`);
|
|
2438
|
+
}
|
|
2439
|
+
}
|
|
2440
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
|
|
2441
|
+
}
|
|
2442
|
+
async function deleteKernelProxyId(kernelApiKey, proxyId) {
|
|
2443
|
+
if (!kernelApiKey || !proxyId) return;
|
|
2444
|
+
const kernel = new Kernel2({ apiKey: kernelApiKey });
|
|
2445
|
+
await kernel.proxies.delete(proxyId);
|
|
2446
|
+
}
|
|
2381
2447
|
async function resolveKernelProxyId(options) {
|
|
2382
2448
|
if (options.proxyMode === "none") {
|
|
2383
2449
|
return resolution("disabled", options.proxyMode, void 0, null, null);
|
|
@@ -2392,6 +2458,9 @@ async function resolveKernelProxyId(options) {
|
|
|
2392
2458
|
const kernel = new Kernel2({ apiKey: options.kernelApiKey });
|
|
2393
2459
|
try {
|
|
2394
2460
|
const attemptIndex = options.attemptIndex ?? 0;
|
|
2461
|
+
if (options.fresh) {
|
|
2462
|
+
return await createFreshLocationProxy(kernel, options, target);
|
|
2463
|
+
}
|
|
2395
2464
|
if (attemptIndex >= 1) {
|
|
2396
2465
|
const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
|
|
2397
2466
|
const createErrors2 = [];
|
|
@@ -2495,6 +2564,7 @@ async function resolveKernelProxyId(options) {
|
|
|
2495
2564
|
|
|
2496
2565
|
// src/harvest.ts
|
|
2497
2566
|
var MAX_ATTEMPTS = 3;
|
|
2567
|
+
var LOCATION_PROXY_MAX_ATTEMPTS = 5;
|
|
2498
2568
|
function abortReason(signal) {
|
|
2499
2569
|
if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
|
|
2500
2570
|
return new RequestAbortedError();
|
|
@@ -2524,9 +2594,12 @@ async function emitAttemptEvent(sink, event) {
|
|
|
2524
2594
|
}
|
|
2525
2595
|
function classifyAttemptError(err) {
|
|
2526
2596
|
if (err instanceof CaptchaError) return "captcha";
|
|
2597
|
+
if (err instanceof LocationMismatchError) return "location_mismatch";
|
|
2527
2598
|
if (err instanceof RequestAbortedError) return "request_aborted";
|
|
2528
2599
|
if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
|
|
2529
2600
|
const message = err instanceof Error ? err.message : String(err);
|
|
2601
|
+
if (looksLikeProxyTunnelFailure(message)) return "proxy_tunnel_failed";
|
|
2602
|
+
if (looksLikeProxyUnavailable(message)) return "proxy_unavailable";
|
|
2530
2603
|
return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
|
|
2531
2604
|
}
|
|
2532
2605
|
function classifyAttemptResult(result) {
|
|
@@ -2535,6 +2608,49 @@ function classifyAttemptResult(result) {
|
|
|
2535
2608
|
function errorMessage(err) {
|
|
2536
2609
|
return err instanceof Error ? err.message : String(err);
|
|
2537
2610
|
}
|
|
2611
|
+
function maxAttemptsForProxyMode(proxyMode) {
|
|
2612
|
+
return proxyMode === "location" ? LOCATION_PROXY_MAX_ATTEMPTS : MAX_ATTEMPTS;
|
|
2613
|
+
}
|
|
2614
|
+
function looksLikeProxyTunnelFailure(message) {
|
|
2615
|
+
return /ERR_TUNNEL_CONNECTION_FAILED|ERR_PROXY_CONNECTION_FAILED|ERR_SOCKS_CONNECTION_FAILED|tunnel connection failed|proxy connection failed|transport error: proxy/i.test(message);
|
|
2616
|
+
}
|
|
2617
|
+
function looksLikeProxyUnavailable(message) {
|
|
2618
|
+
return /proxy unavailable|proxy_unavailable|connection_test_failed|did not return a proxy id|configured fallback/i.test(message);
|
|
2619
|
+
}
|
|
2620
|
+
function retryableLocationProxyError(outcome) {
|
|
2621
|
+
return outcome === "captcha" || outcome === "proxy_tunnel_failed" || outcome === "proxy_unavailable";
|
|
2622
|
+
}
|
|
2623
|
+
function locationMismatchMessage(result) {
|
|
2624
|
+
const evidence = result.diagnostics.debug?.locationEvidence;
|
|
2625
|
+
const expected = evidence?.expected?.canonicalLocation ?? result.location ?? "requested location";
|
|
2626
|
+
const candidates = evidence?.candidates.slice(0, 3).map((candidate) => `${candidate.city}, ${candidate.regionCode}`).join("; ");
|
|
2627
|
+
return candidates ? `Google returned results for ${candidates}, not ${expected}` : `Google returned results for a different location than ${expected}`;
|
|
2628
|
+
}
|
|
2629
|
+
function shouldRetryLocationMismatch(result, proxyMode) {
|
|
2630
|
+
return proxyMode === "location" && result.diagnostics.debug?.locationEvidence?.status === "mismatch";
|
|
2631
|
+
}
|
|
2632
|
+
function stripInternalDebug(result, keepDebug) {
|
|
2633
|
+
if (keepDebug || !result.diagnostics.debug) return result;
|
|
2634
|
+
const diagnostics = { ...result.diagnostics };
|
|
2635
|
+
delete diagnostics.debug;
|
|
2636
|
+
return { ...result, diagnostics };
|
|
2637
|
+
}
|
|
2638
|
+
async function cleanupDisposableProxy(kernelApiKey, proxyId) {
|
|
2639
|
+
if (!kernelApiKey || !proxyId) return;
|
|
2640
|
+
try {
|
|
2641
|
+
await deleteKernelProxyId(kernelApiKey, proxyId);
|
|
2642
|
+
console.info(JSON.stringify({
|
|
2643
|
+
event: "kernel_proxy_deleted",
|
|
2644
|
+
proxy_id_suffix: proxyId.slice(-6)
|
|
2645
|
+
}));
|
|
2646
|
+
} catch (err) {
|
|
2647
|
+
console.warn(JSON.stringify({
|
|
2648
|
+
event: "kernel_proxy_delete_failed",
|
|
2649
|
+
proxy_id_suffix: proxyId.slice(-6),
|
|
2650
|
+
message: errorMessage(err)
|
|
2651
|
+
}));
|
|
2652
|
+
}
|
|
2653
|
+
}
|
|
2538
2654
|
async function extractOnce(options, signal) {
|
|
2539
2655
|
const driver = new BrowserDriver();
|
|
2540
2656
|
const reporter = new ProgressReporter();
|
|
@@ -2602,26 +2718,35 @@ async function harvest(rawOptions) {
|
|
|
2602
2718
|
proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
|
|
2603
2719
|
gl: typeof raw.gl === "string" ? raw.gl : "us"
|
|
2604
2720
|
};
|
|
2721
|
+
const requestedDebug = typeof raw.debug === "boolean" ? raw.debug : false;
|
|
2722
|
+
const needsLocationEvidence = proxyMode === "location" && Boolean(proxyOpts.location);
|
|
2723
|
+
const maxAttempts = maxAttemptsForProxyMode(proxyMode);
|
|
2605
2724
|
const serializer = new OutputSerializer();
|
|
2606
|
-
|
|
2725
|
+
let lastError = null;
|
|
2726
|
+
for (let i = 0; i < maxAttempts; i++) {
|
|
2607
2727
|
const attemptNumber = i + 1;
|
|
2608
2728
|
const startedAtMs = Date.now();
|
|
2609
2729
|
try {
|
|
2610
2730
|
if (signal?.aborted) throw abortReason(signal);
|
|
2611
|
-
const resolution2 = await resolveKernelProxyId({
|
|
2731
|
+
const resolution2 = await resolveKernelProxyId({
|
|
2732
|
+
...proxyOpts,
|
|
2733
|
+
attemptIndex: i,
|
|
2734
|
+
fresh: proxyMode === "location"
|
|
2735
|
+
});
|
|
2612
2736
|
const mergedAttempt = {
|
|
2613
2737
|
...raw,
|
|
2614
2738
|
kernelApiKey,
|
|
2615
2739
|
kernelProxyId: resolution2.kernelProxyId,
|
|
2616
2740
|
kernelProxyResolution: resolution2.resolution,
|
|
2617
|
-
proxyMode
|
|
2741
|
+
proxyMode,
|
|
2742
|
+
debug: requestedDebug || needsLocationEvidence
|
|
2618
2743
|
};
|
|
2619
2744
|
if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
|
|
2620
2745
|
const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
|
|
2621
2746
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2622
2747
|
type: "started",
|
|
2623
2748
|
attemptNumber,
|
|
2624
|
-
maxAttempts
|
|
2749
|
+
maxAttempts,
|
|
2625
2750
|
query: attemptOptions.query,
|
|
2626
2751
|
location: attemptOptions.location ?? null,
|
|
2627
2752
|
maxQuestions: attemptOptions.maxQuestions,
|
|
@@ -2630,7 +2755,7 @@ async function harvest(rawOptions) {
|
|
|
2630
2755
|
console.info(JSON.stringify({
|
|
2631
2756
|
event: "harvest_attempt_started",
|
|
2632
2757
|
attempt_number: attemptNumber,
|
|
2633
|
-
max_attempts:
|
|
2758
|
+
max_attempts: maxAttempts,
|
|
2634
2759
|
query: attemptOptions.query,
|
|
2635
2760
|
location: attemptOptions.location ?? null,
|
|
2636
2761
|
max_questions: attemptOptions.maxQuestions
|
|
@@ -2638,57 +2763,84 @@ async function harvest(rawOptions) {
|
|
|
2638
2763
|
const attempt = await extractOnce(attemptOptions, signal);
|
|
2639
2764
|
if (attempt.error) {
|
|
2640
2765
|
const err = attempt.error;
|
|
2641
|
-
|
|
2642
|
-
|
|
2766
|
+
const outcome = classifyAttemptError(err);
|
|
2767
|
+
const willRetry = i < maxAttempts - 1 && (outcome === "captcha" || proxyMode === "location" && retryableLocationProxyError(outcome));
|
|
2768
|
+
if (outcome === "captcha") {
|
|
2643
2769
|
console.warn(JSON.stringify({
|
|
2644
2770
|
event: "harvest_attempt_captcha",
|
|
2645
2771
|
attempt_number: attemptNumber,
|
|
2646
|
-
max_attempts:
|
|
2647
|
-
message: err
|
|
2772
|
+
max_attempts: maxAttempts,
|
|
2773
|
+
message: errorMessage(err),
|
|
2774
|
+
will_retry: willRetry
|
|
2775
|
+
}));
|
|
2776
|
+
} else if (willRetry) {
|
|
2777
|
+
console.warn(JSON.stringify({
|
|
2778
|
+
event: "harvest_attempt_proxy_retry",
|
|
2779
|
+
attempt_number: attemptNumber,
|
|
2780
|
+
max_attempts: maxAttempts,
|
|
2781
|
+
outcome,
|
|
2782
|
+
message: errorMessage(err),
|
|
2648
2783
|
will_retry: willRetry
|
|
2649
2784
|
}));
|
|
2650
|
-
await emitAttemptEvent(onAttemptEvent, {
|
|
2651
|
-
type: "finished",
|
|
2652
|
-
attemptNumber,
|
|
2653
|
-
maxAttempts: MAX_ATTEMPTS,
|
|
2654
|
-
outcome: "captcha",
|
|
2655
|
-
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2656
|
-
questionCount: 0,
|
|
2657
|
-
durationMs: Date.now() - startedAtMs,
|
|
2658
|
-
error: err.message,
|
|
2659
|
-
willRetry,
|
|
2660
|
-
cleanup: attempt.cleanup,
|
|
2661
|
-
debug: attempt.debug,
|
|
2662
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2663
|
-
});
|
|
2664
|
-
if (willRetry) continue;
|
|
2665
|
-
break;
|
|
2666
2785
|
}
|
|
2667
2786
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2668
2787
|
type: "finished",
|
|
2669
2788
|
attemptNumber,
|
|
2670
|
-
maxAttempts
|
|
2671
|
-
outcome
|
|
2789
|
+
maxAttempts,
|
|
2790
|
+
outcome,
|
|
2672
2791
|
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2673
2792
|
questionCount: 0,
|
|
2674
2793
|
durationMs: Date.now() - startedAtMs,
|
|
2675
2794
|
error: errorMessage(err),
|
|
2676
|
-
willRetry
|
|
2795
|
+
willRetry,
|
|
2677
2796
|
cleanup: attempt.cleanup,
|
|
2678
2797
|
debug: attempt.debug,
|
|
2679
2798
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2680
2799
|
});
|
|
2681
|
-
|
|
2800
|
+
await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
|
|
2801
|
+
lastError = err;
|
|
2802
|
+
if (willRetry) continue;
|
|
2803
|
+
break;
|
|
2682
2804
|
}
|
|
2683
2805
|
const result = attempt.result;
|
|
2684
2806
|
if (!result) throw new Error("Harvest attempt completed without a result");
|
|
2807
|
+
if (shouldRetryLocationMismatch(result, proxyMode)) {
|
|
2808
|
+
const err = new LocationMismatchError(locationMismatchMessage(result));
|
|
2809
|
+
const willRetry = i < maxAttempts - 1;
|
|
2810
|
+
console.warn(JSON.stringify({
|
|
2811
|
+
event: "harvest_attempt_location_mismatch",
|
|
2812
|
+
attempt_number: attemptNumber,
|
|
2813
|
+
max_attempts: maxAttempts,
|
|
2814
|
+
message: err.message,
|
|
2815
|
+
will_retry: willRetry
|
|
2816
|
+
}));
|
|
2817
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2818
|
+
type: "finished",
|
|
2819
|
+
attemptNumber,
|
|
2820
|
+
maxAttempts,
|
|
2821
|
+
outcome: "location_mismatch",
|
|
2822
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2823
|
+
questionCount: result.totalQuestions,
|
|
2824
|
+
durationMs: Date.now() - startedAtMs,
|
|
2825
|
+
error: err.message,
|
|
2826
|
+
willRetry,
|
|
2827
|
+
cleanup: attempt.cleanup,
|
|
2828
|
+
debug: attempt.debug,
|
|
2829
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2830
|
+
});
|
|
2831
|
+
await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
|
|
2832
|
+
lastError = err;
|
|
2833
|
+
if (willRetry) continue;
|
|
2834
|
+
break;
|
|
2835
|
+
}
|
|
2836
|
+
const finalResult = stripInternalDebug(result, requestedDebug);
|
|
2685
2837
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2686
2838
|
type: "finished",
|
|
2687
2839
|
attemptNumber,
|
|
2688
|
-
maxAttempts
|
|
2689
|
-
outcome: classifyAttemptResult(
|
|
2840
|
+
maxAttempts,
|
|
2841
|
+
outcome: classifyAttemptResult(finalResult),
|
|
2690
2842
|
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2691
|
-
questionCount:
|
|
2843
|
+
questionCount: finalResult.totalQuestions,
|
|
2692
2844
|
durationMs: Date.now() - startedAtMs,
|
|
2693
2845
|
error: null,
|
|
2694
2846
|
willRetry: false,
|
|
@@ -2696,64 +2848,52 @@ async function harvest(rawOptions) {
|
|
|
2696
2848
|
debug: attempt.debug,
|
|
2697
2849
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2698
2850
|
});
|
|
2851
|
+
await cleanupDisposableProxy(kernelApiKey, resolution2.disposableProxyId);
|
|
2699
2852
|
if (attemptOptions.format === "json" || attemptOptions.format === "both") {
|
|
2700
|
-
await serializer.writeJSON(
|
|
2853
|
+
await serializer.writeJSON(finalResult, attemptOptions.outputDir);
|
|
2701
2854
|
}
|
|
2702
2855
|
if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
|
|
2703
2856
|
await Promise.all([
|
|
2704
|
-
serializer.writeCSV(
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2857
|
+
serializer.writeCSV(finalResult.flat, attemptOptions.outputDir),
|
|
2858
|
+
finalResult.videos.length > 0 ? serializer.writeVideoCSV(finalResult.videos, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2859
|
+
finalResult.forums.length > 0 ? serializer.writeForumCSV(finalResult.forums, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2860
|
+
finalResult.aiOverview.detected ? serializer.writeAIOverviewCSV(finalResult.aiOverview.citations, finalResult.aiOverview.text, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2861
|
+
finalResult.aiMode.detected ? serializer.writeAIModeCSV(finalResult.aiMode.citations, finalResult.aiMode.text, finalResult.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2862
|
+
finalResult.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(finalResult.whatPeopleSaying, finalResult.seed, attemptOptions.outputDir) : Promise.resolve("")
|
|
2710
2863
|
]);
|
|
2711
2864
|
}
|
|
2712
|
-
return
|
|
2865
|
+
return finalResult;
|
|
2713
2866
|
} catch (err) {
|
|
2714
|
-
|
|
2715
|
-
|
|
2867
|
+
const outcome = classifyAttemptError(err);
|
|
2868
|
+
const willRetry = i < maxAttempts - 1 && (outcome === "captcha" || proxyMode === "location" && retryableLocationProxyError(outcome));
|
|
2869
|
+
if (outcome === "captcha") {
|
|
2716
2870
|
console.warn(JSON.stringify({
|
|
2717
2871
|
event: "harvest_attempt_captcha",
|
|
2718
2872
|
attempt_number: attemptNumber,
|
|
2719
|
-
max_attempts:
|
|
2720
|
-
message: err
|
|
2873
|
+
max_attempts: maxAttempts,
|
|
2874
|
+
message: errorMessage(err),
|
|
2875
|
+
will_retry: willRetry
|
|
2876
|
+
}));
|
|
2877
|
+
} else if (willRetry) {
|
|
2878
|
+
console.warn(JSON.stringify({
|
|
2879
|
+
event: "harvest_attempt_proxy_retry",
|
|
2880
|
+
attempt_number: attemptNumber,
|
|
2881
|
+
max_attempts: maxAttempts,
|
|
2882
|
+
outcome,
|
|
2883
|
+
message: errorMessage(err),
|
|
2721
2884
|
will_retry: willRetry
|
|
2722
2885
|
}));
|
|
2723
|
-
await emitAttemptEvent(onAttemptEvent, {
|
|
2724
|
-
type: "finished",
|
|
2725
|
-
attemptNumber,
|
|
2726
|
-
maxAttempts: MAX_ATTEMPTS,
|
|
2727
|
-
outcome: "captcha",
|
|
2728
|
-
kernelSessionId: null,
|
|
2729
|
-
questionCount: 0,
|
|
2730
|
-
durationMs: Date.now() - startedAtMs,
|
|
2731
|
-
error: err.message,
|
|
2732
|
-
willRetry,
|
|
2733
|
-
cleanup: {
|
|
2734
|
-
kernelSessionId: null,
|
|
2735
|
-
kernelDeleteStarted: false,
|
|
2736
|
-
kernelDeleteSucceeded: null,
|
|
2737
|
-
kernelDeleteError: null,
|
|
2738
|
-
browserCloseSucceeded: null,
|
|
2739
|
-
browserCloseError: null
|
|
2740
|
-
},
|
|
2741
|
-
debug: null,
|
|
2742
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2743
|
-
});
|
|
2744
|
-
if (willRetry) continue;
|
|
2745
|
-
break;
|
|
2746
2886
|
}
|
|
2747
2887
|
await emitAttemptEvent(onAttemptEvent, {
|
|
2748
2888
|
type: "finished",
|
|
2749
2889
|
attemptNumber,
|
|
2750
|
-
maxAttempts
|
|
2751
|
-
outcome
|
|
2890
|
+
maxAttempts,
|
|
2891
|
+
outcome,
|
|
2752
2892
|
kernelSessionId: null,
|
|
2753
2893
|
questionCount: 0,
|
|
2754
2894
|
durationMs: Date.now() - startedAtMs,
|
|
2755
2895
|
error: errorMessage(err),
|
|
2756
|
-
willRetry
|
|
2896
|
+
willRetry,
|
|
2757
2897
|
cleanup: {
|
|
2758
2898
|
kernelSessionId: null,
|
|
2759
2899
|
kernelDeleteStarted: false,
|
|
@@ -2765,15 +2905,19 @@ async function harvest(rawOptions) {
|
|
|
2765
2905
|
debug: null,
|
|
2766
2906
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2767
2907
|
});
|
|
2908
|
+
lastError = err;
|
|
2909
|
+
if (willRetry) continue;
|
|
2910
|
+
if (outcome === "captcha") break;
|
|
2768
2911
|
throw err;
|
|
2769
2912
|
}
|
|
2770
2913
|
}
|
|
2914
|
+
if (lastError && !(lastError instanceof CaptchaError)) throw lastError;
|
|
2771
2915
|
console.warn(JSON.stringify({
|
|
2772
2916
|
event: "harvest_captcha_exhausted",
|
|
2773
|
-
max_attempts:
|
|
2917
|
+
max_attempts: maxAttempts,
|
|
2774
2918
|
session_kind: kernelApiKey ? "kernel" : "local"
|
|
2775
2919
|
}));
|
|
2776
|
-
throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${
|
|
2920
|
+
throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${maxAttempts} fresh sessions. Try again in a few minutes.`));
|
|
2777
2921
|
}
|
|
2778
2922
|
|
|
2779
2923
|
export {
|
|
@@ -2788,7 +2932,8 @@ export {
|
|
|
2788
2932
|
MapsSelectors,
|
|
2789
2933
|
buildYouTubeChannelVideosUrl,
|
|
2790
2934
|
BrowserDriver,
|
|
2935
|
+
deleteKernelProxyId,
|
|
2791
2936
|
resolveKernelProxyId,
|
|
2792
2937
|
harvest
|
|
2793
2938
|
};
|
|
2794
|
-
//# sourceMappingURL=chunk-
|
|
2939
|
+
//# sourceMappingURL=chunk-MY3S7EX7.js.map
|