@grabbit-labs/dynafetch 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +94 -49
- package/dist/index.js.map +4 -4
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -9,6 +9,14 @@ var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require
|
|
|
9
9
|
// ../dynafetch-core/src/index.ts
|
|
10
10
|
import * as net from "node:net";
|
|
11
11
|
|
|
12
|
+
// ../../src/phantom/log.ts
|
|
13
|
+
var enabled = process.env.DYNAFETCH_DEBUG === "1";
|
|
14
|
+
var log = enabled ? console.log.bind(console) : () => {
|
|
15
|
+
};
|
|
16
|
+
var warn = enabled ? console.warn.bind(console) : () => {
|
|
17
|
+
};
|
|
18
|
+
var error = console.error.bind(console);
|
|
19
|
+
|
|
12
20
|
// ../../src/phantom/execute.ts
|
|
13
21
|
import { JSDOM, VirtualConsole, CookieJar } from "jsdom";
|
|
14
22
|
import WebSocket from "ws";
|
|
@@ -161,11 +169,18 @@ function createWorkerCommand() {
|
|
|
161
169
|
}
|
|
162
170
|
function createWorkerTransport() {
|
|
163
171
|
const { command, args, cwd } = createWorkerCommand();
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
172
|
+
let child;
|
|
173
|
+
try {
|
|
174
|
+
child = spawn(command, args, {
|
|
175
|
+
cwd,
|
|
176
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
177
|
+
env: process.env
|
|
178
|
+
});
|
|
179
|
+
} catch (err) {
|
|
180
|
+
return Promise.reject(
|
|
181
|
+
new Error(`Failed to start dynafetch-net TLS proxy: ${err instanceof Error ? err.message : String(err)}. Binary: ${command}`)
|
|
182
|
+
);
|
|
183
|
+
}
|
|
169
184
|
const pending = /* @__PURE__ */ new Map();
|
|
170
185
|
const rl = readline.createInterface({ input: child.stdout });
|
|
171
186
|
rl.on("line", (line) => {
|
|
@@ -174,9 +189,9 @@ function createWorkerTransport() {
|
|
|
174
189
|
let payload;
|
|
175
190
|
try {
|
|
176
191
|
payload = JSON.parse(trimmed);
|
|
177
|
-
} catch (
|
|
192
|
+
} catch (error2) {
|
|
178
193
|
for (const entry of pending.values()) {
|
|
179
|
-
entry.reject(new Error(`Invalid dynafetch-net response: ${String(
|
|
194
|
+
entry.reject(new Error(`Invalid dynafetch-net response: ${String(error2)}`));
|
|
180
195
|
}
|
|
181
196
|
pending.clear();
|
|
182
197
|
return;
|
|
@@ -193,7 +208,7 @@ function createWorkerTransport() {
|
|
|
193
208
|
child.stderr.on("data", (chunk) => {
|
|
194
209
|
const message = chunk.toString().trim();
|
|
195
210
|
if (message) {
|
|
196
|
-
console.warn(`[dynafetch-net] ${message}`);
|
|
211
|
+
if (process.env.DYNAFETCH_DEBUG === "1") console.warn(`[dynafetch-net] ${message}`);
|
|
197
212
|
}
|
|
198
213
|
});
|
|
199
214
|
const onExit = (code, signal) => {
|
|
@@ -204,15 +219,29 @@ function createWorkerTransport() {
|
|
|
204
219
|
pending.clear();
|
|
205
220
|
transportPromise = null;
|
|
206
221
|
};
|
|
207
|
-
child.once("error", (
|
|
222
|
+
child.once("error", (error2) => {
|
|
208
223
|
for (const entry of pending.values()) {
|
|
209
|
-
entry.reject(
|
|
224
|
+
entry.reject(error2);
|
|
210
225
|
}
|
|
211
226
|
pending.clear();
|
|
212
227
|
transportPromise = null;
|
|
213
228
|
});
|
|
214
229
|
child.once("exit", onExit);
|
|
215
|
-
return Promise
|
|
230
|
+
return new Promise((resolve, reject) => {
|
|
231
|
+
let settled = false;
|
|
232
|
+
child.once("error", (err) => {
|
|
233
|
+
if (!settled) {
|
|
234
|
+
settled = true;
|
|
235
|
+
reject(new Error(`Failed to start dynafetch-net TLS proxy: ${err.message}. Binary: ${command}`));
|
|
236
|
+
}
|
|
237
|
+
});
|
|
238
|
+
child.once("spawn", () => {
|
|
239
|
+
if (!settled) {
|
|
240
|
+
settled = true;
|
|
241
|
+
resolve({ child, pending });
|
|
242
|
+
}
|
|
243
|
+
});
|
|
244
|
+
});
|
|
216
245
|
}
|
|
217
246
|
async function getWorkerTransport() {
|
|
218
247
|
if (!transportPromise) {
|
|
@@ -220,17 +249,31 @@ async function getWorkerTransport() {
|
|
|
220
249
|
}
|
|
221
250
|
return transportPromise;
|
|
222
251
|
}
|
|
223
|
-
async function callWorker(method, params) {
|
|
252
|
+
async function callWorker(method, params, timeoutMs = 3e4) {
|
|
224
253
|
const transport = await getWorkerTransport();
|
|
225
254
|
const id = randomUUID();
|
|
226
255
|
const payload = JSON.stringify({ id, method, params });
|
|
227
256
|
return await new Promise((resolve, reject) => {
|
|
228
|
-
|
|
257
|
+
const timer = setTimeout(() => {
|
|
258
|
+
transport.pending.delete(id);
|
|
259
|
+
reject(new Error(`dynafetch-net request timed out after ${timeoutMs}ms (method: ${method})`));
|
|
260
|
+
}, timeoutMs);
|
|
261
|
+
transport.pending.set(id, {
|
|
262
|
+
resolve: (value) => {
|
|
263
|
+
clearTimeout(timer);
|
|
264
|
+
resolve(value);
|
|
265
|
+
},
|
|
266
|
+
reject: (err) => {
|
|
267
|
+
clearTimeout(timer);
|
|
268
|
+
reject(err);
|
|
269
|
+
}
|
|
270
|
+
});
|
|
229
271
|
transport.child.stdin.write(`${payload}
|
|
230
|
-
`, (
|
|
231
|
-
if (!
|
|
272
|
+
`, (error2) => {
|
|
273
|
+
if (!error2) return;
|
|
274
|
+
clearTimeout(timer);
|
|
232
275
|
transport.pending.delete(id);
|
|
233
|
-
reject(
|
|
276
|
+
reject(error2);
|
|
234
277
|
});
|
|
235
278
|
});
|
|
236
279
|
}
|
|
@@ -313,9 +356,9 @@ async function phantomFetch(payload) {
|
|
|
313
356
|
}
|
|
314
357
|
try {
|
|
315
358
|
return await dynafetchWorkerFetch(payload);
|
|
316
|
-
} catch (
|
|
359
|
+
} catch (error2) {
|
|
317
360
|
if (process.env.DYNAFETCH_DISABLE_DIRECT_FALLBACK === "1") {
|
|
318
|
-
throw
|
|
361
|
+
throw error2;
|
|
319
362
|
}
|
|
320
363
|
return await directFetch(payload);
|
|
321
364
|
}
|
|
@@ -337,9 +380,9 @@ async function phantomBatchFetch(payloads) {
|
|
|
337
380
|
finalUrl: r.finalUrl,
|
|
338
381
|
error: r.error
|
|
339
382
|
}));
|
|
340
|
-
} catch (
|
|
383
|
+
} catch (error2) {
|
|
341
384
|
if (process.env.DYNAFETCH_DISABLE_DIRECT_FALLBACK === "1") {
|
|
342
|
-
throw
|
|
385
|
+
throw error2;
|
|
343
386
|
}
|
|
344
387
|
return Promise.all(payloads.map((p) => directFetch(p)));
|
|
345
388
|
}
|
|
@@ -975,7 +1018,7 @@ var Executor = class {
|
|
|
975
1018
|
const taskId = this.trackTaskStart("module_bundle", cacheKey, this.moduleWaitMs);
|
|
976
1019
|
try {
|
|
977
1020
|
if (process.env.PHANTOM_DEBUG_MODULES === "1") {
|
|
978
|
-
|
|
1021
|
+
log("[Executor] Bundling module entry:", cacheKey);
|
|
979
1022
|
}
|
|
980
1023
|
const cached = this.moduleBundleCache.get(cacheKey);
|
|
981
1024
|
if (cached) {
|
|
@@ -1100,7 +1143,7 @@ var Executor = class {
|
|
|
1100
1143
|
this.moduleBundleCache.set(cacheKey, transformed);
|
|
1101
1144
|
if (!this.windowClosed) window.eval(transformed);
|
|
1102
1145
|
if (process.env.PHANTOM_DEBUG_MODULES === "1") {
|
|
1103
|
-
|
|
1146
|
+
log("[Executor] Module bundle eval complete:", cacheKey);
|
|
1104
1147
|
}
|
|
1105
1148
|
} catch (e) {
|
|
1106
1149
|
this.recordExecutionError(e, "unhandledRejection");
|
|
@@ -1437,9 +1480,9 @@ var Executor = class {
|
|
|
1437
1480
|
process.on("unhandledRejection", onNodeUnhandled);
|
|
1438
1481
|
try {
|
|
1439
1482
|
const virtualConsole = new VirtualConsole();
|
|
1440
|
-
virtualConsole.on("log", (...args) =>
|
|
1483
|
+
virtualConsole.on("log", (...args) => log("[JSDOM Log]", ...args));
|
|
1441
1484
|
virtualConsole.on("error", (...args) => console.error("[JSDOM Error]", ...args));
|
|
1442
|
-
virtualConsole.on("warn", (...args) =>
|
|
1485
|
+
virtualConsole.on("warn", (...args) => warn("[JSDOM Warn]", ...args));
|
|
1443
1486
|
const cookieJar = new CookieJar();
|
|
1444
1487
|
this.harvestData.cookies.forEach((c) => {
|
|
1445
1488
|
try {
|
|
@@ -1619,7 +1662,7 @@ var Executor = class {
|
|
|
1619
1662
|
if (that.handledModuleScriptUrls.has(abs)) return;
|
|
1620
1663
|
that.handledModuleScriptUrls.add(abs);
|
|
1621
1664
|
if (process.env.PHANTOM_DEBUG_MODULES === "1") {
|
|
1622
|
-
|
|
1665
|
+
log("[Executor] Detected module script:", abs);
|
|
1623
1666
|
}
|
|
1624
1667
|
void that.handleModuleScript(abs, window2);
|
|
1625
1668
|
return;
|
|
@@ -1778,7 +1821,7 @@ var Executor = class {
|
|
|
1778
1821
|
});
|
|
1779
1822
|
}
|
|
1780
1823
|
} catch (e) {
|
|
1781
|
-
|
|
1824
|
+
warn(`[Executor] Module script ${script.id} failed:`, e);
|
|
1782
1825
|
} finally {
|
|
1783
1826
|
currentScriptState.value = prevCurrentScript;
|
|
1784
1827
|
}
|
|
@@ -1794,7 +1837,7 @@ var Executor = class {
|
|
|
1794
1837
|
try {
|
|
1795
1838
|
window.eval(code);
|
|
1796
1839
|
} catch (e) {
|
|
1797
|
-
|
|
1840
|
+
warn(`[Executor] Script ${script.id} failed:`, e);
|
|
1798
1841
|
} finally {
|
|
1799
1842
|
currentScriptState.value = prevCurrentScript;
|
|
1800
1843
|
}
|
|
@@ -1834,16 +1877,16 @@ var Executor = class {
|
|
|
1834
1877
|
if (this.moduleInFlight.size > 0) {
|
|
1835
1878
|
await this.waitForModuleWork(this.moduleWaitMs);
|
|
1836
1879
|
}
|
|
1837
|
-
|
|
1880
|
+
log("[Executor] Waiting for network quiescence...");
|
|
1838
1881
|
const quiescenceStart = Date.now();
|
|
1839
1882
|
try {
|
|
1840
1883
|
await this.waitForQuiescence();
|
|
1841
1884
|
} catch (e) {
|
|
1842
|
-
|
|
1885
|
+
warn("[Executor] Quiescence wait failed:", e);
|
|
1843
1886
|
}
|
|
1844
1887
|
this.timings.quiescence_ms = Date.now() - quiescenceStart;
|
|
1845
1888
|
const reason = this.matchFound && !this.findAll ? "(early exit on match)" : "";
|
|
1846
|
-
|
|
1889
|
+
log(`[Executor] Quiescence reached in ${Date.now() - quiescenceStart}ms ${reason}`);
|
|
1847
1890
|
const renderedHtml = this.serializeDocument(window);
|
|
1848
1891
|
this.windowClosed = true;
|
|
1849
1892
|
try {
|
|
@@ -2106,7 +2149,7 @@ var Executor = class {
|
|
|
2106
2149
|
this.asyncFlag = async !== false;
|
|
2107
2150
|
this.aborted = false;
|
|
2108
2151
|
if (process.env.PHANTOM_DEBUG_XHR === "1") {
|
|
2109
|
-
|
|
2152
|
+
log("[XHR open]", this.method, this.url);
|
|
2110
2153
|
}
|
|
2111
2154
|
this.readyState = 1;
|
|
2112
2155
|
this.responseURL = this.url;
|
|
@@ -2159,7 +2202,7 @@ var Executor = class {
|
|
|
2159
2202
|
};
|
|
2160
2203
|
that.logRequest(logEntry);
|
|
2161
2204
|
if (process.env.PHANTOM_DEBUG_XHR === "1") {
|
|
2162
|
-
|
|
2205
|
+
log("[XHR send]", this.method, this.url, {
|
|
2163
2206
|
hasBody: body != null,
|
|
2164
2207
|
headers
|
|
2165
2208
|
});
|
|
@@ -2326,7 +2369,9 @@ async function prefetchModuleGraph(rootUrls, cache, pageUrl, opts) {
|
|
|
2326
2369
|
}
|
|
2327
2370
|
for (let round = 0; round < maxRounds && toFetch.length > 0; round++) {
|
|
2328
2371
|
if (process.env.PHANTOM_DEBUG_MODULES === "1") {
|
|
2329
|
-
|
|
2372
|
+
if (process.env.DYNAFETCH_DEBUG === "1") {
|
|
2373
|
+
console.log(`[prefetch] Round ${round}: ${toFetch.length} modules`);
|
|
2374
|
+
}
|
|
2330
2375
|
}
|
|
2331
2376
|
const payloads = toFetch.map((u) => ({
|
|
2332
2377
|
method: "GET",
|
|
@@ -2429,7 +2474,7 @@ var Harvester = class {
|
|
|
2429
2474
|
const location = data.headers["Location"] || data.headers["location"];
|
|
2430
2475
|
if (location) {
|
|
2431
2476
|
currentUrl = new URL(location, currentUrl).toString();
|
|
2432
|
-
|
|
2477
|
+
log(`[Harvest] Following redirect to: ${currentUrl}`);
|
|
2433
2478
|
redirectCount++;
|
|
2434
2479
|
if (data.status === 302 || data.status === 303) {
|
|
2435
2480
|
currentMethod = "GET";
|
|
@@ -2500,10 +2545,10 @@ var Harvester = class {
|
|
|
2500
2545
|
return b.includes("just a moment") || b.includes("challenge-platform") || b.includes("__cf_chl") || b.includes("cf-browser-verification") || b.includes("enable javascript and cookies to continue") || b.includes("security verification") || b.includes("captcha") || b.includes("trkcode=") || b.includes("trkinfo=");
|
|
2501
2546
|
}
|
|
2502
2547
|
async harvest() {
|
|
2503
|
-
|
|
2548
|
+
log(`[Harvest] Fetching ${this.targetUrl} via TLS Proxy...`);
|
|
2504
2549
|
let response = await this.fetchViaProxy(this.targetUrl, { ...DEFAULT_HEADERS, ...this.requestHeaders }, true);
|
|
2505
2550
|
if (response.status >= 400) {
|
|
2506
|
-
|
|
2551
|
+
log(`[Harvest] Response Body on Error:`, response.body.substring(0, 500));
|
|
2507
2552
|
if (this.looksBlocked(response.status, response.body || "")) {
|
|
2508
2553
|
throw new BlockedByBotProtectionError(
|
|
2509
2554
|
this.targetUrl,
|
|
@@ -2515,7 +2560,7 @@ var Harvester = class {
|
|
|
2515
2560
|
let finalUrl = response.finalUrl;
|
|
2516
2561
|
let html = response.body;
|
|
2517
2562
|
if (this.isConsentWall(finalUrl, html)) {
|
|
2518
|
-
|
|
2563
|
+
log(`[Harvest] Consent wall detected at ${finalUrl}, attempting bypass...`);
|
|
2519
2564
|
const form = this.parseConsentForm(html, finalUrl);
|
|
2520
2565
|
if (form) {
|
|
2521
2566
|
try {
|
|
@@ -2529,32 +2574,32 @@ var Harvester = class {
|
|
|
2529
2574
|
};
|
|
2530
2575
|
const consentResp = await this.fetchViaProxy(form.action, postHeaders, true, 10, "POST", formBody);
|
|
2531
2576
|
if (consentResp.status < 400) {
|
|
2532
|
-
|
|
2577
|
+
log(`[Harvest] Consent POST succeeded (${consentResp.status}), final URL: ${consentResp.finalUrl}`);
|
|
2533
2578
|
if (!this.isConsentWall(consentResp.finalUrl, consentResp.body)) {
|
|
2534
2579
|
response = consentResp;
|
|
2535
2580
|
finalUrl = consentResp.finalUrl;
|
|
2536
2581
|
html = consentResp.body;
|
|
2537
|
-
|
|
2582
|
+
log(`[Harvest] Consent bypass successful (from redirect), got real page at ${finalUrl}`);
|
|
2538
2583
|
} else {
|
|
2539
|
-
|
|
2584
|
+
log(`[Harvest] Consent redirect still on consent page, re-fetching original URL...`);
|
|
2540
2585
|
const retryResp = await this.fetchViaProxy(this.targetUrl, { ...DEFAULT_HEADERS, ...this.requestHeaders }, true);
|
|
2541
2586
|
if (retryResp.status < 400 && !this.isConsentWall(retryResp.finalUrl, retryResp.body)) {
|
|
2542
2587
|
response = retryResp;
|
|
2543
2588
|
finalUrl = retryResp.finalUrl;
|
|
2544
2589
|
html = retryResp.body;
|
|
2545
|
-
|
|
2590
|
+
log(`[Harvest] Consent bypass successful (re-fetch), got real page at ${finalUrl}`);
|
|
2546
2591
|
} else {
|
|
2547
|
-
|
|
2592
|
+
warn(`[Harvest] Re-fetch after consent still returned consent wall, proceeding with original`);
|
|
2548
2593
|
}
|
|
2549
2594
|
}
|
|
2550
2595
|
} else {
|
|
2551
|
-
|
|
2596
|
+
warn(`[Harvest] Consent POST returned ${consentResp.status}, proceeding with consent page`);
|
|
2552
2597
|
}
|
|
2553
2598
|
} catch (e) {
|
|
2554
|
-
|
|
2599
|
+
warn(`[Harvest] Consent bypass failed, proceeding with consent page:`, e);
|
|
2555
2600
|
}
|
|
2556
2601
|
} else {
|
|
2557
|
-
|
|
2602
|
+
warn(`[Harvest] Could not parse consent form, proceeding with consent page`);
|
|
2558
2603
|
}
|
|
2559
2604
|
}
|
|
2560
2605
|
const $ = cheerio.load(html);
|
|
@@ -2663,7 +2708,7 @@ var Harvester = class {
|
|
|
2663
2708
|
}));
|
|
2664
2709
|
const allPayloads = [...scriptPayloads, ...preloadPayloads];
|
|
2665
2710
|
if (allPayloads.length > 0) {
|
|
2666
|
-
|
|
2711
|
+
log(`[Harvest] Batch-fetching ${scriptPayloads.length} scripts + ${preloadPayloads.length} modulepreloads...`);
|
|
2667
2712
|
const allResponses = await phantomBatchFetch(allPayloads);
|
|
2668
2713
|
for (let i = 0; i < batchScriptMeta.length; i++) {
|
|
2669
2714
|
const meta = batchScriptMeta[i];
|
|
@@ -2690,7 +2735,7 @@ var Harvester = class {
|
|
|
2690
2735
|
execution: meta.execution
|
|
2691
2736
|
});
|
|
2692
2737
|
} else {
|
|
2693
|
-
|
|
2738
|
+
warn(`[Harvest] Failed to fetch script ${meta.absoluteUrl}: status ${resp.status}`);
|
|
2694
2739
|
}
|
|
2695
2740
|
}
|
|
2696
2741
|
for (let i = 0; i < modulePreloadUrls.length; i++) {
|
|
@@ -2933,8 +2978,8 @@ function toWarnings(plan, errors, options) {
|
|
|
2933
2978
|
warnings.push("non-critical third-party scripts are skipped on the critical render path");
|
|
2934
2979
|
}
|
|
2935
2980
|
if (errors?.length) {
|
|
2936
|
-
for (const
|
|
2937
|
-
warnings.push(`${
|
|
2981
|
+
for (const error2 of errors.slice(0, 3)) {
|
|
2982
|
+
warnings.push(`${error2.source}: ${error2.message}`);
|
|
2938
2983
|
}
|
|
2939
2984
|
}
|
|
2940
2985
|
return warnings;
|