webpeel 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -21
- package/README.md +13 -2
- package/dist/core/strategies.d.ts +16 -22
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +156 -222
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/strategy-hooks.d.ts +76 -0
- package/dist/core/strategy-hooks.d.ts.map +1 -0
- package/dist/core/strategy-hooks.js +33 -0
- package/dist/core/strategy-hooks.js.map +1 -0
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
<a href="https://github.com/webpeel/webpeel/stargazers"><img src="https://img.shields.io/github/stars/webpeel/webpeel.svg" alt="GitHub stars"></a>
|
|
12
12
|
<a href="https://github.com/webpeel/webpeel/actions/workflows/ci.yml"><img src="https://github.com/webpeel/webpeel/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
|
|
13
13
|
<a href="https://www.typescriptlang.org/"><img src="https://img.shields.io/badge/TypeScript-5.6-blue.svg" alt="TypeScript"></a>
|
|
14
|
-
<a href="https://
|
|
14
|
+
<a href="https://www.gnu.org/licenses/agpl-3.0"><img src="https://img.shields.io/badge/License-AGPL%20v3-blue.svg" alt="AGPL v3 License"></a>
|
|
15
15
|
</p>
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
@@ -251,7 +251,18 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
|
251
251
|
|
|
252
252
|
## License
|
|
253
253
|
|
|
254
|
-
|
|
254
|
+
This project is licensed under the [GNU Affero General Public License v3.0 (AGPL-3.0)](https://www.gnu.org/licenses/agpl-3.0.html).
|
|
255
|
+
|
|
256
|
+
**What this means:**
|
|
257
|
+
- ✅ Free to use, modify, and distribute
|
|
258
|
+
- ✅ Free for personal and commercial use
|
|
259
|
+
- ⚠️ If you run a modified version as a network service, you must release your source code under AGPL-3.0
|
|
260
|
+
|
|
261
|
+
**Need a commercial license?** Contact us at [support@webpeel.dev](mailto:support@webpeel.dev) for proprietary/enterprise licensing.
|
|
262
|
+
|
|
263
|
+
> **Note:** Versions 0.7.1 and earlier were released under MIT. Those releases remain MIT-licensed.
|
|
264
|
+
|
|
265
|
+
© [WebPeel](https://github.com/webpeel)
|
|
255
266
|
|
|
256
267
|
---
|
|
257
268
|
|
|
@@ -1,28 +1,23 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Smart escalation strategy: try simple fetch first, escalate to browser if needed
|
|
2
|
+
* Smart escalation strategy: try simple fetch first, escalate to browser if needed.
|
|
3
|
+
*
|
|
4
|
+
* Premium server-side optimisations (SWR cache, domain intelligence, parallel
|
|
5
|
+
* race) are injected via the hook system in `strategy-hooks.ts`. When no hooks
|
|
6
|
+
* are registered the strategy degrades gracefully to a simple escalation path
|
|
7
|
+
* that works great for CLI / npm library usage.
|
|
3
8
|
*/
|
|
4
|
-
import { type
|
|
5
|
-
export
|
|
9
|
+
import { type StrategyResult } from './strategy-hooks.js';
|
|
10
|
+
export type { StrategyResult } from './strategy-hooks.js';
|
|
6
11
|
export interface StrategyOptions {
|
|
7
|
-
/** Force browser mode (skip simple fetch) */
|
|
8
12
|
forceBrowser?: boolean;
|
|
9
|
-
/** Use stealth mode to bypass bot detection */
|
|
10
13
|
stealth?: boolean;
|
|
11
|
-
/** Wait time after page load in browser mode (ms) */
|
|
12
14
|
waitMs?: number;
|
|
13
|
-
/** Custom user agent */
|
|
14
15
|
userAgent?: string;
|
|
15
|
-
/** Request timeout (ms) */
|
|
16
16
|
timeoutMs?: number;
|
|
17
|
-
/** Capture a screenshot of the page */
|
|
18
17
|
screenshot?: boolean;
|
|
19
|
-
/** Full-page screenshot (default: viewport only) */
|
|
20
18
|
screenshotFullPage?: boolean;
|
|
21
|
-
/** Custom HTTP headers to send */
|
|
22
19
|
headers?: Record<string, string>;
|
|
23
|
-
/** Cookies to set (key=value pairs) */
|
|
24
20
|
cookies?: string[];
|
|
25
|
-
/** Page actions to execute before extraction */
|
|
26
21
|
actions?: Array<{
|
|
27
22
|
type: 'wait' | 'click' | 'scroll' | 'type' | 'fill' | 'select' | 'press' | 'hover' | 'waitForSelector' | 'screenshot';
|
|
28
23
|
selector?: string;
|
|
@@ -32,24 +27,23 @@ export interface StrategyOptions {
|
|
|
32
27
|
to?: 'top' | 'bottom' | number;
|
|
33
28
|
timeout?: number;
|
|
34
29
|
}>;
|
|
35
|
-
/** Keep browser page open for reuse (caller must close) */
|
|
36
30
|
keepPageOpen?: boolean;
|
|
37
|
-
/** Disable response cache for this request */
|
|
38
31
|
noCache?: boolean;
|
|
39
|
-
/** Time to wait before launching browser in parallel with simple fetch */
|
|
40
32
|
raceTimeoutMs?: number;
|
|
41
|
-
/** Location/language for geo-targeted scraping */
|
|
42
33
|
location?: {
|
|
43
34
|
country?: string;
|
|
44
35
|
languages?: string[];
|
|
45
36
|
};
|
|
46
37
|
}
|
|
47
|
-
export interface StrategyResult extends FetchResult {
|
|
48
|
-
/** Which strategy succeeded: 'simple' | 'browser' | 'stealth' | 'cached' */
|
|
49
|
-
method: 'simple' | 'browser' | 'stealth' | 'cached';
|
|
50
|
-
}
|
|
51
38
|
/**
|
|
52
|
-
* Smart fetch with automatic escalation
|
|
39
|
+
* Smart fetch with automatic escalation.
|
|
40
|
+
*
|
|
41
|
+
* Without hooks: simple fetch → browser → stealth escalation.
|
|
42
|
+
* With premium hooks: SWR cache → domain intel → parallel race → escalation.
|
|
53
43
|
*/
|
|
54
44
|
export declare function smartFetch(url: string, options?: StrategyOptions): Promise<StrategyResult>;
|
|
45
|
+
/**
|
|
46
|
+
* @deprecated Use `clearStrategyHooks()` from strategy-hooks.ts instead.
|
|
47
|
+
*/
|
|
48
|
+
export { clearStrategyHooks as clearDomainIntel } from './strategy-hooks.js';
|
|
55
49
|
//# sourceMappingURL=strategies.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"strategies.d.ts","sourceRoot":"","sources":["../../src/core/strategies.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"strategies.d.ts","sourceRoot":"","sources":["../../src/core/strategies.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAMH,OAAO,EAEL,KAAK,cAAc,EAEpB,MAAM,qBAAqB,CAAC;AAG7B,YAAY,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAoE1D,MAAM,WAAW,eAAe;IAC9B,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,KAAK,CAAC;QACd,IAAI,EACA,MAAM,GACN,OAAO,GACP,QAAQ,GACR,MAAM,GACN,MAAM,GACN,QAAQ,GACR,OAAO,GACP,OAAO,GACP,iBAAiB,GACjB,YAAY,CAAC;QACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,EAAE,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;QAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,EAAE;QACT,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;CACH;AAuGD;;;;;GAKG;AACH,wBAAsB,UAAU,CAC9B,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,CAAC,CA6QzB;AAID;;GAEG;AACH,OAAO,EAAE,kBAAkB,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC"}
|
package/dist/core/strategies.js
CHANGED
|
@@ -1,116 +1,21 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Smart escalation strategy: try simple fetch first, escalate to browser if needed
|
|
2
|
+
* Smart escalation strategy: try simple fetch first, escalate to browser if needed.
|
|
3
|
+
*
|
|
4
|
+
* Premium server-side optimisations (SWR cache, domain intelligence, parallel
|
|
5
|
+
* race) are injected via the hook system in `strategy-hooks.ts`. When no hooks
|
|
6
|
+
* are registered the strategy degrades gracefully to a simple escalation path
|
|
7
|
+
* that works great for CLI / npm library usage.
|
|
3
8
|
*/
|
|
4
9
|
import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
|
|
5
|
-
import {
|
|
10
|
+
import { getCached, setCached as setBasicCache } from './cache.js';
|
|
6
11
|
import { resolveAndCache } from './dns-cache.js';
|
|
7
12
|
import { BlockedError, NetworkError } from '../types.js';
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
const DOMAIN_INTEL_EMA_ALPHA = 0.3;
|
|
11
|
-
const domainIntel = new Map();
|
|
12
|
-
const domainMethodCounts = new Map();
|
|
13
|
-
function getDomainKey(url) {
|
|
14
|
-
try {
|
|
15
|
-
return new URL(url).hostname.toLowerCase();
|
|
16
|
-
}
|
|
17
|
-
catch {
|
|
18
|
-
return '';
|
|
19
|
-
}
|
|
20
|
-
}
|
|
21
|
-
function pruneDomainIntel(now) {
|
|
22
|
-
for (const [key, intel] of domainIntel) {
|
|
23
|
-
if (now - intel.lastSeen > DOMAIN_INTEL_TTL_MS) {
|
|
24
|
-
domainIntel.delete(key);
|
|
25
|
-
domainMethodCounts.delete(key);
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
function recordDomainResult(url, method, latencyMs) {
|
|
30
|
-
const key = getDomainKey(url);
|
|
31
|
-
if (!key) {
|
|
32
|
-
return;
|
|
33
|
-
}
|
|
34
|
-
const now = Date.now();
|
|
35
|
-
pruneDomainIntel(now);
|
|
36
|
-
const existing = domainIntel.get(key);
|
|
37
|
-
const sanitizedLatency = Number.isFinite(latencyMs) && latencyMs > 0
|
|
38
|
-
? latencyMs
|
|
39
|
-
: (existing?.avgLatencyMs ?? 0);
|
|
40
|
-
const next = existing
|
|
41
|
-
? {
|
|
42
|
-
needsBrowser: existing.needsBrowser || method === 'browser' || method === 'stealth',
|
|
43
|
-
needsStealth: existing.needsStealth || method === 'stealth',
|
|
44
|
-
avgLatencyMs: existing.avgLatencyMs === 0
|
|
45
|
-
? sanitizedLatency
|
|
46
|
-
: (existing.avgLatencyMs * (1 - DOMAIN_INTEL_EMA_ALPHA)) + (sanitizedLatency * DOMAIN_INTEL_EMA_ALPHA),
|
|
47
|
-
lastSeen: now,
|
|
48
|
-
sampleCount: existing.sampleCount + 1,
|
|
49
|
-
}
|
|
50
|
-
: {
|
|
51
|
-
needsBrowser: method === 'browser' || method === 'stealth',
|
|
52
|
-
needsStealth: method === 'stealth',
|
|
53
|
-
avgLatencyMs: sanitizedLatency,
|
|
54
|
-
lastSeen: now,
|
|
55
|
-
sampleCount: 1,
|
|
56
|
-
};
|
|
57
|
-
const existingCounts = domainMethodCounts.get(key) ?? { simple: 0, browser: 0, stealth: 0 };
|
|
58
|
-
existingCounts[method] += 1;
|
|
59
|
-
domainIntel.delete(key);
|
|
60
|
-
domainIntel.set(key, next);
|
|
61
|
-
domainMethodCounts.set(key, existingCounts);
|
|
62
|
-
while (domainIntel.size > DOMAIN_INTEL_MAX) {
|
|
63
|
-
const oldestKey = domainIntel.keys().next().value;
|
|
64
|
-
if (!oldestKey) {
|
|
65
|
-
break;
|
|
66
|
-
}
|
|
67
|
-
domainIntel.delete(oldestKey);
|
|
68
|
-
domainMethodCounts.delete(oldestKey);
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
function getDomainRecommendation(url) {
|
|
72
|
-
const key = getDomainKey(url);
|
|
73
|
-
if (!key) {
|
|
74
|
-
return null;
|
|
75
|
-
}
|
|
76
|
-
const intel = domainIntel.get(key);
|
|
77
|
-
if (!intel) {
|
|
78
|
-
return null;
|
|
79
|
-
}
|
|
80
|
-
const now = Date.now();
|
|
81
|
-
if (now - intel.lastSeen > DOMAIN_INTEL_TTL_MS) {
|
|
82
|
-
domainIntel.delete(key);
|
|
83
|
-
domainMethodCounts.delete(key);
|
|
84
|
-
return null;
|
|
85
|
-
}
|
|
86
|
-
if (intel.sampleCount <= 2) {
|
|
87
|
-
return null;
|
|
88
|
-
}
|
|
89
|
-
const counts = domainMethodCounts.get(key);
|
|
90
|
-
if (!counts) {
|
|
91
|
-
return null;
|
|
92
|
-
}
|
|
93
|
-
// LRU touch
|
|
94
|
-
domainIntel.delete(key);
|
|
95
|
-
domainIntel.set(key, intel);
|
|
96
|
-
const allStealth = counts.stealth === intel.sampleCount;
|
|
97
|
-
if (allStealth && intel.needsStealth) {
|
|
98
|
-
return { mode: 'stealth' };
|
|
99
|
-
}
|
|
100
|
-
const allBrowser = counts.simple === 0 && (counts.browser + counts.stealth === intel.sampleCount);
|
|
101
|
-
if (allBrowser && intel.needsBrowser) {
|
|
102
|
-
return { mode: 'browser' };
|
|
103
|
-
}
|
|
104
|
-
return null;
|
|
105
|
-
}
|
|
106
|
-
export function clearDomainIntel() {
|
|
107
|
-
domainIntel.clear();
|
|
108
|
-
domainMethodCounts.clear();
|
|
109
|
-
}
|
|
13
|
+
import { getStrategyHooks, } from './strategy-hooks.js';
|
|
14
|
+
/* ---------- hardcoded domain rules -------------------------------------- */
|
|
110
15
|
function shouldForceBrowser(url) {
|
|
111
16
|
try {
|
|
112
17
|
const hostname = new URL(url).hostname.toLowerCase();
|
|
113
|
-
// Reddit often returns an HTML shell via simple fetch
|
|
18
|
+
// Reddit often returns an HTML shell via simple fetch
|
|
114
19
|
if (hostname === 'reddit.com' || hostname.endsWith('.reddit.com')) {
|
|
115
20
|
return { mode: 'browser' };
|
|
116
21
|
}
|
|
@@ -120,52 +25,45 @@ function shouldForceBrowser(url) {
|
|
|
120
25
|
hostname.endsWith('.npmjs.com')) {
|
|
121
26
|
return { mode: 'browser' };
|
|
122
27
|
}
|
|
123
|
-
//
|
|
124
|
-
// Note: NOT forced — let the shell-page detector escalate naturally
|
|
125
|
-
// since SO needs extra wait time that the escalation path handles better
|
|
126
|
-
// These are known to aggressively block automation; go straight to stealth
|
|
28
|
+
// These are known to aggressively block automation
|
|
127
29
|
if (hostname === 'glassdoor.com' || hostname.endsWith('.glassdoor.com')) {
|
|
128
30
|
return { mode: 'stealth' };
|
|
129
31
|
}
|
|
130
32
|
if (hostname === 'bloomberg.com' || hostname.endsWith('.bloomberg.com')) {
|
|
131
33
|
return { mode: 'stealth' };
|
|
132
34
|
}
|
|
133
|
-
// Indeed uses Cloudflare aggressively on job detail pages
|
|
134
35
|
if (hostname === 'indeed.com' || hostname.endsWith('.indeed.com')) {
|
|
135
36
|
return { mode: 'stealth' };
|
|
136
37
|
}
|
|
137
38
|
}
|
|
138
39
|
catch {
|
|
139
|
-
// Ignore URL parsing errors
|
|
40
|
+
// Ignore URL parsing errors; validation happens inside fetchers.
|
|
140
41
|
}
|
|
141
42
|
return null;
|
|
142
43
|
}
|
|
44
|
+
/* ---------- helpers ------------------------------------------------------ */
|
|
143
45
|
function isAbortError(error) {
|
|
144
46
|
return error instanceof Error && error.name === 'AbortError';
|
|
145
47
|
}
|
|
146
48
|
function shouldEscalateSimpleError(error) {
|
|
147
|
-
if (error instanceof BlockedError)
|
|
49
|
+
if (error instanceof BlockedError)
|
|
148
50
|
return true;
|
|
149
|
-
}
|
|
150
51
|
return error instanceof NetworkError && error.message.includes('TLS/SSL');
|
|
151
52
|
}
|
|
152
53
|
function looksLikeShellPage(result) {
|
|
153
|
-
const
|
|
154
|
-
if (!
|
|
54
|
+
const ct = (result.contentType || '').toLowerCase();
|
|
55
|
+
if (!ct.includes('html'))
|
|
155
56
|
return false;
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
return textContent.length < 500 && result.html.length > 1000;
|
|
57
|
+
const text = result.html.replace(/<[^>]*>/g, '').trim();
|
|
58
|
+
return text.length < 500 && result.html.length > 1000;
|
|
159
59
|
}
|
|
160
60
|
function prefetchDns(url) {
|
|
161
61
|
try {
|
|
162
62
|
const hostname = new URL(url).hostname;
|
|
163
|
-
void resolveAndCache(hostname).catch(() => {
|
|
164
|
-
// Best-effort optimization only.
|
|
165
|
-
});
|
|
63
|
+
void resolveAndCache(hostname).catch(() => { });
|
|
166
64
|
}
|
|
167
65
|
catch {
|
|
168
|
-
// Ignore invalid URL
|
|
66
|
+
// Ignore invalid URL.
|
|
169
67
|
}
|
|
170
68
|
}
|
|
171
69
|
async function fetchWithBrowserStrategy(url, options) {
|
|
@@ -190,10 +88,9 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
190
88
|
};
|
|
191
89
|
}
|
|
192
90
|
catch (error) {
|
|
193
|
-
if (isAbortError(error))
|
|
91
|
+
if (isAbortError(error))
|
|
194
92
|
throw error;
|
|
195
|
-
|
|
196
|
-
// Strategy 3: If browser gets blocked, try stealth mode as fallback (unless already using stealth)
|
|
93
|
+
// If browser gets blocked, try stealth as fallback (unless already stealth)
|
|
197
94
|
if (!effectiveStealth && error instanceof BlockedError) {
|
|
198
95
|
const result = await browserFetch(url, {
|
|
199
96
|
userAgent,
|
|
@@ -208,13 +105,11 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
208
105
|
keepPageOpen,
|
|
209
106
|
signal,
|
|
210
107
|
});
|
|
211
|
-
return {
|
|
212
|
-
...result,
|
|
213
|
-
method: 'stealth',
|
|
214
|
-
};
|
|
108
|
+
return { ...result, method: 'stealth' };
|
|
215
109
|
}
|
|
216
|
-
// If
|
|
217
|
-
if (error instanceof NetworkError &&
|
|
110
|
+
// If Cloudflare detected, retry with extra wait time
|
|
111
|
+
if (error instanceof NetworkError &&
|
|
112
|
+
error.message.toLowerCase().includes('cloudflare')) {
|
|
218
113
|
const result = await browserFetch(url, {
|
|
219
114
|
userAgent,
|
|
220
115
|
waitMs: 5000,
|
|
@@ -228,40 +123,41 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
228
123
|
keepPageOpen,
|
|
229
124
|
signal,
|
|
230
125
|
});
|
|
231
|
-
return {
|
|
232
|
-
...result,
|
|
233
|
-
method: effectiveStealth ? 'stealth' : 'browser',
|
|
234
|
-
};
|
|
126
|
+
return { ...result, method: effectiveStealth ? 'stealth' : 'browser' };
|
|
235
127
|
}
|
|
236
128
|
throw error;
|
|
237
129
|
}
|
|
238
130
|
}
|
|
131
|
+
/* ---------- main entry point -------------------------------------------- */
|
|
239
132
|
/**
|
|
240
|
-
* Smart fetch with automatic escalation
|
|
133
|
+
* Smart fetch with automatic escalation.
|
|
134
|
+
*
|
|
135
|
+
* Without hooks: simple fetch → browser → stealth escalation.
|
|
136
|
+
* With premium hooks: SWR cache → domain intel → parallel race → escalation.
|
|
241
137
|
*/
|
|
242
138
|
export async function smartFetch(url, options = {}) {
|
|
243
139
|
const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, } = options;
|
|
140
|
+
const hooks = getStrategyHooks();
|
|
244
141
|
const fetchStartMs = Date.now();
|
|
245
|
-
const
|
|
246
|
-
if (method === 'cached')
|
|
142
|
+
const recordMethod = (method) => {
|
|
143
|
+
if (method === 'cached')
|
|
247
144
|
return;
|
|
248
|
-
|
|
249
|
-
recordDomainResult(url, method, Date.now() - fetchStartMs);
|
|
145
|
+
hooks.recordDomainResult?.(url, method, Date.now() - fetchStartMs);
|
|
250
146
|
};
|
|
251
|
-
|
|
252
|
-
// Hardcoded rules take priority
|
|
147
|
+
/* ---- determine effective mode ---------------------------------------- */
|
|
148
|
+
// Hardcoded rules always take priority, then hook-based domain intelligence.
|
|
253
149
|
const forced = shouldForceBrowser(url);
|
|
254
|
-
const recommended = getDomainRecommendation(url);
|
|
255
|
-
const
|
|
150
|
+
const recommended = hooks.getDomainRecommendation?.(url) ?? null;
|
|
151
|
+
const selected = forced ?? recommended;
|
|
256
152
|
let effectiveForceBrowser = forceBrowser;
|
|
257
153
|
let effectiveStealth = stealth;
|
|
258
|
-
if (
|
|
154
|
+
if (selected) {
|
|
259
155
|
effectiveForceBrowser = true;
|
|
260
|
-
if (
|
|
156
|
+
if (selected.mode === 'stealth')
|
|
261
157
|
effectiveStealth = true;
|
|
262
|
-
}
|
|
263
158
|
}
|
|
264
159
|
prefetchDns(url);
|
|
160
|
+
/* ---- cache eligibility ----------------------------------------------- */
|
|
265
161
|
const canUseCache = !noCache &&
|
|
266
162
|
!effectiveForceBrowser &&
|
|
267
163
|
!effectiveStealth &&
|
|
@@ -272,33 +168,35 @@ export async function smartFetch(url, options = {}) {
|
|
|
272
168
|
!cookies &&
|
|
273
169
|
waitMs === 0 &&
|
|
274
170
|
!userAgent;
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
setCached(url, { ...freshResult, method: 'simple' });
|
|
287
|
-
}
|
|
171
|
+
/* ---- hook-based cache check (premium) -------------------------------- */
|
|
172
|
+
if (canUseCache && hooks.checkCache) {
|
|
173
|
+
const cached = hooks.checkCache(url);
|
|
174
|
+
if (cached) {
|
|
175
|
+
if (cached.stale && hooks.markRevalidating?.(url)) {
|
|
176
|
+
// Background revalidation — fire-and-forget
|
|
177
|
+
void (async () => {
|
|
178
|
+
try {
|
|
179
|
+
const fresh = await simpleFetch(url, userAgent, timeoutMs);
|
|
180
|
+
if (!looksLikeShellPage(fresh)) {
|
|
181
|
+
hooks.setCache?.(url, { ...fresh, method: 'simple' });
|
|
288
182
|
}
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
}
|
|
293
|
-
}
|
|
183
|
+
}
|
|
184
|
+
catch {
|
|
185
|
+
// Stale entry continues serving.
|
|
186
|
+
}
|
|
187
|
+
})();
|
|
294
188
|
}
|
|
295
|
-
return {
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
189
|
+
return { ...cached.value, method: 'cached' };
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
/* ---- basic cache check (non-premium fallback) ------------------------ */
|
|
193
|
+
if (canUseCache && !hooks.checkCache) {
|
|
194
|
+
const basicCached = getCached(url);
|
|
195
|
+
if (basicCached) {
|
|
196
|
+
return { ...basicCached, method: 'cached' };
|
|
299
197
|
}
|
|
300
198
|
}
|
|
301
|
-
|
|
199
|
+
/* ---- browser-level options ------------------------------------------- */
|
|
302
200
|
let shouldUseBrowser = effectiveForceBrowser || screenshot || effectiveStealth;
|
|
303
201
|
const browserOptions = {
|
|
304
202
|
userAgent,
|
|
@@ -312,7 +210,7 @@ export async function smartFetch(url, options = {}) {
|
|
|
312
210
|
keepPageOpen,
|
|
313
211
|
effectiveStealth,
|
|
314
212
|
};
|
|
315
|
-
|
|
213
|
+
/* ---- Strategy: simple fetch (with optional race) --------------------- */
|
|
316
214
|
if (!shouldUseBrowser) {
|
|
317
215
|
const simpleAbortController = new AbortController();
|
|
318
216
|
const simplePromise = retryFetch(() => simpleFetch(url, userAgent, timeoutMs, headers, simpleAbortController.signal), 3).then((result) => {
|
|
@@ -321,27 +219,31 @@ export async function smartFetch(url, options = {}) {
|
|
|
321
219
|
}
|
|
322
220
|
return result;
|
|
323
221
|
});
|
|
222
|
+
// Determine race timeout — hooks can override
|
|
223
|
+
const useRace = hooks.shouldRace?.() ?? false;
|
|
224
|
+
const effectiveRaceTimeout = useRace
|
|
225
|
+
? (hooks.getRaceTimeoutMs?.() ?? raceTimeoutMs)
|
|
226
|
+
: raceTimeoutMs;
|
|
324
227
|
let raceTimer;
|
|
325
228
|
const simpleOrTimeout = await Promise.race([
|
|
326
229
|
simplePromise
|
|
327
230
|
.then((result) => ({ type: 'simple-success', result }))
|
|
328
231
|
.catch((error) => ({ type: 'simple-error', error })),
|
|
329
232
|
new Promise((resolve) => {
|
|
330
|
-
raceTimer = setTimeout(() => resolve({ type: 'race-timeout' }), Math.max(
|
|
233
|
+
raceTimer = setTimeout(() => resolve({ type: 'race-timeout' }), Math.max(effectiveRaceTimeout, 0));
|
|
331
234
|
}),
|
|
332
235
|
]);
|
|
333
|
-
if (raceTimer)
|
|
236
|
+
if (raceTimer)
|
|
334
237
|
clearTimeout(raceTimer);
|
|
335
|
-
}
|
|
336
238
|
if (simpleOrTimeout.type === 'simple-success') {
|
|
337
239
|
const strategyResult = {
|
|
338
240
|
...simpleOrTimeout.result,
|
|
339
241
|
method: 'simple',
|
|
340
242
|
};
|
|
341
243
|
if (canUseCache) {
|
|
342
|
-
|
|
244
|
+
hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
|
|
343
245
|
}
|
|
344
|
-
|
|
246
|
+
recordMethod('simple');
|
|
345
247
|
return strategyResult;
|
|
346
248
|
}
|
|
347
249
|
if (simpleOrTimeout.type === 'simple-error') {
|
|
@@ -351,66 +253,98 @@ export async function smartFetch(url, options = {}) {
|
|
|
351
253
|
shouldUseBrowser = true;
|
|
352
254
|
}
|
|
353
255
|
else {
|
|
354
|
-
//
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
256
|
+
// Race timeout — only start parallel browser if hooks say to race
|
|
257
|
+
if (useRace) {
|
|
258
|
+
// Parallel race: simple still running, start browser too
|
|
259
|
+
const browserAbortController = new AbortController();
|
|
260
|
+
let simpleError;
|
|
261
|
+
let browserError;
|
|
262
|
+
const simpleCandidate = simplePromise
|
|
263
|
+
.then((result) => ({ source: 'simple', result }))
|
|
264
|
+
.catch((error) => {
|
|
265
|
+
simpleError = error;
|
|
266
|
+
throw error;
|
|
267
|
+
});
|
|
268
|
+
const browserCandidate = fetchWithBrowserStrategy(url, {
|
|
269
|
+
...browserOptions,
|
|
270
|
+
signal: browserAbortController.signal,
|
|
271
|
+
})
|
|
272
|
+
.then((result) => ({ source: 'browser', result }))
|
|
273
|
+
.catch((error) => {
|
|
274
|
+
browserError = error;
|
|
275
|
+
throw error;
|
|
276
|
+
});
|
|
277
|
+
try {
|
|
278
|
+
const winner = await Promise.any([
|
|
279
|
+
simpleCandidate,
|
|
280
|
+
browserCandidate,
|
|
281
|
+
]);
|
|
282
|
+
if (winner.source === 'simple') {
|
|
283
|
+
browserAbortController.abort();
|
|
284
|
+
const strategyResult = {
|
|
285
|
+
...winner.result,
|
|
286
|
+
method: 'simple',
|
|
287
|
+
};
|
|
288
|
+
if (canUseCache) {
|
|
289
|
+
hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
|
|
290
|
+
}
|
|
291
|
+
recordMethod('simple');
|
|
292
|
+
return strategyResult;
|
|
293
|
+
}
|
|
294
|
+
simpleAbortController.abort();
|
|
295
|
+
if (canUseCache) {
|
|
296
|
+
hooks.setCache?.(url, winner.result) ?? setBasicCache(url, winner.result);
|
|
297
|
+
}
|
|
298
|
+
recordMethod(winner.result.method);
|
|
299
|
+
return winner.result;
|
|
300
|
+
}
|
|
301
|
+
catch {
|
|
302
|
+
if (simpleError &&
|
|
303
|
+
!shouldEscalateSimpleError(simpleError) &&
|
|
304
|
+
!isAbortError(simpleError)) {
|
|
305
|
+
throw simpleError;
|
|
306
|
+
}
|
|
307
|
+
if (browserError)
|
|
308
|
+
throw browserError;
|
|
309
|
+
if (simpleError)
|
|
310
|
+
throw simpleError;
|
|
311
|
+
throw new Error('Both simple and browser fetch attempts failed');
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
else {
|
|
315
|
+
// No race — just wait for the simple fetch to finish
|
|
316
|
+
const simpleResult = await simplePromise
|
|
317
|
+
.then((result) => ({ type: 'simple-success', result }))
|
|
318
|
+
.catch((error) => ({ type: 'simple-error', error }));
|
|
319
|
+
if (simpleResult.type === 'simple-success') {
|
|
377
320
|
const strategyResult = {
|
|
378
|
-
...
|
|
321
|
+
...simpleResult.result,
|
|
379
322
|
method: 'simple',
|
|
380
323
|
};
|
|
381
324
|
if (canUseCache) {
|
|
382
|
-
|
|
325
|
+
hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
|
|
383
326
|
}
|
|
384
|
-
|
|
327
|
+
recordMethod('simple');
|
|
385
328
|
return strategyResult;
|
|
386
329
|
}
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
setCached(url, winner.result);
|
|
330
|
+
if (!shouldEscalateSimpleError(simpleResult.error)) {
|
|
331
|
+
throw simpleResult.error;
|
|
390
332
|
}
|
|
391
|
-
|
|
392
|
-
return winner.result;
|
|
393
|
-
}
|
|
394
|
-
catch {
|
|
395
|
-
// Both failed: prefer non-escalation simple errors, otherwise return browser-side error.
|
|
396
|
-
if (simpleError && !shouldEscalateSimpleError(simpleError) && !isAbortError(simpleError)) {
|
|
397
|
-
throw simpleError;
|
|
398
|
-
}
|
|
399
|
-
if (browserError) {
|
|
400
|
-
throw browserError;
|
|
401
|
-
}
|
|
402
|
-
if (simpleError) {
|
|
403
|
-
throw simpleError;
|
|
404
|
-
}
|
|
405
|
-
throw new Error('Both simple and browser fetch attempts failed');
|
|
333
|
+
shouldUseBrowser = true;
|
|
406
334
|
}
|
|
407
335
|
}
|
|
408
336
|
}
|
|
337
|
+
/* ---- browser / stealth fallback -------------------------------------- */
|
|
409
338
|
const browserResult = await fetchWithBrowserStrategy(url, browserOptions);
|
|
410
339
|
if (canUseCache) {
|
|
411
|
-
|
|
340
|
+
hooks.setCache?.(url, browserResult) ?? setBasicCache(url, browserResult);
|
|
412
341
|
}
|
|
413
|
-
|
|
342
|
+
recordMethod(browserResult.method);
|
|
414
343
|
return browserResult;
|
|
415
344
|
}
|
|
345
|
+
/* ---------- legacy export for tests ------------------------------------- */
|
|
346
|
+
/**
|
|
347
|
+
* @deprecated Use `clearStrategyHooks()` from strategy-hooks.ts instead.
|
|
348
|
+
*/
|
|
349
|
+
export { clearStrategyHooks as clearDomainIntel } from './strategy-hooks.js';
|
|
416
350
|
//# sourceMappingURL=strategies.js.map
|