webpeel 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,7 +11,7 @@
11
11
  <a href="https://github.com/webpeel/webpeel/stargazers"><img src="https://img.shields.io/github/stars/webpeel/webpeel.svg" alt="GitHub stars"></a>
12
12
  <a href="https://github.com/webpeel/webpeel/actions/workflows/ci.yml"><img src="https://github.com/webpeel/webpeel/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
13
13
  <a href="https://www.typescriptlang.org/"><img src="https://img.shields.io/badge/TypeScript-5.6-blue.svg" alt="TypeScript"></a>
14
- <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="MIT License"></a>
14
+ <a href="https://www.gnu.org/licenses/agpl-3.0"><img src="https://img.shields.io/badge/License-AGPL%20v3-blue.svg" alt="AGPL v3 License"></a>
15
15
  </p>
16
16
 
17
17
  <p align="center">
@@ -251,7 +251,18 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
251
251
 
252
252
  ## License
253
253
 
254
- MIT © [WebPeel](https://github.com/webpeel)
254
+ This project is licensed under the [GNU Affero General Public License v3.0 (AGPL-3.0)](https://www.gnu.org/licenses/agpl-3.0.html).
255
+
256
+ **What this means:**
257
+ - ✅ Free to use, modify, and distribute
258
+ - ✅ Free for personal and commercial use
259
+ - ⚠️ If you run a modified version as a network service, you must release your source code under AGPL-3.0
260
+
261
+ **Need a commercial license?** Contact us at [support@webpeel.dev](mailto:support@webpeel.dev) for proprietary/enterprise licensing.
262
+
263
+ > **Note:** Versions 0.7.1 and earlier were released under MIT. Those releases remain MIT-licensed.
264
+
265
+ © [WebPeel](https://github.com/webpeel)
255
266
 
256
267
  ---
257
268
 
@@ -1,28 +1,23 @@
1
1
  /**
2
- * Smart escalation strategy: try simple fetch first, escalate to browser if needed
2
+ * Smart escalation strategy: try simple fetch first, escalate to browser if needed.
3
+ *
4
+ * Premium server-side optimisations (SWR cache, domain intelligence, parallel
5
+ * race) are injected via the hook system in `strategy-hooks.ts`. When no hooks
6
+ * are registered the strategy degrades gracefully to a simple escalation path
7
+ * that works great for CLI / npm library usage.
3
8
  */
4
- import { type FetchResult } from './fetcher.js';
5
- export declare function clearDomainIntel(): void;
9
+ import { type StrategyResult } from './strategy-hooks.js';
10
+ export type { StrategyResult } from './strategy-hooks.js';
6
11
  export interface StrategyOptions {
7
- /** Force browser mode (skip simple fetch) */
8
12
  forceBrowser?: boolean;
9
- /** Use stealth mode to bypass bot detection */
10
13
  stealth?: boolean;
11
- /** Wait time after page load in browser mode (ms) */
12
14
  waitMs?: number;
13
- /** Custom user agent */
14
15
  userAgent?: string;
15
- /** Request timeout (ms) */
16
16
  timeoutMs?: number;
17
- /** Capture a screenshot of the page */
18
17
  screenshot?: boolean;
19
- /** Full-page screenshot (default: viewport only) */
20
18
  screenshotFullPage?: boolean;
21
- /** Custom HTTP headers to send */
22
19
  headers?: Record<string, string>;
23
- /** Cookies to set (key=value pairs) */
24
20
  cookies?: string[];
25
- /** Page actions to execute before extraction */
26
21
  actions?: Array<{
27
22
  type: 'wait' | 'click' | 'scroll' | 'type' | 'fill' | 'select' | 'press' | 'hover' | 'waitForSelector' | 'screenshot';
28
23
  selector?: string;
@@ -32,24 +27,23 @@ export interface StrategyOptions {
32
27
  to?: 'top' | 'bottom' | number;
33
28
  timeout?: number;
34
29
  }>;
35
- /** Keep browser page open for reuse (caller must close) */
36
30
  keepPageOpen?: boolean;
37
- /** Disable response cache for this request */
38
31
  noCache?: boolean;
39
- /** Time to wait before launching browser in parallel with simple fetch */
40
32
  raceTimeoutMs?: number;
41
- /** Location/language for geo-targeted scraping */
42
33
  location?: {
43
34
  country?: string;
44
35
  languages?: string[];
45
36
  };
46
37
  }
47
- export interface StrategyResult extends FetchResult {
48
- /** Which strategy succeeded: 'simple' | 'browser' | 'stealth' | 'cached' */
49
- method: 'simple' | 'browser' | 'stealth' | 'cached';
50
- }
51
38
  /**
52
- * Smart fetch with automatic escalation
39
+ * Smart fetch with automatic escalation.
40
+ *
41
+ * Without hooks: simple fetch → browser → stealth escalation.
42
+ * With premium hooks: SWR cache → domain intel → parallel race → escalation.
53
43
  */
54
44
  export declare function smartFetch(url: string, options?: StrategyOptions): Promise<StrategyResult>;
45
+ /**
46
+ * @deprecated Use `clearStrategyHooks()` from strategy-hooks.ts instead.
47
+ */
48
+ export { clearStrategyHooks as clearDomainIntel } from './strategy-hooks.js';
55
49
  //# sourceMappingURL=strategies.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"strategies.d.ts","sourceRoot":"","sources":["../../src/core/strategies.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAyC,KAAK,WAAW,EAAE,MAAM,cAAc,CAAC;AAoIvF,wBAAgB,gBAAgB,IAAI,IAAI,CAGvC;AA6ED,MAAM,WAAW,eAAe;IAC9B,6CAA6C;IAC7C,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,+CAA+C;IAC/C,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,qDAAqD;IACrD,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,wBAAwB;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,2BAA2B;IAC3B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,uCAAuC;IACvC,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,oDAAoD;IACpD,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,kCAAkC;IAClC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,uCAAuC;IACvC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,gDAAgD;IAChD,OAAO,CAAC,EAAE,KAAK,CAAC;QACd,IAAI,EAAE,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,MAAM,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,GAAG,OAAO,GAAG,iBAAiB,GAAG,YAAY,CAAC;QACtH,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,EAAE,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;QAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;IACH,2DAA2D;IAC3D,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,8CAA8C;IAC9C,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,0EAA0E;IAC1E,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,kDAAkD;IAClD,QAAQ,CAAC,EAAE;QACT,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;CACH;AAED,MAAM,WAAW,cAAe,SAAQ,WAAW;IACjD,4EAA4E;IAC5E,MAAM,EAAE,QAAQ,GAAG,SAAS,GAAG,SAAS,GAAG,QAAQ,CAAC;CACrD;AAuGD;;GAEG;AACH,wBAAsB,UAAU,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,eAAoB,GAAG,OAAO,CAAC,cAAc,CAAC,CAqNpG"}
1
+ {"version":3,"file":"strategies.d.ts","sourceRoot":"","sources":["../../src/core/strategies.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAMH,OAAO,EAEL,KAAK,cAAc,EAEpB,MAAM,qBAAqB,CAAC;AAG7B,YAAY,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAoE1D,MAAM,WAAW,eAAe;IAC9B,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,KAAK,CAAC;QACd,IAAI,EACA,MAAM,GACN,OAAO,GACP,QAAQ,GACR,MAAM,GACN,MAAM,GACN,QAAQ,GACR,OAAO,GACP,OAAO,GACP,iBAAiB,GACjB,YAAY,CAAC;QACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,EAAE,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;QAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,EAAE;QACT,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;CACH;AAuGD;;;;;GAKG;AACH,wBAAsB,UAAU,CAC9B,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,CAAC,CA6QzB;AAID;;GAEG;AACH,OAAO,EAAE,kBAAkB,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC"}
@@ -1,116 +1,21 @@
1
1
  /**
2
- * Smart escalation strategy: try simple fetch first, escalate to browser if needed
2
+ * Smart escalation strategy: try simple fetch first, escalate to browser if needed.
3
+ *
4
+ * Premium server-side optimisations (SWR cache, domain intelligence, parallel
5
+ * race) are injected via the hook system in `strategy-hooks.ts`. When no hooks
6
+ * are registered the strategy degrades gracefully to a simple escalation path
7
+ * that works great for CLI / npm library usage.
3
8
  */
4
9
  import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
5
- import { getCachedWithSWR, setCached, markRevalidating } from './cache.js';
10
+ import { getCached, setCached as setBasicCache } from './cache.js';
6
11
  import { resolveAndCache } from './dns-cache.js';
7
12
  import { BlockedError, NetworkError } from '../types.js';
8
- const DOMAIN_INTEL_MAX = 500;
9
- const DOMAIN_INTEL_TTL_MS = 60 * 60 * 1000; // 1 hour
10
- const DOMAIN_INTEL_EMA_ALPHA = 0.3;
11
- const domainIntel = new Map();
12
- const domainMethodCounts = new Map();
13
- function getDomainKey(url) {
14
- try {
15
- return new URL(url).hostname.toLowerCase();
16
- }
17
- catch {
18
- return '';
19
- }
20
- }
21
- function pruneDomainIntel(now) {
22
- for (const [key, intel] of domainIntel) {
23
- if (now - intel.lastSeen > DOMAIN_INTEL_TTL_MS) {
24
- domainIntel.delete(key);
25
- domainMethodCounts.delete(key);
26
- }
27
- }
28
- }
29
- function recordDomainResult(url, method, latencyMs) {
30
- const key = getDomainKey(url);
31
- if (!key) {
32
- return;
33
- }
34
- const now = Date.now();
35
- pruneDomainIntel(now);
36
- const existing = domainIntel.get(key);
37
- const sanitizedLatency = Number.isFinite(latencyMs) && latencyMs > 0
38
- ? latencyMs
39
- : (existing?.avgLatencyMs ?? 0);
40
- const next = existing
41
- ? {
42
- needsBrowser: existing.needsBrowser || method === 'browser' || method === 'stealth',
43
- needsStealth: existing.needsStealth || method === 'stealth',
44
- avgLatencyMs: existing.avgLatencyMs === 0
45
- ? sanitizedLatency
46
- : (existing.avgLatencyMs * (1 - DOMAIN_INTEL_EMA_ALPHA)) + (sanitizedLatency * DOMAIN_INTEL_EMA_ALPHA),
47
- lastSeen: now,
48
- sampleCount: existing.sampleCount + 1,
49
- }
50
- : {
51
- needsBrowser: method === 'browser' || method === 'stealth',
52
- needsStealth: method === 'stealth',
53
- avgLatencyMs: sanitizedLatency,
54
- lastSeen: now,
55
- sampleCount: 1,
56
- };
57
- const existingCounts = domainMethodCounts.get(key) ?? { simple: 0, browser: 0, stealth: 0 };
58
- existingCounts[method] += 1;
59
- domainIntel.delete(key);
60
- domainIntel.set(key, next);
61
- domainMethodCounts.set(key, existingCounts);
62
- while (domainIntel.size > DOMAIN_INTEL_MAX) {
63
- const oldestKey = domainIntel.keys().next().value;
64
- if (!oldestKey) {
65
- break;
66
- }
67
- domainIntel.delete(oldestKey);
68
- domainMethodCounts.delete(oldestKey);
69
- }
70
- }
71
- function getDomainRecommendation(url) {
72
- const key = getDomainKey(url);
73
- if (!key) {
74
- return null;
75
- }
76
- const intel = domainIntel.get(key);
77
- if (!intel) {
78
- return null;
79
- }
80
- const now = Date.now();
81
- if (now - intel.lastSeen > DOMAIN_INTEL_TTL_MS) {
82
- domainIntel.delete(key);
83
- domainMethodCounts.delete(key);
84
- return null;
85
- }
86
- if (intel.sampleCount <= 2) {
87
- return null;
88
- }
89
- const counts = domainMethodCounts.get(key);
90
- if (!counts) {
91
- return null;
92
- }
93
- // LRU touch
94
- domainIntel.delete(key);
95
- domainIntel.set(key, intel);
96
- const allStealth = counts.stealth === intel.sampleCount;
97
- if (allStealth && intel.needsStealth) {
98
- return { mode: 'stealth' };
99
- }
100
- const allBrowser = counts.simple === 0 && (counts.browser + counts.stealth === intel.sampleCount);
101
- if (allBrowser && intel.needsBrowser) {
102
- return { mode: 'browser' };
103
- }
104
- return null;
105
- }
106
- export function clearDomainIntel() {
107
- domainIntel.clear();
108
- domainMethodCounts.clear();
109
- }
13
+ import { getStrategyHooks, } from './strategy-hooks.js';
14
+ /* ---------- hardcoded domain rules -------------------------------------- */
110
15
  function shouldForceBrowser(url) {
111
16
  try {
112
17
  const hostname = new URL(url).hostname.toLowerCase();
113
- // Reddit often returns an HTML shell via simple fetch; browser rendering is needed for real content
18
+ // Reddit often returns an HTML shell via simple fetch
114
19
  if (hostname === 'reddit.com' || hostname.endsWith('.reddit.com')) {
115
20
  return { mode: 'browser' };
116
21
  }
@@ -120,52 +25,45 @@ function shouldForceBrowser(url) {
120
25
  hostname.endsWith('.npmjs.com')) {
121
26
  return { mode: 'browser' };
122
27
  }
123
- // StackOverflow commonly serves shell-like content to simple fetch clients
124
- // Note: NOT forced — let the shell-page detector escalate naturally
125
- // since SO needs extra wait time that the escalation path handles better
126
- // These are known to aggressively block automation; go straight to stealth
28
+ // These are known to aggressively block automation
127
29
  if (hostname === 'glassdoor.com' || hostname.endsWith('.glassdoor.com')) {
128
30
  return { mode: 'stealth' };
129
31
  }
130
32
  if (hostname === 'bloomberg.com' || hostname.endsWith('.bloomberg.com')) {
131
33
  return { mode: 'stealth' };
132
34
  }
133
- // Indeed uses Cloudflare aggressively on job detail pages
134
35
  if (hostname === 'indeed.com' || hostname.endsWith('.indeed.com')) {
135
36
  return { mode: 'stealth' };
136
37
  }
137
38
  }
138
39
  catch {
139
- // Ignore URL parsing errors here; validation happens inside fetchers
40
+ // Ignore URL parsing errors; validation happens inside fetchers.
140
41
  }
141
42
  return null;
142
43
  }
44
+ /* ---------- helpers ------------------------------------------------------ */
143
45
  function isAbortError(error) {
144
46
  return error instanceof Error && error.name === 'AbortError';
145
47
  }
146
48
  function shouldEscalateSimpleError(error) {
147
- if (error instanceof BlockedError) {
49
+ if (error instanceof BlockedError)
148
50
  return true;
149
- }
150
51
  return error instanceof NetworkError && error.message.includes('TLS/SSL');
151
52
  }
152
53
  function looksLikeShellPage(result) {
153
- const contentTypeLower = (result.contentType || '').toLowerCase();
154
- if (!contentTypeLower.includes('html')) {
54
+ const ct = (result.contentType || '').toLowerCase();
55
+ if (!ct.includes('html'))
155
56
  return false;
156
- }
157
- const textContent = result.html.replace(/<[^>]*>/g, '').trim();
158
- return textContent.length < 500 && result.html.length > 1000;
57
+ const text = result.html.replace(/<[^>]*>/g, '').trim();
58
+ return text.length < 500 && result.html.length > 1000;
159
59
  }
160
60
  function prefetchDns(url) {
161
61
  try {
162
62
  const hostname = new URL(url).hostname;
163
- void resolveAndCache(hostname).catch(() => {
164
- // Best-effort optimization only.
165
- });
63
+ void resolveAndCache(hostname).catch(() => { });
166
64
  }
167
65
  catch {
168
- // Ignore invalid URL here; fetchers handle validation.
66
+ // Ignore invalid URL.
169
67
  }
170
68
  }
171
69
  async function fetchWithBrowserStrategy(url, options) {
@@ -190,10 +88,9 @@ async function fetchWithBrowserStrategy(url, options) {
190
88
  };
191
89
  }
192
90
  catch (error) {
193
- if (isAbortError(error)) {
91
+ if (isAbortError(error))
194
92
  throw error;
195
- }
196
- // Strategy 3: If browser gets blocked, try stealth mode as fallback (unless already using stealth)
93
+ // If browser gets blocked, try stealth as fallback (unless already stealth)
197
94
  if (!effectiveStealth && error instanceof BlockedError) {
198
95
  const result = await browserFetch(url, {
199
96
  userAgent,
@@ -208,13 +105,11 @@ async function fetchWithBrowserStrategy(url, options) {
208
105
  keepPageOpen,
209
106
  signal,
210
107
  });
211
- return {
212
- ...result,
213
- method: 'stealth',
214
- };
108
+ return { ...result, method: 'stealth' };
215
109
  }
216
- // If browser encounters Cloudflare, retry with extra wait time
217
- if (error instanceof NetworkError && error.message.toLowerCase().includes('cloudflare')) {
110
+ // If Cloudflare detected, retry with extra wait time
111
+ if (error instanceof NetworkError &&
112
+ error.message.toLowerCase().includes('cloudflare')) {
218
113
  const result = await browserFetch(url, {
219
114
  userAgent,
220
115
  waitMs: 5000,
@@ -228,40 +123,41 @@ async function fetchWithBrowserStrategy(url, options) {
228
123
  keepPageOpen,
229
124
  signal,
230
125
  });
231
- return {
232
- ...result,
233
- method: effectiveStealth ? 'stealth' : 'browser',
234
- };
126
+ return { ...result, method: effectiveStealth ? 'stealth' : 'browser' };
235
127
  }
236
128
  throw error;
237
129
  }
238
130
  }
131
+ /* ---------- main entry point -------------------------------------------- */
239
132
  /**
240
- * Smart fetch with automatic escalation
133
+ * Smart fetch with automatic escalation.
134
+ *
135
+ * Without hooks: simple fetch → browser → stealth escalation.
136
+ * With premium hooks: SWR cache → domain intel → parallel race → escalation.
241
137
  */
242
138
  export async function smartFetch(url, options = {}) {
243
139
  const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, } = options;
140
+ const hooks = getStrategyHooks();
244
141
  const fetchStartMs = Date.now();
245
- const recordSuccessfulMethod = (method) => {
246
- if (method === 'cached') {
142
+ const recordMethod = (method) => {
143
+ if (method === 'cached')
247
144
  return;
248
- }
249
- recordDomainResult(url, method, Date.now() - fetchStartMs);
145
+ hooks.recordDomainResult?.(url, method, Date.now() - fetchStartMs);
250
146
  };
251
- // Site-specific escalation overrides
252
- // Hardcoded rules take priority (manually verified), domain intel is fallback
147
+ /* ---- determine effective mode ---------------------------------------- */
148
+ // Hardcoded rules always take priority, then hook-based domain intelligence.
253
149
  const forced = shouldForceBrowser(url);
254
- const recommended = getDomainRecommendation(url);
255
- const selectedRecommendation = forced ?? recommended;
150
+ const recommended = hooks.getDomainRecommendation?.(url) ?? null;
151
+ const selected = forced ?? recommended;
256
152
  let effectiveForceBrowser = forceBrowser;
257
153
  let effectiveStealth = stealth;
258
- if (selectedRecommendation) {
154
+ if (selected) {
259
155
  effectiveForceBrowser = true;
260
- if (selectedRecommendation.mode === 'stealth') {
156
+ if (selected.mode === 'stealth')
261
157
  effectiveStealth = true;
262
- }
263
158
  }
264
159
  prefetchDns(url);
160
+ /* ---- cache eligibility ----------------------------------------------- */
265
161
  const canUseCache = !noCache &&
266
162
  !effectiveForceBrowser &&
267
163
  !effectiveStealth &&
@@ -272,33 +168,35 @@ export async function smartFetch(url, options = {}) {
272
168
  !cookies &&
273
169
  waitMs === 0 &&
274
170
  !userAgent;
275
- if (canUseCache) {
276
- const cacheResult = getCachedWithSWR(url);
277
- if (cacheResult) {
278
- if (cacheResult.stale) {
279
- // Stale-while-revalidate: serve stale immediately, refresh in background
280
- if (markRevalidating(url)) {
281
- // Fire-and-forget background revalidation
282
- void (async () => {
283
- try {
284
- const freshResult = await simpleFetch(url, userAgent, timeoutMs);
285
- if (!looksLikeShellPage(freshResult)) {
286
- setCached(url, { ...freshResult, method: 'simple' });
287
- }
171
+ /* ---- hook-based cache check (premium) -------------------------------- */
172
+ if (canUseCache && hooks.checkCache) {
173
+ const cached = hooks.checkCache(url);
174
+ if (cached) {
175
+ if (cached.stale && hooks.markRevalidating?.(url)) {
176
+ // Background revalidation — fire-and-forget
177
+ void (async () => {
178
+ try {
179
+ const fresh = await simpleFetch(url, userAgent, timeoutMs);
180
+ if (!looksLikeShellPage(fresh)) {
181
+ hooks.setCache?.(url, { ...fresh, method: 'simple' });
288
182
  }
289
- catch {
290
- // Background revalidation failed — stale entry continues serving
291
- }
292
- })();
293
- }
183
+ }
184
+ catch {
185
+ // Stale entry continues serving.
186
+ }
187
+ })();
294
188
  }
295
- return {
296
- ...cacheResult.value,
297
- method: 'cached',
298
- };
189
+ return { ...cached.value, method: 'cached' };
190
+ }
191
+ }
192
+ /* ---- basic cache check (non-premium fallback) ------------------------ */
193
+ if (canUseCache && !hooks.checkCache) {
194
+ const basicCached = getCached(url);
195
+ if (basicCached) {
196
+ return { ...basicCached, method: 'cached' };
299
197
  }
300
198
  }
301
- // If stealth is requested, force browser mode (stealth requires browser)
199
+ /* ---- browser-level options ------------------------------------------- */
302
200
  let shouldUseBrowser = effectiveForceBrowser || screenshot || effectiveStealth;
303
201
  const browserOptions = {
304
202
  userAgent,
@@ -312,7 +210,7 @@ export async function smartFetch(url, options = {}) {
312
210
  keepPageOpen,
313
211
  effectiveStealth,
314
212
  };
315
- // Strategy 1: Simple fetch (unless browser is forced or screenshot is requested)
213
+ /* ---- Strategy: simple fetch (with optional race) --------------------- */
316
214
  if (!shouldUseBrowser) {
317
215
  const simpleAbortController = new AbortController();
318
216
  const simplePromise = retryFetch(() => simpleFetch(url, userAgent, timeoutMs, headers, simpleAbortController.signal), 3).then((result) => {
@@ -321,27 +219,31 @@ export async function smartFetch(url, options = {}) {
321
219
  }
322
220
  return result;
323
221
  });
222
+ // Determine race timeout — hooks can override
223
+ const useRace = hooks.shouldRace?.() ?? false;
224
+ const effectiveRaceTimeout = useRace
225
+ ? (hooks.getRaceTimeoutMs?.() ?? raceTimeoutMs)
226
+ : raceTimeoutMs;
324
227
  let raceTimer;
325
228
  const simpleOrTimeout = await Promise.race([
326
229
  simplePromise
327
230
  .then((result) => ({ type: 'simple-success', result }))
328
231
  .catch((error) => ({ type: 'simple-error', error })),
329
232
  new Promise((resolve) => {
330
- raceTimer = setTimeout(() => resolve({ type: 'race-timeout' }), Math.max(raceTimeoutMs, 0));
233
+ raceTimer = setTimeout(() => resolve({ type: 'race-timeout' }), Math.max(effectiveRaceTimeout, 0));
331
234
  }),
332
235
  ]);
333
- if (raceTimer) {
236
+ if (raceTimer)
334
237
  clearTimeout(raceTimer);
335
- }
336
238
  if (simpleOrTimeout.type === 'simple-success') {
337
239
  const strategyResult = {
338
240
  ...simpleOrTimeout.result,
339
241
  method: 'simple',
340
242
  };
341
243
  if (canUseCache) {
342
- setCached(url, strategyResult);
244
+ hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
343
245
  }
344
- recordSuccessfulMethod('simple');
246
+ recordMethod('simple');
345
247
  return strategyResult;
346
248
  }
347
249
  if (simpleOrTimeout.type === 'simple-error') {
@@ -351,66 +253,98 @@ export async function smartFetch(url, options = {}) {
351
253
  shouldUseBrowser = true;
352
254
  }
353
255
  else {
354
- // Simple fetch is slow - start browser in parallel and return whichever succeeds first.
355
- const browserAbortController = new AbortController();
356
- let simpleError;
357
- let browserError;
358
- const simpleCandidate = simplePromise
359
- .then((result) => ({ source: 'simple', result }))
360
- .catch((error) => {
361
- simpleError = error;
362
- throw error;
363
- });
364
- const browserCandidate = fetchWithBrowserStrategy(url, {
365
- ...browserOptions,
366
- signal: browserAbortController.signal,
367
- })
368
- .then((result) => ({ source: 'browser', result }))
369
- .catch((error) => {
370
- browserError = error;
371
- throw error;
372
- });
373
- try {
374
- const winner = await Promise.any([simpleCandidate, browserCandidate]);
375
- if (winner.source === 'simple') {
376
- browserAbortController.abort();
256
+ // Race timeout only start parallel browser if hooks say to race
257
+ if (useRace) {
258
+ // Parallel race: simple still running, start browser too
259
+ const browserAbortController = new AbortController();
260
+ let simpleError;
261
+ let browserError;
262
+ const simpleCandidate = simplePromise
263
+ .then((result) => ({ source: 'simple', result }))
264
+ .catch((error) => {
265
+ simpleError = error;
266
+ throw error;
267
+ });
268
+ const browserCandidate = fetchWithBrowserStrategy(url, {
269
+ ...browserOptions,
270
+ signal: browserAbortController.signal,
271
+ })
272
+ .then((result) => ({ source: 'browser', result }))
273
+ .catch((error) => {
274
+ browserError = error;
275
+ throw error;
276
+ });
277
+ try {
278
+ const winner = await Promise.any([
279
+ simpleCandidate,
280
+ browserCandidate,
281
+ ]);
282
+ if (winner.source === 'simple') {
283
+ browserAbortController.abort();
284
+ const strategyResult = {
285
+ ...winner.result,
286
+ method: 'simple',
287
+ };
288
+ if (canUseCache) {
289
+ hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
290
+ }
291
+ recordMethod('simple');
292
+ return strategyResult;
293
+ }
294
+ simpleAbortController.abort();
295
+ if (canUseCache) {
296
+ hooks.setCache?.(url, winner.result) ?? setBasicCache(url, winner.result);
297
+ }
298
+ recordMethod(winner.result.method);
299
+ return winner.result;
300
+ }
301
+ catch {
302
+ if (simpleError &&
303
+ !shouldEscalateSimpleError(simpleError) &&
304
+ !isAbortError(simpleError)) {
305
+ throw simpleError;
306
+ }
307
+ if (browserError)
308
+ throw browserError;
309
+ if (simpleError)
310
+ throw simpleError;
311
+ throw new Error('Both simple and browser fetch attempts failed');
312
+ }
313
+ }
314
+ else {
315
+ // No race — just wait for the simple fetch to finish
316
+ const simpleResult = await simplePromise
317
+ .then((result) => ({ type: 'simple-success', result }))
318
+ .catch((error) => ({ type: 'simple-error', error }));
319
+ if (simpleResult.type === 'simple-success') {
377
320
  const strategyResult = {
378
- ...winner.result,
321
+ ...simpleResult.result,
379
322
  method: 'simple',
380
323
  };
381
324
  if (canUseCache) {
382
- setCached(url, strategyResult);
325
+ hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
383
326
  }
384
- recordSuccessfulMethod('simple');
327
+ recordMethod('simple');
385
328
  return strategyResult;
386
329
  }
387
- simpleAbortController.abort();
388
- if (canUseCache) {
389
- setCached(url, winner.result);
330
+ if (!shouldEscalateSimpleError(simpleResult.error)) {
331
+ throw simpleResult.error;
390
332
  }
391
- recordSuccessfulMethod(winner.result.method);
392
- return winner.result;
393
- }
394
- catch {
395
- // Both failed: prefer non-escalation simple errors, otherwise return browser-side error.
396
- if (simpleError && !shouldEscalateSimpleError(simpleError) && !isAbortError(simpleError)) {
397
- throw simpleError;
398
- }
399
- if (browserError) {
400
- throw browserError;
401
- }
402
- if (simpleError) {
403
- throw simpleError;
404
- }
405
- throw new Error('Both simple and browser fetch attempts failed');
333
+ shouldUseBrowser = true;
406
334
  }
407
335
  }
408
336
  }
337
+ /* ---- browser / stealth fallback -------------------------------------- */
409
338
  const browserResult = await fetchWithBrowserStrategy(url, browserOptions);
410
339
  if (canUseCache) {
411
- setCached(url, browserResult);
340
+ hooks.setCache?.(url, browserResult) ?? setBasicCache(url, browserResult);
412
341
  }
413
- recordSuccessfulMethod(browserResult.method);
342
+ recordMethod(browserResult.method);
414
343
  return browserResult;
415
344
  }
345
+ /* ---------- legacy export for tests ------------------------------------- */
346
+ /**
347
+ * @deprecated Use `clearStrategyHooks()` from strategy-hooks.ts instead.
348
+ */
349
+ export { clearStrategyHooks as clearDomainIntel } from './strategy-hooks.js';
416
350
  //# sourceMappingURL=strategies.js.map