webpeel 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -5
- package/dist/cli.js +1299 -85
- package/dist/cli.js.map +1 -1
- package/dist/core/application-tracker.d.ts +85 -0
- package/dist/core/application-tracker.d.ts.map +1 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/application-tracker.js.map +1 -0
- package/dist/core/apply.d.ts +163 -0
- package/dist/core/apply.d.ts.map +1 -0
- package/dist/core/apply.js +817 -0
- package/dist/core/apply.js.map +1 -0
- package/dist/core/branding.d.ts +1 -1
- package/dist/core/branding.d.ts.map +1 -1
- package/dist/core/budget.d.ts +43 -0
- package/dist/core/budget.d.ts.map +1 -0
- package/dist/core/budget.js +325 -0
- package/dist/core/budget.js.map +1 -0
- package/dist/core/challenge-detection.d.ts +27 -0
- package/dist/core/challenge-detection.d.ts.map +1 -0
- package/dist/core/challenge-detection.js +436 -0
- package/dist/core/challenge-detection.js.map +1 -0
- package/dist/core/change-tracking.d.ts.map +1 -1
- package/dist/core/change-tracking.js +10 -1
- package/dist/core/change-tracking.js.map +1 -1
- package/dist/core/crawler.d.ts.map +1 -1
- package/dist/core/crawler.js +17 -4
- package/dist/core/crawler.js.map +1 -1
- package/dist/core/diff.d.ts +62 -0
- package/dist/core/diff.d.ts.map +1 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/diff.js.map +1 -0
- package/dist/core/extract-listings.d.ts +39 -0
- package/dist/core/extract-listings.d.ts.map +1 -0
- package/dist/core/extract-listings.js +331 -0
- package/dist/core/extract-listings.js.map +1 -0
- package/dist/core/extract.d.ts.map +1 -1
- package/dist/core/extract.js +15 -2
- package/dist/core/extract.js.map +1 -1
- package/dist/core/fetcher.d.ts +29 -3
- package/dist/core/fetcher.d.ts.map +1 -1
- package/dist/core/fetcher.js +158 -20
- package/dist/core/fetcher.js.map +1 -1
- package/dist/core/human.d.ts +176 -0
- package/dist/core/human.d.ts.map +1 -0
- package/dist/core/human.js +681 -0
- package/dist/core/human.js.map +1 -0
- package/dist/core/jobs.d.ts +12 -2
- package/dist/core/jobs.d.ts.map +1 -1
- package/dist/core/jobs.js +124 -2
- package/dist/core/jobs.js.map +1 -1
- package/dist/core/map.d.ts.map +1 -1
- package/dist/core/map.js +14 -2
- package/dist/core/map.js.map +1 -1
- package/dist/core/paginate.d.ts +32 -0
- package/dist/core/paginate.d.ts.map +1 -0
- package/dist/core/paginate.js +107 -0
- package/dist/core/paginate.js.map +1 -0
- package/dist/core/rate-governor.d.ts +81 -0
- package/dist/core/rate-governor.d.ts.map +1 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/rate-governor.js.map +1 -0
- package/dist/core/search-provider.d.ts +5 -0
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +81 -2
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/site-search.d.ts +45 -0
- package/dist/core/site-search.d.ts.map +1 -0
- package/dist/core/site-search.js +253 -0
- package/dist/core/site-search.js.map +1 -0
- package/dist/core/strategies.d.ts +8 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +185 -45
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/strategy-hooks.d.ts +6 -0
- package/dist/core/strategy-hooks.d.ts.map +1 -1
- package/dist/core/strategy-hooks.js.map +1 -1
- package/dist/core/table-format.d.ts +31 -0
- package/dist/core/table-format.d.ts.map +1 -0
- package/dist/core/table-format.js +147 -0
- package/dist/core/table-format.js.map +1 -0
- package/dist/core/user-agents.d.ts +58 -0
- package/dist/core/user-agents.d.ts.map +1 -0
- package/dist/core/user-agents.js +159 -0
- package/dist/core/user-agents.js.map +1 -0
- package/dist/core/watch.d.ts +100 -0
- package/dist/core/watch.d.ts.map +1 -0
- package/dist/core/watch.js +368 -0
- package/dist/core/watch.js.map +1 -0
- package/dist/index.d.ts +13 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +41 -4
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +3 -0
- package/dist/mcp/server.js.map +1 -1
- package/dist/types.d.ts +73 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +1 -1
- package/package.json +3 -3
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Site-Aware Search URL Builders
|
|
3
|
+
*
|
|
4
|
+
* Provides URL templates for popular websites so AI agents can search them
|
|
5
|
+
* without needing to know site-specific URL structures.
|
|
6
|
+
*
|
|
7
|
+
* @module site-search
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* URL templates for popular sites, keyed by site ID.
|
|
11
|
+
* All query values are URL-encoded via encodeURIComponent.
|
|
12
|
+
*/
|
|
13
|
+
export const SITE_TEMPLATES = {
|
|
14
|
+
// ── Shopping ──────────────────────────────────────────────────────────────
|
|
15
|
+
ebay: {
|
|
16
|
+
name: 'eBay',
|
|
17
|
+
category: 'shopping',
|
|
18
|
+
searchUrl: (q) => `https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(q)}`,
|
|
19
|
+
},
|
|
20
|
+
amazon: {
|
|
21
|
+
name: 'Amazon',
|
|
22
|
+
category: 'shopping',
|
|
23
|
+
searchUrl: (q) => `https://www.amazon.com/s?k=${encodeURIComponent(q)}`,
|
|
24
|
+
},
|
|
25
|
+
walmart: {
|
|
26
|
+
name: 'Walmart',
|
|
27
|
+
category: 'shopping',
|
|
28
|
+
searchUrl: (q) => `https://www.walmart.com/search?q=${encodeURIComponent(q)}`,
|
|
29
|
+
},
|
|
30
|
+
target: {
|
|
31
|
+
name: 'Target',
|
|
32
|
+
category: 'shopping',
|
|
33
|
+
searchUrl: (q) => `https://www.target.com/s?searchTerm=${encodeURIComponent(q)}`,
|
|
34
|
+
},
|
|
35
|
+
bestbuy: {
|
|
36
|
+
name: 'Best Buy',
|
|
37
|
+
category: 'shopping',
|
|
38
|
+
searchUrl: (q) => `https://www.bestbuy.com/site/searchpage.jsp?st=${encodeURIComponent(q)}`,
|
|
39
|
+
},
|
|
40
|
+
etsy: {
|
|
41
|
+
name: 'Etsy',
|
|
42
|
+
category: 'shopping',
|
|
43
|
+
searchUrl: (q) => `https://www.etsy.com/search?q=${encodeURIComponent(q)}`,
|
|
44
|
+
},
|
|
45
|
+
aliexpress: {
|
|
46
|
+
name: 'AliExpress',
|
|
47
|
+
category: 'shopping',
|
|
48
|
+
searchUrl: (q) => `https://www.aliexpress.com/wholesale?SearchText=${encodeURIComponent(q)}`,
|
|
49
|
+
},
|
|
50
|
+
newegg: {
|
|
51
|
+
name: 'Newegg',
|
|
52
|
+
category: 'shopping',
|
|
53
|
+
searchUrl: (q) => `https://www.newegg.com/p/pl?d=${encodeURIComponent(q)}`,
|
|
54
|
+
},
|
|
55
|
+
// ── General ───────────────────────────────────────────────────────────────
|
|
56
|
+
google: {
|
|
57
|
+
name: 'Google',
|
|
58
|
+
category: 'general',
|
|
59
|
+
searchUrl: (q) => `https://www.google.com/search?q=${encodeURIComponent(q)}`,
|
|
60
|
+
},
|
|
61
|
+
bing: {
|
|
62
|
+
name: 'Bing',
|
|
63
|
+
category: 'general',
|
|
64
|
+
searchUrl: (q) => `https://www.bing.com/search?q=${encodeURIComponent(q)}`,
|
|
65
|
+
},
|
|
66
|
+
duckduckgo: {
|
|
67
|
+
name: 'DuckDuckGo',
|
|
68
|
+
category: 'general',
|
|
69
|
+
searchUrl: (q) => `https://html.duckduckgo.com/html/?q=${encodeURIComponent(q)}`,
|
|
70
|
+
},
|
|
71
|
+
// ── Social / Content ──────────────────────────────────────────────────────
|
|
72
|
+
reddit: {
|
|
73
|
+
name: 'Reddit',
|
|
74
|
+
category: 'social',
|
|
75
|
+
searchUrl: (q) => `https://www.reddit.com/search/?q=${encodeURIComponent(q)}`,
|
|
76
|
+
},
|
|
77
|
+
youtube: {
|
|
78
|
+
name: 'YouTube',
|
|
79
|
+
category: 'social',
|
|
80
|
+
searchUrl: (q) => `https://www.youtube.com/results?search_query=${encodeURIComponent(q)}`,
|
|
81
|
+
},
|
|
82
|
+
twitter: {
|
|
83
|
+
name: 'X (Twitter)',
|
|
84
|
+
category: 'social',
|
|
85
|
+
searchUrl: (q) => `https://x.com/search?q=${encodeURIComponent(q)}`,
|
|
86
|
+
},
|
|
87
|
+
linkedin: {
|
|
88
|
+
name: 'LinkedIn',
|
|
89
|
+
category: 'social',
|
|
90
|
+
searchUrl: (q) => `https://www.linkedin.com/search/results/all/?keywords=${encodeURIComponent(q)}`,
|
|
91
|
+
},
|
|
92
|
+
// ── Tech ──────────────────────────────────────────────────────────────────
|
|
93
|
+
github: {
|
|
94
|
+
name: 'GitHub',
|
|
95
|
+
category: 'tech',
|
|
96
|
+
searchUrl: (q) => `https://github.com/search?q=${encodeURIComponent(q)}`,
|
|
97
|
+
},
|
|
98
|
+
stackoverflow: {
|
|
99
|
+
name: 'Stack Overflow',
|
|
100
|
+
category: 'tech',
|
|
101
|
+
searchUrl: (q) => `https://stackoverflow.com/search?q=${encodeURIComponent(q)}`,
|
|
102
|
+
},
|
|
103
|
+
npm: {
|
|
104
|
+
name: 'npm',
|
|
105
|
+
category: 'tech',
|
|
106
|
+
searchUrl: (q) => `https://www.npmjs.com/search?q=${encodeURIComponent(q)}`,
|
|
107
|
+
},
|
|
108
|
+
pypi: {
|
|
109
|
+
name: 'PyPI',
|
|
110
|
+
category: 'tech',
|
|
111
|
+
searchUrl: (q) => `https://pypi.org/search/?q=${encodeURIComponent(q)}`,
|
|
112
|
+
},
|
|
113
|
+
// ── Real Estate ───────────────────────────────────────────────────────────
|
|
114
|
+
zillow: {
|
|
115
|
+
name: 'Zillow',
|
|
116
|
+
category: 'real-estate',
|
|
117
|
+
searchUrl: (q) => `https://www.zillow.com/homes/${encodeURIComponent(q)}_rb/`,
|
|
118
|
+
},
|
|
119
|
+
realtor: {
|
|
120
|
+
name: 'Realtor.com',
|
|
121
|
+
category: 'real-estate',
|
|
122
|
+
searchUrl: (q) => `https://www.realtor.com/realestateandhomes-search/${encodeURIComponent(q)}`,
|
|
123
|
+
},
|
|
124
|
+
// ── Jobs ──────────────────────────────────────────────────────────────────
|
|
125
|
+
indeed: {
|
|
126
|
+
name: 'Indeed',
|
|
127
|
+
category: 'jobs',
|
|
128
|
+
searchUrl: (q) => `https://www.indeed.com/jobs?q=${encodeURIComponent(q)}`,
|
|
129
|
+
},
|
|
130
|
+
glassdoor: {
|
|
131
|
+
name: 'Glassdoor',
|
|
132
|
+
category: 'jobs',
|
|
133
|
+
searchUrl: (q) => `https://www.glassdoor.com/Job/jobs.htm?sc.keyword=${encodeURIComponent(q)}`,
|
|
134
|
+
},
|
|
135
|
+
'linkedin-jobs': {
|
|
136
|
+
name: 'LinkedIn Jobs',
|
|
137
|
+
category: 'jobs',
|
|
138
|
+
searchUrl: (q) => `https://www.linkedin.com/jobs/search/?keywords=${encodeURIComponent(q)}`,
|
|
139
|
+
},
|
|
140
|
+
// ── Food ──────────────────────────────────────────────────────────────────
|
|
141
|
+
yelp: {
|
|
142
|
+
name: 'Yelp',
|
|
143
|
+
category: 'food',
|
|
144
|
+
searchUrl: (q) => `https://www.yelp.com/search?find_desc=${encodeURIComponent(q)}`,
|
|
145
|
+
},
|
|
146
|
+
doordash: {
|
|
147
|
+
name: 'DoorDash',
|
|
148
|
+
category: 'food',
|
|
149
|
+
searchUrl: (q) => `https://www.doordash.com/search/store/${encodeURIComponent(q)}`,
|
|
150
|
+
},
|
|
151
|
+
ubereats: {
|
|
152
|
+
name: 'Uber Eats',
|
|
153
|
+
category: 'food',
|
|
154
|
+
searchUrl: (q) => `https://www.ubereats.com/search?q=${encodeURIComponent(q)}`,
|
|
155
|
+
},
|
|
156
|
+
};
|
|
157
|
+
/** Aliases that map to canonical site IDs */
|
|
158
|
+
const SITE_ALIASES = {
|
|
159
|
+
x: 'twitter',
|
|
160
|
+
'best-buy': 'bestbuy',
|
|
161
|
+
'ali-express': 'aliexpress',
|
|
162
|
+
'stack-overflow': 'stackoverflow',
|
|
163
|
+
'duck-duck-go': 'duckduckgo',
|
|
164
|
+
};
|
|
165
|
+
/**
|
|
166
|
+
* Resolve a site ID (or alias) to its canonical key.
|
|
167
|
+
* Returns null if not found.
|
|
168
|
+
*/
|
|
169
|
+
function resolveSiteId(site) {
|
|
170
|
+
const lower = site.toLowerCase();
|
|
171
|
+
if (lower in SITE_TEMPLATES)
|
|
172
|
+
return lower;
|
|
173
|
+
if (lower in SITE_ALIASES)
|
|
174
|
+
return SITE_ALIASES[lower];
|
|
175
|
+
return null;
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Build a search URL for a given site and query.
|
|
179
|
+
*
|
|
180
|
+
* @param site Site ID (e.g. "ebay", "amazon") or alias (e.g. "x")
|
|
181
|
+
* @param query Search query string
|
|
182
|
+
* @throws Error if the site is not recognized
|
|
183
|
+
*/
|
|
184
|
+
export function buildSiteSearchUrl(site, query) {
|
|
185
|
+
const canonical = resolveSiteId(site);
|
|
186
|
+
if (!canonical) {
|
|
187
|
+
const available = Object.keys(SITE_TEMPLATES).join(', ');
|
|
188
|
+
throw new Error(`Unknown site: "${site}". Available sites: ${available}. ` +
|
|
189
|
+
`Run "webpeel sites" to list all supported sites.`);
|
|
190
|
+
}
|
|
191
|
+
const template = SITE_TEMPLATES[canonical];
|
|
192
|
+
return {
|
|
193
|
+
url: template.searchUrl(query),
|
|
194
|
+
site: canonical,
|
|
195
|
+
query,
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* List all available site templates in a flat array.
|
|
200
|
+
*/
|
|
201
|
+
export function listSites() {
|
|
202
|
+
return Object.entries(SITE_TEMPLATES).map(([id, template]) => ({
|
|
203
|
+
id,
|
|
204
|
+
name: template.name,
|
|
205
|
+
category: template.category,
|
|
206
|
+
}));
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Find which site ID a given URL belongs to (reverse lookup).
|
|
210
|
+
* Returns the canonical site ID, or null if the URL doesn't match any template.
|
|
211
|
+
*/
|
|
212
|
+
export function findSiteByUrl(url) {
|
|
213
|
+
try {
|
|
214
|
+
const parsed = new URL(url);
|
|
215
|
+
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
216
|
+
// Map of hostnames to site IDs
|
|
217
|
+
const hostnameMap = {
|
|
218
|
+
'ebay.com': 'ebay',
|
|
219
|
+
'amazon.com': 'amazon',
|
|
220
|
+
'walmart.com': 'walmart',
|
|
221
|
+
'target.com': 'target',
|
|
222
|
+
'bestbuy.com': 'bestbuy',
|
|
223
|
+
'etsy.com': 'etsy',
|
|
224
|
+
'aliexpress.com': 'aliexpress',
|
|
225
|
+
'newegg.com': 'newegg',
|
|
226
|
+
'google.com': 'google',
|
|
227
|
+
'bing.com': 'bing',
|
|
228
|
+
'html.duckduckgo.com': 'duckduckgo',
|
|
229
|
+
'duckduckgo.com': 'duckduckgo',
|
|
230
|
+
'reddit.com': 'reddit',
|
|
231
|
+
'youtube.com': 'youtube',
|
|
232
|
+
'x.com': 'twitter',
|
|
233
|
+
'twitter.com': 'twitter',
|
|
234
|
+
'linkedin.com': 'linkedin',
|
|
235
|
+
'github.com': 'github',
|
|
236
|
+
'stackoverflow.com': 'stackoverflow',
|
|
237
|
+
'npmjs.com': 'npm',
|
|
238
|
+
'pypi.org': 'pypi',
|
|
239
|
+
'zillow.com': 'zillow',
|
|
240
|
+
'realtor.com': 'realtor',
|
|
241
|
+
'indeed.com': 'indeed',
|
|
242
|
+
'glassdoor.com': 'glassdoor',
|
|
243
|
+
'yelp.com': 'yelp',
|
|
244
|
+
'doordash.com': 'doordash',
|
|
245
|
+
'ubereats.com': 'ubereats',
|
|
246
|
+
};
|
|
247
|
+
return hostnameMap[hostname] ?? null;
|
|
248
|
+
}
|
|
249
|
+
catch {
|
|
250
|
+
return null;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
//# sourceMappingURL=site-search.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"site-search.js","sourceRoot":"","sources":["../../src/core/site-search.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAcH;;;GAGG;AACH,MAAM,CAAC,MAAM,cAAc,GAAiC;IAC1D,6EAA6E;IAC7E,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,wCAAwC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAClF;IACD,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,8BAA8B,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACxE;IACD,OAAO,EAAE;QACP,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,oCAAoC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC9E;IACD,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,uCAAuC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACjF;IACD,OAAO,EAAE;QACP,IAAI,EAAE,UAAU;QAChB,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,kDAAkD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC5F;IACD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iCAAiC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC3E;IACD,UAAU,EAAE;QACV,IAAI,EAAE,YAAY;QAClB,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,mDAAmD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC7F;IACD,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iCAAiC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC3E;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,SAAS;QACnB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,mCAAmC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC7E;IACD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,SAAS;QACnB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iCAAiC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC3E;IACD,UAAU,EAAE;QACV,IAAI,EAAE,YAAY;QAClB,QAAQ,EAAE,SAAS;QACnB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,uCAAuC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACjF;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,QAAQ;QAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,oCAAoC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC9E;IACD,OAAO,EAAE;QACP,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE,QAAQ;QAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,gDAAgD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC1F;IACD,OAAO,EAAE;QACP,IAAI,EAAE,aAAa;QACnB,QAAQ,EAAE,QAAQ;QAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,0BAA0B,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACpE;IACD,QAAQ,EAAE;QACR,IAAI,EAAE,UAAU;QAChB,QAAQ,EAAE,QAAQ;QAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,yDAAyD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACnG;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,+BAA+B,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACzE;IACD,aAAa,EAAE;QACb,IAAI,EAAE,gBAAgB;QACtB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,sCAAsC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAChF;IACD,GAAG,EAAE;QACH,IAAI,EAAE,KAAK;QACX,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,kCAAkC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC5E;IACD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,8BAA8B,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACxE;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,aAAa;QACvB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,gCAAgC,kBAAkB,CAAC,CAAC,CAAC,MAAM;KAC9E;IACD,OAAO,EAAE;QACP,IAAI,EAAE,aAAa;QACnB,QAAQ,EAAE,aAAa;QACvB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,qDAAqD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC/F;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iCAAiC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC3E;IACD,SAAS,EAAE;QACT,IAAI,EAAE,WAAW;QACjB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,qDAAqD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC/F;IACD,eAAe,EAAE;QACf,IAAI,EAAE,eAAe;QACrB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,kDAAkD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC5F;IAED,6EAA6E;IAC7E,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,yCAAyC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACnF;IACD,QAAQ,EAAE;QACR,IAAI,EAAE,UAAU;QAChB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,yCAAyC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACnF;IACD,QAAQ,EAAE;QACR,IAAI,EAAE,WAAW;QACjB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,qCAAqC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC/E;CACF,CAAC;AAEF,6CAA6C;AAC7C,MAAM,YAAY,GAA2B;IAC3C,CAAC,EAAE,SAAS;IACZ,UAAU,EAAE,SAAS;IACrB,aAAa,EAAE,YAAY;IAC3B,gBAAgB,EAAE,eAAe;IACjC,cAAc,EAAE,YAAY;CAC7B,CAAC;AAEF;;;GAGG;AACH,SAAS,aAAa,CAAC,IAAY;IACjC,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IACjC,IAAI,KAAK,IAAI,cAAc;QAAE,OAAO,KAAK,CAAC;IAC1C,IAAI,KAAK,IAAI,YAAY;QAAE,OAAO,YAAY,CAAC,KAAK,CAAE,CAAC;IACvD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,KAAa;IAC5D,MAAM,SAAS,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACtC,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzD,MAAM,IAAI,KAAK,CACb,kBAAkB,IAAI,uBAAuB,SAAS,IAAI;YAC1D,kDAAkD,CACnD,CAAC;IACJ,CAAC;IAED,MAAM,QAAQ,GAAG,cAAc,CAAC,SAAS,CAAE,CAAC;IAC5C,OAAO;QACL,GAAG,EAAE,QAAQ,CAAC,SAAS,CAAC,KAAK,CAAC;QAC9B,IAAI,EAAE,SAAS;QACf,KAAK;KACN,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS;IACvB,OAAO,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;QAC7D,EAAE;QACF,IAAI,EAAE,QAAQ,CAAC,IAAI;QACnB,QAAQ,EAAE,QAAQ,CAAC,QAAQ;KAC5B,CAAC,CAAC,CAAC;AACN,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,aAAa,CAAC,GAAW;IACvC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QAEvD,+BAA+B;QAC/B,MAAM,WAAW,GAA2B;YAC1C,UAAU,EAAE,MAAM;YAClB,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,UAAU,EAAE,MAAM;YAClB,gBAAgB,EAAE,YAAY;YAC9B,YAAY,EAAE,QAAQ;YACtB,YAAY,EAAE,QAAQ;YACtB,UAAU,EAAE,MAAM;YAClB,qBAAqB,EAAE,YAAY;YACnC,gBAAgB,EAAE,YAAY;YAC9B,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,OAAO,EAAE,SAAS;YAClB,aAAa,EAAE,SAAS;YACxB,cAAc,EAAE,UAAU;YAC1B,YAAY,EAAE,QAAQ;YACtB,mBAAmB,EAAE,eAAe;YACpC,WAAW,EAAE,KAAK;YAClB,UAAU,EAAE,MAAM;YAClB,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,YAAY,EAAE,QAAQ;YACtB,eAAe,EAAE,WAAW;YAC5B,UAAU,EAAE,MAAM;YAClB,cAAc,EAAE,UAAU;YAC1B,cAAc,EAAE,UAAU;SAC3B,CAAC;QAEF,OAAO,WAAW,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC;IACvC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -34,6 +34,14 @@ export interface StrategyOptions {
|
|
|
34
34
|
country?: string;
|
|
35
35
|
languages?: string[];
|
|
36
36
|
};
|
|
37
|
+
/**
|
|
38
|
+
* Path to a persistent Chrome user-data-dir.
|
|
39
|
+
* When set, bypasses the shared browser pool so cookies/sessions survive
|
|
40
|
+
* between fetch calls in the same process.
|
|
41
|
+
*/
|
|
42
|
+
profileDir?: string;
|
|
43
|
+
/** Launch browser in headed (visible) mode — useful for debugging and profile setup. */
|
|
44
|
+
headed?: boolean;
|
|
37
45
|
}
|
|
38
46
|
/**
|
|
39
47
|
* Smart fetch with automatic escalation.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"strategies.d.ts","sourceRoot":"","sources":["../../src/core/strategies.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;
|
|
1
|
+
{"version":3,"file":"strategies.d.ts","sourceRoot":"","sources":["../../src/core/strategies.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAOH,OAAO,EAEL,KAAK,cAAc,EAEpB,MAAM,qBAAqB,CAAC;AAG7B,YAAY,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AA2H1D,MAAM,WAAW,eAAe;IAC9B,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,KAAK,CAAC;QACd,IAAI,EACA,MAAM,GACN,OAAO,GACP,QAAQ,GACR,MAAM,GACN,MAAM,GACN,QAAQ,GACR,OAAO,GACP,OAAO,GACP,iBAAiB,GACjB,YAAY,CAAC;QACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,EAAE,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;QAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,EAAE;QACT,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;IACF;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,wFAAwF;IACxF,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAiHD;;;;;GAKG;AACH,wBAAsB,UAAU,CAC9B,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,CAAC,CAiWzB;AAID;;GAEG;AACH,OAAO,EAAE,kBAAkB,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC"}
|
package/dist/core/strategies.js
CHANGED
|
@@ -10,30 +10,70 @@ import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
|
|
|
10
10
|
import { getCached, setCached as setBasicCache } from './cache.js';
|
|
11
11
|
import { resolveAndCache } from './dns-cache.js';
|
|
12
12
|
import { BlockedError, NetworkError } from '../types.js';
|
|
13
|
+
import { detectChallenge } from './challenge-detection.js';
|
|
13
14
|
import { getStrategyHooks, } from './strategy-hooks.js';
|
|
14
15
|
/* ---------- hardcoded domain rules -------------------------------------- */
|
|
15
16
|
function shouldForceBrowser(url) {
|
|
16
17
|
try {
|
|
17
18
|
const hostname = new URL(url).hostname.toLowerCase();
|
|
18
|
-
//
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
19
|
+
// Sites that return HTML shells / need JS rendering (browser mode)
|
|
20
|
+
const browserDomains = [
|
|
21
|
+
'reddit.com', // HTML shell via simple fetch
|
|
22
|
+
'npmjs.com', // 403 on simple fetch
|
|
23
|
+
'x.com', // SPA, login wall
|
|
24
|
+
'twitter.com', // SPA, login wall
|
|
25
|
+
'instagram.com', // SPA, login wall
|
|
26
|
+
'facebook.com', // SPA, heavy JS
|
|
27
|
+
'tiktok.com', // SPA, JS-rendered
|
|
28
|
+
'pinterest.com', // SPA, JS-rendered
|
|
29
|
+
'airbnb.com', // heavy SPA
|
|
30
|
+
'medium.com', // JS-rendered, sometimes login wall
|
|
31
|
+
'substack.com', // JS-rendered
|
|
32
|
+
'notion.so', // SPA
|
|
33
|
+
'figma.com', // SPA
|
|
34
|
+
'canva.com', // SPA
|
|
35
|
+
'vercel.app', // Could be any SPA
|
|
36
|
+
];
|
|
37
|
+
for (const domain of browserDomains) {
|
|
38
|
+
if (hostname === domain || hostname.endsWith(`.${domain}`)) {
|
|
39
|
+
return { mode: 'browser' };
|
|
40
|
+
}
|
|
34
41
|
}
|
|
35
|
-
|
|
36
|
-
|
|
42
|
+
// These are known to aggressively block automation — stealth mode required
|
|
43
|
+
const stealthDomains = [
|
|
44
|
+
'glassdoor.com',
|
|
45
|
+
'bloomberg.com',
|
|
46
|
+
'indeed.com',
|
|
47
|
+
'amazon.com', // captcha wall on simple/browser fetch
|
|
48
|
+
'zillow.com', // aggressive bot detection
|
|
49
|
+
'ticketmaster.com', // Distil Networks / PerimeterX
|
|
50
|
+
'stubhub.com', // PerimeterX / CAPTCHA
|
|
51
|
+
'walmart.com', // Akamai Bot Manager
|
|
52
|
+
'target.com', // Akamai Bot Manager
|
|
53
|
+
'bestbuy.com', // Akamai Bot Manager
|
|
54
|
+
'homedepot.com', // Akamai Bot Manager
|
|
55
|
+
'lowes.com', // Akamai Bot Manager
|
|
56
|
+
'costco.com', // Akamai Bot Manager
|
|
57
|
+
'nike.com', // Akamai / Shape Security
|
|
58
|
+
'footlocker.com', // PerimeterX / DataDome
|
|
59
|
+
'realtor.com', // aggressive bot detection
|
|
60
|
+
'redfin.com', // aggressive bot detection
|
|
61
|
+
'cloudflare.com', // Cloudflare challenge pages
|
|
62
|
+
'ebay.com', // challenge page on simple fetch
|
|
63
|
+
'linkedin.com', // aggressive bot detection + login walls
|
|
64
|
+
'craigslist.org', // occasionally blocks automated access
|
|
65
|
+
'etsy.com', // Akamai protection
|
|
66
|
+
'wayfair.com', // Akamai protection
|
|
67
|
+
'newegg.com', // bot detection
|
|
68
|
+
'zappos.com', // Amazon subsidiary, same protection
|
|
69
|
+
'chewy.com', // Amazon subsidiary
|
|
70
|
+
'aliexpress.com', // anti-bot
|
|
71
|
+
'wish.com', // anti-bot
|
|
72
|
+
];
|
|
73
|
+
for (const domain of stealthDomains) {
|
|
74
|
+
if (hostname === domain || hostname.endsWith(`.${domain}`)) {
|
|
75
|
+
return { mode: 'stealth' };
|
|
76
|
+
}
|
|
37
77
|
}
|
|
38
78
|
}
|
|
39
79
|
catch {
|
|
@@ -57,6 +97,25 @@ function looksLikeShellPage(result) {
|
|
|
57
97
|
const text = result.html.replace(/<[^>]*>/g, '').trim();
|
|
58
98
|
return text.length < 500 && result.html.length > 1000;
|
|
59
99
|
}
|
|
100
|
+
/**
|
|
101
|
+
* Detect pages that returned HTML but have very little actual text content.
|
|
102
|
+
* This catches JS-rendered SPAs that return a shell page with a big HTML payload
|
|
103
|
+
* (scripts, styles, framework boilerplate) but minimal visible text.
|
|
104
|
+
*/
|
|
105
|
+
function shouldEscalateForLowContent(result) {
|
|
106
|
+
const ct = (result.contentType || '').toLowerCase();
|
|
107
|
+
if (!ct.includes('html'))
|
|
108
|
+
return false;
|
|
109
|
+
if (result.html.length <= 1500)
|
|
110
|
+
return false;
|
|
111
|
+
// Strip script/style blocks and their contents first, then strip remaining tags
|
|
112
|
+
const withoutScripts = result.html
|
|
113
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
114
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
115
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, '');
|
|
116
|
+
const visibleText = withoutScripts.replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim();
|
|
117
|
+
return visibleText.length < 200;
|
|
118
|
+
}
|
|
60
119
|
function prefetchDns(url) {
|
|
61
120
|
try {
|
|
62
121
|
const hostname = new URL(url).hostname;
|
|
@@ -67,7 +126,7 @@ function prefetchDns(url) {
|
|
|
67
126
|
}
|
|
68
127
|
}
|
|
69
128
|
async function fetchWithBrowserStrategy(url, options) {
|
|
70
|
-
const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, } = options;
|
|
129
|
+
const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, } = options;
|
|
71
130
|
try {
|
|
72
131
|
const result = await browserFetch(url, {
|
|
73
132
|
userAgent,
|
|
@@ -81,6 +140,8 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
81
140
|
actions,
|
|
82
141
|
keepPageOpen,
|
|
83
142
|
signal,
|
|
143
|
+
profileDir,
|
|
144
|
+
headed,
|
|
84
145
|
});
|
|
85
146
|
return {
|
|
86
147
|
...result,
|
|
@@ -104,6 +165,8 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
104
165
|
actions,
|
|
105
166
|
keepPageOpen,
|
|
106
167
|
signal,
|
|
168
|
+
profileDir,
|
|
169
|
+
headed,
|
|
107
170
|
});
|
|
108
171
|
return { ...result, method: 'stealth' };
|
|
109
172
|
}
|
|
@@ -122,6 +185,8 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
122
185
|
actions,
|
|
123
186
|
keepPageOpen,
|
|
124
187
|
signal,
|
|
188
|
+
profileDir,
|
|
189
|
+
headed,
|
|
125
190
|
});
|
|
126
191
|
return { ...result, method: effectiveStealth ? 'stealth' : 'browser' };
|
|
127
192
|
}
|
|
@@ -136,7 +201,7 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
136
201
|
* With premium hooks: SWR cache → domain intel → parallel race → escalation.
|
|
137
202
|
*/
|
|
138
203
|
export async function smartFetch(url, options = {}) {
|
|
139
|
-
const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, } = options;
|
|
204
|
+
const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, } = options;
|
|
140
205
|
const hooks = getStrategyHooks();
|
|
141
206
|
const fetchStartMs = Date.now();
|
|
142
207
|
const recordMethod = (method) => {
|
|
@@ -198,6 +263,10 @@ export async function smartFetch(url, options = {}) {
|
|
|
198
263
|
}
|
|
199
264
|
/* ---- browser-level options ------------------------------------------- */
|
|
200
265
|
let shouldUseBrowser = effectiveForceBrowser || screenshot || effectiveStealth;
|
|
266
|
+
// A profileDir always forces browser mode (profile sessions need a real browser)
|
|
267
|
+
if (profileDir) {
|
|
268
|
+
effectiveForceBrowser = true;
|
|
269
|
+
}
|
|
201
270
|
const browserOptions = {
|
|
202
271
|
userAgent,
|
|
203
272
|
waitMs,
|
|
@@ -209,6 +278,8 @@ export async function smartFetch(url, options = {}) {
|
|
|
209
278
|
actions,
|
|
210
279
|
keepPageOpen,
|
|
211
280
|
effectiveStealth,
|
|
281
|
+
profileDir,
|
|
282
|
+
headed,
|
|
212
283
|
};
|
|
213
284
|
/* ---- Strategy: simple fetch (with optional race) --------------------- */
|
|
214
285
|
if (!shouldUseBrowser) {
|
|
@@ -236,15 +307,29 @@ export async function smartFetch(url, options = {}) {
|
|
|
236
307
|
if (raceTimer)
|
|
237
308
|
clearTimeout(raceTimer);
|
|
238
309
|
if (simpleOrTimeout.type === 'simple-success') {
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
|
|
310
|
+
// Check if the content is suspiciously thin — escalate to browser if so
|
|
311
|
+
if (shouldEscalateForLowContent(simpleOrTimeout.result)) {
|
|
312
|
+
shouldUseBrowser = true;
|
|
313
|
+
}
|
|
314
|
+
else {
|
|
315
|
+
// Check whether the response is a bot-challenge page (e.g. Cloudflare, PerimeterX)
|
|
316
|
+
const challengeCheck = detectChallenge(simpleOrTimeout.result.html, simpleOrTimeout.result.statusCode);
|
|
317
|
+
if (challengeCheck.isChallenge && challengeCheck.confidence >= 0.7) {
|
|
318
|
+
// Escalate — the browser/stealth path will handle it below
|
|
319
|
+
shouldUseBrowser = true;
|
|
320
|
+
}
|
|
321
|
+
else {
|
|
322
|
+
const strategyResult = {
|
|
323
|
+
...simpleOrTimeout.result,
|
|
324
|
+
method: 'simple',
|
|
325
|
+
};
|
|
326
|
+
if (canUseCache) {
|
|
327
|
+
hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
|
|
328
|
+
}
|
|
329
|
+
recordMethod('simple');
|
|
330
|
+
return strategyResult;
|
|
331
|
+
}
|
|
245
332
|
}
|
|
246
|
-
recordMethod('simple');
|
|
247
|
-
return strategyResult;
|
|
248
333
|
}
|
|
249
334
|
if (simpleOrTimeout.type === 'simple-error') {
|
|
250
335
|
if (!shouldEscalateSimpleError(simpleOrTimeout.error)) {
|
|
@@ -317,30 +402,85 @@ export async function smartFetch(url, options = {}) {
|
|
|
317
402
|
.then((result) => ({ type: 'simple-success', result }))
|
|
318
403
|
.catch((error) => ({ type: 'simple-error', error }));
|
|
319
404
|
if (simpleResult.type === 'simple-success') {
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
|
|
405
|
+
// Check if the content is suspiciously thin — escalate to browser if so
|
|
406
|
+
if (shouldEscalateForLowContent(simpleResult.result)) {
|
|
407
|
+
shouldUseBrowser = true;
|
|
408
|
+
}
|
|
409
|
+
else {
|
|
410
|
+
// Check whether the response is a bot-challenge page
|
|
411
|
+
const challengeCheck = detectChallenge(simpleResult.result.html, simpleResult.result.statusCode);
|
|
412
|
+
if (challengeCheck.isChallenge && challengeCheck.confidence >= 0.7) {
|
|
413
|
+
shouldUseBrowser = true;
|
|
414
|
+
}
|
|
415
|
+
else {
|
|
416
|
+
const strategyResult = {
|
|
417
|
+
...simpleResult.result,
|
|
418
|
+
method: 'simple',
|
|
419
|
+
};
|
|
420
|
+
if (canUseCache) {
|
|
421
|
+
hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
|
|
422
|
+
}
|
|
423
|
+
recordMethod('simple');
|
|
424
|
+
return strategyResult;
|
|
425
|
+
}
|
|
326
426
|
}
|
|
327
|
-
recordMethod('simple');
|
|
328
|
-
return strategyResult;
|
|
329
427
|
}
|
|
330
|
-
|
|
331
|
-
|
|
428
|
+
else {
|
|
429
|
+
if (!shouldEscalateSimpleError(simpleResult.error)) {
|
|
430
|
+
throw simpleResult.error;
|
|
431
|
+
}
|
|
432
|
+
shouldUseBrowser = true;
|
|
332
433
|
}
|
|
333
|
-
shouldUseBrowser = true;
|
|
334
434
|
}
|
|
335
435
|
}
|
|
336
436
|
}
|
|
337
|
-
/* ---- browser / stealth fallback
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
437
|
+
/* ---- browser / stealth fallback with challenge-detection cascade ----- */
|
|
438
|
+
// Attempt 1: browser (or stealth, if already forced)
|
|
439
|
+
let finalResult = await fetchWithBrowserStrategy(url, browserOptions);
|
|
440
|
+
// Check if the browser result is itself a bot-challenge page
|
|
441
|
+
const browserChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
|
|
442
|
+
if (browserChallengeCheck.isChallenge && browserChallengeCheck.confidence >= 0.7) {
|
|
443
|
+
if (!browserOptions.effectiveStealth) {
|
|
444
|
+
// Attempt 2: escalate to stealth
|
|
445
|
+
const stealthOptions = {
|
|
446
|
+
...browserOptions,
|
|
447
|
+
effectiveStealth: true,
|
|
448
|
+
};
|
|
449
|
+
finalResult = await fetchWithBrowserStrategy(url, stealthOptions);
|
|
450
|
+
const stealthChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
|
|
451
|
+
if (stealthChallengeCheck.isChallenge && stealthChallengeCheck.confidence >= 0.7) {
|
|
452
|
+
// Attempt 3: stealth + 5s extra wait
|
|
453
|
+
const stealthExtraOptions = {
|
|
454
|
+
...stealthOptions,
|
|
455
|
+
waitMs: stealthOptions.waitMs + 5000,
|
|
456
|
+
};
|
|
457
|
+
finalResult = await fetchWithBrowserStrategy(url, stealthExtraOptions);
|
|
458
|
+
const finalChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
|
|
459
|
+
if (finalChallengeCheck.isChallenge && finalChallengeCheck.confidence >= 0.7) {
|
|
460
|
+
// Give up — return with warning flag
|
|
461
|
+
finalResult = { ...finalResult, challengeDetected: true };
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
else {
|
|
466
|
+
// Already in stealth mode; retry with 5s extra wait
|
|
467
|
+
const stealthExtraOptions = {
|
|
468
|
+
...browserOptions,
|
|
469
|
+
waitMs: browserOptions.waitMs + 5000,
|
|
470
|
+
};
|
|
471
|
+
finalResult = await fetchWithBrowserStrategy(url, stealthExtraOptions);
|
|
472
|
+
const finalChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
|
|
473
|
+
if (finalChallengeCheck.isChallenge && finalChallengeCheck.confidence >= 0.7) {
|
|
474
|
+
// Give up — return with warning flag
|
|
475
|
+
finalResult = { ...finalResult, challengeDetected: true };
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
if (canUseCache && !finalResult.challengeDetected) {
|
|
480
|
+
hooks.setCache?.(url, finalResult) ?? setBasicCache(url, finalResult);
|
|
341
481
|
}
|
|
342
|
-
recordMethod(
|
|
343
|
-
return
|
|
482
|
+
recordMethod(finalResult.method);
|
|
483
|
+
return finalResult;
|
|
344
484
|
}
|
|
345
485
|
/* ---------- legacy export for tests ------------------------------------- */
|
|
346
486
|
/**
|