webpeel 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/README.md +39 -5
  2. package/dist/cli.js +1299 -85
  3. package/dist/cli.js.map +1 -1
  4. package/dist/core/application-tracker.d.ts +85 -0
  5. package/dist/core/application-tracker.d.ts.map +1 -0
  6. package/dist/core/application-tracker.js +184 -0
  7. package/dist/core/application-tracker.js.map +1 -0
  8. package/dist/core/apply.d.ts +163 -0
  9. package/dist/core/apply.d.ts.map +1 -0
  10. package/dist/core/apply.js +817 -0
  11. package/dist/core/apply.js.map +1 -0
  12. package/dist/core/branding.d.ts +1 -1
  13. package/dist/core/branding.d.ts.map +1 -1
  14. package/dist/core/budget.d.ts +43 -0
  15. package/dist/core/budget.d.ts.map +1 -0
  16. package/dist/core/budget.js +325 -0
  17. package/dist/core/budget.js.map +1 -0
  18. package/dist/core/challenge-detection.d.ts +27 -0
  19. package/dist/core/challenge-detection.d.ts.map +1 -0
  20. package/dist/core/challenge-detection.js +436 -0
  21. package/dist/core/challenge-detection.js.map +1 -0
  22. package/dist/core/change-tracking.d.ts.map +1 -1
  23. package/dist/core/change-tracking.js +10 -1
  24. package/dist/core/change-tracking.js.map +1 -1
  25. package/dist/core/crawler.d.ts.map +1 -1
  26. package/dist/core/crawler.js +17 -4
  27. package/dist/core/crawler.js.map +1 -1
  28. package/dist/core/diff.d.ts +62 -0
  29. package/dist/core/diff.d.ts.map +1 -0
  30. package/dist/core/diff.js +289 -0
  31. package/dist/core/diff.js.map +1 -0
  32. package/dist/core/extract-listings.d.ts +39 -0
  33. package/dist/core/extract-listings.d.ts.map +1 -0
  34. package/dist/core/extract-listings.js +331 -0
  35. package/dist/core/extract-listings.js.map +1 -0
  36. package/dist/core/extract.d.ts.map +1 -1
  37. package/dist/core/extract.js +15 -2
  38. package/dist/core/extract.js.map +1 -1
  39. package/dist/core/fetcher.d.ts +29 -3
  40. package/dist/core/fetcher.d.ts.map +1 -1
  41. package/dist/core/fetcher.js +158 -20
  42. package/dist/core/fetcher.js.map +1 -1
  43. package/dist/core/human.d.ts +176 -0
  44. package/dist/core/human.d.ts.map +1 -0
  45. package/dist/core/human.js +681 -0
  46. package/dist/core/human.js.map +1 -0
  47. package/dist/core/jobs.d.ts +12 -2
  48. package/dist/core/jobs.d.ts.map +1 -1
  49. package/dist/core/jobs.js +124 -2
  50. package/dist/core/jobs.js.map +1 -1
  51. package/dist/core/map.d.ts.map +1 -1
  52. package/dist/core/map.js +14 -2
  53. package/dist/core/map.js.map +1 -1
  54. package/dist/core/paginate.d.ts +32 -0
  55. package/dist/core/paginate.d.ts.map +1 -0
  56. package/dist/core/paginate.js +107 -0
  57. package/dist/core/paginate.js.map +1 -0
  58. package/dist/core/rate-governor.d.ts +81 -0
  59. package/dist/core/rate-governor.d.ts.map +1 -0
  60. package/dist/core/rate-governor.js +238 -0
  61. package/dist/core/rate-governor.js.map +1 -0
  62. package/dist/core/search-provider.d.ts +5 -0
  63. package/dist/core/search-provider.d.ts.map +1 -1
  64. package/dist/core/search-provider.js +81 -2
  65. package/dist/core/search-provider.js.map +1 -1
  66. package/dist/core/site-search.d.ts +45 -0
  67. package/dist/core/site-search.d.ts.map +1 -0
  68. package/dist/core/site-search.js +253 -0
  69. package/dist/core/site-search.js.map +1 -0
  70. package/dist/core/strategies.d.ts +8 -0
  71. package/dist/core/strategies.d.ts.map +1 -1
  72. package/dist/core/strategies.js +185 -45
  73. package/dist/core/strategies.js.map +1 -1
  74. package/dist/core/strategy-hooks.d.ts +6 -0
  75. package/dist/core/strategy-hooks.d.ts.map +1 -1
  76. package/dist/core/strategy-hooks.js.map +1 -1
  77. package/dist/core/table-format.d.ts +31 -0
  78. package/dist/core/table-format.d.ts.map +1 -0
  79. package/dist/core/table-format.js +147 -0
  80. package/dist/core/table-format.js.map +1 -0
  81. package/dist/core/user-agents.d.ts +58 -0
  82. package/dist/core/user-agents.d.ts.map +1 -0
  83. package/dist/core/user-agents.js +159 -0
  84. package/dist/core/user-agents.js.map +1 -0
  85. package/dist/core/watch.d.ts +100 -0
  86. package/dist/core/watch.d.ts.map +1 -0
  87. package/dist/core/watch.js +368 -0
  88. package/dist/core/watch.js.map +1 -0
  89. package/dist/index.d.ts +13 -2
  90. package/dist/index.d.ts.map +1 -1
  91. package/dist/index.js +41 -4
  92. package/dist/index.js.map +1 -1
  93. package/dist/mcp/server.js +3 -0
  94. package/dist/mcp/server.js.map +1 -1
  95. package/dist/types.d.ts +73 -0
  96. package/dist/types.d.ts.map +1 -1
  97. package/dist/types.js.map +1 -1
  98. package/llms.txt +1 -1
  99. package/package.json +3 -3
@@ -0,0 +1,253 @@
1
+ /**
2
+ * Site-Aware Search URL Builders
3
+ *
4
+ * Provides URL templates for popular websites so AI agents can search them
5
+ * without needing to know site-specific URL structures.
6
+ *
7
+ * @module site-search
8
+ */
9
+ /**
10
+ * URL templates for popular sites, keyed by site ID.
11
+ * All query values are URL-encoded via encodeURIComponent.
12
+ */
13
+ export const SITE_TEMPLATES = {
14
+ // ── Shopping ──────────────────────────────────────────────────────────────
15
+ ebay: {
16
+ name: 'eBay',
17
+ category: 'shopping',
18
+ searchUrl: (q) => `https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(q)}`,
19
+ },
20
+ amazon: {
21
+ name: 'Amazon',
22
+ category: 'shopping',
23
+ searchUrl: (q) => `https://www.amazon.com/s?k=${encodeURIComponent(q)}`,
24
+ },
25
+ walmart: {
26
+ name: 'Walmart',
27
+ category: 'shopping',
28
+ searchUrl: (q) => `https://www.walmart.com/search?q=${encodeURIComponent(q)}`,
29
+ },
30
+ target: {
31
+ name: 'Target',
32
+ category: 'shopping',
33
+ searchUrl: (q) => `https://www.target.com/s?searchTerm=${encodeURIComponent(q)}`,
34
+ },
35
+ bestbuy: {
36
+ name: 'Best Buy',
37
+ category: 'shopping',
38
+ searchUrl: (q) => `https://www.bestbuy.com/site/searchpage.jsp?st=${encodeURIComponent(q)}`,
39
+ },
40
+ etsy: {
41
+ name: 'Etsy',
42
+ category: 'shopping',
43
+ searchUrl: (q) => `https://www.etsy.com/search?q=${encodeURIComponent(q)}`,
44
+ },
45
+ aliexpress: {
46
+ name: 'AliExpress',
47
+ category: 'shopping',
48
+ searchUrl: (q) => `https://www.aliexpress.com/wholesale?SearchText=${encodeURIComponent(q)}`,
49
+ },
50
+ newegg: {
51
+ name: 'Newegg',
52
+ category: 'shopping',
53
+ searchUrl: (q) => `https://www.newegg.com/p/pl?d=${encodeURIComponent(q)}`,
54
+ },
55
+ // ── General ───────────────────────────────────────────────────────────────
56
+ google: {
57
+ name: 'Google',
58
+ category: 'general',
59
+ searchUrl: (q) => `https://www.google.com/search?q=${encodeURIComponent(q)}`,
60
+ },
61
+ bing: {
62
+ name: 'Bing',
63
+ category: 'general',
64
+ searchUrl: (q) => `https://www.bing.com/search?q=${encodeURIComponent(q)}`,
65
+ },
66
+ duckduckgo: {
67
+ name: 'DuckDuckGo',
68
+ category: 'general',
69
+ searchUrl: (q) => `https://html.duckduckgo.com/html/?q=${encodeURIComponent(q)}`,
70
+ },
71
+ // ── Social / Content ──────────────────────────────────────────────────────
72
+ reddit: {
73
+ name: 'Reddit',
74
+ category: 'social',
75
+ searchUrl: (q) => `https://www.reddit.com/search/?q=${encodeURIComponent(q)}`,
76
+ },
77
+ youtube: {
78
+ name: 'YouTube',
79
+ category: 'social',
80
+ searchUrl: (q) => `https://www.youtube.com/results?search_query=${encodeURIComponent(q)}`,
81
+ },
82
+ twitter: {
83
+ name: 'X (Twitter)',
84
+ category: 'social',
85
+ searchUrl: (q) => `https://x.com/search?q=${encodeURIComponent(q)}`,
86
+ },
87
+ linkedin: {
88
+ name: 'LinkedIn',
89
+ category: 'social',
90
+ searchUrl: (q) => `https://www.linkedin.com/search/results/all/?keywords=${encodeURIComponent(q)}`,
91
+ },
92
+ // ── Tech ──────────────────────────────────────────────────────────────────
93
+ github: {
94
+ name: 'GitHub',
95
+ category: 'tech',
96
+ searchUrl: (q) => `https://github.com/search?q=${encodeURIComponent(q)}`,
97
+ },
98
+ stackoverflow: {
99
+ name: 'Stack Overflow',
100
+ category: 'tech',
101
+ searchUrl: (q) => `https://stackoverflow.com/search?q=${encodeURIComponent(q)}`,
102
+ },
103
+ npm: {
104
+ name: 'npm',
105
+ category: 'tech',
106
+ searchUrl: (q) => `https://www.npmjs.com/search?q=${encodeURIComponent(q)}`,
107
+ },
108
+ pypi: {
109
+ name: 'PyPI',
110
+ category: 'tech',
111
+ searchUrl: (q) => `https://pypi.org/search/?q=${encodeURIComponent(q)}`,
112
+ },
113
+ // ── Real Estate ───────────────────────────────────────────────────────────
114
+ zillow: {
115
+ name: 'Zillow',
116
+ category: 'real-estate',
117
+ searchUrl: (q) => `https://www.zillow.com/homes/${encodeURIComponent(q)}_rb/`,
118
+ },
119
+ realtor: {
120
+ name: 'Realtor.com',
121
+ category: 'real-estate',
122
+ searchUrl: (q) => `https://www.realtor.com/realestateandhomes-search/${encodeURIComponent(q)}`,
123
+ },
124
+ // ── Jobs ──────────────────────────────────────────────────────────────────
125
+ indeed: {
126
+ name: 'Indeed',
127
+ category: 'jobs',
128
+ searchUrl: (q) => `https://www.indeed.com/jobs?q=${encodeURIComponent(q)}`,
129
+ },
130
+ glassdoor: {
131
+ name: 'Glassdoor',
132
+ category: 'jobs',
133
+ searchUrl: (q) => `https://www.glassdoor.com/Job/jobs.htm?sc.keyword=${encodeURIComponent(q)}`,
134
+ },
135
+ 'linkedin-jobs': {
136
+ name: 'LinkedIn Jobs',
137
+ category: 'jobs',
138
+ searchUrl: (q) => `https://www.linkedin.com/jobs/search/?keywords=${encodeURIComponent(q)}`,
139
+ },
140
+ // ── Food ──────────────────────────────────────────────────────────────────
141
+ yelp: {
142
+ name: 'Yelp',
143
+ category: 'food',
144
+ searchUrl: (q) => `https://www.yelp.com/search?find_desc=${encodeURIComponent(q)}`,
145
+ },
146
+ doordash: {
147
+ name: 'DoorDash',
148
+ category: 'food',
149
+ searchUrl: (q) => `https://www.doordash.com/search/store/${encodeURIComponent(q)}`,
150
+ },
151
+ ubereats: {
152
+ name: 'Uber Eats',
153
+ category: 'food',
154
+ searchUrl: (q) => `https://www.ubereats.com/search?q=${encodeURIComponent(q)}`,
155
+ },
156
+ };
157
+ /** Aliases that map to canonical site IDs */
158
+ const SITE_ALIASES = {
159
+ x: 'twitter',
160
+ 'best-buy': 'bestbuy',
161
+ 'ali-express': 'aliexpress',
162
+ 'stack-overflow': 'stackoverflow',
163
+ 'duck-duck-go': 'duckduckgo',
164
+ };
165
+ /**
166
+ * Resolve a site ID (or alias) to its canonical key.
167
+ * Returns null if not found.
168
+ */
169
+ function resolveSiteId(site) {
170
+ const lower = site.toLowerCase();
171
+ if (lower in SITE_TEMPLATES)
172
+ return lower;
173
+ if (lower in SITE_ALIASES)
174
+ return SITE_ALIASES[lower];
175
+ return null;
176
+ }
177
+ /**
178
+ * Build a search URL for a given site and query.
179
+ *
180
+ * @param site Site ID (e.g. "ebay", "amazon") or alias (e.g. "x")
181
+ * @param query Search query string
182
+ * @throws Error if the site is not recognized
183
+ */
184
+ export function buildSiteSearchUrl(site, query) {
185
+ const canonical = resolveSiteId(site);
186
+ if (!canonical) {
187
+ const available = Object.keys(SITE_TEMPLATES).join(', ');
188
+ throw new Error(`Unknown site: "${site}". Available sites: ${available}. ` +
189
+ `Run "webpeel sites" to list all supported sites.`);
190
+ }
191
+ const template = SITE_TEMPLATES[canonical];
192
+ return {
193
+ url: template.searchUrl(query),
194
+ site: canonical,
195
+ query,
196
+ };
197
+ }
198
+ /**
199
+ * List all available site templates in a flat array.
200
+ */
201
+ export function listSites() {
202
+ return Object.entries(SITE_TEMPLATES).map(([id, template]) => ({
203
+ id,
204
+ name: template.name,
205
+ category: template.category,
206
+ }));
207
+ }
208
+ /**
209
+ * Find which site ID a given URL belongs to (reverse lookup).
210
+ * Returns the canonical site ID, or null if the URL doesn't match any template.
211
+ */
212
+ export function findSiteByUrl(url) {
213
+ try {
214
+ const parsed = new URL(url);
215
+ const hostname = parsed.hostname.replace(/^www\./, '');
216
+ // Map of hostnames to site IDs
217
+ const hostnameMap = {
218
+ 'ebay.com': 'ebay',
219
+ 'amazon.com': 'amazon',
220
+ 'walmart.com': 'walmart',
221
+ 'target.com': 'target',
222
+ 'bestbuy.com': 'bestbuy',
223
+ 'etsy.com': 'etsy',
224
+ 'aliexpress.com': 'aliexpress',
225
+ 'newegg.com': 'newegg',
226
+ 'google.com': 'google',
227
+ 'bing.com': 'bing',
228
+ 'html.duckduckgo.com': 'duckduckgo',
229
+ 'duckduckgo.com': 'duckduckgo',
230
+ 'reddit.com': 'reddit',
231
+ 'youtube.com': 'youtube',
232
+ 'x.com': 'twitter',
233
+ 'twitter.com': 'twitter',
234
+ 'linkedin.com': 'linkedin',
235
+ 'github.com': 'github',
236
+ 'stackoverflow.com': 'stackoverflow',
237
+ 'npmjs.com': 'npm',
238
+ 'pypi.org': 'pypi',
239
+ 'zillow.com': 'zillow',
240
+ 'realtor.com': 'realtor',
241
+ 'indeed.com': 'indeed',
242
+ 'glassdoor.com': 'glassdoor',
243
+ 'yelp.com': 'yelp',
244
+ 'doordash.com': 'doordash',
245
+ 'ubereats.com': 'ubereats',
246
+ };
247
+ return hostnameMap[hostname] ?? null;
248
+ }
249
+ catch {
250
+ return null;
251
+ }
252
+ }
253
+ //# sourceMappingURL=site-search.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"site-search.js","sourceRoot":"","sources":["../../src/core/site-search.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAcH;;;GAGG;AACH,MAAM,CAAC,MAAM,cAAc,GAAiC;IAC1D,6EAA6E;IAC7E,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,wCAAwC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAClF;IACD,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,8BAA8B,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACxE;IACD,OAAO,EAAE;QACP,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,oCAAoC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC9E;IACD,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,uCAAuC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACjF;IACD,OAAO,EAAE;QACP,IAAI,EAAE,UAAU;QAChB,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,kDAAkD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC5F;IACD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iCAAiC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC3E;IACD,UAAU,EAAE;QACV,IAAI,EAAE,YAAY;QAClB,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,mDAAmD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC7F;IACD,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,UAAU;QACpB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iCAAiC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC3E;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,SAAS;QACnB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,mCAAmC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC7E;IACD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,SAAS;QACnB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iCAAiC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC3E;IACD,UAAU,EAAE;QACV,IAAI,EAAE,YAAY;QAClB,QAAQ,EAAE,SAAS;QACnB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,uCAAuC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACjF;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,QAAQ;QAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,oCAAoC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC9E;IACD,OAAO,EAAE;QACP,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE,QAAQ;QAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,gDAAgD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC1F;IACD,OAAO,EAAE;QACP,IAAI,EAAE,aAAa;QACnB,QAAQ,EAAE,QAAQ;QAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,0BAA0B,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACpE;IACD,QAAQ,EAAE;QACR,IAAI,EAAE,UAAU;QAChB,QAAQ,EAAE,QAAQ;QAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,yDAAyD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACnG;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,+BAA+B,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACzE;IACD,aAAa,EAAE;QACb,IAAI,EAAE,gBAAgB;QACtB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,sCAAsC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAChF;IACD,GAAG,EAAE;QACH,IAAI,EAAE,KAAK;QACX,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,kCAAkC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC5E;IACD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,8BAA8B,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACxE;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,aAAa;QACvB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,gCAAgC,kBAAkB,CAAC,CAAC,CAAC,MAAM;KAC9E;IACD,OAAO,EAAE;QACP,IAAI,EAAE,aAAa;QACnB,QAAQ,EAAE,aAAa;QACvB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,qDAAqD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC/F;IAED,6EAA6E;IAC7E,MAAM,EAAE;QACN,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iCAAiC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC3E;IACD,SAAS,EAAE;QACT,IAAI,EAAE,WAAW;QACjB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,qDAAqD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC/F;IACD,eAAe,EAAE;QACf,IAAI,EAAE,eAAe;QACrB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,kDAAkD,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC5F;IAED,6EAA6E;IAC7E,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,yCAAyC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACnF;IACD,QAAQ,EAAE;QACR,IAAI,EAAE,UAAU;QAChB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,yCAAyC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KACnF;IACD,QAAQ,EAAE;QACR,IAAI,EAAE,WAAW;QACjB,QAAQ,EAAE,MAAM;QAChB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,qCAAqC,kBAAkB,CAAC,CAAC,CAAC,EAAE;KAC/E;CACF,CAAC;AAEF,6CAA6C;AAC7C,MAAM,YAAY,GAA2B;IAC3C,CAAC,EAAE,SAAS;IACZ,UAAU,EAAE,SAAS;IACrB,aAAa,EAAE,YAAY;IAC3B,gBAAgB,EAAE,eAAe;IACjC,cAAc,EAAE,YAAY;CAC7B,CAAC;AAEF;;;GAGG;AACH,SAAS,aAAa,CAAC,IAAY;IACjC,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IACjC,IAAI,KAAK,IAAI,cAAc;QAAE,OAAO,KAAK,CAAC;IAC1C,IAAI,KAAK,IAAI,YAAY;QAAE,OAAO,YAAY,CAAC,KAAK,CAAE,CAAC;IACvD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,KAAa;IAC5D,MAAM,SAAS,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACtC,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzD,MAAM,IAAI,KAAK,CACb,kBAAkB,IAAI,uBAAuB,SAAS,IAAI;YAC1D,kDAAkD,CACnD,CAAC;IACJ,CAAC;IAED,MAAM,QAAQ,GAAG,cAAc,CAAC,SAAS,CAAE,CAAC;IAC5C,OAAO;QACL,GAAG,EAAE,QAAQ,CAAC,SAAS,CAAC,KAAK,CAAC;QAC9B,IAAI,EAAE,SAAS;QACf,KAAK;KACN,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS;IACvB,OAAO,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;QAC7D,EAAE;QACF,IAAI,EAAE,QAAQ,CAAC,IAAI;QACnB,QAAQ,EAAE,QAAQ,CAAC,QAAQ;KAC5B,CAAC,CAAC,CAAC;AACN,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,aAAa,CAAC,GAAW;IACvC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QAEvD,+BAA+B;QAC/B,MAAM,WAAW,GAA2B;YAC1C,UAAU,EAAE,MAAM;YAClB,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,UAAU,EAAE,MAAM;YAClB,gBAAgB,EAAE,YAAY;YAC9B,YAAY,EAAE,QAAQ;YACtB,YAAY,EAAE,QAAQ;YACtB,UAAU,EAAE,MAAM;YAClB,qBAAqB,EAAE,YAAY;YACnC,gBAAgB,EAAE,YAAY;YAC9B,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,OAAO,EAAE,SAAS;YAClB,aAAa,EAAE,SAAS;YACxB,cAAc,EAAE,UAAU;YAC1B,YAAY,EAAE,QAAQ;YACtB,mBAAmB,EAAE,eAAe;YACpC,WAAW,EAAE,KAAK;YAClB,UAAU,EAAE,MAAM;YAClB,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,YAAY,EAAE,QAAQ;YACtB,eAAe,EAAE,WAAW;YAC5B,UAAU,EAAE,MAAM;YAClB,cAAc,EAAE,UAAU;YAC1B,cAAc,EAAE,UAAU;SAC3B,CAAC;QAEF,OAAO,WAAW,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC;IACvC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
@@ -34,6 +34,14 @@ export interface StrategyOptions {
34
34
  country?: string;
35
35
  languages?: string[];
36
36
  };
37
+ /**
38
+ * Path to a persistent Chrome user-data-dir.
39
+ * When set, bypasses the shared browser pool so cookies/sessions survive
40
+ * between fetch calls in the same process.
41
+ */
42
+ profileDir?: string;
43
+ /** Launch browser in headed (visible) mode — useful for debugging and profile setup. */
44
+ headed?: boolean;
37
45
  }
38
46
  /**
39
47
  * Smart fetch with automatic escalation.
@@ -1 +1 @@
1
- {"version":3,"file":"strategies.d.ts","sourceRoot":"","sources":["../../src/core/strategies.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAMH,OAAO,EAEL,KAAK,cAAc,EAEpB,MAAM,qBAAqB,CAAC;AAG7B,YAAY,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAoE1D,MAAM,WAAW,eAAe;IAC9B,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,KAAK,CAAC;QACd,IAAI,EACA,MAAM,GACN,OAAO,GACP,QAAQ,GACR,MAAM,GACN,MAAM,GACN,QAAQ,GACR,OAAO,GACP,OAAO,GACP,iBAAiB,GACjB,YAAY,CAAC;QACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,EAAE,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;QAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,EAAE;QACT,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;CACH;AAuGD;;;;;GAKG;AACH,wBAAsB,UAAU,CAC9B,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,CAAC,CA6QzB;AAID;;GAEG;AACH,OAAO,EAAE,kBAAkB,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC"}
1
+ {"version":3,"file":"strategies.d.ts","sourceRoot":"","sources":["../../src/core/strategies.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAOH,OAAO,EAEL,KAAK,cAAc,EAEpB,MAAM,qBAAqB,CAAC;AAG7B,YAAY,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AA2H1D,MAAM,WAAW,eAAe;IAC9B,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,KAAK,CAAC;QACd,IAAI,EACA,MAAM,GACN,OAAO,GACP,QAAQ,GACR,MAAM,GACN,MAAM,GACN,QAAQ,GACR,OAAO,GACP,OAAO,GACP,iBAAiB,GACjB,YAAY,CAAC;QACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,EAAE,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;QAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,EAAE;QACT,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;IACF;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,wFAAwF;IACxF,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAiHD;;;;;GAKG;AACH,wBAAsB,UAAU,CAC9B,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,CAAC,CAiWzB;AAID;;GAEG;AACH,OAAO,EAAE,kBAAkB,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC"}
@@ -10,30 +10,70 @@ import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
10
10
  import { getCached, setCached as setBasicCache } from './cache.js';
11
11
  import { resolveAndCache } from './dns-cache.js';
12
12
  import { BlockedError, NetworkError } from '../types.js';
13
+ import { detectChallenge } from './challenge-detection.js';
13
14
  import { getStrategyHooks, } from './strategy-hooks.js';
14
15
  /* ---------- hardcoded domain rules -------------------------------------- */
15
16
  function shouldForceBrowser(url) {
16
17
  try {
17
18
  const hostname = new URL(url).hostname.toLowerCase();
18
- // Reddit often returns an HTML shell via simple fetch
19
- if (hostname === 'reddit.com' || hostname.endsWith('.reddit.com')) {
20
- return { mode: 'browser' };
21
- }
22
- // npmjs blocks simple fetch with 403 frequently
23
- if (hostname === 'npmjs.com' ||
24
- hostname === 'www.npmjs.com' ||
25
- hostname.endsWith('.npmjs.com')) {
26
- return { mode: 'browser' };
27
- }
28
- // These are known to aggressively block automation
29
- if (hostname === 'glassdoor.com' || hostname.endsWith('.glassdoor.com')) {
30
- return { mode: 'stealth' };
31
- }
32
- if (hostname === 'bloomberg.com' || hostname.endsWith('.bloomberg.com')) {
33
- return { mode: 'stealth' };
19
+ // Sites that return HTML shells / need JS rendering (browser mode)
20
+ const browserDomains = [
21
+ 'reddit.com', // HTML shell via simple fetch
22
+ 'npmjs.com', // 403 on simple fetch
23
+ 'x.com', // SPA, login wall
24
+ 'twitter.com', // SPA, login wall
25
+ 'instagram.com', // SPA, login wall
26
+ 'facebook.com', // SPA, heavy JS
27
+ 'tiktok.com', // SPA, JS-rendered
28
+ 'pinterest.com', // SPA, JS-rendered
29
+ 'airbnb.com', // heavy SPA
30
+ 'medium.com', // JS-rendered, sometimes login wall
31
+ 'substack.com', // JS-rendered
32
+ 'notion.so', // SPA
33
+ 'figma.com', // SPA
34
+ 'canva.com', // SPA
35
+ 'vercel.app', // Could be any SPA
36
+ ];
37
+ for (const domain of browserDomains) {
38
+ if (hostname === domain || hostname.endsWith(`.${domain}`)) {
39
+ return { mode: 'browser' };
40
+ }
34
41
  }
35
- if (hostname === 'indeed.com' || hostname.endsWith('.indeed.com')) {
36
- return { mode: 'stealth' };
42
+ // These are known to aggressively block automation — stealth mode required
43
+ const stealthDomains = [
44
+ 'glassdoor.com',
45
+ 'bloomberg.com',
46
+ 'indeed.com',
47
+ 'amazon.com', // captcha wall on simple/browser fetch
48
+ 'zillow.com', // aggressive bot detection
49
+ 'ticketmaster.com', // Distil Networks / PerimeterX
50
+ 'stubhub.com', // PerimeterX / CAPTCHA
51
+ 'walmart.com', // Akamai Bot Manager
52
+ 'target.com', // Akamai Bot Manager
53
+ 'bestbuy.com', // Akamai Bot Manager
54
+ 'homedepot.com', // Akamai Bot Manager
55
+ 'lowes.com', // Akamai Bot Manager
56
+ 'costco.com', // Akamai Bot Manager
57
+ 'nike.com', // Akamai / Shape Security
58
+ 'footlocker.com', // PerimeterX / DataDome
59
+ 'realtor.com', // aggressive bot detection
60
+ 'redfin.com', // aggressive bot detection
61
+ 'cloudflare.com', // Cloudflare challenge pages
62
+ 'ebay.com', // challenge page on simple fetch
63
+ 'linkedin.com', // aggressive bot detection + login walls
64
+ 'craigslist.org', // occasionally blocks automated access
65
+ 'etsy.com', // Akamai protection
66
+ 'wayfair.com', // Akamai protection
67
+ 'newegg.com', // bot detection
68
+ 'zappos.com', // Amazon subsidiary, same protection
69
+ 'chewy.com', // Amazon subsidiary
70
+ 'aliexpress.com', // anti-bot
71
+ 'wish.com', // anti-bot
72
+ ];
73
+ for (const domain of stealthDomains) {
74
+ if (hostname === domain || hostname.endsWith(`.${domain}`)) {
75
+ return { mode: 'stealth' };
76
+ }
37
77
  }
38
78
  }
39
79
  catch {
@@ -57,6 +97,25 @@ function looksLikeShellPage(result) {
57
97
  const text = result.html.replace(/<[^>]*>/g, '').trim();
58
98
  return text.length < 500 && result.html.length > 1000;
59
99
  }
100
+ /**
101
+ * Detect pages that returned HTML but have very little actual text content.
102
+ * This catches JS-rendered SPAs that return a shell page with a big HTML payload
103
+ * (scripts, styles, framework boilerplate) but minimal visible text.
104
+ */
105
+ function shouldEscalateForLowContent(result) {
106
+ const ct = (result.contentType || '').toLowerCase();
107
+ if (!ct.includes('html'))
108
+ return false;
109
+ if (result.html.length <= 1500)
110
+ return false;
111
+ // Strip script/style blocks and their contents first, then strip remaining tags
112
+ const withoutScripts = result.html
113
+ .replace(/<script[\s\S]*?<\/script>/gi, '')
114
+ .replace(/<style[\s\S]*?<\/style>/gi, '')
115
+ .replace(/<noscript[\s\S]*?<\/noscript>/gi, '');
116
+ const visibleText = withoutScripts.replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim();
117
+ return visibleText.length < 200;
118
+ }
60
119
  function prefetchDns(url) {
61
120
  try {
62
121
  const hostname = new URL(url).hostname;
@@ -67,7 +126,7 @@ function prefetchDns(url) {
67
126
  }
68
127
  }
69
128
  async function fetchWithBrowserStrategy(url, options) {
70
- const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, } = options;
129
+ const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, } = options;
71
130
  try {
72
131
  const result = await browserFetch(url, {
73
132
  userAgent,
@@ -81,6 +140,8 @@ async function fetchWithBrowserStrategy(url, options) {
81
140
  actions,
82
141
  keepPageOpen,
83
142
  signal,
143
+ profileDir,
144
+ headed,
84
145
  });
85
146
  return {
86
147
  ...result,
@@ -104,6 +165,8 @@ async function fetchWithBrowserStrategy(url, options) {
104
165
  actions,
105
166
  keepPageOpen,
106
167
  signal,
168
+ profileDir,
169
+ headed,
107
170
  });
108
171
  return { ...result, method: 'stealth' };
109
172
  }
@@ -122,6 +185,8 @@ async function fetchWithBrowserStrategy(url, options) {
122
185
  actions,
123
186
  keepPageOpen,
124
187
  signal,
188
+ profileDir,
189
+ headed,
125
190
  });
126
191
  return { ...result, method: effectiveStealth ? 'stealth' : 'browser' };
127
192
  }
@@ -136,7 +201,7 @@ async function fetchWithBrowserStrategy(url, options) {
136
201
  * With premium hooks: SWR cache → domain intel → parallel race → escalation.
137
202
  */
138
203
  export async function smartFetch(url, options = {}) {
139
- const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, } = options;
204
+ const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, } = options;
140
205
  const hooks = getStrategyHooks();
141
206
  const fetchStartMs = Date.now();
142
207
  const recordMethod = (method) => {
@@ -198,6 +263,10 @@ export async function smartFetch(url, options = {}) {
198
263
  }
199
264
  /* ---- browser-level options ------------------------------------------- */
200
265
  let shouldUseBrowser = effectiveForceBrowser || screenshot || effectiveStealth;
266
+ // A profileDir always forces browser mode (profile sessions need a real browser)
267
+ if (profileDir) {
268
+ effectiveForceBrowser = true;
269
+ }
201
270
  const browserOptions = {
202
271
  userAgent,
203
272
  waitMs,
@@ -209,6 +278,8 @@ export async function smartFetch(url, options = {}) {
209
278
  actions,
210
279
  keepPageOpen,
211
280
  effectiveStealth,
281
+ profileDir,
282
+ headed,
212
283
  };
213
284
  /* ---- Strategy: simple fetch (with optional race) --------------------- */
214
285
  if (!shouldUseBrowser) {
@@ -236,15 +307,29 @@ export async function smartFetch(url, options = {}) {
236
307
  if (raceTimer)
237
308
  clearTimeout(raceTimer);
238
309
  if (simpleOrTimeout.type === 'simple-success') {
239
- const strategyResult = {
240
- ...simpleOrTimeout.result,
241
- method: 'simple',
242
- };
243
- if (canUseCache) {
244
- hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
310
+ // Check if the content is suspiciously thin — escalate to browser if so
311
+ if (shouldEscalateForLowContent(simpleOrTimeout.result)) {
312
+ shouldUseBrowser = true;
313
+ }
314
+ else {
315
+ // Check whether the response is a bot-challenge page (e.g. Cloudflare, PerimeterX)
316
+ const challengeCheck = detectChallenge(simpleOrTimeout.result.html, simpleOrTimeout.result.statusCode);
317
+ if (challengeCheck.isChallenge && challengeCheck.confidence >= 0.7) {
318
+ // Escalate — the browser/stealth path will handle it below
319
+ shouldUseBrowser = true;
320
+ }
321
+ else {
322
+ const strategyResult = {
323
+ ...simpleOrTimeout.result,
324
+ method: 'simple',
325
+ };
326
+ if (canUseCache) {
327
+ hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
328
+ }
329
+ recordMethod('simple');
330
+ return strategyResult;
331
+ }
245
332
  }
246
- recordMethod('simple');
247
- return strategyResult;
248
333
  }
249
334
  if (simpleOrTimeout.type === 'simple-error') {
250
335
  if (!shouldEscalateSimpleError(simpleOrTimeout.error)) {
@@ -317,30 +402,85 @@ export async function smartFetch(url, options = {}) {
317
402
  .then((result) => ({ type: 'simple-success', result }))
318
403
  .catch((error) => ({ type: 'simple-error', error }));
319
404
  if (simpleResult.type === 'simple-success') {
320
- const strategyResult = {
321
- ...simpleResult.result,
322
- method: 'simple',
323
- };
324
- if (canUseCache) {
325
- hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
405
+ // Check if the content is suspiciously thin — escalate to browser if so
406
+ if (shouldEscalateForLowContent(simpleResult.result)) {
407
+ shouldUseBrowser = true;
408
+ }
409
+ else {
410
+ // Check whether the response is a bot-challenge page
411
+ const challengeCheck = detectChallenge(simpleResult.result.html, simpleResult.result.statusCode);
412
+ if (challengeCheck.isChallenge && challengeCheck.confidence >= 0.7) {
413
+ shouldUseBrowser = true;
414
+ }
415
+ else {
416
+ const strategyResult = {
417
+ ...simpleResult.result,
418
+ method: 'simple',
419
+ };
420
+ if (canUseCache) {
421
+ hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
422
+ }
423
+ recordMethod('simple');
424
+ return strategyResult;
425
+ }
326
426
  }
327
- recordMethod('simple');
328
- return strategyResult;
329
427
  }
330
- if (!shouldEscalateSimpleError(simpleResult.error)) {
331
- throw simpleResult.error;
428
+ else {
429
+ if (!shouldEscalateSimpleError(simpleResult.error)) {
430
+ throw simpleResult.error;
431
+ }
432
+ shouldUseBrowser = true;
332
433
  }
333
- shouldUseBrowser = true;
334
434
  }
335
435
  }
336
436
  }
337
- /* ---- browser / stealth fallback -------------------------------------- */
338
- const browserResult = await fetchWithBrowserStrategy(url, browserOptions);
339
- if (canUseCache) {
340
- hooks.setCache?.(url, browserResult) ?? setBasicCache(url, browserResult);
437
+ /* ---- browser / stealth fallback with challenge-detection cascade ----- */
438
+ // Attempt 1: browser (or stealth, if already forced)
439
+ let finalResult = await fetchWithBrowserStrategy(url, browserOptions);
440
+ // Check if the browser result is itself a bot-challenge page
441
+ const browserChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
442
+ if (browserChallengeCheck.isChallenge && browserChallengeCheck.confidence >= 0.7) {
443
+ if (!browserOptions.effectiveStealth) {
444
+ // Attempt 2: escalate to stealth
445
+ const stealthOptions = {
446
+ ...browserOptions,
447
+ effectiveStealth: true,
448
+ };
449
+ finalResult = await fetchWithBrowserStrategy(url, stealthOptions);
450
+ const stealthChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
451
+ if (stealthChallengeCheck.isChallenge && stealthChallengeCheck.confidence >= 0.7) {
452
+ // Attempt 3: stealth + 5s extra wait
453
+ const stealthExtraOptions = {
454
+ ...stealthOptions,
455
+ waitMs: stealthOptions.waitMs + 5000,
456
+ };
457
+ finalResult = await fetchWithBrowserStrategy(url, stealthExtraOptions);
458
+ const finalChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
459
+ if (finalChallengeCheck.isChallenge && finalChallengeCheck.confidence >= 0.7) {
460
+ // Give up — return with warning flag
461
+ finalResult = { ...finalResult, challengeDetected: true };
462
+ }
463
+ }
464
+ }
465
+ else {
466
+ // Already in stealth mode; retry with 5s extra wait
467
+ const stealthExtraOptions = {
468
+ ...browserOptions,
469
+ waitMs: browserOptions.waitMs + 5000,
470
+ };
471
+ finalResult = await fetchWithBrowserStrategy(url, stealthExtraOptions);
472
+ const finalChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
473
+ if (finalChallengeCheck.isChallenge && finalChallengeCheck.confidence >= 0.7) {
474
+ // Give up — return with warning flag
475
+ finalResult = { ...finalResult, challengeDetected: true };
476
+ }
477
+ }
478
+ }
479
+ if (canUseCache && !finalResult.challengeDetected) {
480
+ hooks.setCache?.(url, finalResult) ?? setBasicCache(url, finalResult);
341
481
  }
342
- recordMethod(browserResult.method);
343
- return browserResult;
482
+ recordMethod(finalResult.method);
483
+ return finalResult;
344
484
  }
345
485
  /* ---------- legacy export for tests ------------------------------------- */
346
486
  /**