webpeel 0.21.65 → 0.21.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,8 +28,12 @@ export type DomainExtractor = (html: string, url: string) => Promise<DomainExtra
28
28
  * Returns the domain extractor for a URL, or null if none matches.
29
29
  */
30
30
  export declare function getDomainExtractor(url: string): DomainExtractor | null;
31
+ /** Clear the extractor response cache (used in tests). */
32
+ export declare function clearExtractorCache(): void;
31
33
  /**
32
34
  * Convenience: run the extractor for the URL (if one exists).
33
- * Returns null when no extractor matches or extraction fails.
35
+ * Wraps _extractDomainDataImpl with a 5-minute LRU cache so that
36
+ * rate-limited API responses fall back to cached results instead of
37
+ * garbage browser rendering.
34
38
  */
35
39
  export declare function extractDomainData(html: string, url: string): Promise<DomainExtractResult | null>;
@@ -131,11 +131,40 @@ export function getDomainExtractor(url) {
131
131
  }
132
132
  return null;
133
133
  }
134
+ // ── Extractor Response Cache ──────────────────────────────────────────────
135
+ // Caches successful API responses for 5 minutes to survive rate limits.
136
+ // If the API rate-limits on the next request, we serve from cache instead
137
+ // of falling back to garbage browser rendering (cookie walls, "Loading…").
138
+ // Key: normalized URL (no query/hash), Value: { result, timestamp }
139
+ const EXTRACTOR_CACHE = new Map();
140
+ /** Clear the extractor response cache (used in tests). */
141
+ export function clearExtractorCache() { EXTRACTOR_CACHE.clear(); }
142
+ const CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
143
+ function getCachedExtractorResult(url) {
144
+ const key = url.replace(/[?#].*$/, '').toLowerCase(); // strip query+hash
145
+ const entry = EXTRACTOR_CACHE.get(key);
146
+ if (entry && Date.now() - entry.ts < CACHE_TTL_MS) {
147
+ return entry.result;
148
+ }
149
+ EXTRACTOR_CACHE.delete(key); // expired — evict
150
+ return null;
151
+ }
152
+ function setCachedExtractorResult(url, result) {
153
+ const key = url.replace(/[?#].*$/, '').toLowerCase();
154
+ EXTRACTOR_CACHE.set(key, { result, ts: Date.now() });
155
+ // Keep cache size bounded at 500 entries (evict oldest)
156
+ if (EXTRACTOR_CACHE.size > 500) {
157
+ const oldest = EXTRACTOR_CACHE.keys().next().value;
158
+ if (oldest)
159
+ EXTRACTOR_CACHE.delete(oldest);
160
+ }
161
+ }
162
+ // ─────────────────────────────────────────────────────────────────────────────
134
163
  /**
135
- * Convenience: run the extractor for the URL (if one exists).
164
+ * Internal implementation: run the extractor for the URL (if one exists).
136
165
  * Returns null when no extractor matches or extraction fails.
137
166
  */
138
- export async function extractDomainData(html, url) {
167
+ async function _extractDomainDataImpl(html, url) {
139
168
  const extractor = getDomainExtractor(url);
140
169
  if (!extractor)
141
170
  return null;
@@ -146,6 +175,32 @@ export async function extractDomainData(html, url) {
146
175
  return null;
147
176
  }
148
177
  }
178
+ /**
179
+ * Convenience: run the extractor for the URL (if one exists).
180
+ * Wraps _extractDomainDataImpl with a 5-minute LRU cache so that
181
+ * rate-limited API responses fall back to cached results instead of
182
+ * garbage browser rendering.
183
+ */
184
+ export async function extractDomainData(html, url) {
185
+ // 1. Check fresh cache first
186
+ const cached = getCachedExtractorResult(url);
187
+ if (cached)
188
+ return cached;
189
+ // 2. Try the real extractor
190
+ const result = await _extractDomainDataImpl(html, url);
191
+ if (result && result.cleanContent.length > 20) {
192
+ // 3. Cache the successful result
193
+ setCachedExtractorResult(url, result);
194
+ return result;
195
+ }
196
+ // 4. Extractor failed/returned garbage — check for any stale cache entry
197
+ // (stale structured data beats a browser "Loading…" page)
198
+ const stale = getCachedExtractorResult(url);
199
+ if (stale)
200
+ return stale;
201
+ // 5. Genuinely nothing — return null so the pipeline falls back to fetch
202
+ return result;
203
+ }
149
204
  // ---------------------------------------------------------------------------
150
205
  // Helpers
151
206
  // ---------------------------------------------------------------------------
@@ -184,6 +239,13 @@ async function fetchJson(url, customHeaders) {
184
239
  redirect: 'follow',
185
240
  });
186
241
  clearTimeout(timer);
242
+ // Surface 429 as a thrown error so callers can detect rate-limiting
243
+ // and the cache wrapper can serve stale results instead of garbage.
244
+ if (resp.status === 429) {
245
+ const err = new Error(`429 Too Many Requests: ${url}`);
246
+ err.statusCode = 429;
247
+ throw err;
248
+ }
187
249
  const text = await resp.text();
188
250
  const parsed = tryParseJson(text);
189
251
  if (parsed === null && text.length > 0) {
@@ -4246,14 +4308,9 @@ async function semanticScholarExtractor(_html, url) {
4246
4308
  const data = await fetchJson(apiUrl);
4247
4309
  if (!data)
4248
4310
  return null;
4249
- // Handle rate limiting — return helpful message instead of null
4311
+ // Rate limited — return null so pipeline falls back to browser rendering
4250
4312
  if (data.code === '429' || (data.message && String(data.message).includes('Too Many Requests'))) {
4251
- return {
4252
- domain,
4253
- type: 'paper',
4254
- structured: { paperId, rateLimited: true },
4255
- cleanContent: `# Semantic Scholar — Rate Limited\n\n⚠️ API rate limit reached. View paper directly: https://www.semanticscholar.org/paper/${paperId}`,
4256
- };
4313
+ return null;
4257
4314
  }
4258
4315
  if (!data.title)
4259
4316
  return null;
@@ -4321,23 +4378,11 @@ async function semanticScholarExtractor(_html, url) {
4321
4378
  const fields = 'title,authors,year,citationCount,url,openAccessPdf';
4322
4379
  const apiUrl = `https://api.semanticscholar.org/graph/v1/paper/search?query=${encodeURIComponent(query)}&limit=10&fields=${fields}`;
4323
4380
  const data = await fetchJson(apiUrl);
4324
- // Handle rate limiting gracefully — return a helpful message instead of null
4381
+ // Rate limited or no data — return null so pipeline falls back to browser rendering
4325
4382
  if (!data)
4326
4383
  return null;
4327
4384
  if (data.code === '429' || (data.message && String(data.message).includes('Too Many Requests'))) {
4328
- const cleanContent = [
4329
- `# 🔍 Semantic Scholar — "${query}"`,
4330
- '',
4331
- '⚠️ **Rate limited by Semantic Scholar API.** The free tier has strict limits.',
4332
- '',
4333
- `Try again in a few seconds, or search directly: https://www.semanticscholar.org/search?q=${encodeURIComponent(query)}`,
4334
- ].join('\n');
4335
- return {
4336
- domain,
4337
- type: 'search',
4338
- structured: { query, total: 0, papers: [], rateLimited: true },
4339
- cleanContent,
4340
- };
4385
+ return null;
4341
4386
  }
4342
4387
  if (!Array.isArray(data.data))
4343
4388
  return null;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.65",
3
+ "version": "0.21.67",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",