scrapex 1.0.0-alpha.1 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +164 -5
  2. package/dist/enhancer-ByjRD-t5.mjs +769 -0
  3. package/dist/enhancer-ByjRD-t5.mjs.map +1 -0
  4. package/dist/enhancer-j0xqKDJm.cjs +847 -0
  5. package/dist/enhancer-j0xqKDJm.cjs.map +1 -0
  6. package/dist/index-CDgcRnig.d.cts +268 -0
  7. package/dist/index-CDgcRnig.d.cts.map +1 -0
  8. package/dist/index-piS5wtki.d.mts +268 -0
  9. package/dist/index-piS5wtki.d.mts.map +1 -0
  10. package/dist/index.cjs +1192 -37
  11. package/dist/index.cjs.map +1 -1
  12. package/dist/index.d.cts +318 -2
  13. package/dist/index.d.cts.map +1 -1
  14. package/dist/index.d.mts +318 -2
  15. package/dist/index.d.mts.map +1 -1
  16. package/dist/index.mjs +1164 -6
  17. package/dist/index.mjs.map +1 -1
  18. package/dist/llm/index.cjs +250 -232
  19. package/dist/llm/index.cjs.map +1 -1
  20. package/dist/llm/index.d.cts +132 -85
  21. package/dist/llm/index.d.cts.map +1 -1
  22. package/dist/llm/index.d.mts +132 -85
  23. package/dist/llm/index.d.mts.map +1 -1
  24. package/dist/llm/index.mjs +243 -236
  25. package/dist/llm/index.mjs.map +1 -1
  26. package/dist/parsers/index.cjs +10 -199
  27. package/dist/parsers/index.d.cts +2 -133
  28. package/dist/parsers/index.d.mts +2 -133
  29. package/dist/parsers/index.mjs +2 -191
  30. package/dist/parsers-Bneuws8x.cjs +569 -0
  31. package/dist/parsers-Bneuws8x.cjs.map +1 -0
  32. package/dist/parsers-CwkYnyWY.mjs +482 -0
  33. package/dist/parsers-CwkYnyWY.mjs.map +1 -0
  34. package/dist/types-CadAXrme.d.mts +674 -0
  35. package/dist/types-CadAXrme.d.mts.map +1 -0
  36. package/dist/types-DPEtPihB.d.cts +674 -0
  37. package/dist/types-DPEtPihB.d.cts.map +1 -0
  38. package/package.json +15 -16
  39. package/dist/enhancer-Q6CSc1gA.mjs +0 -220
  40. package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
  41. package/dist/enhancer-oM4BhYYS.cjs +0 -268
  42. package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
  43. package/dist/parsers/index.cjs.map +0 -1
  44. package/dist/parsers/index.d.cts.map +0 -1
  45. package/dist/parsers/index.d.mts.map +0 -1
  46. package/dist/parsers/index.mjs.map +0 -1
  47. package/dist/types-CNQZVW36.d.mts +0 -150
  48. package/dist/types-CNQZVW36.d.mts.map +0 -1
  49. package/dist/types-D0HYR95H.d.cts +0 -150
  50. package/dist/types-D0HYR95H.d.cts.map +0 -1
@@ -0,0 +1,847 @@
1
+ const require_parsers = require('./parsers-Bneuws8x.cjs');
2
+ let node_dns = require("node:dns");
3
+ let node_net = require("node:net");
4
+ let zod = require("zod");
5
+
6
+ //#region src/core/errors.ts
7
+ /**
8
+ * Custom error class for scraping failures with structured error codes
9
+ */
10
+ var ScrapeError = class ScrapeError extends Error {
11
+ code;
12
+ statusCode;
13
+ constructor(message, code, statusCode, cause) {
14
+ super(message, { cause });
15
+ this.name = "ScrapeError";
16
+ this.code = code;
17
+ this.statusCode = statusCode;
18
+ if (Error.captureStackTrace) Error.captureStackTrace(this, ScrapeError);
19
+ }
20
+ /**
21
+ * Create a ScrapeError from an unknown error
22
+ */
23
+ static from(error, code = "FETCH_FAILED") {
24
+ if (error instanceof ScrapeError) return error;
25
+ if (error instanceof Error) return new ScrapeError(error.message, code, void 0, error);
26
+ return new ScrapeError(String(error), code);
27
+ }
28
+ /**
29
+ * Check if error is retryable (network issues, timeouts)
30
+ */
31
+ isRetryable() {
32
+ return this.code === "FETCH_FAILED" || this.code === "TIMEOUT";
33
+ }
34
+ /**
35
+ * Convert to a plain object for serialization
36
+ */
37
+ toJSON() {
38
+ return {
39
+ name: this.name,
40
+ message: this.message,
41
+ code: this.code,
42
+ statusCode: this.statusCode,
43
+ stack: this.stack
44
+ };
45
+ }
46
+ };
47
+
48
+ //#endregion
49
+ //#region src/common/errors.ts
50
+ /**
51
+ * Error normalization utilities for HTTP providers.
52
+ * Maps HTTP status codes to consistent ScrapeError codes.
53
+ */
54
+ /**
55
+ * HTTP status code to ScrapeError code mapping.
56
+ */
57
+ function getErrorCodeFromStatus(status) {
58
+ if (status === 401 || status === 403) return "BLOCKED";
59
+ if (status === 404) return "NOT_FOUND";
60
+ if (status === 429) return "BLOCKED";
61
+ if (status === 408) return "TIMEOUT";
62
+ if (status >= 500) return "LLM_ERROR";
63
+ return "FETCH_FAILED";
64
+ }
65
+ /**
66
+ * Parse error message from API response body.
67
+ */
68
+ async function parseErrorBody(response) {
69
+ try {
70
+ const text = await response.text();
71
+ try {
72
+ const json = JSON.parse(text);
73
+ if (typeof json.error === "object" && json.error !== null) {
74
+ const error = json.error;
75
+ return String(error.message ?? error.msg ?? JSON.stringify(error));
76
+ }
77
+ if (typeof json.error === "string") return json.error;
78
+ if (typeof json.message === "string") return json.message;
79
+ if (typeof json.detail === "string") return json.detail;
80
+ return text;
81
+ } catch {
82
+ return text || `HTTP ${response.status} ${response.statusText}`;
83
+ }
84
+ } catch {
85
+ return `HTTP ${response.status} ${response.statusText}`;
86
+ }
87
+ }
88
+ /**
89
+ * Create a ScrapeError from an HTTP response.
90
+ */
91
+ async function createHttpError(response, providerName, errorMapper) {
92
+ const code = getErrorCodeFromStatus(response.status);
93
+ let message;
94
+ if (errorMapper) try {
95
+ message = errorMapper(await response.json());
96
+ } catch {
97
+ message = await parseErrorBody(response);
98
+ }
99
+ else message = await parseErrorBody(response);
100
+ return new ScrapeError(`${providerName} API error (${response.status}): ${message}`, code, response.status);
101
+ }
102
+
103
+ //#endregion
104
+ //#region src/common/resilience.ts
105
+ /**
106
+ * Default retry configuration.
107
+ */
108
+ const DEFAULT_RETRY = {
109
+ maxAttempts: 3,
110
+ backoffMs: 1e3,
111
+ backoffMultiplier: 2,
112
+ retryableStatuses: [
113
+ 408,
114
+ 429,
115
+ 500,
116
+ 502,
117
+ 503,
118
+ 504
119
+ ]
120
+ };
121
+ /**
122
+ * Errors that should be retried (transient failures).
123
+ */
124
+ const RETRYABLE_ERROR_CODES = [
125
+ "ECONNRESET",
126
+ "ETIMEDOUT",
127
+ "ECONNREFUSED",
128
+ "EPIPE",
129
+ "ENOTFOUND",
130
+ "ENETUNREACH",
131
+ "EAI_AGAIN"
132
+ ];
133
+ /**
134
+ * Check if an error is retryable.
135
+ */
136
+ function isRetryableError(error, retryableStatuses = DEFAULT_RETRY.retryableStatuses) {
137
+ if (error instanceof Error) {
138
+ const code = error.code;
139
+ if (code && RETRYABLE_ERROR_CODES.includes(code)) return true;
140
+ if ("statusCode" in error && typeof error.statusCode === "number") return retryableStatuses.includes(error.statusCode);
141
+ if ("status" in error && typeof error.status === "number") return retryableStatuses.includes(error.status);
142
+ if ("code" in error) {
143
+ const errCode = error.code;
144
+ if (errCode === "TIMEOUT" || errCode === "FETCH_FAILED") return true;
145
+ }
146
+ const message = error.message.toLowerCase();
147
+ if (message.includes("timeout") || message.includes("rate limit") || message.includes("too many requests") || message.includes("temporarily unavailable")) return true;
148
+ }
149
+ return false;
150
+ }
151
+ /**
152
+ * Sleep for specified milliseconds.
153
+ */
154
+ function sleep(ms) {
155
+ return new Promise((resolve) => setTimeout(resolve, ms));
156
+ }
157
+ /**
158
+ * Execute a function with retry logic.
159
+ */
160
+ async function withRetry(fn, config, onRetry) {
161
+ const maxAttempts = config?.maxAttempts ?? DEFAULT_RETRY.maxAttempts;
162
+ const backoffMs = config?.backoffMs ?? DEFAULT_RETRY.backoffMs;
163
+ const multiplier = config?.backoffMultiplier ?? DEFAULT_RETRY.backoffMultiplier;
164
+ const retryableStatuses = config?.retryableStatuses ?? DEFAULT_RETRY.retryableStatuses;
165
+ let lastError;
166
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) try {
167
+ return {
168
+ result: await fn(),
169
+ attempts: attempt
170
+ };
171
+ } catch (error) {
172
+ lastError = error instanceof Error ? error : new Error(String(error));
173
+ if (attempt === maxAttempts || !isRetryableError(error, retryableStatuses)) throw lastError;
174
+ const jitter = backoffMs * multiplier ** (attempt - 1) * (.9 + Math.random() * .2);
175
+ onRetry?.(attempt, lastError, jitter);
176
+ await sleep(jitter);
177
+ }
178
+ throw lastError ?? /* @__PURE__ */ new Error("Retry failed");
179
+ }
180
+ /**
181
+ * Execute a function with timeout.
182
+ */
183
+ async function withTimeout(fn, timeoutMs) {
184
+ const controller = new AbortController();
185
+ const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
186
+ try {
187
+ return await fn(controller.signal);
188
+ } finally {
189
+ clearTimeout(timeoutId);
190
+ }
191
+ }
192
+ /**
193
+ * Default circuit breaker configuration.
194
+ */
195
+ const DEFAULT_CIRCUIT_BREAKER = {
196
+ failureThreshold: 5,
197
+ resetTimeoutMs: 3e4
198
+ };
199
+ /**
200
+ * Error thrown when circuit breaker is open.
201
+ */
202
+ var CircuitOpenError = class extends Error {
203
+ isCircuitOpen = true;
204
+ constructor(message) {
205
+ super(message);
206
+ this.name = "CircuitOpenError";
207
+ }
208
+ };
209
+ /**
210
+ * Circuit breaker implementation.
211
+ * Prevents cascade failures by stopping requests when failure rate is high.
212
+ */
213
+ var CircuitBreaker = class {
214
+ state;
215
+ failureThreshold;
216
+ resetTimeoutMs;
217
+ constructor(config) {
218
+ this.failureThreshold = config?.failureThreshold ?? DEFAULT_CIRCUIT_BREAKER.failureThreshold;
219
+ this.resetTimeoutMs = config?.resetTimeoutMs ?? DEFAULT_CIRCUIT_BREAKER.resetTimeoutMs;
220
+ this.state = {
221
+ state: "closed",
222
+ failures: 0
223
+ };
224
+ }
225
+ /**
226
+ * Check if requests are blocked.
227
+ */
228
+ isOpen() {
229
+ this.updateState();
230
+ return this.state.state === "open";
231
+ }
232
+ /**
233
+ * Get current circuit state.
234
+ */
235
+ getState() {
236
+ this.updateState();
237
+ return this.state.state;
238
+ }
239
+ /**
240
+ * Record a successful request.
241
+ */
242
+ recordSuccess() {
243
+ this.state.failures = 0;
244
+ this.state.state = "closed";
245
+ this.state.lastFailureTime = void 0;
246
+ this.state.nextAttemptTime = void 0;
247
+ }
248
+ /**
249
+ * Record a failed request.
250
+ */
251
+ recordFailure() {
252
+ this.state.failures++;
253
+ this.state.lastFailureTime = Date.now();
254
+ if (this.state.failures >= this.failureThreshold) {
255
+ this.state.state = "open";
256
+ this.state.nextAttemptTime = Date.now() + this.resetTimeoutMs;
257
+ }
258
+ }
259
+ /**
260
+ * Execute a function with circuit breaker protection.
261
+ */
262
+ async execute(fn) {
263
+ if (this.isOpen()) throw new CircuitOpenError(`Circuit breaker is open. Next attempt at ${this.state.nextAttemptTime ? new Date(this.state.nextAttemptTime).toISOString() : "unknown"}`);
264
+ try {
265
+ const result = await fn();
266
+ this.recordSuccess();
267
+ return result;
268
+ } catch (error) {
269
+ this.recordFailure();
270
+ throw error;
271
+ }
272
+ }
273
+ /**
274
+ * Reset the circuit breaker.
275
+ */
276
+ reset() {
277
+ this.state = {
278
+ state: "closed",
279
+ failures: 0
280
+ };
281
+ }
282
+ /**
283
+ * Update state based on time (open -> half-open transition).
284
+ */
285
+ updateState() {
286
+ if (this.state.state === "open" && this.state.nextAttemptTime && Date.now() >= this.state.nextAttemptTime) this.state.state = "half-open";
287
+ }
288
+ };
289
+ /**
290
+ * Token bucket rate limiter.
291
+ */
292
+ var RateLimiter = class {
293
+ tokens;
294
+ lastRefill;
295
+ maxTokens;
296
+ refillRate;
297
+ constructor(config) {
298
+ const requestsPerSecond = (config.requestsPerMinute ?? 60) / 60;
299
+ this.maxTokens = Math.max(1, Math.ceil(requestsPerSecond * 10));
300
+ this.refillRate = requestsPerSecond;
301
+ this.tokens = this.maxTokens;
302
+ this.lastRefill = Date.now();
303
+ }
304
+ /**
305
+ * Check if a request is allowed without consuming tokens.
306
+ */
307
+ canProceed() {
308
+ this.refill();
309
+ return this.tokens >= 1;
310
+ }
311
+ /**
312
+ * Attempt to acquire tokens for a request.
313
+ * Returns true if allowed, false if rate limited.
314
+ */
315
+ tryAcquire(tokens = 1) {
316
+ this.refill();
317
+ if (this.tokens >= tokens) {
318
+ this.tokens -= tokens;
319
+ return true;
320
+ }
321
+ return false;
322
+ }
323
+ /**
324
+ * Wait until tokens are available, then acquire.
325
+ */
326
+ async acquire(tokens = 1) {
327
+ if (this.tryAcquire(tokens)) return;
328
+ this.refill();
329
+ const tokensNeeded = tokens - this.tokens;
330
+ const waitMs = Math.ceil(tokensNeeded / this.refillRate * 1e3);
331
+ if (waitMs > 0) await sleep(waitMs);
332
+ while (!this.tryAcquire(tokens)) await sleep(Math.ceil(1 / this.refillRate * 1e3));
333
+ }
334
+ /**
335
+ * Get time until next token is available (in milliseconds).
336
+ */
337
+ getWaitTime() {
338
+ this.refill();
339
+ if (this.tokens >= 1) return 0;
340
+ return Math.ceil(1 / this.refillRate * 1e3);
341
+ }
342
+ /**
343
+ * Refill tokens based on elapsed time.
344
+ */
345
+ refill() {
346
+ const now = Date.now();
347
+ const newTokens = (now - this.lastRefill) / 1e3 * this.refillRate;
348
+ this.tokens = Math.min(this.maxTokens, this.tokens + newTokens);
349
+ this.lastRefill = now;
350
+ }
351
+ };
352
+ /**
353
+ * Semaphore for limiting concurrent operations.
354
+ */
355
+ var Semaphore = class {
356
+ permits;
357
+ waiting = [];
358
+ constructor(permits) {
359
+ this.permits = permits;
360
+ }
361
+ /**
362
+ * Acquire a permit, waiting if necessary.
363
+ */
364
+ async acquire() {
365
+ if (this.permits > 0) {
366
+ this.permits--;
367
+ return;
368
+ }
369
+ return new Promise((resolve) => {
370
+ this.waiting.push(resolve);
371
+ });
372
+ }
373
+ /**
374
+ * Release a permit.
375
+ */
376
+ release() {
377
+ const next = this.waiting.shift();
378
+ if (next) next();
379
+ else this.permits++;
380
+ }
381
+ /**
382
+ * Execute function with semaphore protection.
383
+ */
384
+ async execute(fn) {
385
+ await this.acquire();
386
+ try {
387
+ return await fn();
388
+ } finally {
389
+ this.release();
390
+ }
391
+ }
392
+ };
393
+ /**
394
+ * Execute a function with all resilience features.
395
+ *
396
+ * @param fn - The async function to execute with resilience
397
+ * @param config - Configuration for retry and timeout behavior
398
+ * @param state - Pre-instantiated resilience primitives for stateful features.
399
+ * Circuit breaker, rate limiter, and semaphore must be instantiated by the caller
400
+ * and passed via state to enable those features. This allows sharing state across
401
+ * multiple calls for proper circuit breaker tracking and rate limiting.
402
+ * The config parameter is only used for retry and timeout settings.
403
+ * @param callbacks - Optional callbacks for retry events
404
+ */
405
+ async function withResilience(fn, config, state, callbacks) {
406
+ const timeoutMs = config?.timeoutMs ?? 3e4;
407
+ if (state?.circuitBreaker?.isOpen()) throw new CircuitOpenError("Circuit breaker is open");
408
+ if (state?.rateLimiter) await state.rateLimiter.acquire();
409
+ const executeWithConcurrency = async () => {
410
+ const withTimeoutFn = () => withTimeout(fn, timeoutMs);
411
+ try {
412
+ const retryResult = await withRetry(withTimeoutFn, config?.retry, callbacks?.onRetry);
413
+ state?.circuitBreaker?.recordSuccess();
414
+ return retryResult;
415
+ } catch (error) {
416
+ state?.circuitBreaker?.recordFailure();
417
+ throw error;
418
+ }
419
+ };
420
+ if (state?.semaphore) return state.semaphore.execute(executeWithConcurrency);
421
+ return executeWithConcurrency();
422
+ }
423
+
424
+ //#endregion
425
+ //#region src/common/http-base.ts
426
+ /**
427
+ * Shared HTTP provider infrastructure for LLM and Embedding providers.
428
+ * Provides SSRF protection, resilience, and error normalization.
429
+ */
430
+ /**
431
+ * Private IP ranges blocked for SSRF protection.
432
+ */
433
+ const PRIVATE_IP_PATTERNS = [
434
+ /^10\./,
435
+ /^172\.(1[6-9]|2\d|3[01])\./,
436
+ /^192\.168\./,
437
+ /^127\./,
438
+ /^0\./,
439
+ /^169\.254\./,
440
+ /^100\.(6[4-9]|[7-9]\d|1[01]\d|12[0-7])\./,
441
+ /^::1$/,
442
+ /^(fc|fd)[0-9a-f]{2}:/i,
443
+ /^fe80:/i,
444
+ /^fec0:/i,
445
+ /^::ffff:(10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.|127\.|0\.)/i,
446
+ /^localhost$/i
447
+ ];
448
+ /**
449
+ * Check if a hostname/IP is private.
450
+ */
451
+ function isPrivateHost(hostname) {
452
+ return PRIVATE_IP_PATTERNS.some((pattern) => pattern.test(hostname));
453
+ }
454
+ /**
455
+ * Validate a URL for security.
456
+ */
457
+ function validateUrl(url, options = {}) {
458
+ const requireHttps = options.requireHttps ?? true;
459
+ const allowPrivate = options.allowPrivate ?? false;
460
+ let parsed;
461
+ try {
462
+ parsed = new URL(url);
463
+ } catch {
464
+ throw new ScrapeError(`Invalid URL: ${url}`, "INVALID_URL");
465
+ }
466
+ if (requireHttps && parsed.protocol !== "https:") throw new ScrapeError(`HTTPS required. Got: ${parsed.protocol}`, "VALIDATION_ERROR");
467
+ if (!allowPrivate && isPrivateHost(parsed.hostname)) throw new ScrapeError(`Private/internal addresses not allowed: ${parsed.hostname}`, "VALIDATION_ERROR");
468
+ return parsed;
469
+ }
470
+ /**
471
+ * Validate URL and resolve DNS to check for private IPs.
472
+ */
473
+ async function validateUrlWithDns(url, options = {}) {
474
+ const parsed = validateUrl(url, options);
475
+ const resolveDns = options.resolveDns ?? true;
476
+ const allowPrivate = options.allowPrivate ?? false;
477
+ if (!resolveDns || allowPrivate) return;
478
+ const host = parsed.hostname;
479
+ if ((0, node_net.isIP)(host)) return;
480
+ try {
481
+ const addresses = await node_dns.promises.lookup(host, { all: true });
482
+ for (const addr of addresses) if (isPrivateHost(addr.address)) throw new ScrapeError(`DNS resolved to private address: ${host} -> ${addr.address}`, "VALIDATION_ERROR");
483
+ } catch (error) {
484
+ if (error instanceof ScrapeError) throw error;
485
+ throw new ScrapeError(`Failed to resolve hostname: ${host} (${error instanceof Error ? error.message : String(error)})`, "FETCH_FAILED");
486
+ }
487
+ }
488
+ /**
489
+ * Base HTTP provider with shared security and resilience.
490
+ */
491
+ var BaseHttpProvider = class {
492
+ baseUrl;
493
+ model;
494
+ headers;
495
+ errorMapper;
496
+ requireHttps;
497
+ allowPrivate;
498
+ resolveDns;
499
+ allowRedirects;
500
+ timeoutMs;
501
+ retryConfig;
502
+ concurrency;
503
+ circuitBreaker;
504
+ rateLimiter;
505
+ semaphore;
506
+ constructor(config) {
507
+ this.baseUrl = config.baseUrl.replace(/\/$/, "");
508
+ this.model = config.model;
509
+ this.headers = {
510
+ "Content-Type": "application/json",
511
+ ...config.headers
512
+ };
513
+ this.errorMapper = config.errorMapper;
514
+ this.requireHttps = config.requireHttps ?? true;
515
+ this.allowPrivate = config.allowPrivate ?? false;
516
+ this.resolveDns = config.resolveDns ?? true;
517
+ this.allowRedirects = config.allowRedirects ?? false;
518
+ this.timeoutMs = config.resilience?.timeoutMs ?? 3e4;
519
+ this.retryConfig = config.resilience?.retry;
520
+ this.concurrency = config.resilience?.concurrency ?? 1;
521
+ const sharedState = config.resilience?.state;
522
+ this.circuitBreaker = sharedState?.circuitBreaker ?? (config.resilience?.circuitBreaker ? new CircuitBreaker(config.resilience.circuitBreaker) : void 0);
523
+ this.rateLimiter = sharedState?.rateLimiter ?? (config.resilience?.rateLimit ? new RateLimiter(config.resilience.rateLimit) : void 0);
524
+ this.semaphore = sharedState?.semaphore ?? new Semaphore(this.concurrency);
525
+ validateUrl(this.baseUrl, {
526
+ requireHttps: this.requireHttps,
527
+ allowPrivate: this.allowPrivate
528
+ });
529
+ }
530
+ /**
531
+ * Get the current resilience state for persistence across calls.
532
+ */
533
+ getResilienceState() {
534
+ return {
535
+ circuitBreaker: this.circuitBreaker,
536
+ rateLimiter: this.rateLimiter,
537
+ semaphore: this.semaphore
538
+ };
539
+ }
540
+ /**
541
+ * Make an HTTP request with security and resilience.
542
+ */
543
+ async fetch(url, options = {}) {
544
+ const securityOptions = {
545
+ requireHttps: this.requireHttps,
546
+ allowPrivate: this.allowPrivate,
547
+ resolveDns: this.resolveDns,
548
+ allowRedirects: this.allowRedirects
549
+ };
550
+ await validateUrlWithDns(url, securityOptions);
551
+ if (this.circuitBreaker?.isOpen()) throw new CircuitOpenError("Circuit breaker is open. Too many recent failures.");
552
+ if (this.rateLimiter) await this.rateLimiter.acquire();
553
+ const doFetch = async (signal) => {
554
+ const composedSignal = options.signal ? AbortSignal.any([options.signal, signal]) : signal;
555
+ const response = await fetch(url, {
556
+ method: options.method ?? "POST",
557
+ headers: {
558
+ ...this.headers,
559
+ ...options.headers
560
+ },
561
+ body: options.body ? JSON.stringify(options.body) : void 0,
562
+ signal: composedSignal,
563
+ redirect: this.allowRedirects ? "follow" : "error"
564
+ });
565
+ if (this.allowRedirects && response.redirected) await validateUrlWithDns(response.url, securityOptions);
566
+ if (!response.ok) throw await createHttpError(response, this.constructor.name, this.errorMapper);
567
+ return {
568
+ data: await response.json(),
569
+ status: response.status,
570
+ headers: response.headers
571
+ };
572
+ };
573
+ const executeWithConcurrency = async () => {
574
+ if (!this.semaphore) throw new ScrapeError("Semaphore not initialized", "VALIDATION_ERROR");
575
+ return this.semaphore.execute(async () => {
576
+ const fetchWithTimeout = async () => {
577
+ return withTimeout((signal) => doFetch(signal), this.timeoutMs);
578
+ };
579
+ try {
580
+ let result;
581
+ if (this.retryConfig) result = (await withRetry(fetchWithTimeout, this.retryConfig)).result;
582
+ else result = await fetchWithTimeout();
583
+ this.circuitBreaker?.recordSuccess();
584
+ return result;
585
+ } catch (error) {
586
+ this.circuitBreaker?.recordFailure();
587
+ throw error;
588
+ }
589
+ });
590
+ };
591
+ return executeWithConcurrency();
592
+ }
593
+ };
594
+
595
+ //#endregion
596
+ //#region src/llm/types.ts
597
+ /**
598
+ * Zod schemas for LLM outputs
599
+ */
600
+ const SummarySchema = zod.z.object({ summary: zod.z.string().describe("A concise 2-3 sentence summary of the content") });
601
+ const TagsSchema = zod.z.object({ tags: zod.z.array(zod.z.string()).describe("5-10 relevant tags/keywords") });
602
+ const EntitiesSchema = zod.z.object({
603
+ people: zod.z.array(zod.z.string()).describe("People mentioned"),
604
+ organizations: zod.z.array(zod.z.string()).describe("Organizations/companies"),
605
+ technologies: zod.z.array(zod.z.string()).describe("Technologies/tools/frameworks"),
606
+ locations: zod.z.array(zod.z.string()).describe("Locations/places"),
607
+ concepts: zod.z.array(zod.z.string()).describe("Key concepts/topics")
608
+ });
609
+ const ClassifySchema = zod.z.object({
610
+ contentType: zod.z.enum([
611
+ "article",
612
+ "repo",
613
+ "docs",
614
+ "package",
615
+ "video",
616
+ "tool",
617
+ "product",
618
+ "unknown"
619
+ ]).describe("The type of content"),
620
+ confidence: zod.z.number().min(0).max(1).describe("Confidence score 0-1")
621
+ });
622
+
623
+ //#endregion
624
+ //#region src/llm/enhancer.ts
625
+ /**
626
+ * Enhance scraped data with LLM-powered features
627
+ */
628
+ async function enhance(data, provider, types) {
629
+ const results = {};
630
+ const content = data.excerpt || data.textContent.slice(0, 1e4);
631
+ const context = `Title: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
632
+ const promises = [];
633
+ if (types.includes("summarize")) promises.push(summarize(context, provider).then((summary) => {
634
+ results.summary = summary;
635
+ }));
636
+ if (types.includes("tags")) promises.push(extractTags(context, provider).then((tags) => {
637
+ results.suggestedTags = tags;
638
+ }));
639
+ if (types.includes("entities")) promises.push(extractEntities(context, provider).then((entities) => {
640
+ results.entities = entities;
641
+ }));
642
+ if (types.includes("classify")) promises.push(classify(context, provider).then((classification) => {
643
+ if (classification.confidence > .7) results.contentType = classification.contentType;
644
+ }));
645
+ await Promise.all(promises);
646
+ return results;
647
+ }
648
+ /**
649
+ * Ask a custom question about the scraped content
650
+ * Results are stored in the `custom` field of ScrapedData
651
+ */
652
+ async function ask(data, provider, prompt, options) {
653
+ const key = options?.key || "response";
654
+ const content = data.excerpt || data.textContent.slice(0, 1e4);
655
+ const processedPrompt = applyPlaceholders(prompt, data, content);
656
+ if (options?.schema) {
657
+ const result = await extract(data, provider, options.schema, processedPrompt);
658
+ return { custom: { [key]: result } };
659
+ }
660
+ const fullPrompt = prompt.includes("{{content}}") ? processedPrompt : `${processedPrompt}\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
661
+ const response = await provider.complete(fullPrompt);
662
+ return { custom: { [key]: response } };
663
+ }
664
+ /**
665
+ * Apply placeholder replacements to a prompt template
666
+ */
667
+ function applyPlaceholders(prompt, data, content) {
668
+ const domain = (() => {
669
+ try {
670
+ return new URL(data.url).hostname;
671
+ } catch {
672
+ return "";
673
+ }
674
+ })();
675
+ return prompt.replace(/\{\{title\}\}/g, data.title).replace(/\{\{url\}\}/g, data.url).replace(/\{\{content\}\}/g, content).replace(/\{\{description\}\}/g, data.description || "").replace(/\{\{excerpt\}\}/g, data.excerpt || "").replace(/\{\{domain\}\}/g, domain);
676
+ }
677
+ /**
678
+ * Extract structured data using LLM and a custom schema
679
+ */
680
+ async function extract(data, provider, schema, promptTemplate) {
681
+ const zodShape = {};
682
+ for (const [key, type] of Object.entries(schema)) {
683
+ const isOptional = type.endsWith("?");
684
+ const baseType = isOptional ? type.slice(0, -1) : type;
685
+ let zodType;
686
+ switch (baseType) {
687
+ case "string":
688
+ zodType = zod.z.string();
689
+ break;
690
+ case "number":
691
+ zodType = zod.z.number();
692
+ break;
693
+ case "boolean":
694
+ zodType = zod.z.boolean();
695
+ break;
696
+ case "string[]":
697
+ zodType = zod.z.array(zod.z.string());
698
+ break;
699
+ case "number[]":
700
+ zodType = zod.z.array(zod.z.number());
701
+ break;
702
+ default: zodType = zod.z.string();
703
+ }
704
+ zodShape[key] = isOptional ? zodType.optional() : zodType;
705
+ }
706
+ const zodSchema = zod.z.object(zodShape);
707
+ const content = data.textContent.slice(0, 4e3);
708
+ let prompt;
709
+ if (promptTemplate) {
710
+ prompt = applyPlaceholders(promptTemplate, data, content);
711
+ if (!promptTemplate.includes("{{content}}")) prompt += `\n\nContext:\n${content}`;
712
+ } else prompt = `Extract the following information from this content:
713
+
714
+ Title: ${data.title}
715
+ URL: ${data.url}
716
+
717
+ Content:
718
+ ${content}
719
+
720
+ Extract these fields:
721
+ ${Object.entries(schema).map(([key, type]) => `- ${key} (${type})`).join("\n")}`;
722
+ return provider.completeJSON(prompt, zodSchema);
723
+ }
724
+ /**
725
+ * Generate a summary of the content
726
+ */
727
+ async function summarize(context, provider) {
728
+ const prompt = `Summarize the following content in 2-3 concise sentences:
729
+
730
+ ${context}`;
731
+ return (await provider.completeJSON(prompt, SummarySchema)).summary;
732
+ }
733
+ /**
734
+ * Extract relevant tags/keywords
735
+ */
736
+ async function extractTags(context, provider) {
737
+ const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:
738
+
739
+ ${context}`;
740
+ return (await provider.completeJSON(prompt, TagsSchema)).tags;
741
+ }
742
+ /**
743
+ * Extract named entities from content
744
+ */
745
+ async function extractEntities(context, provider) {
746
+ const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:
747
+
748
+ ${context}`;
749
+ return provider.completeJSON(prompt, EntitiesSchema);
750
+ }
751
+ /**
752
+ * Classify content type using LLM
753
+ */
754
+ async function classify(context, provider) {
755
+ const prompt = `Classify the following content into one of these categories:
756
+ - article: Blog post, news article, essay
757
+ - repo: Code repository, open source project
758
+ - docs: Documentation, API reference, guides
759
+ - package: npm/pip package page
760
+ - video: Video content, YouTube
761
+ - tool: Software tool, web application
762
+ - product: Commercial product, e-commerce
763
+
764
+ ${context}`;
765
+ return provider.completeJSON(prompt, ClassifySchema);
766
+ }
767
+
768
+ //#endregion
769
+ Object.defineProperty(exports, 'BaseHttpProvider', {
770
+ enumerable: true,
771
+ get: function () {
772
+ return BaseHttpProvider;
773
+ }
774
+ });
775
+ Object.defineProperty(exports, 'CircuitBreaker', {
776
+ enumerable: true,
777
+ get: function () {
778
+ return CircuitBreaker;
779
+ }
780
+ });
781
+ Object.defineProperty(exports, 'ClassifySchema', {
782
+ enumerable: true,
783
+ get: function () {
784
+ return ClassifySchema;
785
+ }
786
+ });
787
+ Object.defineProperty(exports, 'EntitiesSchema', {
788
+ enumerable: true,
789
+ get: function () {
790
+ return EntitiesSchema;
791
+ }
792
+ });
793
+ Object.defineProperty(exports, 'RateLimiter', {
794
+ enumerable: true,
795
+ get: function () {
796
+ return RateLimiter;
797
+ }
798
+ });
799
+ Object.defineProperty(exports, 'ScrapeError', {
800
+ enumerable: true,
801
+ get: function () {
802
+ return ScrapeError;
803
+ }
804
+ });
805
+ Object.defineProperty(exports, 'Semaphore', {
806
+ enumerable: true,
807
+ get: function () {
808
+ return Semaphore;
809
+ }
810
+ });
811
+ Object.defineProperty(exports, 'SummarySchema', {
812
+ enumerable: true,
813
+ get: function () {
814
+ return SummarySchema;
815
+ }
816
+ });
817
+ Object.defineProperty(exports, 'TagsSchema', {
818
+ enumerable: true,
819
+ get: function () {
820
+ return TagsSchema;
821
+ }
822
+ });
823
+ Object.defineProperty(exports, 'ask', {
824
+ enumerable: true,
825
+ get: function () {
826
+ return ask;
827
+ }
828
+ });
829
+ Object.defineProperty(exports, 'enhance', {
830
+ enumerable: true,
831
+ get: function () {
832
+ return enhance;
833
+ }
834
+ });
835
+ Object.defineProperty(exports, 'extract', {
836
+ enumerable: true,
837
+ get: function () {
838
+ return extract;
839
+ }
840
+ });
841
+ Object.defineProperty(exports, 'withResilience', {
842
+ enumerable: true,
843
+ get: function () {
844
+ return withResilience;
845
+ }
846
+ });
847
+ //# sourceMappingURL=enhancer-j0xqKDJm.cjs.map