scrapex 0.5.3 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +551 -145
  3. package/dist/enhancer-ByjRD-t5.mjs +769 -0
  4. package/dist/enhancer-ByjRD-t5.mjs.map +1 -0
  5. package/dist/enhancer-j0xqKDJm.cjs +847 -0
  6. package/dist/enhancer-j0xqKDJm.cjs.map +1 -0
  7. package/dist/index-CDgcRnig.d.cts +268 -0
  8. package/dist/index-CDgcRnig.d.cts.map +1 -0
  9. package/dist/index-piS5wtki.d.mts +268 -0
  10. package/dist/index-piS5wtki.d.mts.map +1 -0
  11. package/dist/index.cjs +2007 -0
  12. package/dist/index.cjs.map +1 -0
  13. package/dist/index.d.cts +580 -0
  14. package/dist/index.d.cts.map +1 -0
  15. package/dist/index.d.mts +580 -0
  16. package/dist/index.d.mts.map +1 -0
  17. package/dist/index.mjs +1956 -0
  18. package/dist/index.mjs.map +1 -0
  19. package/dist/llm/index.cjs +334 -0
  20. package/dist/llm/index.cjs.map +1 -0
  21. package/dist/llm/index.d.cts +258 -0
  22. package/dist/llm/index.d.cts.map +1 -0
  23. package/dist/llm/index.d.mts +258 -0
  24. package/dist/llm/index.d.mts.map +1 -0
  25. package/dist/llm/index.mjs +317 -0
  26. package/dist/llm/index.mjs.map +1 -0
  27. package/dist/parsers/index.cjs +11 -0
  28. package/dist/parsers/index.d.cts +2 -0
  29. package/dist/parsers/index.d.mts +2 -0
  30. package/dist/parsers/index.mjs +3 -0
  31. package/dist/parsers-Bneuws8x.cjs +569 -0
  32. package/dist/parsers-Bneuws8x.cjs.map +1 -0
  33. package/dist/parsers-CwkYnyWY.mjs +482 -0
  34. package/dist/parsers-CwkYnyWY.mjs.map +1 -0
  35. package/dist/types-CadAXrme.d.mts +674 -0
  36. package/dist/types-CadAXrme.d.mts.map +1 -0
  37. package/dist/types-DPEtPihB.d.cts +674 -0
  38. package/dist/types-DPEtPihB.d.cts.map +1 -0
  39. package/package.json +79 -100
  40. package/dist/index.d.ts +0 -45
  41. package/dist/index.js +0 -8
  42. package/dist/scrapex.cjs.development.js +0 -1130
  43. package/dist/scrapex.cjs.development.js.map +0 -1
  44. package/dist/scrapex.cjs.production.min.js +0 -2
  45. package/dist/scrapex.cjs.production.min.js.map +0 -1
  46. package/dist/scrapex.esm.js +0 -1122
  47. package/dist/scrapex.esm.js.map +0 -1
@@ -0,0 +1,674 @@
1
+ import { CheerioAPI } from "cheerio";
2
+
3
+ //#region src/common/resilience.d.ts
4
+
5
+ /**
6
+ * Shared resilience utilities for HTTP providers.
7
+ * Provides retry, circuit breaker, rate limiting, timeout, and concurrency control.
8
+ */
9
+ /**
10
+ * Retry configuration.
11
+ */
12
+ interface RetryConfig$1 {
13
+ /** Maximum retry attempts. @default 3 */
14
+ maxAttempts?: number;
15
+ /** Initial backoff delay in ms. @default 1000 */
16
+ backoffMs?: number;
17
+ /** Backoff multiplier. @default 2 */
18
+ backoffMultiplier?: number;
19
+ /** HTTP status codes to retry. @default [408, 429, 500, 502, 503, 504] */
20
+ retryableStatuses?: number[];
21
+ }
22
+ /**
23
+ * Circuit breaker configuration.
24
+ */
25
+ interface CircuitBreakerConfig$1 {
26
+ /** Failures before opening circuit. @default 5 */
27
+ failureThreshold?: number;
28
+ /** Time before attempting to close circuit. @default 30000 */
29
+ resetTimeoutMs?: number;
30
+ }
31
+ /**
32
+ * Rate limit configuration.
33
+ */
34
+ interface RateLimitConfig$1 {
35
+ /** Max requests per minute */
36
+ requestsPerMinute?: number;
37
+ /** Max tokens per minute (for LLM providers) */
38
+ tokensPerMinute?: number;
39
+ }
40
+ /**
41
+ * Circuit breaker state.
42
+ */
43
+ type CircuitState$1 = 'closed' | 'open' | 'half-open';
44
+ /**
45
+ * Resilience configuration for HTTP providers.
46
+ */
47
+ interface ResilienceConfig$1 {
48
+ retry?: RetryConfig$1;
49
+ circuitBreaker?: CircuitBreakerConfig$1;
50
+ rateLimit?: RateLimitConfig$1;
51
+ /** Request timeout in ms. @default 30000 */
52
+ timeoutMs?: number;
53
+ /** Max concurrent requests. @default 1 */
54
+ concurrency?: number;
55
+ /** Optional shared state for circuit breaker / rate limiter / semaphore */
56
+ state?: ResilienceState$1;
57
+ }
58
+ /**
59
+ * Shared resilience state for persistence across calls.
60
+ */
61
+ interface ResilienceState$1 {
62
+ circuitBreaker?: {
63
+ isOpen(): boolean;
64
+ recordSuccess(): void;
65
+ recordFailure(): void;
66
+ getState?(): CircuitState$1;
67
+ };
68
+ rateLimiter?: {
69
+ acquire(): Promise<void>;
70
+ };
71
+ semaphore?: {
72
+ execute<T>(fn: () => Promise<T>): Promise<T>;
73
+ };
74
+ }
75
+ //#endregion
76
+ //#region src/common/http-base.d.ts
77
+ /**
78
+ * Base configuration for HTTP providers.
79
+ */
80
+ interface BaseHttpConfig<TError = unknown> {
81
+ /** Base URL for the API endpoint */
82
+ baseUrl: string;
83
+ /** Model identifier */
84
+ model: string;
85
+ /** Additional headers */
86
+ headers?: Record<string, string>;
87
+ /** Extract error message from failed response */
88
+ errorMapper?: (response: TError) => string;
89
+ /** Security options */
90
+ requireHttps?: boolean;
91
+ allowPrivate?: boolean;
92
+ resolveDns?: boolean;
93
+ allowRedirects?: boolean;
94
+ /** Resilience options */
95
+ resilience?: ResilienceConfig$1;
96
+ }
97
+ /**
98
+ * Fetch request options for base provider.
99
+ */
100
+ interface FetchOptions$1 {
101
+ method?: 'GET' | 'POST';
102
+ body?: unknown;
103
+ headers?: Record<string, string>;
104
+ signal?: AbortSignal;
105
+ }
106
+ /**
107
+ * Result of a fetch request.
108
+ */
109
+ interface FetchResult$1<T> {
110
+ data: T;
111
+ status: number;
112
+ headers: Headers;
113
+ }
114
+ /**
115
+ * Base HTTP provider with shared security and resilience.
116
+ */
117
+ declare abstract class BaseHttpProvider<TError = unknown> {
118
+ protected readonly baseUrl: string;
119
+ protected readonly model: string;
120
+ protected readonly headers: Record<string, string>;
121
+ protected readonly errorMapper?: (response: TError) => string;
122
+ protected readonly requireHttps: boolean;
123
+ protected readonly allowPrivate: boolean;
124
+ protected readonly resolveDns: boolean;
125
+ protected readonly allowRedirects: boolean;
126
+ protected readonly timeoutMs: number;
127
+ protected readonly retryConfig?: RetryConfig$1;
128
+ protected readonly concurrency: number;
129
+ private circuitBreaker?;
130
+ private rateLimiter?;
131
+ private semaphore?;
132
+ constructor(config: BaseHttpConfig<TError>);
133
+ /**
134
+ * Get the current resilience state for persistence across calls.
135
+ */
136
+ getResilienceState(): ResilienceState$1;
137
+ /**
138
+ * Make an HTTP request with security and resilience.
139
+ */
140
+ protected fetch<T>(url: string, options?: FetchOptions$1): Promise<FetchResult$1<T>>;
141
+ }
142
+ //#endregion
143
+ //#region src/embeddings/providers/http.d.ts
144
+ /**
145
+ * HTTP embedding provider configuration.
146
+ */
147
+ interface HttpEmbeddingConfig<TRequest = unknown, TResponse = unknown, TError = unknown> extends BaseHttpConfig<TError> {
148
+ /**
149
+ * Build request body from input texts.
150
+ * @default { input: texts, model }
151
+ */
152
+ requestBuilder?: (texts: string[], model: string) => TRequest;
153
+ /**
154
+ * Extract embeddings array from response.
155
+ * @default (res) => res.data.map(d => d.embedding)
156
+ */
157
+ responseMapper?: (response: TResponse) => number[][];
158
+ }
159
+ /**
160
+ * Create a generic HTTP embedding provider.
161
+ */
162
+ declare function createHttpEmbedding<TRequest = unknown, TResponse = unknown, TError = unknown>(config: HttpEmbeddingConfig<TRequest, TResponse, TError>): EmbeddingProvider;
163
+ //#endregion
164
+ //#region src/embeddings/types.d.ts
165
+ /**
166
+ * Embedding provider configuration - discriminated union for type safety.
167
+ *
168
+ * Use preset factory functions to create providers:
169
+ * - `createOpenAIEmbedding()` - OpenAI API
170
+ * - `createAzureEmbedding()` - Azure OpenAI
171
+ * - `createOllamaEmbedding()` - Local Ollama
172
+ * - `createHuggingFaceEmbedding()` - HuggingFace Inference API
173
+ * - `createCohereEmbedding()` - Cohere API
174
+ * - `createTransformersEmbedding()` - Local Transformers.js
175
+ *
176
+ * @example Using a preset
177
+ * ```ts
178
+ * import { createOpenAIEmbedding } from 'scrapex/embeddings';
179
+ *
180
+ * const result = await scrape(url, {
181
+ * embeddings: {
182
+ * provider: { type: 'custom', provider: createOpenAIEmbedding() },
183
+ * },
184
+ * });
185
+ * ```
186
+ *
187
+ * @example Using inline HTTP config
188
+ * ```ts
189
+ * const result = await scrape(url, {
190
+ * embeddings: {
191
+ * provider: {
192
+ * type: 'http',
193
+ * config: {
194
+ * baseUrl: 'https://api.example.com/embed',
195
+ * model: 'custom-model',
196
+ * headers: { Authorization: 'Bearer ...' },
197
+ * },
198
+ * },
199
+ * },
200
+ * });
201
+ * ```
202
+ */
203
+ type EmbeddingProviderConfig = {
204
+ type: 'http';
205
+ config: HttpEmbeddingConfig;
206
+ } | {
207
+ type: 'custom';
208
+ provider: EmbeddingProvider;
209
+ };
210
+ /**
211
+ * Embedding provider interface - mirrors LLMProvider pattern.
212
+ */
213
+ interface EmbeddingProvider {
214
+ readonly name: string;
215
+ /**
216
+ * Generate embeddings for one or more texts.
217
+ */
218
+ embed(texts: string[], options: EmbedRequest): Promise<EmbedResponse>;
219
+ }
220
+ interface EmbedRequest {
221
+ /** Model to use. If undefined, provider uses its configured default. */
222
+ model?: string;
223
+ dimensions?: number;
224
+ signal?: AbortSignal;
225
+ }
226
+ interface EmbedResponse {
227
+ embeddings: number[][];
228
+ usage?: {
229
+ promptTokens: number;
230
+ totalTokens: number;
231
+ };
232
+ }
233
+ type EmbeddingInputType = 'textContent' | 'title+summary' | 'custom';
234
+ interface EmbeddingInputConfig {
235
+ /**
236
+ * Predefined input source. Ignored if `transform` is provided.
237
+ * @default 'textContent'
238
+ */
239
+ type?: EmbeddingInputType;
240
+ /**
241
+ * Custom function to generate input text from scraped data.
242
+ * Enables dynamic construction (e.g., "Combine price + title").
243
+ */
244
+ transform?: (data: Partial<ScrapedData>) => string;
245
+ /**
246
+ * Static custom input string. Used when type is 'custom'.
247
+ */
248
+ customText?: string;
249
+ }
250
+ interface ChunkingConfig {
251
+ /**
252
+ * Target chunk size in tokens.
253
+ * @default 500
254
+ */
255
+ size?: number;
256
+ /**
257
+ * Overlap between chunks in tokens.
258
+ * @default 50
259
+ */
260
+ overlap?: number;
261
+ /**
262
+ * Token counting strategy.
263
+ * - 'heuristic': chars / 4 (fast, approximate)
264
+ * - 'tiktoken': accurate for OpenAI models (lazy-loaded)
265
+ * - function: custom tokenizer
266
+ */
267
+ tokenizer?: 'heuristic' | 'tiktoken' | ((text: string) => number);
268
+ /**
269
+ * Hard cap on input length (characters) to prevent memory exhaustion.
270
+ * @default 100000 (100KB)
271
+ */
272
+ maxInputLength?: number;
273
+ }
274
+ type EmbeddingAggregation = 'average' | 'max' | 'first' | 'all';
275
+ interface OutputConfig {
276
+ /**
277
+ * Aggregation strategy for chunk vectors.
278
+ * @default 'average'
279
+ */
280
+ aggregation?: EmbeddingAggregation;
281
+ /** Model-specific dimension override */
282
+ dimensions?: number;
283
+ }
284
+ interface PiiRedactionConfig {
285
+ /** Redact email addresses */
286
+ email?: boolean;
287
+ /** Redact phone numbers */
288
+ phone?: boolean;
289
+ /** Redact credit card numbers */
290
+ creditCard?: boolean;
291
+ /** Redact SSN patterns */
292
+ ssn?: boolean;
293
+ /** Redact IP addresses */
294
+ ipAddress?: boolean;
295
+ /** Additional patterns to redact */
296
+ customPatterns?: RegExp[];
297
+ }
298
+ interface SafetyConfig {
299
+ /**
300
+ * PII redaction patterns to apply before embedding.
301
+ * Critical for GDPR/CCPA compliance with third-party APIs.
302
+ */
303
+ piiRedaction?: PiiRedactionConfig;
304
+ /**
305
+ * Minimum text length to proceed with embedding.
306
+ * Skips with reason if below threshold.
307
+ */
308
+ minTextLength?: number;
309
+ /**
310
+ * Maximum tokens per API request to prevent billing DoS.
311
+ * @default 8192
312
+ */
313
+ maxTokens?: number;
314
+ /**
315
+ * Explicitly opt-in to receive sensitive data in callbacks.
316
+ * When false (default), onChunk receives redacted content.
317
+ */
318
+ allowSensitiveCallbacks?: boolean;
319
+ }
320
+ interface EmbeddingCacheConfig {
321
+ /** Cache implementation */
322
+ store?: EmbeddingCache;
323
+ /** Time-to-live in milliseconds */
324
+ ttlMs?: number;
325
+ /** Maximum entries for in-memory cache */
326
+ maxEntries?: number;
327
+ /**
328
+ * Extra salt to disambiguate cache keys for custom providers/transforms.
329
+ */
330
+ cacheKeySalt?: string;
331
+ }
332
+ /**
333
+ * Content-addressable cache interface for embeddings.
334
+ * Keys are based on content hash, not URL.
335
+ */
336
+ interface EmbeddingCache {
337
+ get(key: string): Promise<EmbeddingResult | undefined>;
338
+ set(key: string, value: EmbeddingResult, options?: {
339
+ ttlMs?: number;
340
+ }): Promise<void>;
341
+ delete(key: string): Promise<boolean>;
342
+ clear(): Promise<void>;
343
+ }
344
+ interface RetryConfig {
345
+ /**
346
+ * Maximum retry attempts.
347
+ * @default 3
348
+ */
349
+ maxAttempts?: number;
350
+ /**
351
+ * Initial backoff delay in milliseconds.
352
+ * @default 1000
353
+ */
354
+ backoffMs?: number;
355
+ /**
356
+ * Backoff multiplier for exponential delay.
357
+ * @default 2
358
+ */
359
+ backoffMultiplier?: number;
360
+ }
361
+ interface CircuitBreakerConfig {
362
+ /**
363
+ * Number of failures before opening the circuit.
364
+ * @default 5
365
+ */
366
+ failureThreshold?: number;
367
+ /**
368
+ * Time to wait before attempting to close the circuit.
369
+ * @default 30000
370
+ */
371
+ resetTimeoutMs?: number;
372
+ }
373
+ interface RateLimitConfig {
374
+ /** Maximum requests per minute */
375
+ requestsPerMinute?: number;
376
+ /** Maximum tokens per minute */
377
+ tokensPerMinute?: number;
378
+ }
379
+ interface ResilienceState {
380
+ circuitBreaker?: {
381
+ isOpen(): boolean;
382
+ recordSuccess(): void;
383
+ recordFailure(): void;
384
+ getState?(): CircuitState;
385
+ };
386
+ rateLimiter?: {
387
+ acquire(): Promise<void>;
388
+ };
389
+ semaphore?: {
390
+ execute<T>(fn: () => Promise<T>): Promise<T>;
391
+ };
392
+ }
393
+ interface ResilienceConfig {
394
+ /** Retry configuration for transient failures */
395
+ retry?: RetryConfig;
396
+ /** Circuit breaker to prevent cascade failures */
397
+ circuitBreaker?: CircuitBreakerConfig;
398
+ /** Rate limiting per provider */
399
+ rateLimit?: RateLimitConfig;
400
+ /**
401
+ * Optional shared state for circuit breaker and rate limiter.
402
+ * Use to persist state across multiple calls.
403
+ */
404
+ state?: ResilienceState;
405
+ /**
406
+ * Request timeout in milliseconds.
407
+ * @default 30000
408
+ */
409
+ timeoutMs?: number;
410
+ /**
411
+ * Concurrent chunk processing.
412
+ * @default 1
413
+ */
414
+ concurrency?: number;
415
+ }
416
+ interface EmbeddingOptions {
417
+ /** Embedding provider configuration */
418
+ provider: EmbeddingProviderConfig;
419
+ /** Model identifier (overrides provider default) */
420
+ model?: string;
421
+ /** Input text configuration */
422
+ input?: EmbeddingInputConfig;
423
+ /** Chunking and tokenization settings */
424
+ chunking?: ChunkingConfig;
425
+ /** Output format and aggregation */
426
+ output?: OutputConfig;
427
+ /** Safety and compliance settings */
428
+ safety?: SafetyConfig;
429
+ /** Caching configuration */
430
+ cache?: EmbeddingCacheConfig;
431
+ /** Resilience and rate limiting */
432
+ resilience?: ResilienceConfig;
433
+ /**
434
+ * Callback for each chunk (receives redacted content by default).
435
+ */
436
+ onChunk?: (chunk: Readonly<string>, embedding: Readonly<number[]>) => void;
437
+ /**
438
+ * Metrics callback for observability.
439
+ */
440
+ onMetrics?: (metrics: EmbeddingMetrics) => void;
441
+ }
442
+ interface EmbeddingSource {
443
+ /** Model used for embedding (may be undefined for custom providers) */
444
+ model?: string;
445
+ /** Number of chunks processed */
446
+ chunks: number;
447
+ /** Total tokens processed */
448
+ tokens: number;
449
+ /** Content checksum for cache validation */
450
+ checksum: string;
451
+ /** Whether result was from cache */
452
+ cached: boolean;
453
+ /** Total latency in milliseconds */
454
+ latencyMs: number;
455
+ }
456
+ /**
457
+ * Successful embedding result with single aggregated vector.
458
+ */
459
+ interface EmbeddingSuccessSingle {
460
+ status: 'success';
461
+ aggregation: 'average' | 'max' | 'first';
462
+ vector: number[];
463
+ source: EmbeddingSource;
464
+ }
465
+ /**
466
+ * Successful embedding result with all chunk vectors.
467
+ */
468
+ interface EmbeddingSuccessMultiple {
469
+ status: 'success';
470
+ aggregation: 'all';
471
+ vectors: number[][];
472
+ source: EmbeddingSource;
473
+ }
474
+ /**
475
+ * Skipped embedding with reason.
476
+ */
477
+ interface EmbeddingSkipped {
478
+ status: 'skipped';
479
+ reason: string;
480
+ source: Partial<EmbeddingSource>;
481
+ }
482
+ /**
483
+ * Embedding result - discriminated union for type safety.
484
+ * Use `result.status` to narrow the type.
485
+ */
486
+ type EmbeddingResult = EmbeddingSuccessSingle | EmbeddingSuccessMultiple | EmbeddingSkipped;
487
+ interface EmbeddingMetrics {
488
+ /** Provider name */
489
+ provider: string;
490
+ /** Model used (may be undefined for custom providers) */
491
+ model?: string;
492
+ /** Input tokens processed */
493
+ inputTokens: number;
494
+ /** Output embedding dimensions */
495
+ outputDimensions: number;
496
+ /** Number of chunks processed */
497
+ chunks: number;
498
+ /** Total latency in milliseconds */
499
+ latencyMs: number;
500
+ /** Whether result was from cache */
501
+ cached: boolean;
502
+ /** Number of retry attempts */
503
+ retries: number;
504
+ /** Whether PII was redacted */
505
+ piiRedacted: boolean;
506
+ }
507
+ /**
508
+ * Internal chunk representation during processing.
509
+ */
510
+ interface TextChunk {
511
+ /** Chunk text content */
512
+ text: string;
513
+ /** Start position in original text */
514
+ startIndex: number;
515
+ /** End position in original text */
516
+ endIndex: number;
517
+ /** Estimated token count */
518
+ tokens: number;
519
+ }
520
+ /**
521
+ * Circuit breaker state.
522
+ */
523
+ type CircuitState = 'closed' | 'open' | 'half-open';
524
+ //#endregion
525
+ //#region src/core/types.d.ts
526
+ /**
527
+ * Content type classification for scraped URLs
528
+ */
529
+ type ContentType = 'article' | 'repo' | 'docs' | 'package' | 'video' | 'tool' | 'product' | 'unknown';
530
+ /**
531
+ * Extracted link from content
532
+ */
533
+ interface ExtractedLink {
534
+ url: string;
535
+ text: string;
536
+ isExternal: boolean;
537
+ }
538
+ /**
539
+ * Extracted entities from LLM enhancement
540
+ */
541
+ interface ExtractedEntities {
542
+ people: string[];
543
+ organizations: string[];
544
+ technologies: string[];
545
+ locations: string[];
546
+ concepts: string[];
547
+ }
548
+ /**
549
+ * Main result of metadata scraping - optimized for LLM consumption
550
+ */
551
+ interface ScrapedData {
552
+ url: string;
553
+ canonicalUrl: string;
554
+ domain: string;
555
+ title: string;
556
+ description: string;
557
+ image?: string;
558
+ favicon?: string;
559
+ content: string;
560
+ textContent: string;
561
+ excerpt: string;
562
+ wordCount: number;
563
+ author?: string;
564
+ publishedAt?: string;
565
+ modifiedAt?: string;
566
+ siteName?: string;
567
+ language?: string;
568
+ contentType: ContentType;
569
+ keywords: string[];
570
+ jsonLd?: Record<string, unknown>[];
571
+ links?: ExtractedLink[];
572
+ summary?: string;
573
+ suggestedTags?: string[];
574
+ entities?: ExtractedEntities;
575
+ extracted?: Record<string, unknown>;
576
+ custom?: Record<string, unknown>;
577
+ embeddings?: EmbeddingResult;
578
+ scrapedAt: string;
579
+ scrapeTimeMs: number;
580
+ error?: string;
581
+ }
582
+ /**
583
+ * LLM enhancement types
584
+ */
585
+ type EnhancementType = 'summarize' | 'tags' | 'entities' | 'classify';
586
+ /**
587
+ * Schema for structured LLM extraction
588
+ */
589
+ type ExtractionSchemaType = 'string' | 'number' | 'boolean' | 'string[]' | 'number[]' | `${string}?`;
590
+ type ExtractionSchema = Record<string, ExtractionSchemaType>;
591
+ /**
592
+ * Forward declaration for LLM provider (defined in llm/types.ts)
593
+ */
594
+ interface LLMProvider {
595
+ readonly name: string;
596
+ complete(prompt: string, options?: CompletionOptions): Promise<string>;
597
+ completeJSON<T>(prompt: string, schema: unknown): Promise<T>;
598
+ }
599
+ interface CompletionOptions {
600
+ maxTokens?: number;
601
+ temperature?: number;
602
+ systemPrompt?: string;
603
+ }
604
+ /**
605
+ * Forward declaration for Fetcher (defined in fetchers/types.ts)
606
+ */
607
+ interface Fetcher {
608
+ readonly name: string;
609
+ fetch(url: string, options: FetchOptions): Promise<FetchResult>;
610
+ }
611
+ interface FetchOptions {
612
+ timeout?: number;
613
+ userAgent?: string;
614
+ headers?: Record<string, string>;
615
+ }
616
+ interface FetchResult {
617
+ html: string;
618
+ finalUrl: string;
619
+ statusCode: number;
620
+ contentType: string;
621
+ headers?: Record<string, string>;
622
+ }
623
+ /**
624
+ * Forward declaration for Extractor (defined in extractors/types.ts)
625
+ */
626
+ interface Extractor {
627
+ readonly name: string;
628
+ readonly priority?: number;
629
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
630
+ }
631
+ /**
632
+ * Shared context passed to all extractors
633
+ */
634
+ interface ExtractionContext {
635
+ url: string;
636
+ finalUrl: string;
637
+ html: string;
638
+ $: CheerioAPI;
639
+ getDocument(): Document;
640
+ results: Partial<ScrapedData>;
641
+ options: ScrapeOptions;
642
+ }
643
+ /**
644
+ * Options for scraping
645
+ */
646
+ interface ScrapeOptions {
647
+ /** Timeout in milliseconds (default: 10000) */
648
+ timeout?: number;
649
+ /** User agent string */
650
+ userAgent?: string;
651
+ /** Whether to extract full content (default: true) */
652
+ extractContent?: boolean;
653
+ /** Maximum content length in characters (default: 50000) */
654
+ maxContentLength?: number;
655
+ /** Custom fetcher (for Puppeteer/Playwright) */
656
+ fetcher?: Fetcher;
657
+ /** Custom extractors to run */
658
+ extractors?: Extractor[];
659
+ /** If true, only run custom extractors (replace defaults) */
660
+ replaceDefaultExtractors?: boolean;
661
+ /** Check robots.txt before scraping (default: false) */
662
+ respectRobots?: boolean;
663
+ /** LLM provider for enhancements */
664
+ llm?: LLMProvider;
665
+ /** LLM enhancement types to run */
666
+ enhance?: EnhancementType[];
667
+ /** Schema for structured LLM extraction */
668
+ extract?: ExtractionSchema;
669
+ /** Embedding generation options */
670
+ embeddings?: EmbeddingOptions;
671
+ }
672
+ //#endregion
673
+ export { EmbeddingSuccessMultiple as A, BaseHttpProvider as B, EmbeddingMetrics as C, EmbeddingResult as D, EmbeddingProviderConfig as E, SafetyConfig as F, TextChunk as I, HttpEmbeddingConfig as L, OutputConfig as M, PiiRedactionConfig as N, EmbeddingSkipped as O, ResilienceConfig as P, createHttpEmbedding as R, EmbeddingInputConfig as S, EmbeddingProvider as T, EmbedRequest as _, ExtractedLink as a, EmbeddingCache as b, ExtractionSchemaType as c, FetchResult as d, Fetcher as f, ChunkingConfig as g, ScrapedData as h, ExtractedEntities as i, EmbeddingSuccessSingle as j, EmbeddingSource as k, Extractor as l, ScrapeOptions as m, ContentType as n, ExtractionContext as o, LLMProvider as p, EnhancementType as r, ExtractionSchema as s, CompletionOptions as t, FetchOptions as u, EmbedResponse as v, EmbeddingOptions as w, EmbeddingCacheConfig as x, EmbeddingAggregation as y, BaseHttpConfig as z };
674
+ //# sourceMappingURL=types-DPEtPihB.d.cts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types-DPEtPihB.d.cts","names":[],"sources":["../src/common/resilience.ts","../src/common/http-base.ts","../src/embeddings/providers/http.ts","../src/embeddings/types.ts","../src/core/types.ts"],"sourcesContent":[],"mappings":";;;;;;;AAYA;AAcA;AAUA;AAUA;AAeiB,UAjDA,aAAA,CAiDgB;EACvB;EACS,WAAA,CAAA,EAAA,MAAA;EACL;EAMJ,SAAA,CAAA,EAAA,MAAA;EAAe;EAMR,iBAAA,CAAA,EAAe,MAAA;EAKf;EAGF,iBAAA,CAAA,EAAA,MAAA,EAAA;;;;;AAG8B,UA7D5B,sBAAA,CA6D4B;;;;EC3B5B,cAAA,CAAA,EAAc,MAAA;;;;;AA+Hd,UDvJA,iBAAA,CC0JL;EAOK;EASK,iBAAA,CAAA,EAAgB,MAAA;EAGR;EACgB,eAAA,CAAA,EAAA,MAAA;;;;;AAwEG,KD5OrC,cAAA,GC4OqC,QAAA,GAAA,MAAA,GAAA,WAAA;;;;ACzQzB,UF4CP,kBAAA,CE5CO;EAwGR,KAAA,CAAA,EF3DN,aE2DM;EACc,cAAA,CAAA,EF3DX,sBE2DW;EAAU,SAAA,CAAA,EF1D1B,iBE0D0B;EAAW;EAAzC,SAAA,CAAA,EAAA,MAAA;EACP;EAAiB,WAAA,CAAA,EAAA,MAAA;;UFrDV;;AGzBV;AAcA;;AAKyD,UHYxC,iBAAA,CGZwC;EAAR,cAAA,CAAA,EAAA;IAAO,MAAA,EAAA,EAAA,OAAA;IAGvC,aAAY,EAAA,EAAA,IAAA;IAOZ,aAAa,EAAA,EAAA,IAAA;IAYlB,QAAA,GAAA,EHLK,cGKa;EAEb,CAAA;EAKR,WAAA,CAAA,EAAA;IAKoB,OAAA,EAAA,EHdd,OGcc,CAAA,IAAA,CAAA;EAAR,CAAA;EAAO,SAAA,CAAA,EAAA;IAWX,OAAA,CAAA,CAAA,CAAA,CAAA,EAAc,EAAA,GAAA,GHtBN,OGsBM,CHtBE,CGsBF,CAAA,CAAA,EHtBO,OGsBP,CHtBe,CGsBf,CAAA;EA6BnB,CAAA;AAMZ;;;;;;AHzDsC,UC3BrB,cD2BqB,CAAA,SAAA,OAAA,CAAA,CAAA;EAAO;;;;EC3B5B;EAML,OAAA,CAAA,EAAA,MAAA,CAAA,MAAA,EAAA,MAAA,CAAA;EAEe;EAOZ,WAAA,CAAA,EAAA,CAAA,QAAA,EAPY,MAOZ,EAAA,GAAA,MAAA;EAAgB;EAgHd,YAAA,CAAA,EAAA,OAAY;EAUZ,YAAA,CAAA,EAAA,OAAW;EASN,UAAA,CAAA,EAAA,OAAgB;EAGR,cAAA,CAAA,EAAA,OAAA;EACgB;EAUX,UAAA,CAAA,EAjJpB,kBAiJoB;;;;;AC3MX,UD0KP,cAAA,CC1KO;EAwGR,MAAA,CAAA,EAAA,KAAA,GAAA,MAAmB;EACL,IAAA,CAAA,EAAA,OAAA;EAAU,OAAA,CAAA,EDoE5B,MCpE4B,CAAA,MAAA,EAAA,MAAA,CAAA;EAAW,MAAA,CAAA,EDqExC,WCrEwC;;;;;UD2ElC;QACT;EEzJI,MAAA,EAAA,MAAA;EAcK,OAAA,EF6IN,OE7IM;;;;;AAQA,uBF2IK,gBEvIA,CAAA,SAAA,OAAA,CAAA,CAAA;EAGL,mBAAa,OAAA,EAAA,MAAA;EAYlB,mBAAA,KAAkB,EAAA,MAAA;EAEb,mBAAA,OAAoB,EFyHP,MEzHO,CAAA,MAAA,EAAA,MAAA,CAAA;EAK5B,mBAAA,WAAA,CAAA,EAAA,CAAA,QAAA,EFqHqC,MErHrC,EAAA,GAAA,MAAA;EAKoB,mBAAA,YAAA,EAAA,OAAA;EAAR,mBAAA,YAAA,EAAA,OAAA;EAAO,mBAAA,UAAA,EAAA,OAAA;EAWX,mBAAc,cAAA,EAAA,OAAA;EA6BnB,mBAAA,SAAoB,EAAA,MAAA;EAMf,mBAAY,WAKb,CAAA,EFuEmB,aEvEnB;EASC,mBAAA,WAAkB,EAAA,MAYhB;EAGF,QAAA,cAAY;EA2BZ,QAAA,WAAA;EAiBA,QAAA,SAAc;EACH,WAAA,CAAA,MAAA,EFUN,cEVM,CFUS,MEVT,CAAA;EAAR;;;EAEG,kBAAA,CAAA,CAAA,EFmDC,iBEnDD;EACZ;;AAOX;EAkBiB,UAAA,KAAA,CAAA,CAAA,CAAA,CAAA,GAAA,EAAoB,MAAA,EAAA,OAAA,CAAA,EFoCY,cEpCZ,CAAA,EFoCgC,OEpChC,CFoCwC,aEpCxC,CFoCoD,CEpCpD,CAAA,CAAA;AAarC;;;AHrNA;AAeA;;AAEmB,UE/CF,mBF+CE,CAAA,WAAA,OAAA,EAAA,YAAA,OAAA,EAAA,SAAA,OAAA,CAAA,SE9CT,cF8CS,CE9CM,MF8CN,CAAA,CAAA;EACL;;;AAYd;EAKiB,cAAA,CAAA,EAAA,CAAA,KAAA,EAAA,MAAA,EAAA,EAAA,KAAA,EAAA,MAAA,EAAA,GE3DsC,QF2DtC;EAGF;;;;EAGuB,cAAA,CAAA,EAAA,CAAA,QAAA,EE5DR,SF4DQ,EAAA,GAAA,MAAA,EAAA,EAAA;;;;;ACmMW,iBCjKjC,mBDiKiC,CAAA,WAAA,OAAA,EAAA,YAAA,OAAA,EAAA,SAAA,OAAA,CAAA,CAAA,MAAA,EChKvC,mBDgKuC,CChKnB,QDgKmB,EChKT,SDgKS,EChKE,MDgKF,CAAA,CAAA,EC/J9C,iBD+J8C;;;;AD9QjD;AAcA;AAUA;AAUA;AAeA;;;;;;AAeA;;;;;;;;;;;AChBA;;;;;AA+HA;AAUA;AASA;;;;;;;;;AA4E6E,KE7OjE,uBAAA,GF6OiE;EAAR,IAAA,EAAA,MAAA;EAAO,MAAA,EE5OhD,mBF4OgD;;;YE3O5C;AD/BhC,CAAA;;;;AACwB,UC0CP,iBAAA,CD1CO;EAwGR,SAAA,IAAA,EAAA,MAAA;EACc;;;EAApB,KAAA,CAAA,KAAA,EAAA,MAAA,EAAA,EAAA,OAAA,EC1DwB,YD0DxB,CAAA,EC1DuC,OD0DvC,CC1D+C,aD0D/C,CAAA;;AACU,UCxDH,YAAA,CDwDG;;;;EC9ER,MAAA,CAAA,EA0BD,WA1BC;AAcZ;AAKkC,UAUjB,aAAA,CAViB;EAAuB,UAAA,EAAA,MAAA,EAAA,EAAA;EAAR,KAAA,CAAA,EAAA;IAAO,YAAA,EAAA,MAAA;IAGvC,WAAY,EAAA,MAAA;EAOZ,CAAA;AAYjB;AAEiB,KAFL,kBAAA,GAEyB,aAAA,GAAA,eAAA,GAAA,QAAA;AAK5B,UALQ,oBAAA,CAKR;EAKoB;;;AAW7B;EA6BY,IAAA,CAAA,EA7CH,kBA6CuB;EAMf;AAcjB;AAeA;AA2BA;EAiBiB,SAAA,CAAA,EAAA,CAAA,IAAc,EAvHV,OAuHU,CAvHF,WAuHE,CAAA,EAAA,GAAA,MAAA;EACH;;;EAC8C,UAAA,CAAA,EAAA,MAAA;;AAE/D,UAhHM,cAAA,CAgHN;EAAO;AAOlB;AAkBA;AAaA;EAOiB,IAAA,CAAA,EAAA,MAAA;EAKA;;;;EAM6B,OAAA,CAAA,EAAA,MAAA;EAAR;;AAItC;;;;EAWU,SAAA,CAAA,EAAA,WAAA,GAAA,UAAA,GAAA,CAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA,CAAA;EAAe;AAiBzB;;;EAQa,cAAA,CAAA,EAAA,MAAA;;AAIF,KAvLC,oBAAA,GAuLD,SAAA,GAAA,KAAA,GAAA,OAAA,GAAA,KAAA;AAED,UAnLO,YAAA,CAmLP;EAEK;;;;EAQyB,WAAA,CAAA,EAxLxB,oBAwLwB;EAOvB;EAkBA,UAAA,CAAA,EAAA,MAAA;AAUjB;AAUiB,UA5NA,kBAAA,CA+NC;EAON;EAAkB,KAAA,CAAA,EAAA,OAAA;EAAyB;EAA2B,KAAA,CAAA,EAAA,OAAA;EAAgB;EAMjF,UAAA,CAAA,EAAA,OAAgB;EA4BhB;EAcL,GAAA,CAAA,EAAA,OAAA;;;;EC9aA,cAAW,CAAA,EDoKJ,MCpKI,EAAA;AAavB;AASiB,UDiJA,YAAA,CCjJiB;EAWjB;;;;EAsCJ,YAAA,CAAA,EDqGI,kBCrGJ;EACC;;;;EAiBF,aAAA,CAAA,EAAA,MAAe;EAKf;AAQZ;AAKA;;EAEyD,SAAA,CAAA,EAAA,MAAA;EACG;;;AAG5D;EASiB,uBAAO,CAAA,EAAA,OAAA;;AAE6B,UDsEpC,oBAAA,CCtEoC;EAAR;EAAO,KAAA,CAAA,EDwE1C,cCxE0C;EAGnC;EAMA,KAAA,CAAA,EAAA,MAAW;EAWX;EAGE,UAAA,CAAA,EAAA,MAAA;EAAoC;;;EAAT,YAAA,CAAA,EAAA,MAAA;AAM9C;;;;;AAgBW,UD0CM,cAAA,CC1CN;EAAa,GAAA,CAAA,GAAA,EAAA,MAAA,CAAA,ED2CJ,OC3CI,CD2CI,eC3CJ,GAAA,SAAA,CAAA;EAMP,GAAA,CAAA,GAAA,EAAA,MAAA,EAAa,KAAA,EDsCJ,eCtCI,EAAA,OAiBf,CAjBe,EAAA;IAclB,KAAA,CAAA,EAAA,MAAA;EAGG,CAAA,CAAA,EDqB2D,OCrB3D,CAAA,IAAA,CAAA;EASP,MAAA,CAAA,GAAA,EAAA,MAAA,CAAA,EDae,OCbf,CAAA,OAAA,CAAA;EAGI,KAAA,EAAA,EDWD,OCXC,CAAA,IAAA,CAAA;;AAMG,UDYE,WAAA,CCZF;EAAgB;;;;;;;;;;;;;;;;UD8Bd,oBAAA;;;;;;;;;;;;UAaA,eAAA;;;;;;UAOA,eAAA;;;;;iBAKA;;;eAGF;;;yBAGU,QAAQ,KAAK,QAAQ;;;UAI7B,gBAAA;;UAEP;;mBAES;;cAEL;;;;;UAKJ;;;;;;;;;;;;UAiBO,gBAAA;;YAEL;;;;UAIF;;aAEG;;WAEF;;WAEA;;UAED;;eAEK;;;;oBAIK,6BAA6B;;;;wBAIzB;;UAOP,eAAA;;;;;;;;;;;;;;;;;UAkBA,sBAAA;;;;UAIP;;;;;UAMO,wBAAA;;;;UAIP;;;;;UAMO,gBAAA;;;UAGP,QAAQ;;;;;;KAON,eAAA,GAAkB,yBAAyB,2BAA2B;UAMjE,gBAAA;;;;;;;;;;;;;;;;;;;;;;;UA4BA,SAAA;;;;;;;;;;;;;KAcL,YAAA;;;;AHxaZ;AAcA;AAUiB,KI9BL,WAAA,GJ8BoB,SAAA,GAAA,MAAA,GAAA,MAAA,GAAA,SAAA,GAAA,OAAA,GAAA,MAAA,GAAA,SAAA,GAAA,SAAA;AAUhC;AAeA;;AAEmB,UI5CF,aAAA,CJ4CE;EACL,GAAA,EAAA,MAAA;EAMJ,IAAA,EAAA,MAAA;EAAe,UAAA,EAAA,OAAA;AAMzB;;;;AAWyB,UI3DR,iBAAA,CJ2DQ;EAAqB,MAAA,EAAA,MAAA,EAAA;EAAR,aAAA,EAAA,MAAA,EAAA;EAAO,YAAA,EAAA,MAAA,EAAA;;;;AC3B7C;;;AAee,UGpCE,WAAA,CHoCF;EAAgB,GAAA,EAAA,MAAA;EAgHd,YAAA,EAAA,MAAY;EAUZ,MAAA,EAAA,MAAA;EASK,KAAA,EAAA,MAAA;EAGQ,WAAA,EAAA,MAAA;EACgB,KAAA,CAAA,EAAA,MAAA;EAUX,OAAA,CAAA,EAAA,MAAA;EAQE,OAAA,EAAA,MAAA;EAAf,WAAA,EAAA,MAAA;EA2CE,OAAA,EAAA,MAAA;EAWyB,SAAA,EAAA,MAAA;EAAwC,MAAA,CAAA,EAAA,MAAA;EAAZ,WAAA,CAAA,EAAA,MAAA;EAAR,UAAA,CAAA,EAAA,MAAA;EAAO,QAAA,CAAA,EAAA,MAAA;;eGzN7D;;EFjDE,MAAA,CAAA,EEqDN,MFrDM,CAAA,MAAmB,EAAA,OAAA,CAAA,EAAA;EACX,KAAA,CAAA,EEuDf,aFvDe,EAAA;EAK8B,OAAA,CAAA,EAAA,MAAA;EAKzB,aAAA,CAAA,EAAA,MAAA,EAAA;EAVpB,QAAA,CAAA,EE4DG,iBF5DH;EAAc,SAAA,CAAA,EE6DV,MF7DU,CAAA,MAAA,EAAA,OAAA,CAAA;EAwGR,MAAA,CAAA,EExCL,MFwCK,CAAA,MAAmB,EAAA,OAAA,CAAA;EACL,UAAA,CAAA,EEtCf,eFsCe;EAAU,SAAA,EAAA,MAAA;EAAW,YAAA,EAAA,MAAA;EAAzC,KAAA,CAAA,EAAA,MAAA;;;;;KE3BE,eAAA;ADlDZ;AAcA;;AAKyD,KCoC7C,oBAAA,GDpC6C,QAAA,GAAA,QAAA,GAAA,SAAA,GAAA,UAAA,GAAA,UAAA,GAAA,GAAA,MAAA,GAAA;AAAR,KC4CrC,gBAAA,GAAmB,MD5CkB,CAAA,MAAA,EC4CH,oBD5CG,CAAA;;AAGjD;AAOA;AAYY,UC2BK,WAAA,CD3Ba;EAEb,SAAA,IAAA,EAAA,MAAA;EAKR,QAAA,CAAA,MAAA,EAAA,MAAA,EAAA,OAAA,CAAA,ECsB4B,iBDtB5B,CAAA,ECsBgD,ODtBhD,CAAA,MAAA,CAAA;EAKoB,YAAA,CAAA,CAAA,CAAA,CAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,OAAA,CAAA,ECkBuB,ODlBvB,CCkB+B,CDlB/B,CAAA;;AAAD,UCqBX,iBAAA,CDrBW;EAWX,SAAA,CAAA,EAAA,MAAc;EA6BnB,WAAA,CAAA,EAAA,MAAA;EAMK,YAAA,CAAA,EAAY,MAAA;AAc7B;AAeA;AA2BA;AAiBA;AAC4B,UC1FX,OAAA,CD0FW;EAAR,SAAA,IAAA,EAAA,MAAA;EACM,KAAA,CAAA,GAAA,EAAA,MAAA,EAAA,OAAA,ECzFI,YDyFJ,CAAA,ECzFmB,ODyFnB,CCzF2B,WDyF3B,CAAA;;AACH,UCvFN,YAAA,CDuFM;EACZ,OAAA,CAAA,EAAA,MAAA;EAAO,SAAA,CAAA,EAAA,MAAA;EAOD,OAAA,CAAA,EC5FL,MD4FgB,CAAA,MAAA,EAAA,MAAA,CAAA;AAkB5B;AAaiB,UCxHA,WAAA,CDwHe;EAOf,IAAA,EAAA,MAAA;EAKA,QAAA,EAAA,MAAA;EAGF,UAAA,EAAA,MAAA;EAGkB,WAAA,EAAA,MAAA;EAAR,OAAA,CAAA,ECrIb,MDqIa,CAAA,MAAA,EAAA,MAAA,CAAA;;;;AAIzB;AAEU,UCrIO,SAAA,CDqIP;EAES,SAAA,IAAA,EAAA,MAAA;EAEL,SAAA,QAAA,CAAA,EAAA,MAAA;EAKJ,OAAA,CAAA,OAAA,EC3IS,iBD2IT,CAAA,EC3I6B,OD2I7B,CC3IqC,OD2IrC,CC3I6C,WD2I7C,CAAA,CAAA;;AAiBV;;;AAQa,UC9JI,iBAAA,CD8JJ;EAEF,GAAA,EAAA,MAAA;EAEA,QAAA,EAAA,MAAA;EAED,IAAA,EAAA,MAAA;EAEK,CAAA,EC/JV,UD+JU;EAIK,WAAA,EAAA,EChKH,QDgKG;EAA6B,OAAA,EC7JtC,OD6JsC,CC7J9B,WD6J8B,CAAA;EAIzB,OAAA,EC9Jb,aD8Ja;;AAOxB;AAkBA;AAUA;AAUiB,UCrMA,aAAA,CDqMgB;EAUrB;EAAkB,OAAA,CAAA,EAAA,MAAA;EAAyB;EAA2B,SAAA,CAAA,EAAA,MAAA;EAAgB;EAMjF,cAAA,CAAA,EAAA,OAAgB;EA4BhB;EAcL,gBAAY,CAAA,EAAA,MAAA;;YCjPZ;;EA7LA,UAAA,CAAA,EAgMG,SAhMQ,EAAA;EAaN;EASA,wBAAiB,CAAA,EAAA,OAAA;EAWjB;EA0BF,aAAA,CAAA,EAAA,OAAA;EAIJ;EAGD,GAAA,CAAA,EAuIF,WAvIE;EAKG;EACC,OAAA,CAAA,EAoIF,eApIE,EAAA;EAGH;EAGI,OAAA,CAAA,EAiIH,gBAjIG;EAAe;EAWlB,UAAA,CAAA,EAyHG,gBAzHY;AAK3B"}