scrapex 1.0.0-alpha.1 → 1.0.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +164 -5
  2. package/dist/embeddings/index.cjs +52 -0
  3. package/dist/embeddings/index.d.cts +3 -0
  4. package/dist/embeddings/index.d.mts +3 -0
  5. package/dist/embeddings/index.mjs +4 -0
  6. package/dist/embeddings-BjNTQSG9.cjs +1455 -0
  7. package/dist/embeddings-BjNTQSG9.cjs.map +1 -0
  8. package/dist/embeddings-Bsymy_jA.mjs +1215 -0
  9. package/dist/embeddings-Bsymy_jA.mjs.map +1 -0
  10. package/dist/{enhancer-oM4BhYYS.cjs → enhancer-Cs_WyWtJ.cjs} +2 -51
  11. package/dist/enhancer-Cs_WyWtJ.cjs.map +1 -0
  12. package/dist/{enhancer-Q6CSc1gA.mjs → enhancer-INx5NlgO.mjs} +2 -45
  13. package/dist/enhancer-INx5NlgO.mjs.map +1 -0
  14. package/dist/http-base-CHLf-Tco.cjs +684 -0
  15. package/dist/http-base-CHLf-Tco.cjs.map +1 -0
  16. package/dist/http-base-DM7YNo6X.mjs +618 -0
  17. package/dist/http-base-DM7YNo6X.mjs.map +1 -0
  18. package/dist/index-Bvseqli-.d.cts +268 -0
  19. package/dist/index-Bvseqli-.d.cts.map +1 -0
  20. package/dist/index-CIFjNySr.d.mts +268 -0
  21. package/dist/index-CIFjNySr.d.mts.map +1 -0
  22. package/dist/index-D6qfjmZQ.d.mts +401 -0
  23. package/dist/index-D6qfjmZQ.d.mts.map +1 -0
  24. package/dist/index-RFSpP5g8.d.cts +401 -0
  25. package/dist/index-RFSpP5g8.d.cts.map +1 -0
  26. package/dist/index.cjs +171 -51
  27. package/dist/index.cjs.map +1 -1
  28. package/dist/index.d.cts +61 -2
  29. package/dist/index.d.cts.map +1 -1
  30. package/dist/index.d.mts +61 -2
  31. package/dist/index.d.mts.map +1 -1
  32. package/dist/index.mjs +129 -6
  33. package/dist/index.mjs.map +1 -1
  34. package/dist/llm/index.cjs +252 -233
  35. package/dist/llm/index.cjs.map +1 -1
  36. package/dist/llm/index.d.cts +132 -85
  37. package/dist/llm/index.d.cts.map +1 -1
  38. package/dist/llm/index.d.mts +132 -85
  39. package/dist/llm/index.d.mts.map +1 -1
  40. package/dist/llm/index.mjs +244 -236
  41. package/dist/llm/index.mjs.map +1 -1
  42. package/dist/parsers/index.cjs +10 -199
  43. package/dist/parsers/index.d.cts +2 -133
  44. package/dist/parsers/index.d.mts +2 -133
  45. package/dist/parsers/index.mjs +2 -191
  46. package/dist/parsers-Bneuws8x.cjs +569 -0
  47. package/dist/parsers-Bneuws8x.cjs.map +1 -0
  48. package/dist/parsers-DsawHeo0.mjs +482 -0
  49. package/dist/parsers-DsawHeo0.mjs.map +1 -0
  50. package/dist/types-BOcHQU9s.d.mts +831 -0
  51. package/dist/types-BOcHQU9s.d.mts.map +1 -0
  52. package/dist/types-DutdBpqd.d.cts +831 -0
  53. package/dist/types-DutdBpqd.d.cts.map +1 -0
  54. package/package.json +15 -16
  55. package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
  56. package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
  57. package/dist/parsers/index.cjs.map +0 -1
  58. package/dist/parsers/index.d.cts.map +0 -1
  59. package/dist/parsers/index.d.mts.map +0 -1
  60. package/dist/parsers/index.mjs.map +0 -1
  61. package/dist/types-CNQZVW36.d.mts +0 -150
  62. package/dist/types-CNQZVW36.d.mts.map +0 -1
  63. package/dist/types-D0HYR95H.d.cts +0 -150
  64. package/dist/types-D0HYR95H.d.cts.map +0 -1
@@ -0,0 +1,831 @@
1
+ import { CheerioAPI } from "cheerio";
2
+
3
+ //#region src/core/types.d.ts
4
+
5
+ /**
6
+ * Content type classification for scraped URLs
7
+ */
8
+ type ContentType = 'article' | 'repo' | 'docs' | 'package' | 'video' | 'tool' | 'product' | 'unknown';
9
+ /**
10
+ * Extracted link from content
11
+ */
12
+ interface ExtractedLink {
13
+ url: string;
14
+ text: string;
15
+ isExternal: boolean;
16
+ }
17
+ /**
18
+ * Extracted entities from LLM enhancement
19
+ */
20
+ interface ExtractedEntities {
21
+ people: string[];
22
+ organizations: string[];
23
+ technologies: string[];
24
+ locations: string[];
25
+ concepts: string[];
26
+ }
27
+ /**
28
+ * Main result of metadata scraping - optimized for LLM consumption
29
+ */
30
+ interface ScrapedData {
31
+ url: string;
32
+ canonicalUrl: string;
33
+ domain: string;
34
+ title: string;
35
+ description: string;
36
+ image?: string;
37
+ favicon?: string;
38
+ content: string;
39
+ textContent: string;
40
+ excerpt: string;
41
+ wordCount: number;
42
+ author?: string;
43
+ publishedAt?: string;
44
+ modifiedAt?: string;
45
+ siteName?: string;
46
+ language?: string;
47
+ contentType: ContentType;
48
+ keywords: string[];
49
+ jsonLd?: Record<string, unknown>[];
50
+ links?: ExtractedLink[];
51
+ summary?: string;
52
+ suggestedTags?: string[];
53
+ entities?: ExtractedEntities;
54
+ extracted?: Record<string, unknown>;
55
+ custom?: Record<string, unknown>;
56
+ embeddings?: EmbeddingResult;
57
+ scrapedAt: string;
58
+ scrapeTimeMs: number;
59
+ error?: string;
60
+ }
61
+ /**
62
+ * LLM enhancement types
63
+ */
64
+ type EnhancementType = 'summarize' | 'tags' | 'entities' | 'classify';
65
+ /**
66
+ * Schema for structured LLM extraction
67
+ */
68
+ type ExtractionSchemaType = 'string' | 'number' | 'boolean' | 'string[]' | 'number[]' | `${string}?`;
69
+ type ExtractionSchema = Record<string, ExtractionSchemaType>;
70
+ /**
71
+ * Forward declaration for LLM provider (defined in llm/types.ts)
72
+ */
73
+ interface LLMProvider {
74
+ readonly name: string;
75
+ complete(prompt: string, options?: CompletionOptions): Promise<string>;
76
+ completeJSON<T>(prompt: string, schema: unknown): Promise<T>;
77
+ }
78
+ interface CompletionOptions {
79
+ maxTokens?: number;
80
+ temperature?: number;
81
+ systemPrompt?: string;
82
+ }
83
+ /**
84
+ * Forward declaration for Fetcher (defined in fetchers/types.ts)
85
+ */
86
+ interface Fetcher {
87
+ readonly name: string;
88
+ fetch(url: string, options: FetchOptions$1): Promise<FetchResult$1>;
89
+ }
90
+ interface FetchOptions$1 {
91
+ timeout?: number;
92
+ userAgent?: string;
93
+ headers?: Record<string, string>;
94
+ }
95
+ interface FetchResult$1 {
96
+ html: string;
97
+ finalUrl: string;
98
+ statusCode: number;
99
+ contentType: string;
100
+ headers?: Record<string, string>;
101
+ }
102
+ /**
103
+ * Forward declaration for Extractor (defined in extractors/types.ts)
104
+ */
105
+ interface Extractor {
106
+ readonly name: string;
107
+ readonly priority?: number;
108
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
109
+ }
110
+ /**
111
+ * Shared context passed to all extractors
112
+ */
113
+ interface ExtractionContext {
114
+ url: string;
115
+ finalUrl: string;
116
+ html: string;
117
+ $: CheerioAPI;
118
+ getDocument(): Document;
119
+ results: Partial<ScrapedData>;
120
+ options: ScrapeOptions;
121
+ }
122
+ /**
123
+ * Options for scraping
124
+ */
125
+ interface ScrapeOptions {
126
+ /** Timeout in milliseconds (default: 10000) */
127
+ timeout?: number;
128
+ /** User agent string */
129
+ userAgent?: string;
130
+ /** Whether to extract full content (default: true) */
131
+ extractContent?: boolean;
132
+ /** Maximum content length in characters (default: 50000) */
133
+ maxContentLength?: number;
134
+ /** Custom fetcher (for Puppeteer/Playwright) */
135
+ fetcher?: Fetcher;
136
+ /** Custom extractors to run */
137
+ extractors?: Extractor[];
138
+ /** If true, only run custom extractors (replace defaults) */
139
+ replaceDefaultExtractors?: boolean;
140
+ /** Check robots.txt before scraping (default: false) */
141
+ respectRobots?: boolean;
142
+ /** LLM provider for enhancements */
143
+ llm?: LLMProvider;
144
+ /** LLM enhancement types to run */
145
+ enhance?: EnhancementType[];
146
+ /** Schema for structured LLM extraction */
147
+ extract?: ExtractionSchema;
148
+ /** Embedding generation options */
149
+ embeddings?: EmbeddingOptions;
150
+ }
151
+ //#endregion
152
+ //#region src/common/resilience.d.ts
153
+ /**
154
+ * Shared resilience utilities for HTTP providers.
155
+ * Provides retry, circuit breaker, rate limiting, timeout, and concurrency control.
156
+ */
157
+ /**
158
+ * Retry configuration.
159
+ */
160
+ interface RetryConfig$1 {
161
+ /** Maximum retry attempts. @default 3 */
162
+ maxAttempts?: number;
163
+ /** Initial backoff delay in ms. @default 1000 */
164
+ backoffMs?: number;
165
+ /** Backoff multiplier. @default 2 */
166
+ backoffMultiplier?: number;
167
+ /** HTTP status codes to retry. @default [408, 429, 500, 502, 503, 504] */
168
+ retryableStatuses?: number[];
169
+ }
170
+ /**
171
+ * Circuit breaker configuration.
172
+ */
173
+ interface CircuitBreakerConfig$1 {
174
+ /** Failures before opening circuit. @default 5 */
175
+ failureThreshold?: number;
176
+ /** Time before attempting to close circuit. @default 30000 */
177
+ resetTimeoutMs?: number;
178
+ }
179
+ /**
180
+ * Rate limit configuration.
181
+ */
182
+ interface RateLimitConfig$1 {
183
+ /** Max requests per minute */
184
+ requestsPerMinute?: number;
185
+ /** Max tokens per minute (for LLM providers) */
186
+ tokensPerMinute?: number;
187
+ }
188
+ /**
189
+ * Circuit breaker state.
190
+ */
191
+ type CircuitState$1 = 'closed' | 'open' | 'half-open';
192
+ /**
193
+ * Resilience configuration for HTTP providers.
194
+ */
195
+ interface ResilienceConfig$1 {
196
+ retry?: RetryConfig$1;
197
+ circuitBreaker?: CircuitBreakerConfig$1;
198
+ rateLimit?: RateLimitConfig$1;
199
+ /** Request timeout in ms. @default 30000 */
200
+ timeoutMs?: number;
201
+ /** Max concurrent requests. @default 1 */
202
+ concurrency?: number;
203
+ /** Optional shared state for circuit breaker / rate limiter / semaphore */
204
+ state?: ResilienceState$1;
205
+ }
206
+ /**
207
+ * Shared resilience state for persistence across calls.
208
+ */
209
+ interface ResilienceState$1 {
210
+ circuitBreaker?: {
211
+ isOpen(): boolean;
212
+ recordSuccess(): void;
213
+ recordFailure(): void;
214
+ getState?(): CircuitState$1;
215
+ };
216
+ rateLimiter?: {
217
+ acquire(): Promise<void>;
218
+ };
219
+ semaphore?: {
220
+ execute<T>(fn: () => Promise<T>): Promise<T>;
221
+ };
222
+ }
223
+ /**
224
+ * Check if an error is retryable.
225
+ */
226
+ declare function isRetryableError(error: unknown, retryableStatuses?: number[]): boolean;
227
+ /**
228
+ * Execute a function with retry logic.
229
+ */
230
+ declare function withRetry<T>(fn: () => Promise<T>, config?: RetryConfig$1, onRetry?: (attempt: number, error: Error, delayMs: number) => void): Promise<{
231
+ result: T;
232
+ attempts: number;
233
+ }>;
234
+ /**
235
+ * Execute a function with timeout.
236
+ */
237
+ declare function withTimeout<T>(fn: (signal: AbortSignal) => Promise<T>, timeoutMs: number): Promise<T>;
238
+ /**
239
+ * Create an AbortSignal that times out after specified milliseconds.
240
+ * If parentSignal is provided, this signal will abort when the parent aborts.
241
+ */
242
+ declare function createTimeoutSignal(timeoutMs: number, parentSignal?: AbortSignal): AbortSignal;
243
+ /**
244
+ * Error thrown when circuit breaker is open.
245
+ */
246
+ declare class CircuitOpenError extends Error {
247
+ readonly isCircuitOpen = true;
248
+ constructor(message: string);
249
+ }
250
+ /**
251
+ * Circuit breaker implementation.
252
+ * Prevents cascade failures by stopping requests when failure rate is high.
253
+ */
254
+ declare class CircuitBreaker {
255
+ private state;
256
+ private readonly failureThreshold;
257
+ private readonly resetTimeoutMs;
258
+ constructor(config?: CircuitBreakerConfig$1);
259
+ /**
260
+ * Check if requests are blocked.
261
+ */
262
+ isOpen(): boolean;
263
+ /**
264
+ * Get current circuit state.
265
+ */
266
+ getState(): CircuitState$1;
267
+ /**
268
+ * Record a successful request.
269
+ */
270
+ recordSuccess(): void;
271
+ /**
272
+ * Record a failed request.
273
+ */
274
+ recordFailure(): void;
275
+ /**
276
+ * Execute a function with circuit breaker protection.
277
+ */
278
+ execute<T>(fn: () => Promise<T>): Promise<T>;
279
+ /**
280
+ * Reset the circuit breaker.
281
+ */
282
+ reset(): void;
283
+ /**
284
+ * Update state based on time (open -> half-open transition).
285
+ */
286
+ private updateState;
287
+ }
288
+ /**
289
+ * Token bucket rate limiter.
290
+ */
291
+ declare class RateLimiter {
292
+ private tokens;
293
+ private lastRefill;
294
+ private readonly maxTokens;
295
+ private readonly refillRate;
296
+ constructor(config: RateLimitConfig$1);
297
+ /**
298
+ * Check if a request is allowed without consuming tokens.
299
+ */
300
+ canProceed(): boolean;
301
+ /**
302
+ * Attempt to acquire tokens for a request.
303
+ * Returns true if allowed, false if rate limited.
304
+ */
305
+ tryAcquire(tokens?: number): boolean;
306
+ /**
307
+ * Wait until tokens are available, then acquire.
308
+ */
309
+ acquire(tokens?: number): Promise<void>;
310
+ /**
311
+ * Get time until next token is available (in milliseconds).
312
+ */
313
+ getWaitTime(): number;
314
+ /**
315
+ * Refill tokens based on elapsed time.
316
+ */
317
+ private refill;
318
+ }
319
+ /**
320
+ * Semaphore for limiting concurrent operations.
321
+ */
322
+ declare class Semaphore {
323
+ private permits;
324
+ private waiting;
325
+ constructor(permits: number);
326
+ /**
327
+ * Acquire a permit, waiting if necessary.
328
+ */
329
+ acquire(): Promise<void>;
330
+ /**
331
+ * Release a permit.
332
+ */
333
+ release(): void;
334
+ /**
335
+ * Execute function with semaphore protection.
336
+ */
337
+ execute<T>(fn: () => Promise<T>): Promise<T>;
338
+ }
339
+ /**
340
+ * Execute a function with all resilience features.
341
+ *
342
+ * @param fn - The async function to execute with resilience
343
+ * @param config - Configuration for retry and timeout behavior
344
+ * @param state - Pre-instantiated resilience primitives for stateful features.
345
+ * Circuit breaker, rate limiter, and semaphore must be instantiated by the caller
346
+ * and passed via state to enable those features. This allows sharing state across
347
+ * multiple calls for proper circuit breaker tracking and rate limiting.
348
+ * The config parameter is only used for retry and timeout settings.
349
+ * @param callbacks - Optional callbacks for retry events
350
+ */
351
+ declare function withResilience<T>(fn: (signal: AbortSignal) => Promise<T>, config?: ResilienceConfig$1, state?: ResilienceState$1, callbacks?: {
352
+ onRetry?: (attempt: number, error: Error, delayMs: number) => void;
353
+ }): Promise<{
354
+ result: T;
355
+ attempts: number;
356
+ }>;
357
+ //#endregion
358
+ //#region src/common/http-base.d.ts
359
+ /**
360
+ * Base configuration for HTTP providers.
361
+ */
362
+ interface BaseHttpConfig<TError = unknown> {
363
+ /** Base URL for the API endpoint */
364
+ baseUrl: string;
365
+ /** Model identifier */
366
+ model: string;
367
+ /** Additional headers */
368
+ headers?: Record<string, string>;
369
+ /** Extract error message from failed response */
370
+ errorMapper?: (response: TError) => string;
371
+ /** Security options */
372
+ requireHttps?: boolean;
373
+ allowPrivate?: boolean;
374
+ resolveDns?: boolean;
375
+ allowRedirects?: boolean;
376
+ /** Resilience options */
377
+ resilience?: ResilienceConfig$1;
378
+ }
379
+ /**
380
+ * Fetch request options for base provider.
381
+ */
382
+ interface FetchOptions {
383
+ method?: 'GET' | 'POST';
384
+ body?: unknown;
385
+ headers?: Record<string, string>;
386
+ signal?: AbortSignal;
387
+ }
388
+ /**
389
+ * Result of a fetch request.
390
+ */
391
+ interface FetchResult<T> {
392
+ data: T;
393
+ status: number;
394
+ headers: Headers;
395
+ }
396
+ /**
397
+ * Base HTTP provider with shared security and resilience.
398
+ */
399
+ declare abstract class BaseHttpProvider<TError = unknown> {
400
+ protected readonly baseUrl: string;
401
+ protected readonly model: string;
402
+ protected readonly headers: Record<string, string>;
403
+ protected readonly errorMapper?: (response: TError) => string;
404
+ protected readonly requireHttps: boolean;
405
+ protected readonly allowPrivate: boolean;
406
+ protected readonly resolveDns: boolean;
407
+ protected readonly allowRedirects: boolean;
408
+ protected readonly timeoutMs: number;
409
+ protected readonly retryConfig?: RetryConfig$1;
410
+ protected readonly concurrency: number;
411
+ private circuitBreaker?;
412
+ private rateLimiter?;
413
+ private semaphore?;
414
+ constructor(config: BaseHttpConfig<TError>);
415
+ /**
416
+ * Get the current resilience state for persistence across calls.
417
+ */
418
+ getResilienceState(): ResilienceState$1;
419
+ /**
420
+ * Make an HTTP request with security and resilience.
421
+ */
422
+ protected fetch<T>(url: string, options?: FetchOptions): Promise<FetchResult<T>>;
423
+ }
424
+ //#endregion
425
+ //#region src/embeddings/providers/http.d.ts
426
+ /**
427
+ * HTTP embedding provider configuration.
428
+ */
429
+ interface HttpEmbeddingConfig<TRequest = unknown, TResponse = unknown, TError = unknown> extends BaseHttpConfig<TError> {
430
+ /**
431
+ * Build request body from input texts.
432
+ * @default { input: texts, model }
433
+ */
434
+ requestBuilder?: (texts: string[], model: string) => TRequest;
435
+ /**
436
+ * Extract embeddings array from response.
437
+ * @default (res) => res.data.map(d => d.embedding)
438
+ */
439
+ responseMapper?: (response: TResponse) => number[][];
440
+ }
441
+ /**
442
+ * HTTP-based embedding provider.
443
+ * Works with any REST API using native fetch.
444
+ */
445
+ declare class HttpEmbeddingProvider<TRequest = unknown, TResponse = unknown, TError = unknown> extends BaseHttpProvider<TError> implements EmbeddingProvider {
446
+ readonly name = "http-embedding";
447
+ private readonly requestBuilder;
448
+ private readonly responseMapper;
449
+ constructor(config: HttpEmbeddingConfig<TRequest, TResponse, TError>);
450
+ /**
451
+ * Generate embeddings for one or more texts.
452
+ */
453
+ embed(texts: string[], options: EmbedRequest): Promise<EmbedResponse>;
454
+ }
455
+ /**
456
+ * Create a generic HTTP embedding provider.
457
+ */
458
+ declare function createHttpEmbedding<TRequest = unknown, TResponse = unknown, TError = unknown>(config: HttpEmbeddingConfig<TRequest, TResponse, TError>): EmbeddingProvider;
459
+ //#endregion
460
+ //#region src/embeddings/types.d.ts
461
+ /**
462
+ * Embedding provider configuration - discriminated union for type safety.
463
+ *
464
+ * Use preset factory functions to create providers:
465
+ * - `createOpenAIEmbedding()` - OpenAI API
466
+ * - `createAzureEmbedding()` - Azure OpenAI
467
+ * - `createOllamaEmbedding()` - Local Ollama
468
+ * - `createHuggingFaceEmbedding()` - HuggingFace Inference API
469
+ * - `createCohereEmbedding()` - Cohere API
470
+ * - `createTransformersEmbedding()` - Local Transformers.js
471
+ *
472
+ * @example Using a preset
473
+ * ```ts
474
+ * import { createOpenAIEmbedding } from 'scrapex/embeddings';
475
+ *
476
+ * const result = await scrape(url, {
477
+ * embeddings: {
478
+ * provider: { type: 'custom', provider: createOpenAIEmbedding() },
479
+ * },
480
+ * });
481
+ * ```
482
+ *
483
+ * @example Using inline HTTP config
484
+ * ```ts
485
+ * const result = await scrape(url, {
486
+ * embeddings: {
487
+ * provider: {
488
+ * type: 'http',
489
+ * config: {
490
+ * baseUrl: 'https://api.example.com/embed',
491
+ * model: 'custom-model',
492
+ * headers: { Authorization: 'Bearer ...' },
493
+ * },
494
+ * },
495
+ * },
496
+ * });
497
+ * ```
498
+ */
499
+ type EmbeddingProviderConfig = {
500
+ type: 'http';
501
+ config: HttpEmbeddingConfig;
502
+ } | {
503
+ type: 'custom';
504
+ provider: EmbeddingProvider;
505
+ };
506
+ /**
507
+ * Embedding provider interface - mirrors LLMProvider pattern.
508
+ */
509
+ interface EmbeddingProvider {
510
+ readonly name: string;
511
+ /**
512
+ * Generate embeddings for one or more texts.
513
+ */
514
+ embed(texts: string[], options: EmbedRequest): Promise<EmbedResponse>;
515
+ }
516
+ interface EmbedRequest {
517
+ /** Model to use. If undefined, provider uses its configured default. */
518
+ model?: string;
519
+ dimensions?: number;
520
+ signal?: AbortSignal;
521
+ }
522
+ interface EmbedResponse {
523
+ embeddings: number[][];
524
+ usage?: {
525
+ promptTokens: number;
526
+ totalTokens: number;
527
+ };
528
+ }
529
+ type EmbeddingInputType = 'textContent' | 'title+summary' | 'custom';
530
+ interface EmbeddingInputConfig {
531
+ /**
532
+ * Predefined input source. Ignored if `transform` is provided.
533
+ * @default 'textContent'
534
+ */
535
+ type?: EmbeddingInputType;
536
+ /**
537
+ * Custom function to generate input text from scraped data.
538
+ * Enables dynamic construction (e.g., "Combine price + title").
539
+ */
540
+ transform?: (data: Partial<ScrapedData>) => string;
541
+ /**
542
+ * Static custom input string. Used when type is 'custom'.
543
+ */
544
+ customText?: string;
545
+ }
546
+ interface ChunkingConfig {
547
+ /**
548
+ * Target chunk size in tokens.
549
+ * @default 500
550
+ */
551
+ size?: number;
552
+ /**
553
+ * Overlap between chunks in tokens.
554
+ * @default 50
555
+ */
556
+ overlap?: number;
557
+ /**
558
+ * Token counting strategy.
559
+ * - 'heuristic': chars / 4 (fast, approximate)
560
+ * - 'tiktoken': accurate for OpenAI models (lazy-loaded)
561
+ * - function: custom tokenizer
562
+ */
563
+ tokenizer?: 'heuristic' | 'tiktoken' | ((text: string) => number);
564
+ /**
565
+ * Hard cap on input length (characters) to prevent memory exhaustion.
566
+ * @default 100000 (100KB)
567
+ */
568
+ maxInputLength?: number;
569
+ }
570
+ type EmbeddingAggregation = 'average' | 'max' | 'first' | 'all';
571
+ interface OutputConfig {
572
+ /**
573
+ * Aggregation strategy for chunk vectors.
574
+ * @default 'average'
575
+ */
576
+ aggregation?: EmbeddingAggregation;
577
+ /** Model-specific dimension override */
578
+ dimensions?: number;
579
+ }
580
+ interface PiiRedactionConfig {
581
+ /** Redact email addresses */
582
+ email?: boolean;
583
+ /** Redact phone numbers */
584
+ phone?: boolean;
585
+ /** Redact credit card numbers */
586
+ creditCard?: boolean;
587
+ /** Redact SSN patterns */
588
+ ssn?: boolean;
589
+ /** Redact IP addresses */
590
+ ipAddress?: boolean;
591
+ /** Additional patterns to redact */
592
+ customPatterns?: RegExp[];
593
+ }
594
+ interface SafetyConfig {
595
+ /**
596
+ * PII redaction patterns to apply before embedding.
597
+ * Critical for GDPR/CCPA compliance with third-party APIs.
598
+ */
599
+ piiRedaction?: PiiRedactionConfig;
600
+ /**
601
+ * Minimum text length to proceed with embedding.
602
+ * Skips with reason if below threshold.
603
+ */
604
+ minTextLength?: number;
605
+ /**
606
+ * Maximum tokens per API request to prevent billing DoS.
607
+ * @default 8192
608
+ */
609
+ maxTokens?: number;
610
+ /**
611
+ * Explicitly opt-in to receive sensitive data in callbacks.
612
+ * When false (default), onChunk receives redacted content.
613
+ */
614
+ allowSensitiveCallbacks?: boolean;
615
+ }
616
+ interface EmbeddingCacheConfig {
617
+ /** Cache implementation */
618
+ store?: EmbeddingCache;
619
+ /** Time-to-live in milliseconds */
620
+ ttlMs?: number;
621
+ /** Maximum entries for in-memory cache */
622
+ maxEntries?: number;
623
+ /**
624
+ * Extra salt to disambiguate cache keys for custom providers/transforms.
625
+ */
626
+ cacheKeySalt?: string;
627
+ }
628
+ /**
629
+ * Content-addressable cache interface for embeddings.
630
+ * Keys are based on content hash, not URL.
631
+ */
632
+ interface EmbeddingCache {
633
+ get(key: string): Promise<EmbeddingResult | undefined>;
634
+ set(key: string, value: EmbeddingResult, options?: {
635
+ ttlMs?: number;
636
+ }): Promise<void>;
637
+ delete(key: string): Promise<boolean>;
638
+ clear(): Promise<void>;
639
+ }
640
+ interface RetryConfig {
641
+ /**
642
+ * Maximum retry attempts.
643
+ * @default 3
644
+ */
645
+ maxAttempts?: number;
646
+ /**
647
+ * Initial backoff delay in milliseconds.
648
+ * @default 1000
649
+ */
650
+ backoffMs?: number;
651
+ /**
652
+ * Backoff multiplier for exponential delay.
653
+ * @default 2
654
+ */
655
+ backoffMultiplier?: number;
656
+ }
657
+ interface CircuitBreakerConfig {
658
+ /**
659
+ * Number of failures before opening the circuit.
660
+ * @default 5
661
+ */
662
+ failureThreshold?: number;
663
+ /**
664
+ * Time to wait before attempting to close the circuit.
665
+ * @default 30000
666
+ */
667
+ resetTimeoutMs?: number;
668
+ }
669
+ interface RateLimitConfig {
670
+ /** Maximum requests per minute */
671
+ requestsPerMinute?: number;
672
+ /** Maximum tokens per minute */
673
+ tokensPerMinute?: number;
674
+ }
675
+ interface ResilienceState {
676
+ circuitBreaker?: {
677
+ isOpen(): boolean;
678
+ recordSuccess(): void;
679
+ recordFailure(): void;
680
+ getState?(): CircuitState;
681
+ };
682
+ rateLimiter?: {
683
+ acquire(): Promise<void>;
684
+ };
685
+ semaphore?: {
686
+ execute<T>(fn: () => Promise<T>): Promise<T>;
687
+ };
688
+ }
689
+ interface ResilienceConfig {
690
+ /** Retry configuration for transient failures */
691
+ retry?: RetryConfig;
692
+ /** Circuit breaker to prevent cascade failures */
693
+ circuitBreaker?: CircuitBreakerConfig;
694
+ /** Rate limiting per provider */
695
+ rateLimit?: RateLimitConfig;
696
+ /**
697
+ * Optional shared state for circuit breaker and rate limiter.
698
+ * Use to persist state across multiple calls.
699
+ */
700
+ state?: ResilienceState;
701
+ /**
702
+ * Request timeout in milliseconds.
703
+ * @default 30000
704
+ */
705
+ timeoutMs?: number;
706
+ /**
707
+ * Concurrent chunk processing.
708
+ * @default 1
709
+ */
710
+ concurrency?: number;
711
+ }
712
+ interface EmbeddingOptions {
713
+ /** Embedding provider configuration */
714
+ provider: EmbeddingProviderConfig;
715
+ /** Model identifier (overrides provider default) */
716
+ model?: string;
717
+ /** Input text configuration */
718
+ input?: EmbeddingInputConfig;
719
+ /** Chunking and tokenization settings */
720
+ chunking?: ChunkingConfig;
721
+ /** Output format and aggregation */
722
+ output?: OutputConfig;
723
+ /** Safety and compliance settings */
724
+ safety?: SafetyConfig;
725
+ /** Caching configuration */
726
+ cache?: EmbeddingCacheConfig;
727
+ /** Resilience and rate limiting */
728
+ resilience?: ResilienceConfig;
729
+ /**
730
+ * Callback for each chunk (receives redacted content by default).
731
+ */
732
+ onChunk?: (chunk: Readonly<string>, embedding: Readonly<number[]>) => void;
733
+ /**
734
+ * Metrics callback for observability.
735
+ */
736
+ onMetrics?: (metrics: EmbeddingMetrics) => void;
737
+ }
738
+ interface EmbeddingSource {
739
+ /** Model used for embedding (may be undefined for custom providers) */
740
+ model?: string;
741
+ /** Number of chunks processed */
742
+ chunks: number;
743
+ /** Total tokens processed */
744
+ tokens: number;
745
+ /** Content checksum for cache validation */
746
+ checksum: string;
747
+ /** Whether result was from cache */
748
+ cached: boolean;
749
+ /** Total latency in milliseconds */
750
+ latencyMs: number;
751
+ }
752
+ /**
753
+ * Successful embedding result with single aggregated vector.
754
+ */
755
+ interface EmbeddingSuccessSingle {
756
+ status: 'success';
757
+ aggregation: 'average' | 'max' | 'first';
758
+ vector: number[];
759
+ source: EmbeddingSource;
760
+ }
761
+ /**
762
+ * Successful embedding result with all chunk vectors.
763
+ */
764
+ interface EmbeddingSuccessMultiple {
765
+ status: 'success';
766
+ aggregation: 'all';
767
+ vectors: number[][];
768
+ source: EmbeddingSource;
769
+ }
770
+ /**
771
+ * Skipped embedding with reason.
772
+ */
773
+ interface EmbeddingSkipped {
774
+ status: 'skipped';
775
+ reason: string;
776
+ source: Partial<EmbeddingSource>;
777
+ }
778
+ /**
779
+ * Embedding result - discriminated union for type safety.
780
+ * Use `result.status` to narrow the type.
781
+ */
782
+ type EmbeddingResult = EmbeddingSuccessSingle | EmbeddingSuccessMultiple | EmbeddingSkipped;
783
+ interface EmbeddingMetrics {
784
+ /** Provider name */
785
+ provider: string;
786
+ /** Model used (may be undefined for custom providers) */
787
+ model?: string;
788
+ /** Input tokens processed */
789
+ inputTokens: number;
790
+ /** Output embedding dimensions */
791
+ outputDimensions: number;
792
+ /** Number of chunks processed */
793
+ chunks: number;
794
+ /** Total latency in milliseconds */
795
+ latencyMs: number;
796
+ /** Whether result was from cache */
797
+ cached: boolean;
798
+ /** Number of retry attempts */
799
+ retries: number;
800
+ /** Whether PII was redacted */
801
+ piiRedacted: boolean;
802
+ }
803
+ /**
804
+ * Internal chunk representation during processing.
805
+ */
806
+ interface TextChunk {
807
+ /** Chunk text content */
808
+ text: string;
809
+ /** Start position in original text */
810
+ startIndex: number;
811
+ /** End position in original text */
812
+ endIndex: number;
813
+ /** Estimated token count */
814
+ tokens: number;
815
+ }
816
+ /**
817
+ * Circuit breaker state.
818
+ */
819
+ type CircuitState = 'closed' | 'open' | 'half-open';
820
+ /**
821
+ * Circuit breaker internal state tracking.
822
+ */
823
+ interface CircuitBreakerState {
824
+ state: CircuitState;
825
+ failures: number;
826
+ lastFailureTime?: number;
827
+ nextAttemptTime?: number;
828
+ }
829
+ //#endregion
830
+ export { FetchResult$1 as $, HttpEmbeddingProvider as A, withResilience as B, RateLimitConfig as C, SafetyConfig as D, RetryConfig as E, CircuitOpenError as F, EnhancementType as G, withTimeout as H, RateLimiter as I, ExtractionContext as J, ExtractedEntities as K, Semaphore as L, BaseHttpConfig as M, BaseHttpProvider as N, TextChunk as O, CircuitBreaker as P, FetchOptions$1 as Q, createTimeoutSignal as R, PiiRedactionConfig as S, ResilienceState as T, CompletionOptions as U, withRetry as V, ContentType as W, ExtractionSchemaType as X, ExtractionSchema as Y, Extractor as Z, EmbeddingSkipped as _, EmbedRequest as a, EmbeddingSuccessSingle as b, EmbeddingCache as c, EmbeddingInputType as d, Fetcher as et, EmbeddingMetrics as f, EmbeddingResult as g, EmbeddingProviderConfig as h, CircuitState as i, createHttpEmbedding as j, HttpEmbeddingConfig as k, EmbeddingCacheConfig as l, EmbeddingProvider as m, CircuitBreakerConfig as n, ScrapeOptions as nt, EmbedResponse as o, EmbeddingOptions as p, ExtractedLink as q, CircuitBreakerState as r, ScrapedData as rt, EmbeddingAggregation as s, ChunkingConfig as t, LLMProvider as tt, EmbeddingInputConfig as u, EmbeddingSource as v, ResilienceConfig as w, OutputConfig as x, EmbeddingSuccessMultiple as y, isRetryableError as z };
831
+ //# sourceMappingURL=types-BOcHQU9s.d.mts.map