scrapex 0.5.3 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +551 -145
  3. package/dist/enhancer-ByjRD-t5.mjs +769 -0
  4. package/dist/enhancer-ByjRD-t5.mjs.map +1 -0
  5. package/dist/enhancer-j0xqKDJm.cjs +847 -0
  6. package/dist/enhancer-j0xqKDJm.cjs.map +1 -0
  7. package/dist/index-CDgcRnig.d.cts +268 -0
  8. package/dist/index-CDgcRnig.d.cts.map +1 -0
  9. package/dist/index-piS5wtki.d.mts +268 -0
  10. package/dist/index-piS5wtki.d.mts.map +1 -0
  11. package/dist/index.cjs +2007 -0
  12. package/dist/index.cjs.map +1 -0
  13. package/dist/index.d.cts +580 -0
  14. package/dist/index.d.cts.map +1 -0
  15. package/dist/index.d.mts +580 -0
  16. package/dist/index.d.mts.map +1 -0
  17. package/dist/index.mjs +1956 -0
  18. package/dist/index.mjs.map +1 -0
  19. package/dist/llm/index.cjs +334 -0
  20. package/dist/llm/index.cjs.map +1 -0
  21. package/dist/llm/index.d.cts +258 -0
  22. package/dist/llm/index.d.cts.map +1 -0
  23. package/dist/llm/index.d.mts +258 -0
  24. package/dist/llm/index.d.mts.map +1 -0
  25. package/dist/llm/index.mjs +317 -0
  26. package/dist/llm/index.mjs.map +1 -0
  27. package/dist/parsers/index.cjs +11 -0
  28. package/dist/parsers/index.d.cts +2 -0
  29. package/dist/parsers/index.d.mts +2 -0
  30. package/dist/parsers/index.mjs +3 -0
  31. package/dist/parsers-Bneuws8x.cjs +569 -0
  32. package/dist/parsers-Bneuws8x.cjs.map +1 -0
  33. package/dist/parsers-CwkYnyWY.mjs +482 -0
  34. package/dist/parsers-CwkYnyWY.mjs.map +1 -0
  35. package/dist/types-CadAXrme.d.mts +674 -0
  36. package/dist/types-CadAXrme.d.mts.map +1 -0
  37. package/dist/types-DPEtPihB.d.cts +674 -0
  38. package/dist/types-DPEtPihB.d.cts.map +1 -0
  39. package/package.json +79 -100
  40. package/dist/index.d.ts +0 -45
  41. package/dist/index.js +0 -8
  42. package/dist/scrapex.cjs.development.js +0 -1130
  43. package/dist/scrapex.cjs.development.js.map +0 -1
  44. package/dist/scrapex.cjs.production.min.js +0 -2
  45. package/dist/scrapex.cjs.production.min.js.map +0 -1
  46. package/dist/scrapex.esm.js +0 -1122
  47. package/dist/scrapex.esm.js.map +0 -1
@@ -0,0 +1,580 @@
1
+ import { A as EmbeddingSuccessMultiple, C as EmbeddingMetrics, D as EmbeddingResult, E as EmbeddingProviderConfig, F as SafetyConfig, I as TextChunk, L as HttpEmbeddingConfig, M as OutputConfig, N as PiiRedactionConfig, O as EmbeddingSkipped, P as ResilienceConfig, R as createHttpEmbedding, S as EmbeddingInputConfig, T as EmbeddingProvider, _ as EmbedRequest, a as ExtractedLink, b as EmbeddingCache, c as ExtractionSchemaType, d as FetchResult, f as Fetcher, g as ChunkingConfig, h as ScrapedData, i as ExtractedEntities, j as EmbeddingSuccessSingle, k as EmbeddingSource, l as Extractor, m as ScrapeOptions, n as ContentType, o as ExtractionContext, p as LLMProvider, r as EnhancementType, s as ExtractionSchema, t as CompletionOptions, u as FetchOptions, v as EmbedResponse, w as EmbeddingOptions, x as EmbeddingCacheConfig, y as EmbeddingAggregation } from "./types-DPEtPihB.cjs";
2
+ import { b as ParserResult, m as FeedMeta, n as RSSParserOptions, p as FeedItem, t as RSSParser, v as ParsedFeed } from "./index-CDgcRnig.cjs";
3
+
4
+ //#region src/core/context.d.ts
5
+
6
+ /**
7
+ * Create an extraction context with lazy JSDOM loading.
8
+ *
9
+ * Cheerio is always available for fast DOM queries.
10
+ * JSDOM is only loaded when getDocument() is called (for Readability).
11
+ */
12
+ declare function createExtractionContext(url: string, finalUrl: string, html: string, options: ScrapeOptions): ExtractionContext;
13
+ /**
14
+ * Merge partial results into the context
15
+ */
16
+ declare function mergeResults(context: ExtractionContext, extracted: Partial<ScrapedData>): ExtractionContext;
17
+ //#endregion
18
+ //#region src/core/errors.d.ts
19
+ /**
20
+ * Error codes for scraping failures
21
+ */
22
+ type ScrapeErrorCode = 'FETCH_FAILED' | 'TIMEOUT' | 'INVALID_URL' | 'BLOCKED' | 'NOT_FOUND' | 'ROBOTS_BLOCKED' | 'PARSE_ERROR' | 'LLM_ERROR' | 'VALIDATION_ERROR';
23
+ /**
24
+ * Custom error class for scraping failures with structured error codes
25
+ */
26
+ declare class ScrapeError extends Error {
27
+ readonly code: ScrapeErrorCode;
28
+ readonly statusCode?: number;
29
+ constructor(message: string, code: ScrapeErrorCode, statusCode?: number, cause?: Error);
30
+ /**
31
+ * Create a ScrapeError from an unknown error
32
+ */
33
+ static from(error: unknown, code?: ScrapeErrorCode): ScrapeError;
34
+ /**
35
+ * Check if error is retryable (network issues, timeouts)
36
+ */
37
+ isRetryable(): boolean;
38
+ /**
39
+ * Convert to a plain object for serialization
40
+ */
41
+ toJSON(): Record<string, unknown>;
42
+ }
43
+ //#endregion
44
+ //#region src/core/scrape.d.ts
45
+ /**
46
+ * Scrape a URL and extract metadata and content.
47
+ *
48
+ * @param url - The URL to scrape
49
+ * @param options - Scraping options
50
+ * @returns Scraped data with metadata and content
51
+ *
52
+ * @example
53
+ * ```ts
54
+ * const result = await scrape('https://example.com/article');
55
+ * console.log(result.title, result.content);
56
+ * ```
57
+ */
58
+ declare function scrape(url: string, options?: ScrapeOptions): Promise<ScrapedData>;
59
+ /**
60
+ * Scrape from raw HTML string (no fetch).
61
+ *
62
+ * @param html - The HTML content
63
+ * @param url - The URL (for resolving relative links)
64
+ * @param options - Scraping options
65
+ * @returns Scraped data with metadata and content
66
+ *
67
+ * @example
68
+ * ```ts
69
+ * const html = await fetchSomehow('https://example.com');
70
+ * const result = await scrapeHtml(html, 'https://example.com');
71
+ * ```
72
+ */
73
+ declare function scrapeHtml(html: string, url: string, options?: ScrapeOptions): Promise<ScrapedData>;
74
+ //#endregion
75
+ //#region src/embeddings/aggregation.d.ts
76
+ /**
77
+ * Aggregate multiple embedding vectors into a single vector or return all.
78
+ *
79
+ * @param vectors - Array of embedding vectors (must all have same dimensions)
80
+ * @param strategy - Aggregation strategy
81
+ * @returns Aggregated result based on strategy
82
+ */
83
+ declare function aggregateVectors(vectors: number[][], strategy?: EmbeddingAggregation): AggregationResult;
84
+ /**
85
+ * Result of vector aggregation.
86
+ */
87
+ type AggregationResult = {
88
+ type: 'single';
89
+ vector: number[];
90
+ dimensions: number;
91
+ } | {
92
+ type: 'multiple';
93
+ vectors: number[][];
94
+ dimensions: number;
95
+ };
96
+ /**
97
+ * Compute cosine similarity between two vectors.
98
+ * Both vectors should be normalized for accurate results.
99
+ */
100
+ declare function cosineSimilarity(a: number[], b: number[]): number;
101
+ //#endregion
102
+ //#region src/embeddings/cache.d.ts
103
+ /**
104
+ * In-memory LRU cache with TTL support.
105
+ * Content-addressable: uses content hash as key, not URL.
106
+ */
107
+ declare class InMemoryEmbeddingCache implements EmbeddingCache {
108
+ private cache;
109
+ private readonly maxEntries;
110
+ private readonly defaultTtlMs;
111
+ constructor(options?: {
112
+ maxEntries?: number;
113
+ ttlMs?: number;
114
+ });
115
+ get(key: string): Promise<EmbeddingResult | undefined>;
116
+ set(key: string, value: EmbeddingResult, options?: {
117
+ ttlMs?: number;
118
+ }): Promise<void>;
119
+ delete(key: string): Promise<boolean>;
120
+ clear(): Promise<void>;
121
+ /**
122
+ * Get cache statistics.
123
+ */
124
+ getStats(): CacheStats;
125
+ /**
126
+ * Evict expired entries.
127
+ */
128
+ cleanup(): number;
129
+ /**
130
+ * Evict least recently used entry.
131
+ */
132
+ private evictLRU;
133
+ }
134
+ /**
135
+ * Cache statistics.
136
+ */
137
+ interface CacheStats {
138
+ /** Current number of entries */
139
+ size: number;
140
+ /** Maximum allowed entries */
141
+ maxEntries: number;
142
+ /** Number of expired entries (not yet cleaned up) */
143
+ expired: number;
144
+ /** Cache utilization (0-1) */
145
+ utilization: number;
146
+ }
147
+ //#endregion
148
+ //#region src/embeddings/chunking.d.ts
149
+ /**
150
+ * Split text into overlapping chunks optimized for embedding.
151
+ * Respects sentence boundaries when possible.
152
+ */
153
+ declare function chunkText(text: string, config?: ChunkingConfig): TextChunk[];
154
+ /**
155
+ * Estimate total tokens for a text without chunking.
156
+ */
157
+ declare function estimateTokens(text: string, tokenizer?: ChunkingConfig['tokenizer']): number;
158
+ //#endregion
159
+ //#region src/embeddings/pipeline.d.ts
160
+ /**
161
+ * Generate embeddings for scraped data.
162
+ * This is the main entry point for the embedding pipeline.
163
+ */
164
+ declare function generateEmbeddings(data: Partial<ScrapedData>, options: EmbeddingOptions): Promise<EmbeddingResult>;
165
+ /**
166
+ * Embed arbitrary text directly.
167
+ * Standalone function for embedding text outside of scrape().
168
+ */
169
+ declare function embed(text: string, options: EmbeddingOptions): Promise<EmbeddingResult>;
170
+ /**
171
+ * Embed from existing ScrapedData.
172
+ * Useful when you've already scraped and want to add embeddings later.
173
+ */
174
+ declare function embedScrapedData(data: ScrapedData, options: EmbeddingOptions): Promise<EmbeddingResult>;
175
+ //#endregion
176
+ //#region src/embeddings/providers/presets.d.ts
177
+ /**
178
+ * Create an OpenAI embedding provider.
179
+ *
180
+ * @example
181
+ * ```ts
182
+ * const provider = createOpenAIEmbedding({ apiKey: 'sk-...' });
183
+ * const { embeddings } = await provider.embed(['Hello'], { model: 'text-embedding-3-small' });
184
+ * ```
185
+ */
186
+ declare function createOpenAIEmbedding(options?: {
187
+ apiKey?: string;
188
+ model?: string;
189
+ baseUrl?: string;
190
+ organization?: string;
191
+ }): EmbeddingProvider;
192
+ /**
193
+ * Create an Azure OpenAI embedding provider.
194
+ *
195
+ * @example
196
+ * ```ts
197
+ * const provider = createAzureEmbedding({
198
+ * endpoint: 'https://my-resource.openai.azure.com',
199
+ * deploymentName: 'text-embedding-ada-002',
200
+ * apiVersion: '2023-05-15',
201
+ * });
202
+ * ```
203
+ */
204
+ declare function createAzureEmbedding(options: {
205
+ endpoint: string;
206
+ deploymentName: string;
207
+ apiVersion: string;
208
+ apiKey?: string;
209
+ }): EmbeddingProvider;
210
+ /**
211
+ * Create an Ollama embedding provider for local models.
212
+ *
213
+ * LIMITATION: Ollama's /api/embeddings endpoint processes one text at a time,
214
+ * not batches. When multiple chunks are embedded, each chunk triggers a
215
+ * separate HTTP request. This is handled transparently by the pipeline's
216
+ * sequential chunk processing, but may be slower than batch-capable providers.
217
+ * For high-throughput scenarios, consider using OpenAI, Cohere, or HuggingFace
218
+ * which support batch embedding in a single request.
219
+ *
220
+ * @example
221
+ * ```ts
222
+ * const provider = createOllamaEmbedding({ model: 'nomic-embed-text' });
223
+ * ```
224
+ */
225
+ declare function createOllamaEmbedding(options?: {
226
+ baseUrl?: string;
227
+ model?: string;
228
+ }): EmbeddingProvider;
229
+ /**
230
+ * Create a HuggingFace Inference API embedding provider.
231
+ *
232
+ * @example
233
+ * ```ts
234
+ * const provider = createHuggingFaceEmbedding({
235
+ * model: 'sentence-transformers/all-MiniLM-L6-v2',
236
+ * });
237
+ * ```
238
+ */
239
+ declare function createHuggingFaceEmbedding(options: {
240
+ model: string;
241
+ apiKey?: string;
242
+ }): EmbeddingProvider;
243
+ /**
244
+ * Feature extraction pipeline type for Transformers.js
245
+ */
246
+ type FeatureExtractionPipeline = (text: string, options?: {
247
+ pooling?: 'mean' | 'cls' | 'max';
248
+ normalize?: boolean;
249
+ }) => Promise<{
250
+ data: Float32Array;
251
+ }>;
252
+ /**
253
+ * Transformers.js module interface for dependency injection.
254
+ */
255
+ interface TransformersModule {
256
+ pipeline: (task: 'feature-extraction', model: string, options?: {
257
+ quantized?: boolean;
258
+ }) => Promise<FeatureExtractionPipeline>;
259
+ env?: {
260
+ cacheDir?: string;
261
+ };
262
+ }
263
+ /**
264
+ * Create a local Transformers.js embedding provider.
265
+ * Uses dependency injection - user provides the imported transformers module.
266
+ *
267
+ * @example
268
+ * ```typescript
269
+ * import * as transformers from '@huggingface/transformers';
270
+ * import { createTransformersEmbedding } from 'scrapex/embeddings';
271
+ *
272
+ * const provider = createTransformersEmbedding(transformers, {
273
+ * model: 'Xenova/all-MiniLM-L6-v2',
274
+ * });
275
+ * ```
276
+ *
277
+ * Required Node.js dependencies:
278
+ * ```
279
+ * npm install @huggingface/transformers onnxruntime-node
280
+ * ```
281
+ */
282
+ declare function createTransformersEmbedding(transformers: TransformersModule, options?: {
283
+ model?: string;
284
+ quantized?: boolean;
285
+ pooling?: 'mean' | 'cls' | 'max';
286
+ normalize?: boolean;
287
+ cacheDir?: string;
288
+ }): EmbeddingProvider;
289
+ /** Recommended models for Transformers.js */
290
+ declare const TRANSFORMERS_MODELS: {
291
+ /** Default - Fast, general purpose (384 dimensions, ~23MB) */
292
+ readonly DEFAULT: "Xenova/all-MiniLM-L6-v2";
293
+ /** Higher quality, more resources (768 dimensions, ~110MB) */
294
+ readonly QUALITY: "Xenova/all-mpnet-base-v2";
295
+ /** Optimized for retrieval (384 dimensions, ~33MB) */
296
+ readonly RETRIEVAL: "Xenova/bge-small-en-v1.5";
297
+ /** Multi-language support (384 dimensions, ~118MB) */
298
+ readonly MULTILINGUAL: "Xenova/multilingual-e5-small";
299
+ };
300
+ //#endregion
301
+ //#region src/embeddings/providers/index.d.ts
302
+ /**
303
+ * Create an embedding provider from configuration.
304
+ * This is the main factory function for creating providers.
305
+ */
306
+ declare function createEmbeddingProvider(config: EmbeddingProviderConfig): EmbeddingProvider;
307
+ //#endregion
308
+ //#region src/embeddings/safety.d.ts
309
+ /**
310
+ * Create a redaction function based on configuration.
311
+ * Returns a function that applies all configured PII patterns.
312
+ */
313
+ declare function createPiiRedactor(config: PiiRedactionConfig): (text: string) => RedactionResult;
314
+ /**
315
+ * Result of PII redaction operation.
316
+ */
317
+ interface RedactionResult {
318
+ /** Redacted text */
319
+ text: string;
320
+ /** Whether any redactions were made */
321
+ redacted: boolean;
322
+ /** Total number of redactions */
323
+ redactionCount: number;
324
+ /** Count by redaction type */
325
+ redactionsByType: Record<string, number>;
326
+ }
327
+ /**
328
+ * Simple redaction that applies all default patterns.
329
+ * Use createPiiRedactor() for fine-grained control.
330
+ */
331
+ declare function redactPii(text: string): RedactionResult;
332
+ //#endregion
333
+ //#region src/extractors/content.d.ts
334
+ /**
335
+ * Extracts main content using Mozilla Readability.
336
+ * Converts HTML to Markdown for LLM consumption.
337
+ */
338
+ declare class ContentExtractor implements Extractor {
339
+ readonly name = "content";
340
+ readonly priority = 50;
341
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
342
+ private extractFallback;
343
+ private createExcerpt;
344
+ private detectContentType;
345
+ }
346
+ //#endregion
347
+ //#region src/extractors/favicon.d.ts
348
+ /**
349
+ * Extracts favicon URL from the page.
350
+ * Checks multiple sources in order of preference.
351
+ */
352
+ declare class FaviconExtractor implements Extractor {
353
+ readonly name = "favicon";
354
+ readonly priority = 70;
355
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
356
+ }
357
+ //#endregion
358
+ //#region src/extractors/jsonld.d.ts
359
+ /**
360
+ * Extracts JSON-LD structured data from the page.
361
+ * Also extracts additional metadata from structured data.
362
+ */
363
+ declare class JsonLdExtractor implements Extractor {
364
+ readonly name = "jsonld";
365
+ readonly priority = 80;
366
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
367
+ private extractMetadata;
368
+ private getType;
369
+ private getString;
370
+ private getAuthor;
371
+ private getImage;
372
+ private getKeywords;
373
+ }
374
+ //#endregion
375
+ //#region src/extractors/links.d.ts
376
+ /**
377
+ * Extracts links from the page content.
378
+ * Filters out navigation/footer links and focuses on content links.
379
+ */
380
+ declare class LinksExtractor implements Extractor {
381
+ readonly name = "links";
382
+ readonly priority = 30;
383
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
384
+ }
385
+ //#endregion
386
+ //#region src/extractors/meta.d.ts
387
+ /**
388
+ * Extracts metadata from HTML meta tags, Open Graph, and Twitter cards.
389
+ * Runs first to provide basic metadata for other extractors.
390
+ */
391
+ declare class MetaExtractor implements Extractor {
392
+ readonly name = "meta";
393
+ readonly priority = 100;
394
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
395
+ }
396
+ //#endregion
397
+ //#region src/extractors/index.d.ts
398
+ /**
399
+ * Default extractors in priority order.
400
+ * Higher priority runs first.
401
+ */
402
+ declare function createDefaultExtractors(): Extractor[];
403
+ /**
404
+ * Sort extractors by priority (higher first).
405
+ */
406
+ declare function sortExtractors(extractors: Extractor[]): Extractor[];
407
+ //#endregion
408
+ //#region src/fetchers/types.d.ts
409
+ /**
410
+ * Fetcher interface - allows swapping fetch implementation
411
+ * for Puppeteer, Playwright, or custom solutions
412
+ */
413
+ interface Fetcher$1 {
414
+ /**
415
+ * Fetch HTML from a URL
416
+ * @returns HTML content and final URL (after redirects)
417
+ */
418
+ fetch(url: string, options?: FetchOptions$1): Promise<FetchResult$1>;
419
+ /** Fetcher name for logging */
420
+ readonly name: string;
421
+ }
422
+ /**
423
+ * Options for fetching
424
+ */
425
+ interface FetchOptions$1 {
426
+ /** Timeout in milliseconds (default: 10000) */
427
+ timeout?: number;
428
+ /** User agent string */
429
+ userAgent?: string;
430
+ /** Additional headers to send */
431
+ headers?: Record<string, string>;
432
+ /**
433
+ * Allowed MIME types.
434
+ * Defaults to HTML/XHTML if undefined.
435
+ */
436
+ allowedContentTypes?: string[];
437
+ }
438
+ /**
439
+ * Result from fetching a URL
440
+ */
441
+ interface FetchResult$1 {
442
+ /** Raw HTML content */
443
+ html: string;
444
+ /** Final URL after redirects */
445
+ finalUrl: string;
446
+ /** HTTP status code */
447
+ statusCode: number;
448
+ /** Content-Type header */
449
+ contentType: string;
450
+ /** Response headers (optional) */
451
+ headers?: Record<string, string>;
452
+ }
453
+ /**
454
+ * Default user agent string
455
+ */
456
+ declare const DEFAULT_USER_AGENT = "Scrapex-Bot/2.0 (+https://github.com/developer-rakeshpaul/scrapex)";
457
+ /**
458
+ * Default timeout in milliseconds
459
+ */
460
+ declare const DEFAULT_TIMEOUT = 10000;
461
+ //#endregion
462
+ //#region src/fetchers/fetch.d.ts
463
+ /**
464
+ * Default fetcher using native fetch API.
465
+ * Works in Node.js 18+ without polyfills.
466
+ */
467
+ declare class NativeFetcher implements Fetcher$1 {
468
+ readonly name = "native-fetch";
469
+ fetch(url: string, options?: FetchOptions$1): Promise<FetchResult$1>;
470
+ }
471
+ /**
472
+ * Default fetcher instance
473
+ */
474
+ declare const defaultFetcher: NativeFetcher;
475
+ //#endregion
476
+ //#region src/fetchers/robots.d.ts
477
+ /**
478
+ * Result of robots.txt check
479
+ */
480
+ interface RobotsCheckResult {
481
+ allowed: boolean;
482
+ reason?: string;
483
+ }
484
+ /**
485
+ * Check if URL is allowed by robots.txt
486
+ *
487
+ * @param url - The URL to check
488
+ * @param userAgent - User agent to check rules for
489
+ * @returns Whether the URL is allowed and optional reason
490
+ */
491
+ declare function checkRobotsTxt(url: string, userAgent?: string): Promise<RobotsCheckResult>;
492
+ //#endregion
493
+ //#region src/utils/feed.d.ts
494
+ /**
495
+ * Fetch and parse an RSS/Atom feed from a URL.
496
+ * Uses scrapex's fetcher infrastructure for consistent behavior.
497
+ */
498
+ declare function fetchFeed(url: string, options?: {
499
+ fetcher?: Fetcher$1;
500
+ timeout?: number;
501
+ userAgent?: string;
502
+ parserOptions?: RSSParserOptions;
503
+ }): Promise<ParserResult<ParsedFeed, FeedMeta>>;
504
+ /**
505
+ * Detect RSS/Atom feed URLs from HTML.
506
+ * Supports RSS, Atom, and RDF feed types.
507
+ */
508
+ declare function discoverFeeds(html: string, baseUrl: string): string[];
509
+ /**
510
+ * Filter feed items by date range.
511
+ * Items without publishedAt are included by default.
512
+ */
513
+ declare function filterByDate(items: FeedItem[], options: {
514
+ after?: Date;
515
+ before?: Date;
516
+ includeUndated?: boolean;
517
+ }): FeedItem[];
518
+ /**
519
+ * Convert feed items to markdown for LLM consumption.
520
+ * Uses ISO 8601 date format for consistency across environments.
521
+ */
522
+ declare function feedToMarkdown(feed: ParsedFeed, options?: {
523
+ includeContent?: boolean;
524
+ maxItems?: number;
525
+ }): string;
526
+ /**
527
+ * Extract plain text from feed items for LLM processing.
528
+ * Concatenates title, description, and content.
529
+ */
530
+ declare function feedToText(feed: ParsedFeed, options?: {
531
+ maxItems?: number;
532
+ separator?: string;
533
+ }): string;
534
+ /**
535
+ * Paginate through a feed using rel="next" links (RFC 5005).
536
+ * Returns an async generator that yields each page.
537
+ */
538
+ declare function paginateFeed(url: string, options?: {
539
+ fetcher?: Fetcher$1;
540
+ timeout?: number;
541
+ userAgent?: string;
542
+ maxPages?: number;
543
+ }): AsyncGenerator<ParsedFeed, void, unknown>;
544
+ //#endregion
545
+ //#region src/utils/url.d.ts
546
+ /**
547
+ * Validate if a string is a valid URL
548
+ */
549
+ declare function isValidUrl(url: string): boolean;
550
+ /**
551
+ * Normalize URL by removing tracking params and trailing slashes
552
+ */
553
+ declare function normalizeUrl(url: string): string;
554
+ /**
555
+ * Extract domain from URL (without www prefix)
556
+ */
557
+ declare function extractDomain(url: string): string;
558
+ /**
559
+ * Resolve a potentially relative URL against a base URL
560
+ */
561
+ declare function resolveUrl(url: string | undefined | null, baseUrl: string): string | undefined;
562
+ /**
563
+ * Check if a URL is external relative to a domain
564
+ */
565
+ declare function isExternalUrl(url: string, baseDomain: string): boolean;
566
+ /**
567
+ * Extract protocol from URL
568
+ */
569
+ declare function getProtocol(url: string): string;
570
+ /**
571
+ * Get the path portion of a URL
572
+ */
573
+ declare function getPath(url: string): string;
574
+ /**
575
+ * Check if URL matches a pattern (supports * wildcard)
576
+ */
577
+ declare function matchesUrlPattern(url: string, pattern: string): boolean;
578
+ //#endregion
579
+ export { type ChunkingConfig, type CompletionOptions, ContentExtractor, type ContentType, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT, type EmbedRequest, type EmbedResponse, type EmbeddingCache, type EmbeddingCacheConfig, type EmbeddingInputConfig, type EmbeddingMetrics, type EmbeddingOptions, type EmbeddingProvider, type EmbeddingProviderConfig, type EmbeddingResult, type EmbeddingSkipped, type EmbeddingSource, type EmbeddingSuccessMultiple, type EmbeddingSuccessSingle, type EnhancementType, type ExtractedEntities, type ExtractedLink, type ExtractionContext, type ExtractionSchema, type ExtractionSchemaType, type Extractor, FaviconExtractor, type FetchOptions, type FetchResult, type Fetcher, type HttpEmbeddingConfig, InMemoryEmbeddingCache, JsonLdExtractor, type LLMProvider, LinksExtractor, MetaExtractor, NativeFetcher, type OutputConfig, RSSParser, type ResilienceConfig, type RobotsCheckResult, type SafetyConfig, ScrapeError, type ScrapeErrorCode, type ScrapeOptions, type ScrapedData, TRANSFORMERS_MODELS, aggregateVectors, checkRobotsTxt, chunkText, cosineSimilarity, createAzureEmbedding, createDefaultExtractors, createEmbeddingProvider, createExtractionContext, createHttpEmbedding, createHuggingFaceEmbedding, createOllamaEmbedding, createOpenAIEmbedding, createPiiRedactor, createTransformersEmbedding, defaultFetcher, discoverFeeds, embed, embedScrapedData, estimateTokens, extractDomain, feedToMarkdown, feedToText, fetchFeed, filterByDate, generateEmbeddings, getPath, getProtocol, isExternalUrl, isValidUrl, matchesUrlPattern, mergeResults, normalizeUrl, paginateFeed, redactPii, resolveUrl, scrape, scrapeHtml, sortExtractors };
580
+ //# sourceMappingURL=index.d.cts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.cts","names":[],"sources":["../src/core/context.ts","../src/core/errors.ts","../src/core/scrape.ts","../src/embeddings/aggregation.ts","../src/embeddings/cache.ts","../src/embeddings/chunking.ts","../src/embeddings/pipeline.ts","../src/embeddings/providers/presets.ts","../src/embeddings/providers/index.ts","../src/embeddings/safety.ts","../src/extractors/content.ts","../src/extractors/favicon.ts","../src/extractors/jsonld.ts","../src/extractors/links.ts","../src/extractors/meta.ts","../src/extractors/index.ts","../src/fetchers/types.ts","../src/fetchers/fetch.ts","../src/fetchers/robots.ts","../src/utils/feed.ts","../src/utils/url.ts"],"sourcesContent":[],"mappings":";;;;;AA2DA;;;;;;iBArCgB,uBAAA,uDAIL,gBACR;;;ACxBH;AAca,iBD0CG,YAAA,CC1CS,OAAA,ED2Cd,iBC3Cc,EAAA,SAAA,ED4CZ,OC5CY,CD4CJ,WC5CI,CAAA,CAAA,ED6CtB,iBC7CsB;;;;;;KAdb,eAAA;ADmBZ;AAqCA;;AAEqB,cC5CR,WAAA,SAAoB,KAAA,CD4CZ;EAAR,SAAA,IAAA,EC3CW,eD2CX;EACV,SAAA,UAAA,CAAA,EAAA,MAAA;EAAiB,WAAA,CAAA,OAAA,EAAA,MAAA,EAAA,IAAA,ECzCiB,eDyCjB,EAAA,UAAA,CAAA,EAAA,MAAA,EAAA,KAAA,CAAA,ECzC+D,KDyC/D;;;;EC3DR,OAAA,IAAA,CAAA,KAAA,EAAe,OAAA,EAAA,IAAA,CAAA,EAiCS,eAjCT,CAAA,EAiC4C,WAjC5C;EAcd;;;EAIsE,WAAA,CAAA,CAAA,EAAA,OAAA;EAe/C;;;EAnBH,MAAA,CAAA,CAAA,EAyCrB,MAzCqB,CAAA,MAAA,EAAA,OAAA,CAAA;;;;;;;ADKjC;AAqCA;;;;;;;;;ACxDY,iBCmBU,MAAA,CDnBK,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,ECmBwB,aDnBxB,CAAA,ECmB6C,ODnB7C,CCmBqD,WDnBrD,CAAA;AAc3B;;;;;;;;;;;;ACKA;;AAAgF,iBA4J1D,UAAA,CA5J0D,IAAA,EAAA,MAAA,EAAA,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,EA+JrE,aA/JqE,CAAA,EAgK7E,OAhK6E,CAgKrE,WAhKqE,CAAA;;;;;;AFAhF;AAqCA;;;AAEa,iBGpDG,gBAAA,CHoDH,OAAA,EAAA,MAAA,EAAA,EAAA,EAAA,QAAA,CAAA,EGlDD,oBHkDC,CAAA,EGjDV,iBHiDU;;;;KGYD,iBAAA;;EFtEA,MAAA,EAAA,MAAA,EAAA;EAcC,UAAA,EAAA,MAAY;CACD,GAAA;EAGa,IAAA,EAAA,UAAA;EAA8C,OAAA,EAAA,MAAA,EAAA,EAAA;EAe/C,UAAA,EAAA,MAAA;CAAmC;;;ACdvE;;AAAgF,iBCmIhE,gBAAA,CDnIgE,CAAA,EAAA,MAAA,EAAA,EAAA,CAAA,EAAA,MAAA,EAAA,CAAA,EAAA,MAAA;;;;;;AAAhF;AAAmD,cEmLtC,sBAAA,YAAkC,cFnLI,CAAA;EAA6B,QAAA,KAAA;EAAR,iBAAA,UAAA;EAAO,iBAAA,YAAA;EA4JzD,WAAA,CAAA,OAInB,CAJ6B,EAAA;IAGrB,UAAA,CAAA,EAAA,MAAA;IACA,KAAA,CAAA,EAAA,MAAA;EAAR,CAAA;EAAO,GAAA,CAAA,GAAA,EAAA,MAAA,CAAA,EE8BgB,OF9BhB,CE8BwB,eF9BxB,GAAA,SAAA,CAAA;0BEmDsB;;MAAgD;EDhOhE,MAAA,CAAA,GAAA,EAAA,MAAA,CAAgB,ECiPH,ODjPG,CAAA,OAEpB,CAAA;EA8DA,KAAA,CAAA,CAAA,ECqLK,ODrLL,CAAA,IAAiB,CAAA;EAgFb;;;cC4GF;EA5DD;;;EAgCmB,OAAA,CAAA,CAAA,EAAA,MAAA;EAAgD;;;EA4BlE,QAAA,QAAA;;;AA0Dd;;UAAiB,UAAA;;EC5ND,IAAA,EAAA,MAAS;EAiFT;;;;ECpIM;EACN,WAAA,EAAA,MAAA;;;;;;;;AL9CJ,iBIgGI,SAAA,CJhGW,IAAA,EAAA,MAAA,EAAA,MAAA,CAAA,EIgGsB,cJhGtB,CAAA,EIgGuC,SJhGvC,EAAA;AAc3B;;;AAImF,iBI+JnE,cAAA,CJ/JmE,IAAA,EAAA,MAAA,EAAA,SAAA,CAAA,EI+J1B,cJ/J0B,CAAA,WAAA,CAAA,CAAA,EAAA,MAAA;;;;;ADCnF;AAqCA;AACW,iBMZW,kBAAA,CNYX,IAAA,EMXH,ONWG,CMXK,WNWL,CAAA,EAAA,OAAA,EMVA,gBNUA,CAAA,EMTR,ONSQ,CMTA,eNSA,CAAA;;;;;iBMkPW,KAAA,wBAA6B,mBAAmB,QAAQ;;;AL3S9E;AAcA;AACwB,iBKkTF,gBAAA,CLlTE,IAAA,EKmThB,WLnTgB,EAAA,OAAA,EKoTb,gBLpTa,CAAA,EKqTrB,OLrTqB,CKqTb,eLrTa,CAAA;;;ADyCxB;;;;;;;;;ACxDY,iBM6BI,qBAAA,CN7BW,OAiCS,CAjCT,EAAA;EAcd,MAAA,CAAA,EAAA,MAAY;EACD,KAAA,CAAA,EAAA,MAAA;EAGa,OAAA,CAAA,EAAA,MAAA;EAA8C,YAAA,CAAA,EAAA,MAAA;CAe/C,CAAA,EMChC,iBNDgC;;;;;;;;ACdpC;;;;;AA4JsB,iBKvGN,oBAAA,CLuGgB,OAAA,EAAA;EAGrB,QAAA,EAAA,MAAA;EACA,cAAA,EAAA,MAAA;EAAR,UAAA,EAAA,MAAA;EAAO,MAAA,CAAA,EAAA,MAAA;IKtGN;;;AJvEJ;AAgEA;AAgFA;;;;ACgDA;;;;;;;AA4Dc,iBGxIE,qBAAA,CHwIF,OA0Dd,CA1Dc,EAAA;EA5DiC,OAAA,CAAA,EAAA,MAAA;EAAc,KAAA,CAAA,EAAA,MAAA;AAsH7D,CAAA,CAAA,EG/LI,iBH+LuB;;;;AC5N3B;AAiFA;;;;ACpIA;;AACQ,iBCwGQ,0BAAA,CDxGR,OAAA,EAAA;EACG,KAAA,EAAA,MAAA;EACA,MAAA,CAAA,EAAA,MAAA;CAAR,CAAA,ECyGC,iBDzGD;;ACnBH;AA2CA;AAkDA,KA6GK,yBAAA,GA7GgC,CAAA,IAAA,EAAA,MAGjC,EAAA,OA6GQ,CA7GR,EAAA;EAyBY,OAAA,CAAA,EAAA,MAAA,GAAA,KAAA,GAAA,KAA0B;EAiFrC,SAAA,CAAA,EAAA,OAAA;AAGO,CAAA,EAAA,GAAP,OAKK,CAAA;EA4BM,IAAA,EAjCK,YAiCL;AAkEhB,CAAA,CAAA;;;;ACpTA,UDsNU,kBAAA,CCtNM;;;QD2NT,QAAQ;EEtNC,GAAA,CAAA,EAAA;IAkEC,QAAA,CAAA,EAAA,MAAe;EAehB,CAAA;;;;AC7FhB;;;;;;;;;;ACdA;;;;;;;iBJuQgB,2BAAA,eACA,2BKrQqC;;;EAJxC,OAAA,CAAA,EAAA,MAAA,GAAgB,KAAA,GAAA,KAAA;EAIJ,SAAA,CAAA,EAAA,OAAA;EAAoC,QAAA,CAAA,EAAA,MAAA;CAAR,CAAA,EL6QlD,iBK7QkD;;AAJb,cL0U3B,mBK1U2B,EAAA;EAAS;;;;ECCpC;EAIY,SAAA,SAAA,EAAA,0BAAA;EAAoC;EAAR,SAAA,YAAA,EAAA,8BAAA;CAAR;;;AbgD7C;;;;AAGG,iBQlCa,uBAAA,CRkCb,MAAA,EQlC6C,uBRkC7C,CAAA,EQlCuE,iBRkCvE;;;;;;AAxCH;AAqCgB,iBS1BA,iBAAA,CT0BY,MAAA,ES1Bc,kBT0Bd,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GS1BqD,eT0BrD;;;;AAGzB,USqCc,eAAA,CTrCd;EAAiB;;;;EC3DR;EAcC,cAAY,EAAA,MAAA;EACD;EAGa,gBAAA,EQsFjB,MRtFiB,CAAA,MAAA,EAAA,MAAA,CAAA;;;;;;AAJC,iBQiGtB,SAAA,CRjGsB,IAAA,EAAA,MAAA,CAAA,EQiGG,eRjGH;;;;;;ADKtC;AAqCgB,cUtCH,gBAAA,YAA4B,SVsCb,CAAA;EACjB,SAAA,IAAA,GAAA,SAAA;EACU,SAAA,QAAA,GAAA,EAAA;EAAR,OAAA,CAAA,OAAA,EUpCY,iBVoCZ,CAAA,EUpCgC,OVoChC,CUpCwC,OVoCxC,CUpCgD,WVoChD,CAAA,CAAA;EACV,QAAA,eAAA;EAAiB,QAAA,aAAA;;;;;;;;AAxCpB;AAqCgB,cWpDH,gBAAA,YAA4B,SXoDb,CAAA;EACjB,SAAA,IAAA,GAAA,SAAA;EACU,SAAA,QAAA,GAAA,EAAA;EAAR,OAAA,CAAA,OAAA,EWlDY,iBXkDZ,CAAA,EWlDgC,OXkDhC,CWlDwC,OXkDxC,CWlDgD,WXkDhD,CAAA,CAAA;;;;;;;AAvCb;AAqCgB,cYrDH,eAAA,YAA2B,SZqDZ,CAAA;EACjB,SAAA,IAAA,GAAA,QAAA;EACU,SAAA,QAAA,GAAA,EAAA;EAAR,OAAA,CAAA,OAAA,EYnDY,iBZmDZ,CAAA,EYnDgC,OZmDhC,CYnDwC,OZmDxC,CYnDgD,WZmDhD,CAAA,CAAA;EACV,QAAA,eAAA;EAAiB,QAAA,OAAA;;;;EC3DR,QAAA,WAAe;AAc3B;;;;;;ADKA;AAqCgB,capDH,cAAA,YAA0B,SboDX,CAAA;EACjB,SAAA,IAAA,GAAA,OAAA;EACU,SAAA,QAAA,GAAA,EAAA;EAAR,OAAA,CAAA,OAAA,EalDY,iBbkDZ,CAAA,EalDgC,ObkDhC,CalDwC,ObkDxC,CalDgD,WbkDhD,CAAA,CAAA;;;;;;;AAvCb;AAqCgB,ccrDH,aAAA,YAAyB,SdqDV,CAAA;EACjB,SAAA,IAAA,GAAA,MAAA;EACU,SAAA,QAAA,GAAA,GAAA;EAAR,OAAA,CAAA,OAAA,EcnDY,iBdmDZ,CAAA,EcnDgC,OdmDhC,CcnDwC,OdmDxC,CcnDgD,WdmDhD,CAAA,CAAA;;;;;;;;AACO,iBe5CJ,uBAAA,CAAA,Cf4CI,Ee5CuB,Sf4CvB,EAAA;;;;AC3DR,iBc4BI,cAAA,Cd5BW,UAAA,Ec4BgB,Sd5BhB,EAAA,CAAA,Ec4B8B,Sd5B9B,EAAA;;;;;;;ADmBX,UgBlBC,SAAA,ChBkBD;EAqCA;;;;EAGb,KAAA,CAAA,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,EgBrD4B,chBqD5B,CAAA,EgBrD2C,OhBqD3C,CgBrDmD,ahBqDnD,CAAA;EAAiB;;;;AC3DpB;AAcA;AACwB,UeAP,cAAA,CfAO;EAGa;EAA8C,OAAA,CAAA,EAAA,MAAA;EAe/C;EAAmC,SAAA,CAAA,EAAA,MAAA;EAsB3D;EAzCqB,OAAA,CAAA,EeSrB,MfTqB,CAAA,MAAA,EAAA,MAAA,CAAA;EAAK;;;;ECKhB,mBAAM,CAAA,EAAA,MAAA,EAAA;;;;;AA4JN,Uc5IL,aAAA,Cd4Ie;EAGrB;EACA,IAAA,EAAA,MAAA;EAAR;EAAO,QAAA,EAAA,MAAA;;;;EC7KM,WAAA,EAAA,MAAgB;EAgEpB;EAgFI,OAAA,CAAA,EarGJ,MbqGI,CAAA,MAAgB,EAAA,MAAA,CAAA;;;;ACgDhC;AAWkC,cY1JrB,kBAAA,GZ0JqB,oEAAA;;;;AAsCL,cY1LhB,eAAA,GZ0LgB,KAAA;;;;;;AJpO7B;AAqCgB,ciB9CH,aAAA,YAAyB,SjB8CV,CAAA;EACjB,SAAA,IAAA,GAAA,cAAA;EACU,KAAA,CAAA,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,EiB7Ce,cjB6Cf,CAAA,EiB7CmC,OjB6CnC,CiB7C2C,ajB6C3C,CAAA;;;;;ciBiER,gBAAc;;;;;;UCzHV,iBAAA;ElBiBD,OAAA,EAAA,OAAA;EAqCA,MAAA,CAAA,EAAA,MAAY;;;;;;;;;ACxDhB,iBiBsBU,cAAA,CjBtBK,GAAA,EAAA,MAAA,EAAA,SAAA,CAAA,EAAA,MAAA,CAAA,EiByBxB,OjBzBwB,CiByBhB,iBjBzBgB,CAAA;;;;ADmB3B;AAqCA;;AAEqB,iBmBnDC,SAAA,CnBmDD,GAAA,EAAA,MAAA,EAAA,QAAA,EAAA;EAAR,OAAA,CAAA,EmBhDC,SnBgDD;EACV,OAAA,CAAA,EAAA,MAAA;EAAiB,SAAA,CAAA,EAAA,MAAA;kBmB9CA;IAEjB,QAAQ,aAAa,YAAY;;AlBfpC;AAcA;;AAIqC,iBkBqBrB,aAAA,ClBrBqB,IAAA,EAAA,MAAA,EAAA,OAAA,EAAA,MAAA,CAAA,EAAA,MAAA,EAAA;;;;;AAJJ,iBkB4DjB,YAAA,ClB5DiB,KAAA,EkB6DxB,QlB7DwB,EAAA,EAAA,OAAA,EAAA;EAAK,KAAA,CAAA,EkB8DjB,IlB9DiB;WkB8DF;;IACjC;AjB1DH;;;;AAA+E,iBiB6E/D,cAAA,CjB7E+D,IAAA,EiB8EvE,UjB9EuE,EAAA,OAgKpE,CAhKoE,EAAA;EA4JzD,cAAU,CAAA,EAAA,OAAA;EAGrB,QAAA,CAAA,EAAA,MAAA;CACA,CAAA,EAAA,MAAA;;;;;iBiBpCK,UAAA,OACR,mBhBMR;EAhJgB,QAAA,CAAA,EAAA,MAAA;EAgEJ,SAAA,CAAA,EAAA,MAAA;AAgFZ,CAAA,CAAA,EAAgB,MAAA;;;;ACgDhB;AAWkC,iBe1CX,YAAA,Cf0CW,GAAA,EAAA,MAAA,EAAA,OA0CjB,CA1CiB,EAAA;EAAR,OAAA,CAAA,EevCZ,SfuCY;EAqBM,OAAA,CAAA,EAAA,MAAA;EAAgD,SAAA,CAAA,EAAA,MAAA;EAiBnD,QAAA,CAAA,EAAA,MAAA;CAIZ,CAAA,Ee5Ed,cf4Ec,Ce5EC,Uf4ED,EAAA,IAAA,EAAA,OAAA,CAAA;;;;;;iBgBnOD,UAAA;ApBLhB;AAqCA;;AAEqB,iBoBtBL,YAAA,CpBsBK,GAAA,EAAA,MAAA,CAAA,EAAA,MAAA;;;;iBoBEL,aAAA;;;AnB5DhB;AAca,iBmB0DG,UAAA,CnB1DS,GAAA,EAAA,MAAA,GAAA,SAAA,GAAA,IAAA,EAAA,OAAA,EAAA,MAAA,CAAA,EAAA,MAAA,GAAA,SAAA;;;;AAmBW,iBmBoDpB,aAAA,CnBpDoB,GAAA,EAAA,MAAA,EAAA,UAAA,EAAA,MAAA,CAAA,EAAA,OAAA;;;;AAnBE,iBmBoFtB,WAAA,CnBpFsB,GAAA,EAAA,MAAA,CAAA,EAAA,MAAA;;;;ACKhB,iBkB0FN,OAAA,ClB1FY,GAAA,EAAA,MAAA,CAAA,EAAA,MAAA;;;;AAAmD,iBkBqG/D,iBAAA,ClBrG+D,GAAA,EAAA,MAAA,EAAA,OAAA,EAAA,MAAA,CAAA,EAAA,OAAA"}