scrapex 1.0.0-alpha.1 → 1.0.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +164 -5
- package/dist/enhancer-ByjRD-t5.mjs +769 -0
- package/dist/enhancer-ByjRD-t5.mjs.map +1 -0
- package/dist/enhancer-j0xqKDJm.cjs +847 -0
- package/dist/enhancer-j0xqKDJm.cjs.map +1 -0
- package/dist/index-CDgcRnig.d.cts +268 -0
- package/dist/index-CDgcRnig.d.cts.map +1 -0
- package/dist/index-piS5wtki.d.mts +268 -0
- package/dist/index-piS5wtki.d.mts.map +1 -0
- package/dist/index.cjs +1192 -37
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +318 -2
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +318 -2
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1164 -6
- package/dist/index.mjs.map +1 -1
- package/dist/llm/index.cjs +250 -232
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +132 -85
- package/dist/llm/index.d.cts.map +1 -1
- package/dist/llm/index.d.mts +132 -85
- package/dist/llm/index.d.mts.map +1 -1
- package/dist/llm/index.mjs +243 -236
- package/dist/llm/index.mjs.map +1 -1
- package/dist/parsers/index.cjs +10 -199
- package/dist/parsers/index.d.cts +2 -133
- package/dist/parsers/index.d.mts +2 -133
- package/dist/parsers/index.mjs +2 -191
- package/dist/parsers-Bneuws8x.cjs +569 -0
- package/dist/parsers-Bneuws8x.cjs.map +1 -0
- package/dist/parsers-CwkYnyWY.mjs +482 -0
- package/dist/parsers-CwkYnyWY.mjs.map +1 -0
- package/dist/types-CadAXrme.d.mts +674 -0
- package/dist/types-CadAXrme.d.mts.map +1 -0
- package/dist/types-DPEtPihB.d.cts +674 -0
- package/dist/types-DPEtPihB.d.cts.map +1 -0
- package/package.json +15 -16
- package/dist/enhancer-Q6CSc1gA.mjs +0 -220
- package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
- package/dist/enhancer-oM4BhYYS.cjs +0 -268
- package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
- package/dist/parsers/index.cjs.map +0 -1
- package/dist/parsers/index.d.cts.map +0 -1
- package/dist/parsers/index.d.mts.map +0 -1
- package/dist/parsers/index.mjs.map +0 -1
- package/dist/types-CNQZVW36.d.mts +0 -150
- package/dist/types-CNQZVW36.d.mts.map +0 -1
- package/dist/types-D0HYR95H.d.cts +0 -150
- package/dist/types-D0HYR95H.d.cts.map +0 -1
package/dist/index.d.mts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { a as ExtractedLink, c as ExtractionSchemaType, d as FetchResult, f as Fetcher, h as ScrapedData, i as ExtractedEntities, l as Extractor, m as ScrapeOptions, n as ContentType, o as ExtractionContext, p as LLMProvider, r as EnhancementType, s as ExtractionSchema, t as CompletionOptions, u as FetchOptions } from "./types-
|
|
1
|
+
import { A as EmbeddingSuccessMultiple, C as EmbeddingMetrics, D as EmbeddingResult, E as EmbeddingProviderConfig, F as SafetyConfig, I as TextChunk, L as HttpEmbeddingConfig, M as OutputConfig, N as PiiRedactionConfig, O as EmbeddingSkipped, P as ResilienceConfig, R as createHttpEmbedding, S as EmbeddingInputConfig, T as EmbeddingProvider, _ as EmbedRequest, a as ExtractedLink, b as EmbeddingCache, c as ExtractionSchemaType, d as FetchResult, f as Fetcher, g as ChunkingConfig, h as ScrapedData, i as ExtractedEntities, j as EmbeddingSuccessSingle, k as EmbeddingSource, l as Extractor, m as ScrapeOptions, n as ContentType, o as ExtractionContext, p as LLMProvider, r as EnhancementType, s as ExtractionSchema, t as CompletionOptions, u as FetchOptions, v as EmbedResponse, w as EmbeddingOptions, x as EmbeddingCacheConfig, y as EmbeddingAggregation } from "./types-CadAXrme.mjs";
|
|
2
|
+
import { b as ParserResult, m as FeedMeta, n as RSSParserOptions, p as FeedItem, t as RSSParser, v as ParsedFeed } from "./index-piS5wtki.mjs";
|
|
2
3
|
|
|
3
4
|
//#region src/core/context.d.ts
|
|
4
5
|
|
|
@@ -71,6 +72,264 @@ declare function scrape(url: string, options?: ScrapeOptions): Promise<ScrapedDa
|
|
|
71
72
|
*/
|
|
72
73
|
declare function scrapeHtml(html: string, url: string, options?: ScrapeOptions): Promise<ScrapedData>;
|
|
73
74
|
//#endregion
|
|
75
|
+
//#region src/embeddings/aggregation.d.ts
|
|
76
|
+
/**
|
|
77
|
+
* Aggregate multiple embedding vectors into a single vector or return all.
|
|
78
|
+
*
|
|
79
|
+
* @param vectors - Array of embedding vectors (must all have same dimensions)
|
|
80
|
+
* @param strategy - Aggregation strategy
|
|
81
|
+
* @returns Aggregated result based on strategy
|
|
82
|
+
*/
|
|
83
|
+
declare function aggregateVectors(vectors: number[][], strategy?: EmbeddingAggregation): AggregationResult;
|
|
84
|
+
/**
|
|
85
|
+
* Result of vector aggregation.
|
|
86
|
+
*/
|
|
87
|
+
type AggregationResult = {
|
|
88
|
+
type: 'single';
|
|
89
|
+
vector: number[];
|
|
90
|
+
dimensions: number;
|
|
91
|
+
} | {
|
|
92
|
+
type: 'multiple';
|
|
93
|
+
vectors: number[][];
|
|
94
|
+
dimensions: number;
|
|
95
|
+
};
|
|
96
|
+
/**
|
|
97
|
+
* Compute cosine similarity between two vectors.
|
|
98
|
+
* Both vectors should be normalized for accurate results.
|
|
99
|
+
*/
|
|
100
|
+
declare function cosineSimilarity(a: number[], b: number[]): number;
|
|
101
|
+
//#endregion
|
|
102
|
+
//#region src/embeddings/cache.d.ts
|
|
103
|
+
/**
|
|
104
|
+
* In-memory LRU cache with TTL support.
|
|
105
|
+
* Content-addressable: uses content hash as key, not URL.
|
|
106
|
+
*/
|
|
107
|
+
declare class InMemoryEmbeddingCache implements EmbeddingCache {
|
|
108
|
+
private cache;
|
|
109
|
+
private readonly maxEntries;
|
|
110
|
+
private readonly defaultTtlMs;
|
|
111
|
+
constructor(options?: {
|
|
112
|
+
maxEntries?: number;
|
|
113
|
+
ttlMs?: number;
|
|
114
|
+
});
|
|
115
|
+
get(key: string): Promise<EmbeddingResult | undefined>;
|
|
116
|
+
set(key: string, value: EmbeddingResult, options?: {
|
|
117
|
+
ttlMs?: number;
|
|
118
|
+
}): Promise<void>;
|
|
119
|
+
delete(key: string): Promise<boolean>;
|
|
120
|
+
clear(): Promise<void>;
|
|
121
|
+
/**
|
|
122
|
+
* Get cache statistics.
|
|
123
|
+
*/
|
|
124
|
+
getStats(): CacheStats;
|
|
125
|
+
/**
|
|
126
|
+
* Evict expired entries.
|
|
127
|
+
*/
|
|
128
|
+
cleanup(): number;
|
|
129
|
+
/**
|
|
130
|
+
* Evict least recently used entry.
|
|
131
|
+
*/
|
|
132
|
+
private evictLRU;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Cache statistics.
|
|
136
|
+
*/
|
|
137
|
+
interface CacheStats {
|
|
138
|
+
/** Current number of entries */
|
|
139
|
+
size: number;
|
|
140
|
+
/** Maximum allowed entries */
|
|
141
|
+
maxEntries: number;
|
|
142
|
+
/** Number of expired entries (not yet cleaned up) */
|
|
143
|
+
expired: number;
|
|
144
|
+
/** Cache utilization (0-1) */
|
|
145
|
+
utilization: number;
|
|
146
|
+
}
|
|
147
|
+
//#endregion
|
|
148
|
+
//#region src/embeddings/chunking.d.ts
|
|
149
|
+
/**
|
|
150
|
+
* Split text into overlapping chunks optimized for embedding.
|
|
151
|
+
* Respects sentence boundaries when possible.
|
|
152
|
+
*/
|
|
153
|
+
declare function chunkText(text: string, config?: ChunkingConfig): TextChunk[];
|
|
154
|
+
/**
|
|
155
|
+
* Estimate total tokens for a text without chunking.
|
|
156
|
+
*/
|
|
157
|
+
declare function estimateTokens(text: string, tokenizer?: ChunkingConfig['tokenizer']): number;
|
|
158
|
+
//#endregion
|
|
159
|
+
//#region src/embeddings/pipeline.d.ts
|
|
160
|
+
/**
|
|
161
|
+
* Generate embeddings for scraped data.
|
|
162
|
+
* This is the main entry point for the embedding pipeline.
|
|
163
|
+
*/
|
|
164
|
+
declare function generateEmbeddings(data: Partial<ScrapedData>, options: EmbeddingOptions): Promise<EmbeddingResult>;
|
|
165
|
+
/**
|
|
166
|
+
* Embed arbitrary text directly.
|
|
167
|
+
* Standalone function for embedding text outside of scrape().
|
|
168
|
+
*/
|
|
169
|
+
declare function embed(text: string, options: EmbeddingOptions): Promise<EmbeddingResult>;
|
|
170
|
+
/**
|
|
171
|
+
* Embed from existing ScrapedData.
|
|
172
|
+
* Useful when you've already scraped and want to add embeddings later.
|
|
173
|
+
*/
|
|
174
|
+
declare function embedScrapedData(data: ScrapedData, options: EmbeddingOptions): Promise<EmbeddingResult>;
|
|
175
|
+
//#endregion
|
|
176
|
+
//#region src/embeddings/providers/presets.d.ts
|
|
177
|
+
/**
|
|
178
|
+
* Create an OpenAI embedding provider.
|
|
179
|
+
*
|
|
180
|
+
* @example
|
|
181
|
+
* ```ts
|
|
182
|
+
* const provider = createOpenAIEmbedding({ apiKey: 'sk-...' });
|
|
183
|
+
* const { embeddings } = await provider.embed(['Hello'], { model: 'text-embedding-3-small' });
|
|
184
|
+
* ```
|
|
185
|
+
*/
|
|
186
|
+
declare function createOpenAIEmbedding(options?: {
|
|
187
|
+
apiKey?: string;
|
|
188
|
+
model?: string;
|
|
189
|
+
baseUrl?: string;
|
|
190
|
+
organization?: string;
|
|
191
|
+
}): EmbeddingProvider;
|
|
192
|
+
/**
|
|
193
|
+
* Create an Azure OpenAI embedding provider.
|
|
194
|
+
*
|
|
195
|
+
* @example
|
|
196
|
+
* ```ts
|
|
197
|
+
* const provider = createAzureEmbedding({
|
|
198
|
+
* endpoint: 'https://my-resource.openai.azure.com',
|
|
199
|
+
* deploymentName: 'text-embedding-ada-002',
|
|
200
|
+
* apiVersion: '2023-05-15',
|
|
201
|
+
* });
|
|
202
|
+
* ```
|
|
203
|
+
*/
|
|
204
|
+
declare function createAzureEmbedding(options: {
|
|
205
|
+
endpoint: string;
|
|
206
|
+
deploymentName: string;
|
|
207
|
+
apiVersion: string;
|
|
208
|
+
apiKey?: string;
|
|
209
|
+
}): EmbeddingProvider;
|
|
210
|
+
/**
|
|
211
|
+
* Create an Ollama embedding provider for local models.
|
|
212
|
+
*
|
|
213
|
+
* LIMITATION: Ollama's /api/embeddings endpoint processes one text at a time,
|
|
214
|
+
* not batches. When multiple chunks are embedded, each chunk triggers a
|
|
215
|
+
* separate HTTP request. This is handled transparently by the pipeline's
|
|
216
|
+
* sequential chunk processing, but may be slower than batch-capable providers.
|
|
217
|
+
* For high-throughput scenarios, consider using OpenAI, Cohere, or HuggingFace
|
|
218
|
+
* which support batch embedding in a single request.
|
|
219
|
+
*
|
|
220
|
+
* @example
|
|
221
|
+
* ```ts
|
|
222
|
+
* const provider = createOllamaEmbedding({ model: 'nomic-embed-text' });
|
|
223
|
+
* ```
|
|
224
|
+
*/
|
|
225
|
+
declare function createOllamaEmbedding(options?: {
|
|
226
|
+
baseUrl?: string;
|
|
227
|
+
model?: string;
|
|
228
|
+
}): EmbeddingProvider;
|
|
229
|
+
/**
|
|
230
|
+
* Create a HuggingFace Inference API embedding provider.
|
|
231
|
+
*
|
|
232
|
+
* @example
|
|
233
|
+
* ```ts
|
|
234
|
+
* const provider = createHuggingFaceEmbedding({
|
|
235
|
+
* model: 'sentence-transformers/all-MiniLM-L6-v2',
|
|
236
|
+
* });
|
|
237
|
+
* ```
|
|
238
|
+
*/
|
|
239
|
+
declare function createHuggingFaceEmbedding(options: {
|
|
240
|
+
model: string;
|
|
241
|
+
apiKey?: string;
|
|
242
|
+
}): EmbeddingProvider;
|
|
243
|
+
/**
|
|
244
|
+
* Feature extraction pipeline type for Transformers.js
|
|
245
|
+
*/
|
|
246
|
+
type FeatureExtractionPipeline = (text: string, options?: {
|
|
247
|
+
pooling?: 'mean' | 'cls' | 'max';
|
|
248
|
+
normalize?: boolean;
|
|
249
|
+
}) => Promise<{
|
|
250
|
+
data: Float32Array;
|
|
251
|
+
}>;
|
|
252
|
+
/**
|
|
253
|
+
* Transformers.js module interface for dependency injection.
|
|
254
|
+
*/
|
|
255
|
+
interface TransformersModule {
|
|
256
|
+
pipeline: (task: 'feature-extraction', model: string, options?: {
|
|
257
|
+
quantized?: boolean;
|
|
258
|
+
}) => Promise<FeatureExtractionPipeline>;
|
|
259
|
+
env?: {
|
|
260
|
+
cacheDir?: string;
|
|
261
|
+
};
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* Create a local Transformers.js embedding provider.
|
|
265
|
+
* Uses dependency injection - user provides the imported transformers module.
|
|
266
|
+
*
|
|
267
|
+
* @example
|
|
268
|
+
* ```typescript
|
|
269
|
+
* import * as transformers from '@huggingface/transformers';
|
|
270
|
+
* import { createTransformersEmbedding } from 'scrapex/embeddings';
|
|
271
|
+
*
|
|
272
|
+
* const provider = createTransformersEmbedding(transformers, {
|
|
273
|
+
* model: 'Xenova/all-MiniLM-L6-v2',
|
|
274
|
+
* });
|
|
275
|
+
* ```
|
|
276
|
+
*
|
|
277
|
+
* Required Node.js dependencies:
|
|
278
|
+
* ```
|
|
279
|
+
* npm install @huggingface/transformers onnxruntime-node
|
|
280
|
+
* ```
|
|
281
|
+
*/
|
|
282
|
+
declare function createTransformersEmbedding(transformers: TransformersModule, options?: {
|
|
283
|
+
model?: string;
|
|
284
|
+
quantized?: boolean;
|
|
285
|
+
pooling?: 'mean' | 'cls' | 'max';
|
|
286
|
+
normalize?: boolean;
|
|
287
|
+
cacheDir?: string;
|
|
288
|
+
}): EmbeddingProvider;
|
|
289
|
+
/** Recommended models for Transformers.js */
|
|
290
|
+
declare const TRANSFORMERS_MODELS: {
|
|
291
|
+
/** Default - Fast, general purpose (384 dimensions, ~23MB) */
|
|
292
|
+
readonly DEFAULT: "Xenova/all-MiniLM-L6-v2";
|
|
293
|
+
/** Higher quality, more resources (768 dimensions, ~110MB) */
|
|
294
|
+
readonly QUALITY: "Xenova/all-mpnet-base-v2";
|
|
295
|
+
/** Optimized for retrieval (384 dimensions, ~33MB) */
|
|
296
|
+
readonly RETRIEVAL: "Xenova/bge-small-en-v1.5";
|
|
297
|
+
/** Multi-language support (384 dimensions, ~118MB) */
|
|
298
|
+
readonly MULTILINGUAL: "Xenova/multilingual-e5-small";
|
|
299
|
+
};
|
|
300
|
+
//#endregion
|
|
301
|
+
//#region src/embeddings/providers/index.d.ts
|
|
302
|
+
/**
|
|
303
|
+
* Create an embedding provider from configuration.
|
|
304
|
+
* This is the main factory function for creating providers.
|
|
305
|
+
*/
|
|
306
|
+
declare function createEmbeddingProvider(config: EmbeddingProviderConfig): EmbeddingProvider;
|
|
307
|
+
//#endregion
|
|
308
|
+
//#region src/embeddings/safety.d.ts
|
|
309
|
+
/**
|
|
310
|
+
* Create a redaction function based on configuration.
|
|
311
|
+
* Returns a function that applies all configured PII patterns.
|
|
312
|
+
*/
|
|
313
|
+
declare function createPiiRedactor(config: PiiRedactionConfig): (text: string) => RedactionResult;
|
|
314
|
+
/**
|
|
315
|
+
* Result of PII redaction operation.
|
|
316
|
+
*/
|
|
317
|
+
interface RedactionResult {
|
|
318
|
+
/** Redacted text */
|
|
319
|
+
text: string;
|
|
320
|
+
/** Whether any redactions were made */
|
|
321
|
+
redacted: boolean;
|
|
322
|
+
/** Total number of redactions */
|
|
323
|
+
redactionCount: number;
|
|
324
|
+
/** Count by redaction type */
|
|
325
|
+
redactionsByType: Record<string, number>;
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Simple redaction that applies all default patterns.
|
|
329
|
+
* Use createPiiRedactor() for fine-grained control.
|
|
330
|
+
*/
|
|
331
|
+
declare function redactPii(text: string): RedactionResult;
|
|
332
|
+
//#endregion
|
|
74
333
|
//#region src/extractors/content.d.ts
|
|
75
334
|
/**
|
|
76
335
|
* Extracts main content using Mozilla Readability.
|
|
@@ -170,6 +429,11 @@ interface FetchOptions$1 {
|
|
|
170
429
|
userAgent?: string;
|
|
171
430
|
/** Additional headers to send */
|
|
172
431
|
headers?: Record<string, string>;
|
|
432
|
+
/**
|
|
433
|
+
* Allowed MIME types.
|
|
434
|
+
* Defaults to HTML/XHTML if undefined.
|
|
435
|
+
*/
|
|
436
|
+
allowedContentTypes?: string[];
|
|
173
437
|
}
|
|
174
438
|
/**
|
|
175
439
|
* Result from fetching a URL
|
|
@@ -226,6 +490,58 @@ interface RobotsCheckResult {
|
|
|
226
490
|
*/
|
|
227
491
|
declare function checkRobotsTxt(url: string, userAgent?: string): Promise<RobotsCheckResult>;
|
|
228
492
|
//#endregion
|
|
493
|
+
//#region src/utils/feed.d.ts
|
|
494
|
+
/**
|
|
495
|
+
* Fetch and parse an RSS/Atom feed from a URL.
|
|
496
|
+
* Uses scrapex's fetcher infrastructure for consistent behavior.
|
|
497
|
+
*/
|
|
498
|
+
declare function fetchFeed(url: string, options?: {
|
|
499
|
+
fetcher?: Fetcher$1;
|
|
500
|
+
timeout?: number;
|
|
501
|
+
userAgent?: string;
|
|
502
|
+
parserOptions?: RSSParserOptions;
|
|
503
|
+
}): Promise<ParserResult<ParsedFeed, FeedMeta>>;
|
|
504
|
+
/**
|
|
505
|
+
* Detect RSS/Atom feed URLs from HTML.
|
|
506
|
+
* Supports RSS, Atom, and RDF feed types.
|
|
507
|
+
*/
|
|
508
|
+
declare function discoverFeeds(html: string, baseUrl: string): string[];
|
|
509
|
+
/**
|
|
510
|
+
* Filter feed items by date range.
|
|
511
|
+
* Items without publishedAt are included by default.
|
|
512
|
+
*/
|
|
513
|
+
declare function filterByDate(items: FeedItem[], options: {
|
|
514
|
+
after?: Date;
|
|
515
|
+
before?: Date;
|
|
516
|
+
includeUndated?: boolean;
|
|
517
|
+
}): FeedItem[];
|
|
518
|
+
/**
|
|
519
|
+
* Convert feed items to markdown for LLM consumption.
|
|
520
|
+
* Uses ISO 8601 date format for consistency across environments.
|
|
521
|
+
*/
|
|
522
|
+
declare function feedToMarkdown(feed: ParsedFeed, options?: {
|
|
523
|
+
includeContent?: boolean;
|
|
524
|
+
maxItems?: number;
|
|
525
|
+
}): string;
|
|
526
|
+
/**
|
|
527
|
+
* Extract plain text from feed items for LLM processing.
|
|
528
|
+
* Concatenates title, description, and content.
|
|
529
|
+
*/
|
|
530
|
+
declare function feedToText(feed: ParsedFeed, options?: {
|
|
531
|
+
maxItems?: number;
|
|
532
|
+
separator?: string;
|
|
533
|
+
}): string;
|
|
534
|
+
/**
|
|
535
|
+
* Paginate through a feed using rel="next" links (RFC 5005).
|
|
536
|
+
* Returns an async generator that yields each page.
|
|
537
|
+
*/
|
|
538
|
+
declare function paginateFeed(url: string, options?: {
|
|
539
|
+
fetcher?: Fetcher$1;
|
|
540
|
+
timeout?: number;
|
|
541
|
+
userAgent?: string;
|
|
542
|
+
maxPages?: number;
|
|
543
|
+
}): AsyncGenerator<ParsedFeed, void, unknown>;
|
|
544
|
+
//#endregion
|
|
229
545
|
//#region src/utils/url.d.ts
|
|
230
546
|
/**
|
|
231
547
|
* Validate if a string is a valid URL
|
|
@@ -260,5 +576,5 @@ declare function getPath(url: string): string;
|
|
|
260
576
|
*/
|
|
261
577
|
declare function matchesUrlPattern(url: string, pattern: string): boolean;
|
|
262
578
|
//#endregion
|
|
263
|
-
export { type CompletionOptions, ContentExtractor, type ContentType, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT, type EnhancementType, type ExtractedEntities, type ExtractedLink, type ExtractionContext, type ExtractionSchema, type ExtractionSchemaType, type Extractor, FaviconExtractor, type FetchOptions, type FetchResult, type Fetcher, JsonLdExtractor, type LLMProvider, LinksExtractor, MetaExtractor, NativeFetcher, type RobotsCheckResult, ScrapeError, type ScrapeErrorCode, type ScrapeOptions, type ScrapedData, checkRobotsTxt, createDefaultExtractors, createExtractionContext, defaultFetcher, extractDomain, getPath, getProtocol, isExternalUrl, isValidUrl, matchesUrlPattern, mergeResults, normalizeUrl, resolveUrl, scrape, scrapeHtml, sortExtractors };
|
|
579
|
+
export { type ChunkingConfig, type CompletionOptions, ContentExtractor, type ContentType, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT, type EmbedRequest, type EmbedResponse, type EmbeddingCache, type EmbeddingCacheConfig, type EmbeddingInputConfig, type EmbeddingMetrics, type EmbeddingOptions, type EmbeddingProvider, type EmbeddingProviderConfig, type EmbeddingResult, type EmbeddingSkipped, type EmbeddingSource, type EmbeddingSuccessMultiple, type EmbeddingSuccessSingle, type EnhancementType, type ExtractedEntities, type ExtractedLink, type ExtractionContext, type ExtractionSchema, type ExtractionSchemaType, type Extractor, FaviconExtractor, type FetchOptions, type FetchResult, type Fetcher, type HttpEmbeddingConfig, InMemoryEmbeddingCache, JsonLdExtractor, type LLMProvider, LinksExtractor, MetaExtractor, NativeFetcher, type OutputConfig, RSSParser, type ResilienceConfig, type RobotsCheckResult, type SafetyConfig, ScrapeError, type ScrapeErrorCode, type ScrapeOptions, type ScrapedData, TRANSFORMERS_MODELS, aggregateVectors, checkRobotsTxt, chunkText, cosineSimilarity, createAzureEmbedding, createDefaultExtractors, createEmbeddingProvider, createExtractionContext, createHttpEmbedding, createHuggingFaceEmbedding, createOllamaEmbedding, createOpenAIEmbedding, createPiiRedactor, createTransformersEmbedding, defaultFetcher, discoverFeeds, embed, embedScrapedData, estimateTokens, extractDomain, feedToMarkdown, feedToText, fetchFeed, filterByDate, generateEmbeddings, getPath, getProtocol, isExternalUrl, isValidUrl, matchesUrlPattern, mergeResults, normalizeUrl, paginateFeed, redactPii, resolveUrl, scrape, scrapeHtml, sortExtractors };
|
|
264
580
|
//# sourceMappingURL=index.d.mts.map
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/core/context.ts","../src/core/errors.ts","../src/core/scrape.ts","../src/extractors/content.ts","../src/extractors/favicon.ts","../src/extractors/jsonld.ts","../src/extractors/links.ts","../src/extractors/meta.ts","../src/extractors/index.ts","../src/fetchers/types.ts","../src/fetchers/fetch.ts","../src/fetchers/robots.ts","../src/utils/url.ts"],"sourcesContent":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/core/context.ts","../src/core/errors.ts","../src/core/scrape.ts","../src/embeddings/aggregation.ts","../src/embeddings/cache.ts","../src/embeddings/chunking.ts","../src/embeddings/pipeline.ts","../src/embeddings/providers/presets.ts","../src/embeddings/providers/index.ts","../src/embeddings/safety.ts","../src/extractors/content.ts","../src/extractors/favicon.ts","../src/extractors/jsonld.ts","../src/extractors/links.ts","../src/extractors/meta.ts","../src/extractors/index.ts","../src/fetchers/types.ts","../src/fetchers/fetch.ts","../src/fetchers/robots.ts","../src/utils/feed.ts","../src/utils/url.ts"],"sourcesContent":[],"mappings":";;;;;AA2DA;;;;;;iBArCgB,uBAAA,uDAIL,gBACR;;;ACxBH;AAca,iBD0CG,YAAA,CC1CS,OAAA,ED2Cd,iBC3Cc,EAAA,SAAA,ED4CZ,OC5CY,CD4CJ,WC5CI,CAAA,CAAA,ED6CtB,iBC7CsB;;;;;;KAdb,eAAA;ADmBZ;AAqCA;;AAEqB,cC5CR,WAAA,SAAoB,KAAA,CD4CZ;EAAR,SAAA,IAAA,EC3CW,eD2CX;EACV,SAAA,UAAA,CAAA,EAAA,MAAA;EAAiB,WAAA,CAAA,OAAA,EAAA,MAAA,EAAA,IAAA,ECzCiB,eDyCjB,EAAA,UAAA,CAAA,EAAA,MAAA,EAAA,KAAA,CAAA,ECzC+D,KDyC/D;;;;EC3DR,OAAA,IAAA,CAAA,KAAA,EAAe,OAAA,EAAA,IAAA,CAAA,EAiCS,eAjCT,CAAA,EAiC4C,WAjC5C;EAcd;;;EAIsE,WAAA,CAAA,CAAA,EAAA,OAAA;EAe/C;;;EAnBH,MAAA,CAAA,CAAA,EAyCrB,MAzCqB,CAAA,MAAA,EAAA,OAAA,CAAA;;;;;;;ADKjC;AAqCA;;;;;;;;;ACxDY,iBCmBU,MAAA,CDnBK,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,ECmBwB,aDnBxB,CAAA,ECmB6C,ODnB7C,CCmBqD,WDnBrD,CAAA;AAc3B;;;;;;;;;;;;ACKA;;AAAgF,iBA4J1D,UAAA,CA5J0D,IAAA,EAAA,MAAA,EAAA,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,EA+JrE,aA/JqE,CAAA,EAgK7E,OAhK6E,CAgKrE,WAhKqE,CAAA;;;;;;AFAhF;AAqCA;;;AAEa,iBGpDG,gBAAA,CHoDH,OAAA,EAAA,MAAA,EAAA,EAAA,EAAA,QAAA,CAAA,EGlDD,oBHkDC,CAAA,EGjDV,iBHiDU;;;;KGYD,iBAAA;;EFtEA,MAAA,EAAA,MAAA,EAAA;EAcC,UAAA,EAAA,MAAY;CACD,GAAA;EAGa,IAAA,EAAA,UAAA;EAA8C,OAAA,EAAA,MAAA,EAAA,EAAA;EAe/C,UAAA,EAAA,MAAA;CAAmC;;;ACdvE;;AAAgF,iBCmIhE,gBAAA,CDnIgE,CAAA,EAAA,MAAA,EAAA,EAAA,CAAA,EAAA,MAAA,EAAA,CAAA,EAAA,MAAA;;;;;;AAAhF;AAAmD,cEmLtC,sBAAA,YAAkC,cFnLI,CAAA;EAA6B,QAAA,KAAA;EAAR,iBAAA,UAAA;EAAO,iBAAA,YAAA;EA4JzD,WAAA,CAAA,OAInB,CAJ6B,EAAA;IAGrB,UAAA,CAAA,EAAA,MAAA;IACA,KAAA,CAAA,EAAA,MAAA;EAAR,CAAA;EAAO,GAAA,CAAA,GAAA,EAAA,MAAA,CAAA,EE8BgB,OF9BhB,CE8BwB,eF9BxB,GAAA,SAAA,CAAA;0BEmDsB;;MAAgD;EDhOhE,MAAA,CAAA,GAAA,EAAA,MAAA,CAAgB,ECiPH,ODjPG,CAAA,OAEpB,CAAA;EA8DA,KAAA,CAAA,CAAA,ECqLK,ODrLL,CAAA,IAAiB,CAAA;EAgFb;;;cC4GF;EA5DD;;;EAgCmB,OAAA,CAAA,CAAA,EAAA,MAAA;EAAgD;;;EA4BlE,QAAA,QAAA;;;AA0Dd;;UAAiB,UAAA;;EC5ND,IAAA,EAAA,MAAS;EAiFT;;;;ECpIM;EACN,WAAA,EAAA,MAAA;;;;;;;;AL9CJ,iBIgGI,SAAA,CJhGW,IAAA,EAAA,MAAA,EAAA,MAAA,CAAA,EIgGsB,cJhGtB,CAAA,EIgGuC,SJhGvC,EAAA;AAc3B;;;AAImF,iBI+JnE,cAAA,CJ/JmE,IAAA,EAAA,MAAA,EAAA,SAAA,CAAA,EI+J1B,cJ/J0B,CAAA,WAAA,CAAA,CAAA,EAAA,MAAA;;;;;ADCnF;AAqCA;AACW,iBMZW,kBAAA,CNYX,IAAA,EMXH,ONWG,CMXK,WNWL,CAAA,EAAA,OAAA,EMVA,gBNUA,CAAA,EMTR,ONSQ,CMTA,eNSA,CAAA;;;;;iBMkPW,KAAA,wBAA6B,mBAAmB,QAAQ;;;AL3S9E;AAcA;AACwB,iBKkTF,gBAAA,CLlTE,IAAA,EKmThB,WLnTgB,EAAA,OAAA,EKoTb,gBLpTa,CAAA,EKqTrB,OLrTqB,CKqTb,eLrTa,CAAA;;;ADyCxB;;;;;;;;;ACxDY,iBM6BI,qBAAA,CN7BW,OAiCS,CAjCT,EAAA;EAcd,MAAA,CAAA,EAAA,MAAY;EACD,KAAA,CAAA,EAAA,MAAA;EAGa,OAAA,CAAA,EAAA,MAAA;EAA8C,YAAA,CAAA,EAAA,MAAA;CAe/C,CAAA,EMChC,iBNDgC;;;;;;;;ACdpC;;;;;AA4JsB,iBKvGN,oBAAA,CLuGgB,OAAA,EAAA;EAGrB,QAAA,EAAA,MAAA;EACA,cAAA,EAAA,MAAA;EAAR,UAAA,EAAA,MAAA;EAAO,MAAA,CAAA,EAAA,MAAA;IKtGN;;;AJvEJ;AAgEA;AAgFA;;;;ACgDA;;;;;;;AA4Dc,iBGxIE,qBAAA,CHwIF,OA0Dd,CA1Dc,EAAA;EA5DiC,OAAA,CAAA,EAAA,MAAA;EAAc,KAAA,CAAA,EAAA,MAAA;AAsH7D,CAAA,CAAA,EG/LI,iBH+LuB;;;;AC5N3B;AAiFA;;;;ACpIA;;AACQ,iBCwGQ,0BAAA,CDxGR,OAAA,EAAA;EACG,KAAA,EAAA,MAAA;EACA,MAAA,CAAA,EAAA,MAAA;CAAR,CAAA,ECyGC,iBDzGD;;ACnBH;AA2CA;AAkDA,KA6GK,yBAAA,GA7GgC,CAAA,IAAA,EAAA,MAGjC,EAAA,OA6GQ,CA7GR,EAAA;EAyBY,OAAA,CAAA,EAAA,MAAA,GAAA,KAAA,GAAA,KAA0B;EAiFrC,SAAA,CAAA,EAAA,OAAA;AAGO,CAAA,EAAA,GAAP,OAKK,CAAA;EA4BM,IAAA,EAjCK,YAiCL;AAkEhB,CAAA,CAAA;;;;ACpTA,UDsNU,kBAAA,CCtNM;;;QD2NT,QAAQ;EEtNC,GAAA,CAAA,EAAA;IAkEC,QAAA,CAAA,EAAA,MAAe;EAehB,CAAA;;;;AC7FhB;;;;;;;;;;ACdA;;;;;;;iBJuQgB,2BAAA,eACA,2BKrQqC;;;EAJxC,OAAA,CAAA,EAAA,MAAA,GAAgB,KAAA,GAAA,KAAA;EAIJ,SAAA,CAAA,EAAA,OAAA;EAAoC,QAAA,CAAA,EAAA,MAAA;CAAR,CAAA,EL6QlD,iBK7QkD;;AAJb,cL0U3B,mBK1U2B,EAAA;EAAS;;;;ECCpC;EAIY,SAAA,SAAA,EAAA,0BAAA;EAAoC;EAAR,SAAA,YAAA,EAAA,8BAAA;CAAR;;;AbgD7C;;;;AAGG,iBQlCa,uBAAA,CRkCb,MAAA,EQlC6C,uBRkC7C,CAAA,EQlCuE,iBRkCvE;;;;;;AAxCH;AAqCgB,iBS1BA,iBAAA,CT0BY,MAAA,ES1Bc,kBT0Bd,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GS1BqD,eT0BrD;;;;AAGzB,USqCc,eAAA,CTrCd;EAAiB;;;;EC3DR;EAcC,cAAY,EAAA,MAAA;EACD;EAGa,gBAAA,EQsFjB,MRtFiB,CAAA,MAAA,EAAA,MAAA,CAAA;;;;;;AAJC,iBQiGtB,SAAA,CRjGsB,IAAA,EAAA,MAAA,CAAA,EQiGG,eRjGH;;;;;;ADKtC;AAqCgB,cUtCH,gBAAA,YAA4B,SVsCb,CAAA;EACjB,SAAA,IAAA,GAAA,SAAA;EACU,SAAA,QAAA,GAAA,EAAA;EAAR,OAAA,CAAA,OAAA,EUpCY,iBVoCZ,CAAA,EUpCgC,OVoChC,CUpCwC,OVoCxC,CUpCgD,WVoChD,CAAA,CAAA;EACV,QAAA,eAAA;EAAiB,QAAA,aAAA;;;;;;;;AAxCpB;AAqCgB,cWpDH,gBAAA,YAA4B,SXoDb,CAAA;EACjB,SAAA,IAAA,GAAA,SAAA;EACU,SAAA,QAAA,GAAA,EAAA;EAAR,OAAA,CAAA,OAAA,EWlDY,iBXkDZ,CAAA,EWlDgC,OXkDhC,CWlDwC,OXkDxC,CWlDgD,WXkDhD,CAAA,CAAA;;;;;;;AAvCb;AAqCgB,cYrDH,eAAA,YAA2B,SZqDZ,CAAA;EACjB,SAAA,IAAA,GAAA,QAAA;EACU,SAAA,QAAA,GAAA,EAAA;EAAR,OAAA,CAAA,OAAA,EYnDY,iBZmDZ,CAAA,EYnDgC,OZmDhC,CYnDwC,OZmDxC,CYnDgD,WZmDhD,CAAA,CAAA;EACV,QAAA,eAAA;EAAiB,QAAA,OAAA;;;;EC3DR,QAAA,WAAe;AAc3B;;;;;;ADKA;AAqCgB,capDH,cAAA,YAA0B,SboDX,CAAA;EACjB,SAAA,IAAA,GAAA,OAAA;EACU,SAAA,QAAA,GAAA,EAAA;EAAR,OAAA,CAAA,OAAA,EalDY,iBbkDZ,CAAA,EalDgC,ObkDhC,CalDwC,ObkDxC,CalDgD,WbkDhD,CAAA,CAAA;;;;;;;AAvCb;AAqCgB,ccrDH,aAAA,YAAyB,SdqDV,CAAA;EACjB,SAAA,IAAA,GAAA,MAAA;EACU,SAAA,QAAA,GAAA,GAAA;EAAR,OAAA,CAAA,OAAA,EcnDY,iBdmDZ,CAAA,EcnDgC,OdmDhC,CcnDwC,OdmDxC,CcnDgD,WdmDhD,CAAA,CAAA;;;;;;;;AACO,iBe5CJ,uBAAA,CAAA,Cf4CI,Ee5CuB,Sf4CvB,EAAA;;;;AC3DR,iBc4BI,cAAA,Cd5BW,UAAA,Ec4BgB,Sd5BhB,EAAA,CAAA,Ec4B8B,Sd5B9B,EAAA;;;;;;;ADmBX,UgBlBC,SAAA,ChBkBD;EAqCA;;;;EAGb,KAAA,CAAA,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,EgBrD4B,chBqD5B,CAAA,EgBrD2C,OhBqD3C,CgBrDmD,ahBqDnD,CAAA;EAAiB;;;;AC3DpB;AAcA;AACwB,UeAP,cAAA,CfAO;EAGa;EAA8C,OAAA,CAAA,EAAA,MAAA;EAe/C;EAAmC,SAAA,CAAA,EAAA,MAAA;EAsB3D;EAzCqB,OAAA,CAAA,EeSrB,MfTqB,CAAA,MAAA,EAAA,MAAA,CAAA;EAAK;;;;ECKhB,mBAAM,CAAA,EAAA,MAAA,EAAA;;;;;AA4JN,Uc5IL,aAAA,Cd4Ie;EAGrB;EACA,IAAA,EAAA,MAAA;EAAR;EAAO,QAAA,EAAA,MAAA;;;;EC7KM,WAAA,EAAA,MAAgB;EAgEpB;EAgFI,OAAA,CAAA,EarGJ,MbqGI,CAAA,MAAgB,EAAA,MAAA,CAAA;;;;ACgDhC;AAWkC,cY1JrB,kBAAA,GZ0JqB,oEAAA;;;;AAsCL,cY1LhB,eAAA,GZ0LgB,KAAA;;;;;;AJpO7B;AAqCgB,ciB9CH,aAAA,YAAyB,SjB8CV,CAAA;EACjB,SAAA,IAAA,GAAA,cAAA;EACU,KAAA,CAAA,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,EiB7Ce,cjB6Cf,CAAA,EiB7CmC,OjB6CnC,CiB7C2C,ajB6C3C,CAAA;;;;;ciBiER,gBAAc;;;;;;UCzHV,iBAAA;ElBiBD,OAAA,EAAA,OAAA;EAqCA,MAAA,CAAA,EAAA,MAAY;;;;;;;;;ACxDhB,iBiBsBU,cAAA,CjBtBK,GAAA,EAAA,MAAA,EAAA,SAAA,CAAA,EAAA,MAAA,CAAA,EiByBxB,OjBzBwB,CiByBhB,iBjBzBgB,CAAA;;;;ADmB3B;AAqCA;;AAEqB,iBmBnDC,SAAA,CnBmDD,GAAA,EAAA,MAAA,EAAA,QAAA,EAAA;EAAR,OAAA,CAAA,EmBhDC,SnBgDD;EACV,OAAA,CAAA,EAAA,MAAA;EAAiB,SAAA,CAAA,EAAA,MAAA;kBmB9CA;IAEjB,QAAQ,aAAa,YAAY;;AlBfpC;AAcA;;AAIqC,iBkBqBrB,aAAA,ClBrBqB,IAAA,EAAA,MAAA,EAAA,OAAA,EAAA,MAAA,CAAA,EAAA,MAAA,EAAA;;;;;AAJJ,iBkB4DjB,YAAA,ClB5DiB,KAAA,EkB6DxB,QlB7DwB,EAAA,EAAA,OAAA,EAAA;EAAK,KAAA,CAAA,EkB8DjB,IlB9DiB;WkB8DF;;IACjC;AjB1DH;;;;AAA+E,iBiB6E/D,cAAA,CjB7E+D,IAAA,EiB8EvE,UjB9EuE,EAAA,OAgKpE,CAhKoE,EAAA;EA4JzD,cAAU,CAAA,EAAA,OAAA;EAGrB,QAAA,CAAA,EAAA,MAAA;CACA,CAAA,EAAA,MAAA;;;;;iBiBpCK,UAAA,OACR,mBhBMR;EAhJgB,QAAA,CAAA,EAAA,MAAA;EAgEJ,SAAA,CAAA,EAAA,MAAA;AAgFZ,CAAA,CAAA,EAAgB,MAAA;;;;ACgDhB;AAWkC,iBe1CX,YAAA,Cf0CW,GAAA,EAAA,MAAA,EAAA,OA0CjB,CA1CiB,EAAA;EAAR,OAAA,CAAA,EevCZ,SfuCY;EAqBM,OAAA,CAAA,EAAA,MAAA;EAAgD,SAAA,CAAA,EAAA,MAAA;EAiBnD,QAAA,CAAA,EAAA,MAAA;CAIZ,CAAA,Ee5Ed,cf4Ec,Ce5EC,Uf4ED,EAAA,IAAA,EAAA,OAAA,CAAA;;;;;;iBgBnOD,UAAA;ApBLhB;AAqCA;;AAEqB,iBoBtBL,YAAA,CpBsBK,GAAA,EAAA,MAAA,CAAA,EAAA,MAAA;;;;iBoBEL,aAAA;;;AnB5DhB;AAca,iBmB0DG,UAAA,CnB1DS,GAAA,EAAA,MAAA,GAAA,SAAA,GAAA,IAAA,EAAA,OAAA,EAAA,MAAA,CAAA,EAAA,MAAA,GAAA,SAAA;;;;AAmBW,iBmBoDpB,aAAA,CnBpDoB,GAAA,EAAA,MAAA,EAAA,UAAA,EAAA,MAAA,CAAA,EAAA,OAAA;;;;AAnBE,iBmBoFtB,WAAA,CnBpFsB,GAAA,EAAA,MAAA,CAAA,EAAA,MAAA;;;;ACKhB,iBkB0FN,OAAA,ClB1FY,GAAA,EAAA,MAAA,CAAA,EAAA,MAAA;;;;AAAmD,iBkBqG/D,iBAAA,ClBrG+D,GAAA,EAAA,MAAA,EAAA,OAAA,EAAA,MAAA,CAAA,EAAA,OAAA"}
|