scrapex 1.0.0-beta.4 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +164 -3
- package/dist/embeddings/index.cjs +2 -2
- package/dist/embeddings/index.d.cts +2 -2
- package/dist/embeddings/index.d.mts +2 -2
- package/dist/embeddings/index.mjs +2 -2
- package/dist/{embeddings-DHVu52VF.cjs → embeddings-BzRSRX9t.cjs} +3 -3
- package/dist/{embeddings-DHVu52VF.cjs.map → embeddings-BzRSRX9t.cjs.map} +1 -1
- package/dist/{embeddings-Dama_IQl.mjs → embeddings-CukTWZVJ.mjs} +2 -2
- package/dist/{embeddings-Dama_IQl.mjs.map → embeddings-CukTWZVJ.mjs.map} +1 -1
- package/dist/{enhancer-Cs_WyWtJ.cjs → enhancer-18LeSP31.cjs} +2 -2
- package/dist/{enhancer-Cs_WyWtJ.cjs.map → enhancer-18LeSP31.cjs.map} +1 -1
- package/dist/{enhancer-INx5NlgO.mjs → enhancer-CIFsqRhc.mjs} +1 -1
- package/dist/{enhancer-INx5NlgO.mjs.map → enhancer-CIFsqRhc.mjs.map} +1 -1
- package/dist/{http-base-CHLf-Tco.cjs → http-base-B5EXbNwR.cjs} +2 -2
- package/dist/{http-base-CHLf-Tco.cjs.map → http-base-B5EXbNwR.cjs.map} +1 -1
- package/dist/{http-base-DM7YNo6X.mjs → http-base-B8qhcRit.mjs} +1 -1
- package/dist/{http-base-DM7YNo6X.mjs.map → http-base-B8qhcRit.mjs.map} +1 -1
- package/dist/{index-Bvseqli-.d.cts → index-B87QDy6g.d.mts} +39 -2
- package/dist/index-B87QDy6g.d.mts.map +1 -0
- package/dist/{index-CliKPY5N.d.cts → index-BKGgfF5C.d.cts} +2 -2
- package/dist/{index-CliKPY5N.d.cts.map → index-BKGgfF5C.d.cts.map} +1 -1
- package/dist/{index-REk4CMxV.d.mts → index-DoA7xuDF.d.mts} +2 -2
- package/dist/{index-REk4CMxV.d.mts.map → index-DoA7xuDF.d.mts.map} +1 -1
- package/dist/{index-CIFjNySr.d.mts → index-qCDikize.d.cts} +39 -2
- package/dist/index-qCDikize.d.cts.map +1 -0
- package/dist/index.cjs +10 -331
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +66 -9
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +66 -9
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +5 -327
- package/dist/index.mjs.map +1 -1
- package/dist/llm/index.cjs +3 -3
- package/dist/llm/index.d.cts +1 -1
- package/dist/llm/index.d.mts +1 -1
- package/dist/llm/index.mjs +2 -2
- package/dist/parsers/index.cjs +2 -1
- package/dist/parsers/index.d.cts +2 -2
- package/dist/parsers/index.d.mts +2 -2
- package/dist/parsers/index.mjs +2 -2
- package/dist/parsers-C-AnBv0B.mjs +946 -0
- package/dist/parsers-C-AnBv0B.mjs.map +1 -0
- package/dist/{parsers-Bneuws8x.cjs → parsers-DsqevSf6.cjs} +496 -2
- package/dist/parsers-DsqevSf6.cjs.map +1 -0
- package/dist/{types-BdlvuOtp.d.mts → types--8EgNAtb.d.cts} +4 -134
- package/dist/types--8EgNAtb.d.cts.map +1 -0
- package/dist/types-BeKod8eQ.d.cts +134 -0
- package/dist/types-BeKod8eQ.d.cts.map +1 -0
- package/dist/{types-BYsD9u71.d.cts → types-dyXBh3Xy.d.mts} +4 -134
- package/dist/types-dyXBh3Xy.d.mts.map +1 -0
- package/dist/types-tpb9d2vq.d.mts +134 -0
- package/dist/types-tpb9d2vq.d.mts.map +1 -0
- package/package.json +1 -1
- package/dist/index-Bvseqli-.d.cts.map +0 -1
- package/dist/index-CIFjNySr.d.mts.map +0 -1
- package/dist/parsers-Bneuws8x.cjs.map +0 -1
- package/dist/parsers-DsawHeo0.mjs +0 -482
- package/dist/parsers-DsawHeo0.mjs.map +0 -1
- package/dist/types-BYsD9u71.d.cts.map +0 -1
- package/dist/types-BdlvuOtp.d.mts.map +0 -1
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Modern web scraper with LLM-enhanced extraction, extensible pipeline, and pluggable parsers.
|
|
4
4
|
|
|
5
|
-
> **
|
|
5
|
+
> **Release**: v1.0.0 (API stable; breaking changes documented below).
|
|
6
6
|
|
|
7
7
|
## Features
|
|
8
8
|
|
|
@@ -13,13 +13,14 @@ Modern web scraper with LLM-enhanced extraction, extensible pipeline, and plugga
|
|
|
13
13
|
- **Smart Extraction** - Uses Mozilla Readability for content, Cheerio for metadata
|
|
14
14
|
- **Markdown Parsing** - Parse markdown content, awesome lists, and GitHub repos
|
|
15
15
|
- **RSS/Atom Feeds** - Parse RSS 2.0, RSS 1.0 (RDF), and Atom feeds with pagination support
|
|
16
|
+
- **Content Normalization** - Clean, embedding-ready text with boilerplate removal
|
|
16
17
|
- **TypeScript First** - Full type safety with comprehensive type exports
|
|
17
18
|
- **Dual Format** - ESM and CommonJS builds
|
|
18
19
|
|
|
19
20
|
## Installation
|
|
20
21
|
|
|
21
22
|
```bash
|
|
22
|
-
npm install scrapex
|
|
23
|
+
npm install scrapex
|
|
23
24
|
```
|
|
24
25
|
|
|
25
26
|
### Optional Peer Dependencies
|
|
@@ -29,6 +30,9 @@ npm install scrapex@beta
|
|
|
29
30
|
npm install openai # OpenAI/Ollama/LM Studio
|
|
30
31
|
npm install @anthropic-ai/sdk # Anthropic Claude
|
|
31
32
|
|
|
33
|
+
# For local embeddings (zero API cost)
|
|
34
|
+
npm install @huggingface/transformers onnxruntime-node
|
|
35
|
+
|
|
32
36
|
# For JavaScript-rendered pages
|
|
33
37
|
npm install puppeteer
|
|
34
38
|
```
|
|
@@ -60,6 +64,10 @@ const result = await scrape('https://example.com', {
|
|
|
60
64
|
userAgent: 'MyBot/1.0',
|
|
61
65
|
extractContent: true,
|
|
62
66
|
maxContentLength: 50000,
|
|
67
|
+
normalize: {
|
|
68
|
+
mode: 'full',
|
|
69
|
+
removeBoilerplate: true,
|
|
70
|
+
},
|
|
63
71
|
respectRobots: false,
|
|
64
72
|
});
|
|
65
73
|
```
|
|
@@ -117,6 +125,17 @@ interface ScrapedData {
|
|
|
117
125
|
entities?: ExtractedEntities;
|
|
118
126
|
extracted?: Record<string, unknown>;
|
|
119
127
|
|
|
128
|
+
// Custom extractor results
|
|
129
|
+
custom?: Record<string, unknown>;
|
|
130
|
+
|
|
131
|
+
// Embeddings (when enabled)
|
|
132
|
+
embeddings?: EmbeddingResult;
|
|
133
|
+
|
|
134
|
+
// Normalized text (when enabled)
|
|
135
|
+
normalizedText?: string;
|
|
136
|
+
normalizationMeta?: NormalizationMeta;
|
|
137
|
+
normalizedBlocks?: ContentBlock[];
|
|
138
|
+
|
|
120
139
|
// Meta
|
|
121
140
|
scrapedAt: string;
|
|
122
141
|
scrapeTimeMs: number;
|
|
@@ -173,7 +192,84 @@ Features include:
|
|
|
173
192
|
|
|
174
193
|
See the [Embeddings Guide](https://scrapex.dev/guides/embeddings) for full documentation.
|
|
175
194
|
|
|
176
|
-
##
|
|
195
|
+
## Content Normalization
|
|
196
|
+
|
|
197
|
+
Clean, embedding-ready text with boilerplate removal and block classification:
|
|
198
|
+
|
|
199
|
+
```typescript
|
|
200
|
+
const result = await scrape(url, {
|
|
201
|
+
normalize: {
|
|
202
|
+
mode: 'full', // or 'summary' for score-ranked blocks
|
|
203
|
+
removeBoilerplate: true, // filter nav, footer, promos
|
|
204
|
+
maxChars: 5000, // truncate at sentence boundary
|
|
205
|
+
},
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
console.log(result.normalizedText); // Clean text ready for embedding
|
|
209
|
+
console.log(result.normalizationMeta); // { charCount, tokenEstimate, hash, ... }
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Standalone Normalization
|
|
213
|
+
|
|
214
|
+
Use normalization without scraping:
|
|
215
|
+
|
|
216
|
+
```typescript
|
|
217
|
+
import { parseBlocks, normalizeText } from 'scrapex';
|
|
218
|
+
import { load } from 'cheerio';
|
|
219
|
+
|
|
220
|
+
const $ = load(html);
|
|
221
|
+
const blocks = parseBlocks($);
|
|
222
|
+
const result = await normalizeText(blocks, { mode: 'full' });
|
|
223
|
+
|
|
224
|
+
console.log(result.text);
|
|
225
|
+
console.log(result.meta);
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### Custom Classifiers
|
|
229
|
+
|
|
230
|
+
Filter blocks with custom logic:
|
|
231
|
+
|
|
232
|
+
```typescript
|
|
233
|
+
import { combineClassifiers, defaultBlockClassifier } from 'scrapex';
|
|
234
|
+
|
|
235
|
+
const myClassifier = combineClassifiers(
|
|
236
|
+
defaultBlockClassifier,
|
|
237
|
+
(block) => {
|
|
238
|
+
if (block.text.includes('Advertisement')) {
|
|
239
|
+
return { accept: false, label: 'ad' };
|
|
240
|
+
}
|
|
241
|
+
return { accept: true };
|
|
242
|
+
}
|
|
243
|
+
);
|
|
244
|
+
|
|
245
|
+
const result = await scrape(url, {
|
|
246
|
+
normalize: { blockClassifier: myClassifier },
|
|
247
|
+
});
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Normalization Options
|
|
251
|
+
|
|
252
|
+
```typescript
|
|
253
|
+
interface NormalizeOptions {
|
|
254
|
+
mode?: 'summary' | 'full'; // Output mode (default: 'full')
|
|
255
|
+
maxChars?: number; // Max characters in output
|
|
256
|
+
minChars?: number; // Minimum characters required
|
|
257
|
+
maxBlocks?: number; // Max blocks to process (default: 2000)
|
|
258
|
+
truncate?: 'sentence' | 'word' | 'char'; // Truncation strategy
|
|
259
|
+
dropSelectors?: string[]; // Selectors to drop before parsing
|
|
260
|
+
removeBoilerplate?: boolean; // Filter nav/footer/promos (default: true)
|
|
261
|
+
decodeEntities?: boolean; // Decode HTML entities (default: true)
|
|
262
|
+
normalizeUnicode?: boolean; // Normalize Unicode to NFC (default: true)
|
|
263
|
+
preserveLineBreaks?: boolean; // Preserve paragraph breaks (default: true)
|
|
264
|
+
stripLinks?: boolean; // Strip Markdown links (default: true)
|
|
265
|
+
includeHtml?: boolean; // Include raw HTML in blocks (default: false)
|
|
266
|
+
languageHint?: string; // Language hint for metadata
|
|
267
|
+
blockClassifier?: ContentBlockClassifier; // Custom classifier
|
|
268
|
+
debug?: boolean; // Include blocks in output
|
|
269
|
+
}
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## Breaking Changes
|
|
177
273
|
|
|
178
274
|
- LLM provider classes (e.g., `AnthropicProvider`) were removed. Use preset factories like
|
|
179
275
|
`createOpenAI`, `createAnthropic`, `createOllama`, and `createLMStudio` instead.
|
|
@@ -409,6 +505,27 @@ const markdown = feedToMarkdown(result.data, { maxItems: 10 });
|
|
|
409
505
|
const text = feedToText(result.data);
|
|
410
506
|
```
|
|
411
507
|
|
|
508
|
+
### Normalizing Feed Items
|
|
509
|
+
|
|
510
|
+
Convert feed item content to clean, embedding-ready text:
|
|
511
|
+
|
|
512
|
+
```typescript
|
|
513
|
+
import { RSSParser, normalizeFeedItem } from 'scrapex';
|
|
514
|
+
|
|
515
|
+
const parser = new RSSParser();
|
|
516
|
+
const result = parser.parse(feedXml);
|
|
517
|
+
|
|
518
|
+
for (const item of result.data.items) {
|
|
519
|
+
const normalized = await normalizeFeedItem(item, {
|
|
520
|
+
mode: 'full',
|
|
521
|
+
removeBoilerplate: true,
|
|
522
|
+
});
|
|
523
|
+
|
|
524
|
+
console.log(normalized.text); // Clean text from content/description
|
|
525
|
+
console.log(normalized.meta); // { charCount, tokenEstimate, hash, ... }
|
|
526
|
+
}
|
|
527
|
+
```
|
|
528
|
+
|
|
412
529
|
### Custom Fields (Podcast/Media)
|
|
413
530
|
|
|
414
531
|
Extract custom namespace fields like iTunes podcast tags:
|
|
@@ -429,6 +546,25 @@ console.log(item.customFields?.duration); // '10:00'
|
|
|
429
546
|
console.log(item.customFields?.explicit); // 'no'
|
|
430
547
|
```
|
|
431
548
|
|
|
549
|
+
#### Attribute Extraction
|
|
550
|
+
|
|
551
|
+
Use `selector@attr` syntax to extract XML attribute values:
|
|
552
|
+
|
|
553
|
+
```typescript
|
|
554
|
+
const parser = new RSSParser({
|
|
555
|
+
customFields: {
|
|
556
|
+
// Extract url attribute from media:thumbnail element
|
|
557
|
+
thumbnail: 'media\\:thumbnail@url',
|
|
558
|
+
// Extract url attribute from media:content element
|
|
559
|
+
mediaUrl: 'media\\:content@url',
|
|
560
|
+
},
|
|
561
|
+
});
|
|
562
|
+
|
|
563
|
+
const result = parser.parse(mediaRssFeed);
|
|
564
|
+
console.log(result.data.items[0]?.customFields?.thumbnail);
|
|
565
|
+
// => "https://example.com/images/thumbnail.jpg"
|
|
566
|
+
```
|
|
567
|
+
|
|
432
568
|
### Security
|
|
433
569
|
|
|
434
570
|
The RSS parser enforces strict URL security:
|
|
@@ -536,6 +672,8 @@ interface ScrapeOptions {
|
|
|
536
672
|
llm?: LLMProvider; // LLM provider
|
|
537
673
|
enhance?: EnhancementType[]; // LLM enhancements
|
|
538
674
|
extract?: ExtractionSchema; // Structured extraction
|
|
675
|
+
embeddings?: EmbeddingOptions; // Vector embeddings
|
|
676
|
+
normalize?: NormalizeOptions; // Content normalization
|
|
539
677
|
}
|
|
540
678
|
```
|
|
541
679
|
|
|
@@ -554,6 +692,29 @@ type EnhancementType =
|
|
|
554
692
|
- Node.js 20+
|
|
555
693
|
- TypeScript 5.0+ (for type imports)
|
|
556
694
|
|
|
695
|
+
## Get Help
|
|
696
|
+
|
|
697
|
+
- [Documentation](https://scrapex.dev) - Guides and API reference
|
|
698
|
+
- [GitHub Issues](https://github.com/developer-rakeshpaul/scrapex/issues) - Bug reports and feature requests
|
|
699
|
+
- [GitHub Discussions](https://github.com/developer-rakeshpaul/scrapex/discussions) - Questions and ideas
|
|
700
|
+
- [Stack Overflow](https://stackoverflow.com/questions/tagged/scrapex) - Community Q&A
|
|
701
|
+
|
|
702
|
+
## Contributing
|
|
703
|
+
|
|
704
|
+
Contributions are welcome! Whether it's bug reports, feature requests, or pull requests.
|
|
705
|
+
|
|
706
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on:
|
|
707
|
+
- Reporting bugs
|
|
708
|
+
- Suggesting features
|
|
709
|
+
- Submitting pull requests
|
|
710
|
+
- Development setup
|
|
711
|
+
|
|
712
|
+
## Support
|
|
713
|
+
|
|
714
|
+
If you find scrapex useful, consider supporting its development:
|
|
715
|
+
|
|
716
|
+
[](https://buymeacoffee.com/binaryroute)
|
|
717
|
+
|
|
557
718
|
## License
|
|
558
719
|
|
|
559
720
|
MIT
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
const require_http_base = require('../http-base-
|
|
2
|
-
const require_embeddings = require('../embeddings-
|
|
1
|
+
const require_http_base = require('../http-base-B5EXbNwR.cjs');
|
|
2
|
+
const require_embeddings = require('../embeddings-BzRSRX9t.cjs');
|
|
3
3
|
|
|
4
4
|
exports.CircuitBreaker = require_http_base.CircuitBreaker;
|
|
5
5
|
exports.CircuitOpenError = require_http_base.CircuitOpenError;
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import { A as HttpEmbeddingProvider, B as withResilience, C as RateLimitConfig, D as SafetyConfig, E as RetryConfig, F as CircuitOpenError, H as withTimeout, I as RateLimiter, L as Semaphore, O as TextChunk, P as CircuitBreaker, R as createTimeoutSignal, S as PiiRedactionConfig, T as ResilienceState, V as withRetry, _ as EmbeddingSkipped, a as EmbedRequest, b as EmbeddingSuccessSingle, c as EmbeddingCache, d as EmbeddingInputType, f as EmbeddingMetrics, g as EmbeddingResult, h as EmbeddingProviderConfig, i as CircuitState, j as createHttpEmbedding, k as HttpEmbeddingConfig, l as EmbeddingCacheConfig, m as EmbeddingProvider, n as CircuitBreakerConfig, o as EmbedResponse, p as EmbeddingOptions, r as CircuitBreakerState, s as EmbeddingAggregation, t as ChunkingConfig, u as EmbeddingInputConfig, v as EmbeddingSource, w as ResilienceConfig, x as OutputConfig, y as EmbeddingSuccessMultiple, z as isRetryableError } from "../types
|
|
2
|
-
import { A as createNoOpCache, B as euclideanDistance, C as createTokenizer, D as needsChunking, E as heuristicTokenCount, F as validateCachedResult, H as normalizeVector, I as AggregationResult, L as aggregateVectors, M as generateChecksum, N as getDefaultCache, O as CacheStats, P as resetDefaultCache, R as cosineSimilarity, S as chunkText, T as getChunkingStats, V as getDimensions, _ as generateEmbeddings, a as createEmbeddingProvider, b as selectInput, c as createAzureEmbedding, d as createOllamaEmbedding, f as createOpenAIEmbedding, g as embedScrapedData, h as embed, i as redactPii, j as generateCacheKey, k as InMemoryEmbeddingCache, l as createCohereEmbedding, m as getDefaultModel, n as containsPii, o as isEmbeddingProvider, p as createTransformersEmbedding, r as createPiiRedactor, s as TRANSFORMERS_MODELS, t as RedactionResult, u as createHuggingFaceEmbedding, v as InputValidation, w as estimateTokens, x as validateInput, y as previewInput, z as dotProduct } from "../index-
|
|
1
|
+
import { A as HttpEmbeddingProvider, B as withResilience, C as RateLimitConfig, D as SafetyConfig, E as RetryConfig, F as CircuitOpenError, H as withTimeout, I as RateLimiter, L as Semaphore, O as TextChunk, P as CircuitBreaker, R as createTimeoutSignal, S as PiiRedactionConfig, T as ResilienceState, V as withRetry, _ as EmbeddingSkipped, a as EmbedRequest, b as EmbeddingSuccessSingle, c as EmbeddingCache, d as EmbeddingInputType, f as EmbeddingMetrics, g as EmbeddingResult, h as EmbeddingProviderConfig, i as CircuitState, j as createHttpEmbedding, k as HttpEmbeddingConfig, l as EmbeddingCacheConfig, m as EmbeddingProvider, n as CircuitBreakerConfig, o as EmbedResponse, p as EmbeddingOptions, r as CircuitBreakerState, s as EmbeddingAggregation, t as ChunkingConfig, u as EmbeddingInputConfig, v as EmbeddingSource, w as ResilienceConfig, x as OutputConfig, y as EmbeddingSuccessMultiple, z as isRetryableError } from "../types--8EgNAtb.cjs";
|
|
2
|
+
import { A as createNoOpCache, B as euclideanDistance, C as createTokenizer, D as needsChunking, E as heuristicTokenCount, F as validateCachedResult, H as normalizeVector, I as AggregationResult, L as aggregateVectors, M as generateChecksum, N as getDefaultCache, O as CacheStats, P as resetDefaultCache, R as cosineSimilarity, S as chunkText, T as getChunkingStats, V as getDimensions, _ as generateEmbeddings, a as createEmbeddingProvider, b as selectInput, c as createAzureEmbedding, d as createOllamaEmbedding, f as createOpenAIEmbedding, g as embedScrapedData, h as embed, i as redactPii, j as generateCacheKey, k as InMemoryEmbeddingCache, l as createCohereEmbedding, m as getDefaultModel, n as containsPii, o as isEmbeddingProvider, p as createTransformersEmbedding, r as createPiiRedactor, s as TRANSFORMERS_MODELS, t as RedactionResult, u as createHuggingFaceEmbedding, v as InputValidation, w as estimateTokens, x as validateInput, y as previewInput, z as dotProduct } from "../index-BKGgfF5C.cjs";
|
|
3
3
|
export { AggregationResult, CacheStats, ChunkingConfig, CircuitBreaker, CircuitBreakerConfig, CircuitBreakerState, CircuitOpenError, CircuitState, EmbedRequest, EmbedResponse, EmbeddingAggregation, EmbeddingCache, EmbeddingCacheConfig, EmbeddingInputConfig, EmbeddingInputType, EmbeddingMetrics, EmbeddingOptions, EmbeddingProvider, EmbeddingProviderConfig, EmbeddingResult, EmbeddingSkipped, EmbeddingSource, EmbeddingSuccessMultiple, EmbeddingSuccessSingle, HttpEmbeddingConfig, HttpEmbeddingProvider, InMemoryEmbeddingCache, InputValidation, OutputConfig, PiiRedactionConfig, RateLimitConfig, RateLimiter, RedactionResult, ResilienceConfig, ResilienceState, RetryConfig, SafetyConfig, Semaphore, TRANSFORMERS_MODELS, TextChunk, aggregateVectors, chunkText, containsPii, cosineSimilarity, createAzureEmbedding, createCohereEmbedding, createEmbeddingProvider, createHttpEmbedding, createHuggingFaceEmbedding, createNoOpCache, createOllamaEmbedding, createOpenAIEmbedding, createPiiRedactor, createTimeoutSignal, createTokenizer, createTransformersEmbedding, dotProduct, embed, embedScrapedData, estimateTokens, euclideanDistance, generateCacheKey, generateChecksum, generateEmbeddings, getChunkingStats, getDefaultCache, getDefaultModel, getDimensions, heuristicTokenCount, isEmbeddingProvider, isRetryableError, needsChunking, normalizeVector, previewInput, redactPii, resetDefaultCache, selectInput, validateCachedResult, validateInput, withResilience, withRetry, withTimeout };
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import { A as HttpEmbeddingProvider, B as withResilience, C as RateLimitConfig, D as SafetyConfig, E as RetryConfig, F as CircuitOpenError, H as withTimeout, I as RateLimiter, L as Semaphore, O as TextChunk, P as CircuitBreaker, R as createTimeoutSignal, S as PiiRedactionConfig, T as ResilienceState, V as withRetry, _ as EmbeddingSkipped, a as EmbedRequest, b as EmbeddingSuccessSingle, c as EmbeddingCache, d as EmbeddingInputType, f as EmbeddingMetrics, g as EmbeddingResult, h as EmbeddingProviderConfig, i as CircuitState, j as createHttpEmbedding, k as HttpEmbeddingConfig, l as EmbeddingCacheConfig, m as EmbeddingProvider, n as CircuitBreakerConfig, o as EmbedResponse, p as EmbeddingOptions, r as CircuitBreakerState, s as EmbeddingAggregation, t as ChunkingConfig, u as EmbeddingInputConfig, v as EmbeddingSource, w as ResilienceConfig, x as OutputConfig, y as EmbeddingSuccessMultiple, z as isRetryableError } from "../types-
|
|
2
|
-
import { A as createNoOpCache, B as euclideanDistance, C as createTokenizer, D as needsChunking, E as heuristicTokenCount, F as validateCachedResult, H as normalizeVector, I as AggregationResult, L as aggregateVectors, M as generateChecksum, N as getDefaultCache, O as CacheStats, P as resetDefaultCache, R as cosineSimilarity, S as chunkText, T as getChunkingStats, V as getDimensions, _ as generateEmbeddings, a as createEmbeddingProvider, b as selectInput, c as createAzureEmbedding, d as createOllamaEmbedding, f as createOpenAIEmbedding, g as embedScrapedData, h as embed, i as redactPii, j as generateCacheKey, k as InMemoryEmbeddingCache, l as createCohereEmbedding, m as getDefaultModel, n as containsPii, o as isEmbeddingProvider, p as createTransformersEmbedding, r as createPiiRedactor, s as TRANSFORMERS_MODELS, t as RedactionResult, u as createHuggingFaceEmbedding, v as InputValidation, w as estimateTokens, x as validateInput, y as previewInput, z as dotProduct } from "../index-
|
|
1
|
+
import { A as HttpEmbeddingProvider, B as withResilience, C as RateLimitConfig, D as SafetyConfig, E as RetryConfig, F as CircuitOpenError, H as withTimeout, I as RateLimiter, L as Semaphore, O as TextChunk, P as CircuitBreaker, R as createTimeoutSignal, S as PiiRedactionConfig, T as ResilienceState, V as withRetry, _ as EmbeddingSkipped, a as EmbedRequest, b as EmbeddingSuccessSingle, c as EmbeddingCache, d as EmbeddingInputType, f as EmbeddingMetrics, g as EmbeddingResult, h as EmbeddingProviderConfig, i as CircuitState, j as createHttpEmbedding, k as HttpEmbeddingConfig, l as EmbeddingCacheConfig, m as EmbeddingProvider, n as CircuitBreakerConfig, o as EmbedResponse, p as EmbeddingOptions, r as CircuitBreakerState, s as EmbeddingAggregation, t as ChunkingConfig, u as EmbeddingInputConfig, v as EmbeddingSource, w as ResilienceConfig, x as OutputConfig, y as EmbeddingSuccessMultiple, z as isRetryableError } from "../types-dyXBh3Xy.mjs";
|
|
2
|
+
import { A as createNoOpCache, B as euclideanDistance, C as createTokenizer, D as needsChunking, E as heuristicTokenCount, F as validateCachedResult, H as normalizeVector, I as AggregationResult, L as aggregateVectors, M as generateChecksum, N as getDefaultCache, O as CacheStats, P as resetDefaultCache, R as cosineSimilarity, S as chunkText, T as getChunkingStats, V as getDimensions, _ as generateEmbeddings, a as createEmbeddingProvider, b as selectInput, c as createAzureEmbedding, d as createOllamaEmbedding, f as createOpenAIEmbedding, g as embedScrapedData, h as embed, i as redactPii, j as generateCacheKey, k as InMemoryEmbeddingCache, l as createCohereEmbedding, m as getDefaultModel, n as containsPii, o as isEmbeddingProvider, p as createTransformersEmbedding, r as createPiiRedactor, s as TRANSFORMERS_MODELS, t as RedactionResult, u as createHuggingFaceEmbedding, v as InputValidation, w as estimateTokens, x as validateInput, y as previewInput, z as dotProduct } from "../index-DoA7xuDF.mjs";
|
|
3
3
|
export { AggregationResult, CacheStats, ChunkingConfig, CircuitBreaker, CircuitBreakerConfig, CircuitBreakerState, CircuitOpenError, CircuitState, EmbedRequest, EmbedResponse, EmbeddingAggregation, EmbeddingCache, EmbeddingCacheConfig, EmbeddingInputConfig, EmbeddingInputType, EmbeddingMetrics, EmbeddingOptions, EmbeddingProvider, EmbeddingProviderConfig, EmbeddingResult, EmbeddingSkipped, EmbeddingSource, EmbeddingSuccessMultiple, EmbeddingSuccessSingle, HttpEmbeddingConfig, HttpEmbeddingProvider, InMemoryEmbeddingCache, InputValidation, OutputConfig, PiiRedactionConfig, RateLimitConfig, RateLimiter, RedactionResult, ResilienceConfig, ResilienceState, RetryConfig, SafetyConfig, Semaphore, TRANSFORMERS_MODELS, TextChunk, aggregateVectors, chunkText, containsPii, cosineSimilarity, createAzureEmbedding, createCohereEmbedding, createEmbeddingProvider, createHttpEmbedding, createHuggingFaceEmbedding, createNoOpCache, createOllamaEmbedding, createOpenAIEmbedding, createPiiRedactor, createTimeoutSignal, createTokenizer, createTransformersEmbedding, dotProduct, embed, embedScrapedData, estimateTokens, euclideanDistance, generateCacheKey, generateChecksum, generateEmbeddings, getChunkingStats, getDefaultCache, getDefaultModel, getDimensions, heuristicTokenCount, isEmbeddingProvider, isRetryableError, needsChunking, normalizeVector, previewInput, redactPii, resetDefaultCache, selectInput, validateCachedResult, validateInput, withResilience, withRetry, withTimeout };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { a as Semaphore, c as withResilience, i as RateLimiter, l as withRetry, n as CircuitBreaker, o as createTimeoutSignal, r as CircuitOpenError, s as isRetryableError, u as withTimeout } from "../http-base-
|
|
2
|
-
import { A as generateCacheKey, B as normalizeVector, C as createTokenizer, D as needsChunking, E as heuristicTokenCount, F as aggregateVectors, I as cosineSimilarity, L as dotProduct, M as getDefaultCache, N as resetDefaultCache, O as InMemoryEmbeddingCache, P as validateCachedResult, R as euclideanDistance, S as chunkText, T as getChunkingStats, _ as createHttpEmbedding, a as createPiiRedactor, b as selectInput, c as isEmbeddingProvider, d as createCohereEmbedding, f as createHuggingFaceEmbedding, g as HttpEmbeddingProvider, h as createTransformersEmbedding, i as containsPii, j as generateChecksum, k as createNoOpCache, l as TRANSFORMERS_MODELS, m as createOpenAIEmbedding, n as embedScrapedData, o as redactPii, p as createOllamaEmbedding, r as generateEmbeddings, s as createEmbeddingProvider, t as embed, u as createAzureEmbedding, v as getDefaultModel, w as estimateTokens, x as validateInput, y as previewInput, z as getDimensions } from "../embeddings-
|
|
1
|
+
import { a as Semaphore, c as withResilience, i as RateLimiter, l as withRetry, n as CircuitBreaker, o as createTimeoutSignal, r as CircuitOpenError, s as isRetryableError, u as withTimeout } from "../http-base-B8qhcRit.mjs";
|
|
2
|
+
import { A as generateCacheKey, B as normalizeVector, C as createTokenizer, D as needsChunking, E as heuristicTokenCount, F as aggregateVectors, I as cosineSimilarity, L as dotProduct, M as getDefaultCache, N as resetDefaultCache, O as InMemoryEmbeddingCache, P as validateCachedResult, R as euclideanDistance, S as chunkText, T as getChunkingStats, _ as createHttpEmbedding, a as createPiiRedactor, b as selectInput, c as isEmbeddingProvider, d as createCohereEmbedding, f as createHuggingFaceEmbedding, g as HttpEmbeddingProvider, h as createTransformersEmbedding, i as containsPii, j as generateChecksum, k as createNoOpCache, l as TRANSFORMERS_MODELS, m as createOpenAIEmbedding, n as embedScrapedData, o as redactPii, p as createOllamaEmbedding, r as generateEmbeddings, s as createEmbeddingProvider, t as embed, u as createAzureEmbedding, v as getDefaultModel, w as estimateTokens, x as validateInput, y as previewInput, z as getDimensions } from "../embeddings-CukTWZVJ.mjs";
|
|
3
3
|
|
|
4
4
|
export { CircuitBreaker, CircuitOpenError, HttpEmbeddingProvider, InMemoryEmbeddingCache, RateLimiter, Semaphore, TRANSFORMERS_MODELS, aggregateVectors, chunkText, containsPii, cosineSimilarity, createAzureEmbedding, createCohereEmbedding, createEmbeddingProvider, createHttpEmbedding, createHuggingFaceEmbedding, createNoOpCache, createOllamaEmbedding, createOpenAIEmbedding, createPiiRedactor, createTimeoutSignal, createTokenizer, createTransformersEmbedding, dotProduct, embed, embedScrapedData, estimateTokens, euclideanDistance, generateCacheKey, generateChecksum, generateEmbeddings, getChunkingStats, getDefaultCache, getDefaultModel, getDimensions, heuristicTokenCount, isEmbeddingProvider, isRetryableError, needsChunking, normalizeVector, previewInput, redactPii, resetDefaultCache, selectInput, validateCachedResult, validateInput, withResilience, withRetry, withTimeout };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
const require_parsers = require('./parsers-
|
|
2
|
-
const require_http_base = require('./http-base-
|
|
1
|
+
const require_parsers = require('./parsers-DsqevSf6.cjs');
|
|
2
|
+
const require_http_base = require('./http-base-B5EXbNwR.cjs');
|
|
3
3
|
let node_crypto = require("node:crypto");
|
|
4
4
|
|
|
5
5
|
//#region src/embeddings/aggregation.ts
|
|
@@ -1453,4 +1453,4 @@ Object.defineProperty(exports, 'validateInput', {
|
|
|
1453
1453
|
return validateInput;
|
|
1454
1454
|
}
|
|
1455
1455
|
});
|
|
1456
|
-
//# sourceMappingURL=embeddings-
|
|
1456
|
+
//# sourceMappingURL=embeddings-BzRSRX9t.cjs.map
|