scrapex 1.0.0-beta.1 → 1.0.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/dist/embeddings/index.cjs +52 -0
  2. package/dist/embeddings/index.d.cts +3 -0
  3. package/dist/embeddings/index.d.mts +3 -0
  4. package/dist/embeddings/index.mjs +4 -0
  5. package/dist/embeddings-BjNTQSG9.cjs +1455 -0
  6. package/dist/embeddings-BjNTQSG9.cjs.map +1 -0
  7. package/dist/embeddings-Bsymy_jA.mjs +1215 -0
  8. package/dist/embeddings-Bsymy_jA.mjs.map +1 -0
  9. package/dist/enhancer-Cs_WyWtJ.cjs +219 -0
  10. package/dist/enhancer-Cs_WyWtJ.cjs.map +1 -0
  11. package/dist/enhancer-INx5NlgO.mjs +177 -0
  12. package/dist/enhancer-INx5NlgO.mjs.map +1 -0
  13. package/dist/{enhancer-j0xqKDJm.cjs → http-base-CHLf-Tco.cjs} +36 -199
  14. package/dist/http-base-CHLf-Tco.cjs.map +1 -0
  15. package/dist/{enhancer-ByjRD-t5.mjs → http-base-DM7YNo6X.mjs} +25 -176
  16. package/dist/http-base-DM7YNo6X.mjs.map +1 -0
  17. package/dist/{index-CDgcRnig.d.cts → index-Bvseqli-.d.cts} +1 -1
  18. package/dist/{index-CDgcRnig.d.cts.map → index-Bvseqli-.d.cts.map} +1 -1
  19. package/dist/{index-piS5wtki.d.mts → index-CIFjNySr.d.mts} +1 -1
  20. package/dist/{index-piS5wtki.d.mts.map → index-CIFjNySr.d.mts.map} +1 -1
  21. package/dist/index-D6qfjmZQ.d.mts +401 -0
  22. package/dist/index-D6qfjmZQ.d.mts.map +1 -0
  23. package/dist/index-RFSpP5g8.d.cts +401 -0
  24. package/dist/index-RFSpP5g8.d.cts.map +1 -0
  25. package/dist/index.cjs +39 -1074
  26. package/dist/index.cjs.map +1 -1
  27. package/dist/index.d.cts +3 -260
  28. package/dist/index.d.cts.map +1 -1
  29. package/dist/index.d.mts +3 -260
  30. package/dist/index.d.mts.map +1 -1
  31. package/dist/index.mjs +4 -1039
  32. package/dist/index.mjs.map +1 -1
  33. package/dist/llm/index.cjs +7 -6
  34. package/dist/llm/index.cjs.map +1 -1
  35. package/dist/llm/index.d.cts +1 -1
  36. package/dist/llm/index.d.mts +1 -1
  37. package/dist/llm/index.mjs +2 -1
  38. package/dist/llm/index.mjs.map +1 -1
  39. package/dist/parsers/index.d.cts +1 -1
  40. package/dist/parsers/index.d.mts +1 -1
  41. package/dist/parsers/index.mjs +1 -1
  42. package/dist/{parsers-CwkYnyWY.mjs → parsers-DsawHeo0.mjs} +1 -1
  43. package/dist/{parsers-CwkYnyWY.mjs.map → parsers-DsawHeo0.mjs.map} +1 -1
  44. package/dist/{types-CadAXrme.d.mts → types-BOcHQU9s.d.mts} +308 -151
  45. package/dist/types-BOcHQU9s.d.mts.map +1 -0
  46. package/dist/{types-DPEtPihB.d.cts → types-DutdBpqd.d.cts} +308 -151
  47. package/dist/types-DutdBpqd.d.cts.map +1 -0
  48. package/package.json +1 -1
  49. package/dist/enhancer-ByjRD-t5.mjs.map +0 -1
  50. package/dist/enhancer-j0xqKDJm.cjs.map +0 -1
  51. package/dist/types-CadAXrme.d.mts.map +0 -1
  52. package/dist/types-DPEtPihB.d.cts.map +0 -1
@@ -0,0 +1,1215 @@
1
+ import { a as Semaphore, c as withResilience, d as ScrapeError, i as RateLimiter, n as CircuitBreaker, t as BaseHttpProvider } from "./http-base-DM7YNo6X.mjs";
2
+ import { createHash } from "node:crypto";
3
+
4
+ //#region src/embeddings/aggregation.ts
5
+ /**
6
+ * Aggregate multiple embedding vectors into a single vector or return all.
7
+ *
8
+ * @param vectors - Array of embedding vectors (must all have same dimensions)
9
+ * @param strategy - Aggregation strategy
10
+ * @returns Aggregated result based on strategy
11
+ */
12
+ function aggregateVectors(vectors, strategy = "average") {
13
+ if (vectors.length === 0) throw new Error("Cannot aggregate empty vector array");
14
+ const firstVector = vectors[0];
15
+ if (!firstVector) throw new Error("Cannot aggregate empty vector array");
16
+ const dimensions = firstVector.length;
17
+ for (let i = 1; i < vectors.length; i++) {
18
+ const vec = vectors[i];
19
+ if (!vec || vec.length !== dimensions) throw new Error(`Vector dimension mismatch: expected ${dimensions}, got ${vec?.length ?? 0} at index ${i}`);
20
+ }
21
+ switch (strategy) {
22
+ case "average": return {
23
+ type: "single",
24
+ vector: averageVectors(vectors),
25
+ dimensions
26
+ };
27
+ case "max": return {
28
+ type: "single",
29
+ vector: maxPoolVectors(vectors),
30
+ dimensions
31
+ };
32
+ case "first": return {
33
+ type: "single",
34
+ vector: firstVector,
35
+ dimensions
36
+ };
37
+ case "all": return {
38
+ type: "multiple",
39
+ vectors,
40
+ dimensions
41
+ };
42
+ default: {
43
+ const _exhaustive = strategy;
44
+ throw new Error(`Unknown aggregation strategy: ${_exhaustive}`);
45
+ }
46
+ }
47
+ }
48
+ /**
49
+ * Compute element-wise average of vectors.
50
+ */
51
+ function averageVectors(vectors) {
52
+ const first = vectors[0];
53
+ if (!first || vectors.length === 1) return first ?? [];
54
+ const dimensions = first.length;
55
+ const count = vectors.length;
56
+ const result = new Array(dimensions).fill(0);
57
+ for (const vector of vectors) for (let i = 0; i < dimensions; i++) {
58
+ const val = result[i];
59
+ if (val !== void 0) result[i] = val + (vector[i] ?? 0);
60
+ }
61
+ for (let i = 0; i < dimensions; i++) {
62
+ const val = result[i];
63
+ if (val !== void 0) result[i] = val / count;
64
+ }
65
+ return result;
66
+ }
67
+ /**
68
+ * Compute element-wise maximum of vectors (max pooling).
69
+ */
70
+ function maxPoolVectors(vectors) {
71
+ const first = vectors[0];
72
+ if (!first || vectors.length === 1) return first ?? [];
73
+ const dimensions = first.length;
74
+ const result = [...first];
75
+ for (let v = 1; v < vectors.length; v++) {
76
+ const vec = vectors[v];
77
+ if (!vec) continue;
78
+ for (let i = 0; i < dimensions; i++) {
79
+ const val = vec[i] ?? 0;
80
+ if (val > (result[i] ?? 0)) result[i] = val;
81
+ }
82
+ }
83
+ return result;
84
+ }
85
+ /**
86
+ * Normalize a vector to unit length (L2 normalization).
87
+ */
88
+ function normalizeVector(vector) {
89
+ const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
90
+ if (magnitude === 0) return vector;
91
+ return vector.map((val) => val / magnitude);
92
+ }
93
+ /**
94
+ * Compute cosine similarity between two vectors.
95
+ * Both vectors should be normalized for accurate results.
96
+ */
97
+ function cosineSimilarity(a, b) {
98
+ if (a.length !== b.length) throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`);
99
+ let dot = 0;
100
+ let magnitudeA = 0;
101
+ let magnitudeB = 0;
102
+ for (let i = 0; i < a.length; i++) {
103
+ const aVal = a[i] ?? 0;
104
+ const bVal = b[i] ?? 0;
105
+ dot += aVal * bVal;
106
+ magnitudeA += aVal * aVal;
107
+ magnitudeB += bVal * bVal;
108
+ }
109
+ const magnitude = Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB);
110
+ if (magnitude === 0) return 0;
111
+ return dot / magnitude;
112
+ }
113
+ /**
114
+ * Compute euclidean distance between two vectors.
115
+ */
116
+ function euclideanDistance(a, b) {
117
+ if (a.length !== b.length) throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`);
118
+ let sum = 0;
119
+ for (let i = 0; i < a.length; i++) {
120
+ const diff = (a[i] ?? 0) - (b[i] ?? 0);
121
+ sum += diff * diff;
122
+ }
123
+ return Math.sqrt(sum);
124
+ }
125
+ /**
126
+ * Compute dot product of two vectors.
127
+ */
128
+ function dotProduct(a, b) {
129
+ if (a.length !== b.length) throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`);
130
+ let result = 0;
131
+ for (let i = 0; i < a.length; i++) {
132
+ const aVal = a[i] ?? 0;
133
+ const bVal = b[i] ?? 0;
134
+ result += aVal * bVal;
135
+ }
136
+ return result;
137
+ }
138
+ /**
139
+ * Get the dimensions of a vector or set of vectors.
140
+ */
141
+ function getDimensions(vectors) {
142
+ if (vectors.length === 0) return 0;
143
+ const first = vectors[0];
144
+ if (typeof first === "number") return vectors.length;
145
+ return first?.length ?? 0;
146
+ }
147
+
148
+ //#endregion
149
+ //#region src/embeddings/cache.ts
150
+ /**
151
+ * Default maximum cache entries.
152
+ */
153
+ const DEFAULT_MAX_ENTRIES = 1e3;
154
+ /**
155
+ * Default TTL in milliseconds (1 hour).
156
+ */
157
+ const DEFAULT_TTL_MS = 3600 * 1e3;
158
+ /**
159
+ * Generate a content-addressable cache key.
160
+ * Key is based on content hash and embedding configuration.
161
+ * Note: custom RegExp patterns are serialized by source+flags; different
162
+ * constructions can yield different cache keys even if equivalent.
163
+ */
164
+ function generateCacheKey(params) {
165
+ const hash = createHash("sha256");
166
+ const fingerprint = stableStringify({
167
+ providerKey: params.providerKey,
168
+ model: params.model ?? "provider-default",
169
+ dimensions: params.dimensions ?? "default",
170
+ aggregation: params.aggregation ?? "average",
171
+ input: serializeInputConfig(params.input),
172
+ chunking: serializeChunkingConfig(params.chunking),
173
+ safety: serializeSafetyConfig(params.safety),
174
+ cacheKeySalt: params.cacheKeySalt
175
+ });
176
+ hash.update(fingerprint);
177
+ hash.update("\0");
178
+ hash.update(params.content);
179
+ return hash.digest("hex");
180
+ }
181
+ /**
182
+ * Generate a checksum for content verification.
183
+ */
184
+ function generateChecksum(content) {
185
+ return createHash("sha256").update(content).digest("hex").slice(0, 16);
186
+ }
187
+ function serializeInputConfig(config) {
188
+ if (!config) return void 0;
189
+ return normalizeObject({
190
+ type: config.type ?? "textContent",
191
+ hasTransform: Boolean(config.transform),
192
+ hasCustomText: Boolean(config.customText)
193
+ });
194
+ }
195
+ function serializeChunkingConfig(config) {
196
+ if (!config) return void 0;
197
+ return normalizeObject({
198
+ size: config.size,
199
+ overlap: config.overlap,
200
+ tokenizer: getTokenizerId(config.tokenizer),
201
+ maxInputLength: config.maxInputLength
202
+ });
203
+ }
204
+ function serializeSafetyConfig(config) {
205
+ if (!config) return void 0;
206
+ return normalizeObject({
207
+ piiRedaction: serializePiiConfig(config.piiRedaction),
208
+ minTextLength: config.minTextLength,
209
+ maxTokens: config.maxTokens
210
+ });
211
+ }
212
+ function serializePiiConfig(config) {
213
+ if (!config) return void 0;
214
+ return normalizeObject({
215
+ email: config.email ?? false,
216
+ phone: config.phone ?? false,
217
+ creditCard: config.creditCard ?? false,
218
+ ssn: config.ssn ?? false,
219
+ ipAddress: config.ipAddress ?? false,
220
+ customPatterns: config.customPatterns?.map((pattern) => `${pattern.source}/${pattern.flags}`)
221
+ });
222
+ }
223
+ function getTokenizerId(tokenizer) {
224
+ if (!tokenizer || tokenizer === "heuristic") return "heuristic";
225
+ if (tokenizer === "tiktoken") return "tiktoken";
226
+ return "custom";
227
+ }
228
+ function stableStringify(value) {
229
+ return stringifyNormalized(normalizeValue(value));
230
+ }
231
+ function normalizeValue(value) {
232
+ if (value === void 0) return void 0;
233
+ if (value === null) return null;
234
+ if (Array.isArray(value)) return value.map((entry) => normalizeValue(entry)).filter((entry) => entry !== void 0);
235
+ if (typeof value === "object") return normalizeObject(value);
236
+ return value;
237
+ }
238
+ function normalizeObject(value) {
239
+ const normalized = {};
240
+ for (const key of Object.keys(value).sort()) {
241
+ const entry = normalizeValue(value[key]);
242
+ if (entry !== void 0) normalized[key] = entry;
243
+ }
244
+ return normalized;
245
+ }
246
+ function stringifyNormalized(value) {
247
+ if (value === void 0) return "undefined";
248
+ if (value === null) return "null";
249
+ if (typeof value === "string") return JSON.stringify(value);
250
+ if (typeof value === "number" || typeof value === "boolean") return String(value);
251
+ if (Array.isArray(value)) return `[${value.map((entry) => stringifyNormalized(entry)).join(",")}]`;
252
+ if (typeof value === "object") {
253
+ const obj = value;
254
+ return `{${Object.keys(obj).sort().map((key) => `${JSON.stringify(key)}:${stringifyNormalized(obj[key])}`).join(",")}}`;
255
+ }
256
+ return JSON.stringify(value);
257
+ }
258
+ /**
259
+ * In-memory LRU cache with TTL support.
260
+ * Content-addressable: uses content hash as key, not URL.
261
+ */
262
+ var InMemoryEmbeddingCache = class {
263
+ cache;
264
+ maxEntries;
265
+ defaultTtlMs;
266
+ constructor(options) {
267
+ this.cache = /* @__PURE__ */ new Map();
268
+ this.maxEntries = options?.maxEntries ?? DEFAULT_MAX_ENTRIES;
269
+ this.defaultTtlMs = options?.ttlMs ?? DEFAULT_TTL_MS;
270
+ }
271
+ async get(key) {
272
+ const entry = this.cache.get(key);
273
+ if (!entry) return;
274
+ const now = Date.now();
275
+ if (now > entry.expiresAt) {
276
+ this.cache.delete(key);
277
+ return;
278
+ }
279
+ entry.accessedAt = now;
280
+ return entry.value;
281
+ }
282
+ async set(key, value, options) {
283
+ const now = Date.now();
284
+ const ttl = options?.ttlMs ?? this.defaultTtlMs;
285
+ if (this.cache.size >= this.maxEntries && !this.cache.has(key)) this.evictLRU();
286
+ this.cache.set(key, {
287
+ value,
288
+ createdAt: now,
289
+ expiresAt: now + ttl,
290
+ accessedAt: now
291
+ });
292
+ }
293
+ async delete(key) {
294
+ return this.cache.delete(key);
295
+ }
296
+ async clear() {
297
+ this.cache.clear();
298
+ }
299
+ /**
300
+ * Get cache statistics.
301
+ */
302
+ getStats() {
303
+ const now = Date.now();
304
+ let expired = 0;
305
+ for (const entry of this.cache.values()) if (now > entry.expiresAt) expired++;
306
+ return {
307
+ size: this.cache.size,
308
+ maxEntries: this.maxEntries,
309
+ expired,
310
+ utilization: this.cache.size / this.maxEntries
311
+ };
312
+ }
313
+ /**
314
+ * Evict expired entries.
315
+ */
316
+ cleanup() {
317
+ const now = Date.now();
318
+ let evicted = 0;
319
+ for (const [key, entry] of this.cache.entries()) if (now > entry.expiresAt) {
320
+ this.cache.delete(key);
321
+ evicted++;
322
+ }
323
+ return evicted;
324
+ }
325
+ /**
326
+ * Evict least recently used entry.
327
+ */
328
+ evictLRU() {
329
+ let oldestKey = null;
330
+ let oldestAccess = Number.POSITIVE_INFINITY;
331
+ for (const [key, entry] of this.cache.entries()) if (entry.accessedAt < oldestAccess) {
332
+ oldestAccess = entry.accessedAt;
333
+ oldestKey = key;
334
+ }
335
+ if (oldestKey) this.cache.delete(oldestKey);
336
+ }
337
+ };
338
+ /**
339
+ * Validate that a cached result matches expected parameters.
340
+ */
341
+ function validateCachedResult(result, expectedDimensions) {
342
+ if (result.status !== "success") return true;
343
+ if (!expectedDimensions) return true;
344
+ if (result.aggregation === "all") {
345
+ const firstVec = result.vectors[0];
346
+ if (!firstVec) return false;
347
+ return firstVec.length === expectedDimensions;
348
+ }
349
+ return result.vector.length === expectedDimensions;
350
+ }
351
+ /**
352
+ * Create a no-op cache that never stores anything.
353
+ * Useful for disabling caching while maintaining interface compatibility.
354
+ */
355
+ function createNoOpCache() {
356
+ return {
357
+ async get() {},
358
+ async set() {},
359
+ async delete() {
360
+ return false;
361
+ },
362
+ async clear() {}
363
+ };
364
+ }
365
+ /**
366
+ * Default in-memory cache instance.
367
+ * Optimized for moderate cache sizes (default 1000 entries).
368
+ */
369
+ let defaultCache = null;
370
+ /**
371
+ * Get or create the default cache instance.
372
+ */
373
+ function getDefaultCache() {
374
+ if (!defaultCache) defaultCache = new InMemoryEmbeddingCache();
375
+ return defaultCache;
376
+ }
377
+ /**
378
+ * Reset the default cache (mainly for testing).
379
+ */
380
+ async function resetDefaultCache() {
381
+ if (defaultCache) await defaultCache.clear();
382
+ defaultCache = null;
383
+ }
384
+
385
+ //#endregion
386
+ //#region src/embeddings/chunking.ts
387
+ /**
388
+ * Default chunk size in tokens.
389
+ */
390
+ const DEFAULT_CHUNK_SIZE$1 = 500;
391
+ /**
392
+ * Default overlap in tokens.
393
+ */
394
+ const DEFAULT_OVERLAP = 50;
395
+ /**
396
+ * Default maximum input length in characters.
397
+ */
398
+ const DEFAULT_MAX_INPUT_LENGTH = 1e5;
399
+ /**
400
+ * Heuristic token counting: approximately 4 characters per token.
401
+ * This is a reasonable approximation for English text.
402
+ */
403
+ function heuristicTokenCount(text) {
404
+ return Math.ceil(text.length / 4);
405
+ }
406
+ /**
407
+ * Convert token count to approximate character count.
408
+ */
409
+ function tokensToChars(tokens) {
410
+ return tokens * 4;
411
+ }
412
+ /**
413
+ * Create a tokenizer function based on configuration.
414
+ */
415
+ function createTokenizer(config) {
416
+ if (!config || config === "heuristic") return heuristicTokenCount;
417
+ if (config === "tiktoken") return heuristicTokenCount;
418
+ return config;
419
+ }
420
+ /**
421
+ * Find a natural break point in text (sentence or word boundary).
422
+ * Prefers common sentence boundaries (Latin + CJK), falls back to word boundaries.
423
+ */
424
+ function findBreakPoint(text, targetIndex) {
425
+ const searchStart = Math.max(0, targetIndex - Math.floor(targetIndex * .2));
426
+ const searchEnd = Math.min(text.length, targetIndex + Math.floor(targetIndex * .2));
427
+ const searchText = text.slice(searchStart, searchEnd);
428
+ const sentenceMatch = /[.!?。!?]\s*/g;
429
+ let lastSentenceEnd = -1;
430
+ for (const match of searchText.matchAll(sentenceMatch)) {
431
+ const absolutePos = searchStart + match.index + match[0].length;
432
+ if (absolutePos <= targetIndex) lastSentenceEnd = absolutePos;
433
+ }
434
+ if (lastSentenceEnd !== -1) return lastSentenceEnd;
435
+ const wordBoundary = text.lastIndexOf(" ", targetIndex);
436
+ if (wordBoundary > searchStart) return wordBoundary + 1;
437
+ return targetIndex;
438
+ }
439
+ /**
440
+ * Split text into overlapping chunks optimized for embedding.
441
+ * Respects sentence boundaries when possible.
442
+ */
443
+ function chunkText(text, config) {
444
+ const chunkSize = config?.size ?? DEFAULT_CHUNK_SIZE$1;
445
+ const rawOverlap = config?.overlap ?? DEFAULT_OVERLAP;
446
+ const safeOverlap = Math.max(0, rawOverlap);
447
+ const overlap = Math.min(safeOverlap, Math.max(0, chunkSize - 1));
448
+ const maxInputLength = config?.maxInputLength ?? DEFAULT_MAX_INPUT_LENGTH;
449
+ const tokenizer = createTokenizer(config?.tokenizer);
450
+ const normalizedText = (text.length > maxInputLength ? text.slice(0, maxInputLength) : text).replace(/\s+/g, " ").trim();
451
+ if (!normalizedText) return [];
452
+ const totalTokens = tokenizer(normalizedText);
453
+ if (totalTokens <= chunkSize) return [{
454
+ text: normalizedText,
455
+ startIndex: 0,
456
+ endIndex: normalizedText.length,
457
+ tokens: totalTokens
458
+ }];
459
+ const chunks = [];
460
+ const chunkSizeChars = tokensToChars(chunkSize);
461
+ const overlapChars = tokensToChars(overlap);
462
+ let startIndex = 0;
463
+ while (startIndex < normalizedText.length) {
464
+ const targetEnd = Math.min(startIndex + chunkSizeChars, normalizedText.length);
465
+ const endIndex = targetEnd < normalizedText.length ? findBreakPoint(normalizedText, targetEnd) : targetEnd;
466
+ const chunkText$1 = normalizedText.slice(startIndex, endIndex).trim();
467
+ if (chunkText$1) chunks.push({
468
+ text: chunkText$1,
469
+ startIndex,
470
+ endIndex,
471
+ tokens: tokenizer(chunkText$1)
472
+ });
473
+ if (endIndex >= normalizedText.length) break;
474
+ const nextStart = endIndex - overlapChars;
475
+ startIndex = Math.max(nextStart, startIndex + 1);
476
+ if (startIndex < normalizedText.length) {
477
+ const spaceIndex = normalizedText.indexOf(" ", startIndex);
478
+ if (spaceIndex !== -1 && spaceIndex < startIndex + overlapChars) startIndex = spaceIndex + 1;
479
+ }
480
+ }
481
+ return chunks;
482
+ }
483
+ /**
484
+ * Estimate total tokens for a text without chunking.
485
+ */
486
+ function estimateTokens(text, tokenizer) {
487
+ return createTokenizer(tokenizer)(text);
488
+ }
489
+ /**
490
+ * Check if text needs chunking based on token count.
491
+ */
492
+ function needsChunking(text, maxTokens = DEFAULT_CHUNK_SIZE$1, tokenizer) {
493
+ return createTokenizer(tokenizer)(text) > maxTokens;
494
+ }
495
+ /**
496
+ * Get statistics about potential chunking.
497
+ */
498
+ function getChunkingStats(text, config) {
499
+ const maxInputLength = config?.maxInputLength ?? DEFAULT_MAX_INPUT_LENGTH;
500
+ const chunkSize = config?.size ?? DEFAULT_CHUNK_SIZE$1;
501
+ const overlap = config?.overlap ?? DEFAULT_OVERLAP;
502
+ const tokenizer = createTokenizer(config?.tokenizer);
503
+ const inputLength = text.length;
504
+ const willTruncate = inputLength > maxInputLength;
505
+ const processedLength = willTruncate ? maxInputLength : inputLength;
506
+ const estimatedTokens = tokenizer(text.slice(0, processedLength).replace(/\s+/g, " ").trim());
507
+ let estimatedChunks = 1;
508
+ if (estimatedTokens > chunkSize) {
509
+ const clampedOverlap = Math.min(overlap, Math.max(0, chunkSize - 1));
510
+ const effectiveChunkSize = Math.max(1, chunkSize - clampedOverlap);
511
+ estimatedChunks = Math.ceil((estimatedTokens - clampedOverlap) / effectiveChunkSize);
512
+ }
513
+ return {
514
+ inputLength,
515
+ estimatedTokens,
516
+ estimatedChunks,
517
+ willTruncate
518
+ };
519
+ }
520
+
521
+ //#endregion
522
+ //#region src/embeddings/input.ts
523
+ /**
524
+ * Select and prepare input text for embedding based on configuration.
525
+ *
526
+ * @param data - Scraped data to extract input from
527
+ * @param config - Input configuration
528
+ * @returns Selected and prepared text, or undefined if no valid input
529
+ */
530
+ function selectInput(data, config) {
531
+ if (config?.transform) return normalizeText(config.transform(data));
532
+ if (config?.type === "custom" && config.customText) return normalizeText(config.customText);
533
+ const type = config?.type ?? "textContent";
534
+ switch (type) {
535
+ case "textContent": return selectTextContent(data);
536
+ case "title+summary": return selectTitleSummary(data);
537
+ case "custom": return selectTextContent(data);
538
+ default: {
539
+ const _exhaustive = type;
540
+ throw new Error(`Unknown input type: ${_exhaustive}`);
541
+ }
542
+ }
543
+ }
544
+ /**
545
+ * Select textContent as input.
546
+ */
547
+ function selectTextContent(data) {
548
+ if (data.textContent) return normalizeText(data.textContent);
549
+ if (data.content) return normalizeText(stripMarkdown(data.content));
550
+ if (data.excerpt) return normalizeText(data.excerpt);
551
+ if (data.description) return normalizeText(data.description);
552
+ }
553
+ /**
554
+ * Select title + summary (or fallbacks) as input.
555
+ * Optimized for semantic search and classification.
556
+ */
557
+ function selectTitleSummary(data) {
558
+ const parts = [];
559
+ if (data.title) parts.push(data.title);
560
+ if (data.summary) parts.push(data.summary);
561
+ else if (data.excerpt) parts.push(data.excerpt);
562
+ else if (data.description) parts.push(data.description);
563
+ if (parts.length === 0) return;
564
+ return normalizeText(parts.join("\n\n"));
565
+ }
566
+ /**
567
+ * Normalize text for embedding:
568
+ * - Collapse whitespace
569
+ * - Trim leading/trailing whitespace
570
+ * - Remove control characters
571
+ */
572
+ function normalizeText(text) {
573
+ if (!text) return "";
574
+ return text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "").replace(/[ \t]+/g, " ").replace(/\n{3,}/g, "\n\n").split("\n").map((line) => line.trim()).join("\n").trim();
575
+ }
576
+ /**
577
+ * Basic markdown stripping for when we need plain text from content.
578
+ * Not comprehensive, but handles common cases.
579
+ */
580
+ function stripMarkdown(markdown) {
581
+ return markdown.replace(/```[\s\S]*?```/g, "").replace(/`[^`]+`/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/\*\*([^*]+)\*\*/g, "$1").replace(/\*([^*]+)\*/g, "$1").replace(/__([^_]+)__/g, "$1").replace(/_([^_]+)_/g, "$1").replace(/^>\s+/gm, "").replace(/^[-*_]{3,}$/gm, "").replace(/^[\s]*[-*+]\s+/gm, "").replace(/^[\s]*\d+\.\s+/gm, "");
582
+ }
583
+ /**
584
+ * Check if the selected input meets minimum requirements.
585
+ */
586
+ function validateInput(text, minLength = 10) {
587
+ if (!text) return {
588
+ valid: false,
589
+ reason: "No input text available"
590
+ };
591
+ if (text.length < minLength) return {
592
+ valid: false,
593
+ reason: `Input too short (${text.length} < ${minLength} characters)`
594
+ };
595
+ const wordCount = text.split(/\s+/).filter((w) => w.length > 1).length;
596
+ if (wordCount < 3) return {
597
+ valid: false,
598
+ reason: `Input has too few words (${wordCount} < 3)`
599
+ };
600
+ return {
601
+ valid: true,
602
+ text,
603
+ wordCount,
604
+ charCount: text.length
605
+ };
606
+ }
607
+ /**
608
+ * Get a preview of what input would be selected.
609
+ * Useful for debugging and testing.
610
+ */
611
+ function previewInput(data, config, maxLength = 200) {
612
+ const input = selectInput(data, config);
613
+ if (!input) return "[No input available]";
614
+ if (input.length <= maxLength) return input;
615
+ return `${input.slice(0, maxLength)}...`;
616
+ }
617
+
618
+ //#endregion
619
+ //#region src/embeddings/providers/base.ts
620
+ /**
621
+ * Generate a stable cache key identifier for provider configuration.
622
+ */
623
+ function getProviderCacheKey(config) {
624
+ switch (config.type) {
625
+ case "http": return `http:${config.config.baseUrl.replace(/\/$/, "")}:${config.config.model}`;
626
+ case "custom": return `custom:${config.provider.name}`;
627
+ default: {
628
+ const _exhaustive = config;
629
+ return String(_exhaustive);
630
+ }
631
+ }
632
+ }
633
+ /**
634
+ * Get default model for a provider type.
635
+ */
636
+ function getDefaultModel(providerType) {
637
+ switch (providerType) {
638
+ case "openai": return "text-embedding-3-small";
639
+ case "azure": return "text-embedding-ada-002";
640
+ case "transformers": return "Xenova/all-MiniLM-L6-v2";
641
+ default: return "default";
642
+ }
643
+ }
644
+
645
+ //#endregion
646
+ //#region src/embeddings/providers/http.ts
647
+ /**
648
+ * HTTP-based Embedding Provider using native fetch.
649
+ * Provides a unified interface for any REST-based embedding API.
650
+ */
651
+ /**
652
+ * HTTP-based embedding provider.
653
+ * Works with any REST API using native fetch.
654
+ */
655
+ var HttpEmbeddingProvider = class extends BaseHttpProvider {
656
+ name = "http-embedding";
657
+ requestBuilder;
658
+ responseMapper;
659
+ constructor(config) {
660
+ super(config);
661
+ this.requestBuilder = config.requestBuilder ?? ((texts, model) => ({
662
+ input: texts,
663
+ model
664
+ }));
665
+ this.responseMapper = config.responseMapper ?? ((response) => {
666
+ const resp = response;
667
+ if (Array.isArray(resp.data)) return resp.data.map((item) => item.embedding);
668
+ if (Array.isArray(resp.embeddings)) return resp.embeddings;
669
+ if (Array.isArray(resp.embedding)) return [resp.embedding];
670
+ if (Array.isArray(response)) return response;
671
+ throw new ScrapeError("Unable to parse embedding response. Provide a custom responseMapper.", "VALIDATION_ERROR");
672
+ });
673
+ }
674
+ /**
675
+ * Generate embeddings for one or more texts.
676
+ */
677
+ async embed(texts, options) {
678
+ const model = options.model || this.model;
679
+ const body = this.requestBuilder(texts, model);
680
+ const { data } = await this.fetch(this.baseUrl, {
681
+ body,
682
+ signal: options.signal
683
+ });
684
+ const embeddings = this.responseMapper(data);
685
+ if (embeddings.length !== texts.length) throw new ScrapeError(`Embedding count mismatch: expected ${texts.length}, got ${embeddings.length}`, "VALIDATION_ERROR");
686
+ return { embeddings };
687
+ }
688
+ };
689
+ /**
690
+ * Create a generic HTTP embedding provider.
691
+ */
692
+ function createHttpEmbedding(config) {
693
+ return new HttpEmbeddingProvider(config);
694
+ }
695
+
696
+ //#endregion
697
+ //#region src/embeddings/providers/presets.ts
698
+ /**
699
+ * Create an OpenAI embedding provider.
700
+ *
701
+ * @example
702
+ * ```ts
703
+ * const provider = createOpenAIEmbedding({ apiKey: 'sk-...' });
704
+ * const { embeddings } = await provider.embed(['Hello'], { model: 'text-embedding-3-small' });
705
+ * ```
706
+ */
707
+ function createOpenAIEmbedding(options) {
708
+ const apiKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
709
+ if (!apiKey) throw new Error("OpenAI API key required. Set OPENAI_API_KEY env var or pass apiKey option.");
710
+ const headers = { Authorization: `Bearer ${apiKey}` };
711
+ if (options?.organization) headers["OpenAI-Organization"] = options.organization;
712
+ return new HttpEmbeddingProvider({
713
+ baseUrl: options?.baseUrl ?? "https://api.openai.com/v1/embeddings",
714
+ model: options?.model ?? "text-embedding-3-small",
715
+ headers,
716
+ requestBuilder: (texts, model) => ({
717
+ input: texts,
718
+ model
719
+ }),
720
+ responseMapper: (res) => res.data.map((item) => item.embedding)
721
+ });
722
+ }
723
+ /**
724
+ * Create an Azure OpenAI embedding provider.
725
+ *
726
+ * @example
727
+ * ```ts
728
+ * const provider = createAzureEmbedding({
729
+ * endpoint: 'https://my-resource.openai.azure.com',
730
+ * deploymentName: 'text-embedding-ada-002',
731
+ * apiVersion: '2023-05-15',
732
+ * });
733
+ * ```
734
+ */
735
+ function createAzureEmbedding(options) {
736
+ const apiKey = options.apiKey ?? process.env.AZURE_OPENAI_API_KEY;
737
+ if (!apiKey) throw new Error("Azure OpenAI API key required. Set AZURE_OPENAI_API_KEY env var or pass apiKey option.");
738
+ return new HttpEmbeddingProvider({
739
+ baseUrl: `${options.endpoint.replace(/\/$/, "")}/openai/deployments/${options.deploymentName}/embeddings?api-version=${options.apiVersion}`,
740
+ model: options.deploymentName,
741
+ headers: { "api-key": apiKey },
742
+ requestBuilder: (texts) => ({ input: texts }),
743
+ responseMapper: (res) => res.data.map((item) => item.embedding)
744
+ });
745
+ }
746
+ /**
747
+ * Create an Ollama embedding provider for local models.
748
+ *
749
+ * LIMITATION: Ollama's /api/embeddings endpoint processes one text at a time,
750
+ * not batches. When multiple chunks are embedded, each chunk triggers a
751
+ * separate HTTP request. This is handled transparently by the pipeline's
752
+ * sequential chunk processing, but may be slower than batch-capable providers.
753
+ * For high-throughput scenarios, consider using OpenAI, Cohere, or HuggingFace
754
+ * which support batch embedding in a single request.
755
+ *
756
+ * @example
757
+ * ```ts
758
+ * const provider = createOllamaEmbedding({ model: 'nomic-embed-text' });
759
+ * ```
760
+ */
761
+ function createOllamaEmbedding(options) {
762
+ return new HttpEmbeddingProvider({
763
+ baseUrl: options?.baseUrl ?? "http://localhost:11434/api/embeddings",
764
+ model: options?.model ?? "nomic-embed-text",
765
+ requireHttps: false,
766
+ allowPrivate: true,
767
+ requestBuilder: (texts, model) => ({
768
+ model,
769
+ prompt: texts[0]
770
+ }),
771
+ responseMapper: (res) => [res.embedding]
772
+ });
773
+ }
774
+ /**
775
+ * Create a HuggingFace Inference API embedding provider.
776
+ *
777
+ * @example
778
+ * ```ts
779
+ * const provider = createHuggingFaceEmbedding({
780
+ * model: 'sentence-transformers/all-MiniLM-L6-v2',
781
+ * });
782
+ * ```
783
+ */
784
+ function createHuggingFaceEmbedding(options) {
785
+ const apiKey = options.apiKey ?? process.env.HF_TOKEN ?? process.env.HUGGINGFACE_API_KEY;
786
+ const headers = {};
787
+ if (apiKey) headers.Authorization = `Bearer ${apiKey}`;
788
+ return new HttpEmbeddingProvider({
789
+ baseUrl: `https://api-inference.huggingface.co/models/${options.model}`,
790
+ model: options.model,
791
+ headers,
792
+ requestBuilder: (texts) => ({ inputs: texts }),
793
+ responseMapper: (response) => {
794
+ if (Array.isArray(response)) {
795
+ if (Array.isArray(response[0]) && typeof response[0][0] === "number") return response;
796
+ return [response];
797
+ }
798
+ throw new Error("Unexpected HuggingFace response format");
799
+ }
800
+ });
801
+ }
802
+ /**
803
+ * Create a Cohere embedding provider.
804
+ *
805
+ * @example
806
+ * ```ts
807
+ * const provider = createCohereEmbedding({ model: 'embed-english-v3.0' });
808
+ * ```
809
+ */
810
+ function createCohereEmbedding(options) {
811
+ const apiKey = options?.apiKey ?? process.env.COHERE_API_KEY;
812
+ if (!apiKey) throw new Error("Cohere API key required. Set COHERE_API_KEY env var or pass apiKey option.");
813
+ return new HttpEmbeddingProvider({
814
+ baseUrl: "https://api.cohere.ai/v1/embed",
815
+ model: options?.model ?? "embed-english-v3.0",
816
+ headers: { Authorization: `Bearer ${apiKey}` },
817
+ requestBuilder: (texts, model) => ({
818
+ texts,
819
+ model,
820
+ input_type: options?.inputType ?? "search_document"
821
+ }),
822
+ responseMapper: (res) => res.embeddings
823
+ });
824
+ }
825
+ /**
826
+ * Create a local Transformers.js embedding provider.
827
+ * Uses dependency injection - user provides the imported transformers module.
828
+ *
829
+ * @example
830
+ * ```typescript
831
+ * import * as transformers from '@huggingface/transformers';
832
+ * import { createTransformersEmbedding } from 'scrapex/embeddings';
833
+ *
834
+ * const provider = createTransformersEmbedding(transformers, {
835
+ * model: 'Xenova/all-MiniLM-L6-v2',
836
+ * });
837
+ * ```
838
+ *
839
+ * Required Node.js dependencies:
840
+ * ```
841
+ * npm install @huggingface/transformers onnxruntime-node
842
+ * ```
843
+ */
844
+ function createTransformersEmbedding(transformers, options) {
845
+ let pipeline = null;
846
+ let currentModel = null;
847
+ const config = {
848
+ model: options?.model ?? "Xenova/all-MiniLM-L6-v2",
849
+ quantized: options?.quantized ?? true,
850
+ pooling: options?.pooling ?? "mean",
851
+ normalize: options?.normalize ?? true
852
+ };
853
+ return {
854
+ name: "transformers",
855
+ async embed(texts, request) {
856
+ const model = request.model || config.model;
857
+ if (!pipeline || currentModel !== model) {
858
+ const cacheDir = options?.cacheDir;
859
+ const env = transformers.env;
860
+ const priorCacheDir = env?.cacheDir;
861
+ if (cacheDir && env) env.cacheDir = cacheDir;
862
+ try {
863
+ pipeline = await transformers.pipeline("feature-extraction", model, { quantized: config.quantized });
864
+ } finally {
865
+ if (cacheDir && env) if (priorCacheDir === void 0) delete env.cacheDir;
866
+ else env.cacheDir = priorCacheDir;
867
+ }
868
+ currentModel = model;
869
+ }
870
+ const embeddings = [];
871
+ for (const text of texts) {
872
+ const output = await pipeline(text, {
873
+ pooling: config.pooling,
874
+ normalize: config.normalize
875
+ });
876
+ embeddings.push(Array.from(output.data));
877
+ }
878
+ return { embeddings };
879
+ }
880
+ };
881
+ }
882
+ /** Recommended models for Transformers.js */
883
+ const TRANSFORMERS_MODELS = {
884
+ DEFAULT: "Xenova/all-MiniLM-L6-v2",
885
+ QUALITY: "Xenova/all-mpnet-base-v2",
886
+ RETRIEVAL: "Xenova/bge-small-en-v1.5",
887
+ MULTILINGUAL: "Xenova/multilingual-e5-small"
888
+ };
889
+
890
+ //#endregion
891
+ //#region src/embeddings/providers/index.ts
892
+ /**
893
+ * Create an embedding provider from configuration.
894
+ * This is the main factory function for creating providers.
895
+ */
896
+ function createEmbeddingProvider(config) {
897
+ switch (config.type) {
898
+ case "http": return createHttpEmbedding(config.config);
899
+ case "custom": return config.provider;
900
+ default: throw new ScrapeError(`Unknown embedding provider type: ${config.type}`, "VALIDATION_ERROR");
901
+ }
902
+ }
903
+ /**
904
+ * Type guard to check if a value is an EmbeddingProvider.
905
+ */
906
+ function isEmbeddingProvider(value) {
907
+ return typeof value === "object" && value !== null && "name" in value && typeof value.name === "string" && "embed" in value && typeof value.embed === "function";
908
+ }
909
+
910
+ //#endregion
911
+ //#region src/embeddings/safety.ts
912
+ /**
913
+ * PII redaction patterns with high precision to minimize false positives.
914
+ * Patterns are designed to match common formats while avoiding over-matching.
915
+ */
916
+ const EMAIL_PATTERN = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
917
+ const PHONE_PATTERN = /(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b/g;
918
+ const CREDIT_CARD_PATTERN = /\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12}|(?:[0-9]{4}[-\s]){3}[0-9]{4}|[0-9]{13,19})\b/g;
919
+ const SSN_PATTERN = /\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b/g;
920
+ const IPV4_PATTERN = /\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g;
921
+ const REDACTED = "[REDACTED]";
922
+ /**
923
+ * Create a redaction function based on configuration.
924
+ * Returns a function that applies all configured PII patterns.
925
+ */
926
+ function createPiiRedactor(config) {
927
+ const patterns = [];
928
+ if (config.creditCard) patterns.push({
929
+ name: "creditCard",
930
+ pattern: CREDIT_CARD_PATTERN
931
+ });
932
+ if (config.email) patterns.push({
933
+ name: "email",
934
+ pattern: EMAIL_PATTERN
935
+ });
936
+ if (config.phone) patterns.push({
937
+ name: "phone",
938
+ pattern: PHONE_PATTERN
939
+ });
940
+ if (config.ssn) patterns.push({
941
+ name: "ssn",
942
+ pattern: SSN_PATTERN
943
+ });
944
+ if (config.ipAddress) patterns.push({
945
+ name: "ipAddress",
946
+ pattern: IPV4_PATTERN
947
+ });
948
+ if (config.customPatterns) for (let i = 0; i < config.customPatterns.length; i++) {
949
+ const customPattern = config.customPatterns[i];
950
+ if (customPattern) patterns.push({
951
+ name: `custom_${i}`,
952
+ pattern: customPattern
953
+ });
954
+ }
955
+ return (text) => {
956
+ let redactedText = text;
957
+ let totalRedactions = 0;
958
+ const redactionsByType = {};
959
+ for (const { name, pattern } of patterns) {
960
+ pattern.lastIndex = 0;
961
+ const matchCount = text.match(pattern)?.length ?? 0;
962
+ if (matchCount > 0) {
963
+ redactedText = redactedText.replace(pattern, REDACTED);
964
+ totalRedactions += matchCount;
965
+ redactionsByType[name] = (redactionsByType[name] ?? 0) + matchCount;
966
+ }
967
+ }
968
+ return {
969
+ text: redactedText,
970
+ redacted: totalRedactions > 0,
971
+ redactionCount: totalRedactions,
972
+ redactionsByType
973
+ };
974
+ };
975
+ }
976
+ /**
977
+ * Simple redaction that applies all default patterns.
978
+ * Use createPiiRedactor() for fine-grained control.
979
+ */
980
+ function redactPii(text) {
981
+ return createPiiRedactor({
982
+ email: true,
983
+ phone: true,
984
+ creditCard: true,
985
+ ssn: true,
986
+ ipAddress: true
987
+ })(text);
988
+ }
989
+ /**
990
+ * Check if text contains any PII.
991
+ * Useful for validation before sending to external APIs.
992
+ */
993
+ function containsPii(text, config) {
994
+ const fullConfig = {
995
+ email: config?.email ?? true,
996
+ phone: config?.phone ?? true,
997
+ creditCard: config?.creditCard ?? true,
998
+ ssn: config?.ssn ?? true,
999
+ ipAddress: config?.ipAddress ?? true,
1000
+ customPatterns: config?.customPatterns
1001
+ };
1002
+ const patterns = [];
1003
+ if (fullConfig.email) patterns.push(EMAIL_PATTERN);
1004
+ if (fullConfig.phone) patterns.push(PHONE_PATTERN);
1005
+ if (fullConfig.creditCard) patterns.push(CREDIT_CARD_PATTERN);
1006
+ if (fullConfig.ssn) patterns.push(SSN_PATTERN);
1007
+ if (fullConfig.ipAddress) patterns.push(IPV4_PATTERN);
1008
+ if (fullConfig.customPatterns) patterns.push(...fullConfig.customPatterns);
1009
+ for (const pattern of patterns) {
1010
+ pattern.lastIndex = 0;
1011
+ if (pattern.test(text)) return true;
1012
+ }
1013
+ return false;
1014
+ }
1015
+
1016
+ //#endregion
1017
+ //#region src/embeddings/pipeline.ts
1018
+ const DEFAULT_CHUNK_SIZE = 500;
1019
+ /**
1020
+ * Get the effective model for embedding.
1021
+ * Prioritizes: explicit options.model > provider config model
1022
+ */
1023
+ function getEffectiveModel(providerConfig, explicitModel) {
1024
+ if (explicitModel) return explicitModel;
1025
+ if (providerConfig.type === "http") return providerConfig.config.model;
1026
+ }
1027
+ /**
1028
+ * Generate embeddings for scraped data.
1029
+ * This is the main entry point for the embedding pipeline.
1030
+ */
1031
+ async function generateEmbeddings(data, options) {
1032
+ const startTime = Date.now();
1033
+ try {
1034
+ const provider = createEmbeddingProvider(options.provider);
1035
+ const model = getEffectiveModel(options.provider, options.model);
1036
+ const validation = validateInput(selectInput(data, options.input), options.safety?.minTextLength ?? 10);
1037
+ if (!validation.valid) return createSkippedResult(validation.reason, { model });
1038
+ const originalInput = validation.text;
1039
+ let inputText = validation.text;
1040
+ let piiRedacted = false;
1041
+ if (options.safety?.piiRedaction) {
1042
+ const redactionResult = createPiiRedactor(options.safety.piiRedaction)(inputText);
1043
+ inputText = redactionResult.text;
1044
+ piiRedacted = redactionResult.redacted;
1045
+ }
1046
+ const effectiveChunking = applyMaxTokensToChunking(options.chunking, options.safety?.maxTokens);
1047
+ const cacheKey = generateCacheKey({
1048
+ providerKey: getProviderCacheKey(options.provider),
1049
+ model,
1050
+ dimensions: options.output?.dimensions,
1051
+ aggregation: options.output?.aggregation,
1052
+ input: options.input,
1053
+ chunking: effectiveChunking,
1054
+ safety: options.safety,
1055
+ cacheKeySalt: options.cache?.cacheKeySalt,
1056
+ content: inputText
1057
+ });
1058
+ const cache = options.cache?.store ?? getDefaultCache();
1059
+ const cachedResult = await cache.get(cacheKey);
1060
+ if (cachedResult && cachedResult.status === "success") {
1061
+ if (options.onMetrics) options.onMetrics({
1062
+ provider: provider.name,
1063
+ model,
1064
+ inputTokens: estimateTokens(inputText),
1065
+ outputDimensions: getDimensions(cachedResult.aggregation === "all" ? cachedResult.vectors : cachedResult.vector),
1066
+ chunks: cachedResult.source.chunks,
1067
+ latencyMs: Date.now() - startTime,
1068
+ cached: true,
1069
+ retries: 0,
1070
+ piiRedacted
1071
+ });
1072
+ return {
1073
+ ...cachedResult,
1074
+ source: {
1075
+ ...cachedResult.source,
1076
+ cached: true
1077
+ }
1078
+ };
1079
+ }
1080
+ const chunks = chunkText(inputText, effectiveChunking);
1081
+ const callbackChunks = options.onChunk && options.safety?.allowSensitiveCallbacks ? chunkText(originalInput, effectiveChunking) : null;
1082
+ if (chunks.length === 0) return createSkippedResult("No content after chunking", { model });
1083
+ const sharedState = options.resilience?.state;
1084
+ const rateLimiter = sharedState?.rateLimiter ?? (options.resilience?.rateLimit ? new RateLimiter(options.resilience.rateLimit) : null);
1085
+ const circuitBreaker = sharedState?.circuitBreaker ?? (options.resilience?.circuitBreaker ? new CircuitBreaker(options.resilience.circuitBreaker) : null);
1086
+ const concurrency = options.resilience?.concurrency ?? 1;
1087
+ const semaphore = sharedState?.semaphore ?? new Semaphore(concurrency);
1088
+ const embeddings = [];
1089
+ let totalTokens = 0;
1090
+ let retryCount = 0;
1091
+ for (let i = 0; i < chunks.length; i++) {
1092
+ const chunk = chunks[i];
1093
+ if (!chunk) continue;
1094
+ if (rateLimiter) await rateLimiter.acquire();
1095
+ if (circuitBreaker?.isOpen()) return createSkippedResult("Circuit breaker is open", {
1096
+ model,
1097
+ chunks: i
1098
+ });
1099
+ await semaphore.execute(async () => {
1100
+ const { result: result$1 } = await withResilience(async (signal) => {
1101
+ return provider.embed([chunk.text], {
1102
+ model,
1103
+ dimensions: options.output?.dimensions,
1104
+ signal
1105
+ });
1106
+ }, options.resilience, {
1107
+ circuitBreaker: circuitBreaker ?? void 0,
1108
+ rateLimiter: void 0,
1109
+ semaphore: void 0
1110
+ }, { onRetry: () => {
1111
+ retryCount++;
1112
+ } });
1113
+ if (result$1.usage) totalTokens += result$1.usage.totalTokens;
1114
+ else totalTokens += chunk.tokens;
1115
+ const embedding = result$1.embeddings[0];
1116
+ if (embedding) {
1117
+ embeddings.push(embedding);
1118
+ if (options.onChunk) {
1119
+ const callbackText = callbackChunks?.[i]?.text ?? chunk.text;
1120
+ options.onChunk(callbackText, embedding);
1121
+ }
1122
+ }
1123
+ });
1124
+ }
1125
+ const aggregation = options.output?.aggregation ?? "average";
1126
+ const aggregated = aggregateVectors(embeddings, aggregation);
1127
+ const source = {
1128
+ model,
1129
+ chunks: chunks.length,
1130
+ tokens: totalTokens || estimateTokens(inputText),
1131
+ checksum: generateChecksum(inputText),
1132
+ cached: false,
1133
+ latencyMs: Date.now() - startTime
1134
+ };
1135
+ let result;
1136
+ if (aggregated.type === "single") result = {
1137
+ status: "success",
1138
+ aggregation,
1139
+ vector: aggregated.vector,
1140
+ source
1141
+ };
1142
+ else result = {
1143
+ status: "success",
1144
+ aggregation: "all",
1145
+ vectors: aggregated.vectors,
1146
+ source
1147
+ };
1148
+ await cache.set(cacheKey, result, { ttlMs: options.cache?.ttlMs });
1149
+ if (options.onMetrics) {
1150
+ const metrics = {
1151
+ provider: provider.name,
1152
+ model,
1153
+ inputTokens: source.tokens,
1154
+ outputDimensions: aggregated.dimensions,
1155
+ chunks: chunks.length,
1156
+ latencyMs: source.latencyMs,
1157
+ cached: false,
1158
+ retries: retryCount,
1159
+ piiRedacted
1160
+ };
1161
+ options.onMetrics(metrics);
1162
+ }
1163
+ return result;
1164
+ } catch (error) {
1165
+ const reason = error instanceof Error ? error.message : String(error);
1166
+ if (error instanceof ScrapeError && ["INVALID_URL", "BLOCKED"].includes(error.code)) throw error;
1167
+ return createSkippedResult(reason, { latencyMs: Date.now() - startTime });
1168
+ }
1169
+ }
1170
+ function applyMaxTokensToChunking(chunking, maxTokens) {
1171
+ if (!maxTokens || maxTokens <= 0) return chunking;
1172
+ const baseSize = chunking?.size ?? DEFAULT_CHUNK_SIZE;
1173
+ const baseOverlap = chunking?.overlap ?? 50;
1174
+ const clampedSize = Math.min(baseSize, maxTokens);
1175
+ const clampedOverlap = Math.min(baseOverlap, Math.max(0, clampedSize - 1));
1176
+ return {
1177
+ ...chunking,
1178
+ size: clampedSize,
1179
+ overlap: clampedOverlap
1180
+ };
1181
+ }
1182
+ /**
1183
+ * Embed arbitrary text directly.
1184
+ * Standalone function for embedding text outside of scrape().
1185
+ */
1186
+ async function embed(text, options) {
1187
+ return generateEmbeddings({ textContent: text }, {
1188
+ ...options,
1189
+ input: {
1190
+ ...options.input,
1191
+ type: "textContent"
1192
+ }
1193
+ });
1194
+ }
1195
+ /**
1196
+ * Embed from existing ScrapedData.
1197
+ * Useful when you've already scraped and want to add embeddings later.
1198
+ */
1199
+ async function embedScrapedData(data, options) {
1200
+ return generateEmbeddings(data, options);
1201
+ }
1202
+ /**
1203
+ * Create a skipped result with reason.
1204
+ */
1205
+ function createSkippedResult(reason, partialSource) {
1206
+ return {
1207
+ status: "skipped",
1208
+ reason,
1209
+ source: partialSource ?? {}
1210
+ };
1211
+ }
1212
+
1213
+ //#endregion
1214
+ export { generateCacheKey as A, normalizeVector as B, createTokenizer as C, needsChunking as D, heuristicTokenCount as E, aggregateVectors as F, cosineSimilarity as I, dotProduct as L, getDefaultCache as M, resetDefaultCache as N, InMemoryEmbeddingCache as O, validateCachedResult as P, euclideanDistance as R, chunkText as S, getChunkingStats as T, createHttpEmbedding as _, createPiiRedactor as a, selectInput as b, isEmbeddingProvider as c, createCohereEmbedding as d, createHuggingFaceEmbedding as f, HttpEmbeddingProvider as g, createTransformersEmbedding as h, containsPii as i, generateChecksum as j, createNoOpCache as k, TRANSFORMERS_MODELS as l, createOpenAIEmbedding as m, embedScrapedData as n, redactPii as o, createOllamaEmbedding as p, generateEmbeddings as r, createEmbeddingProvider as s, embed as t, createAzureEmbedding as u, getDefaultModel as v, estimateTokens as w, validateInput as x, previewInput as y, getDimensions as z };
1215
+ //# sourceMappingURL=embeddings-Bsymy_jA.mjs.map