scrapex 1.0.0-alpha.1 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +164 -5
  2. package/dist/enhancer-ByjRD-t5.mjs +769 -0
  3. package/dist/enhancer-ByjRD-t5.mjs.map +1 -0
  4. package/dist/enhancer-j0xqKDJm.cjs +847 -0
  5. package/dist/enhancer-j0xqKDJm.cjs.map +1 -0
  6. package/dist/index-CDgcRnig.d.cts +268 -0
  7. package/dist/index-CDgcRnig.d.cts.map +1 -0
  8. package/dist/index-piS5wtki.d.mts +268 -0
  9. package/dist/index-piS5wtki.d.mts.map +1 -0
  10. package/dist/index.cjs +1192 -37
  11. package/dist/index.cjs.map +1 -1
  12. package/dist/index.d.cts +318 -2
  13. package/dist/index.d.cts.map +1 -1
  14. package/dist/index.d.mts +318 -2
  15. package/dist/index.d.mts.map +1 -1
  16. package/dist/index.mjs +1164 -6
  17. package/dist/index.mjs.map +1 -1
  18. package/dist/llm/index.cjs +250 -232
  19. package/dist/llm/index.cjs.map +1 -1
  20. package/dist/llm/index.d.cts +132 -85
  21. package/dist/llm/index.d.cts.map +1 -1
  22. package/dist/llm/index.d.mts +132 -85
  23. package/dist/llm/index.d.mts.map +1 -1
  24. package/dist/llm/index.mjs +243 -236
  25. package/dist/llm/index.mjs.map +1 -1
  26. package/dist/parsers/index.cjs +10 -199
  27. package/dist/parsers/index.d.cts +2 -133
  28. package/dist/parsers/index.d.mts +2 -133
  29. package/dist/parsers/index.mjs +2 -191
  30. package/dist/parsers-Bneuws8x.cjs +569 -0
  31. package/dist/parsers-Bneuws8x.cjs.map +1 -0
  32. package/dist/parsers-CwkYnyWY.mjs +482 -0
  33. package/dist/parsers-CwkYnyWY.mjs.map +1 -0
  34. package/dist/types-CadAXrme.d.mts +674 -0
  35. package/dist/types-CadAXrme.d.mts.map +1 -0
  36. package/dist/types-DPEtPihB.d.cts +674 -0
  37. package/dist/types-DPEtPihB.d.cts.map +1 -0
  38. package/package.json +15 -16
  39. package/dist/enhancer-Q6CSc1gA.mjs +0 -220
  40. package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
  41. package/dist/enhancer-oM4BhYYS.cjs +0 -268
  42. package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
  43. package/dist/parsers/index.cjs.map +0 -1
  44. package/dist/parsers/index.d.cts.map +0 -1
  45. package/dist/parsers/index.d.mts.map +0 -1
  46. package/dist/parsers/index.mjs.map +0 -1
  47. package/dist/types-CNQZVW36.d.mts +0 -150
  48. package/dist/types-CNQZVW36.d.mts.map +0 -1
  49. package/dist/types-D0HYR95H.d.cts +0 -150
  50. package/dist/types-D0HYR95H.d.cts.map +0 -1
package/dist/index.cjs CHANGED
@@ -1,36 +1,11 @@
1
- //#region rolldown:runtime
2
- var __create = Object.create;
3
- var __defProp = Object.defineProperty;
4
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
- var __getOwnPropNames = Object.getOwnPropertyNames;
6
- var __getProtoOf = Object.getPrototypeOf;
7
- var __hasOwnProp = Object.prototype.hasOwnProperty;
8
- var __copyProps = (to, from, except, desc) => {
9
- if (from && typeof from === "object" || typeof from === "function") {
10
- for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
11
- key = keys[i];
12
- if (!__hasOwnProp.call(to, key) && key !== except) {
13
- __defProp(to, key, {
14
- get: ((k) => from[k]).bind(null, key),
15
- enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
16
- });
17
- }
18
- }
19
- }
20
- return to;
21
- };
22
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", {
23
- value: mod,
24
- enumerable: true
25
- }) : target, mod));
26
-
27
- //#endregion
28
- const require_enhancer = require('./enhancer-oM4BhYYS.cjs');
1
+ const require_parsers = require('./parsers-Bneuws8x.cjs');
2
+ const require_enhancer = require('./enhancer-j0xqKDJm.cjs');
29
3
  let cheerio = require("cheerio");
30
- cheerio = __toESM(cheerio);
31
- let __mozilla_readability = require("@mozilla/readability");
4
+ cheerio = require_parsers.__toESM(cheerio);
5
+ let node_crypto = require("node:crypto");
6
+ let _mozilla_readability = require("@mozilla/readability");
32
7
  let turndown = require("turndown");
33
- turndown = __toESM(turndown);
8
+ turndown = require_parsers.__toESM(turndown);
34
9
 
35
10
  //#region src/core/context.ts
36
11
  let jsdomModule = null;
@@ -81,6 +56,1042 @@ function mergeResults(context, extracted) {
81
56
  };
82
57
  }
83
58
 
59
+ //#endregion
60
+ //#region src/embeddings/aggregation.ts
61
+ /**
62
+ * Aggregate multiple embedding vectors into a single vector or return all.
63
+ *
64
+ * @param vectors - Array of embedding vectors (must all have same dimensions)
65
+ * @param strategy - Aggregation strategy
66
+ * @returns Aggregated result based on strategy
67
+ */
68
+ function aggregateVectors(vectors, strategy = "average") {
69
+ if (vectors.length === 0) throw new Error("Cannot aggregate empty vector array");
70
+ const firstVector = vectors[0];
71
+ if (!firstVector) throw new Error("Cannot aggregate empty vector array");
72
+ const dimensions = firstVector.length;
73
+ for (let i = 1; i < vectors.length; i++) {
74
+ const vec = vectors[i];
75
+ if (!vec || vec.length !== dimensions) throw new Error(`Vector dimension mismatch: expected ${dimensions}, got ${vec?.length ?? 0} at index ${i}`);
76
+ }
77
+ switch (strategy) {
78
+ case "average": return {
79
+ type: "single",
80
+ vector: averageVectors(vectors),
81
+ dimensions
82
+ };
83
+ case "max": return {
84
+ type: "single",
85
+ vector: maxPoolVectors(vectors),
86
+ dimensions
87
+ };
88
+ case "first": return {
89
+ type: "single",
90
+ vector: firstVector,
91
+ dimensions
92
+ };
93
+ case "all": return {
94
+ type: "multiple",
95
+ vectors,
96
+ dimensions
97
+ };
98
+ default: {
99
+ const _exhaustive = strategy;
100
+ throw new Error(`Unknown aggregation strategy: ${_exhaustive}`);
101
+ }
102
+ }
103
+ }
104
+ /**
105
+ * Compute element-wise average of vectors.
106
+ */
107
+ function averageVectors(vectors) {
108
+ const first = vectors[0];
109
+ if (!first || vectors.length === 1) return first ?? [];
110
+ const dimensions = first.length;
111
+ const count = vectors.length;
112
+ const result = new Array(dimensions).fill(0);
113
+ for (const vector of vectors) for (let i = 0; i < dimensions; i++) {
114
+ const val = result[i];
115
+ if (val !== void 0) result[i] = val + (vector[i] ?? 0);
116
+ }
117
+ for (let i = 0; i < dimensions; i++) {
118
+ const val = result[i];
119
+ if (val !== void 0) result[i] = val / count;
120
+ }
121
+ return result;
122
+ }
123
+ /**
124
+ * Compute element-wise maximum of vectors (max pooling).
125
+ */
126
+ function maxPoolVectors(vectors) {
127
+ const first = vectors[0];
128
+ if (!first || vectors.length === 1) return first ?? [];
129
+ const dimensions = first.length;
130
+ const result = [...first];
131
+ for (let v = 1; v < vectors.length; v++) {
132
+ const vec = vectors[v];
133
+ if (!vec) continue;
134
+ for (let i = 0; i < dimensions; i++) {
135
+ const val = vec[i] ?? 0;
136
+ if (val > (result[i] ?? 0)) result[i] = val;
137
+ }
138
+ }
139
+ return result;
140
+ }
141
+ /**
142
+ * Compute cosine similarity between two vectors.
143
+ * Both vectors should be normalized for accurate results.
144
+ */
145
+ function cosineSimilarity(a, b) {
146
+ if (a.length !== b.length) throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`);
147
+ let dot = 0;
148
+ let magnitudeA = 0;
149
+ let magnitudeB = 0;
150
+ for (let i = 0; i < a.length; i++) {
151
+ const aVal = a[i] ?? 0;
152
+ const bVal = b[i] ?? 0;
153
+ dot += aVal * bVal;
154
+ magnitudeA += aVal * aVal;
155
+ magnitudeB += bVal * bVal;
156
+ }
157
+ const magnitude = Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB);
158
+ if (magnitude === 0) return 0;
159
+ return dot / magnitude;
160
+ }
161
+ /**
162
+ * Get the dimensions of a vector or set of vectors.
163
+ */
164
+ function getDimensions(vectors) {
165
+ if (vectors.length === 0) return 0;
166
+ const first = vectors[0];
167
+ if (typeof first === "number") return vectors.length;
168
+ return first?.length ?? 0;
169
+ }
170
+
171
+ //#endregion
172
+ //#region src/embeddings/cache.ts
173
+ /**
174
+ * Default maximum cache entries.
175
+ */
176
+ const DEFAULT_MAX_ENTRIES = 1e3;
177
+ /**
178
+ * Default TTL in milliseconds (1 hour).
179
+ */
180
+ const DEFAULT_TTL_MS = 3600 * 1e3;
181
+ /**
182
+ * Generate a content-addressable cache key.
183
+ * Key is based on content hash and embedding configuration.
184
+ * Note: custom RegExp patterns are serialized by source+flags; different
185
+ * constructions can yield different cache keys even if equivalent.
186
+ */
187
+ function generateCacheKey(params) {
188
+ const hash = (0, node_crypto.createHash)("sha256");
189
+ const fingerprint = stableStringify({
190
+ providerKey: params.providerKey,
191
+ model: params.model ?? "provider-default",
192
+ dimensions: params.dimensions ?? "default",
193
+ aggregation: params.aggregation ?? "average",
194
+ input: serializeInputConfig(params.input),
195
+ chunking: serializeChunkingConfig(params.chunking),
196
+ safety: serializeSafetyConfig(params.safety),
197
+ cacheKeySalt: params.cacheKeySalt
198
+ });
199
+ hash.update(fingerprint);
200
+ hash.update("\0");
201
+ hash.update(params.content);
202
+ return hash.digest("hex");
203
+ }
204
+ /**
205
+ * Generate a checksum for content verification.
206
+ */
207
+ function generateChecksum(content) {
208
+ return (0, node_crypto.createHash)("sha256").update(content).digest("hex").slice(0, 16);
209
+ }
210
+ function serializeInputConfig(config) {
211
+ if (!config) return void 0;
212
+ return normalizeObject({
213
+ type: config.type ?? "textContent",
214
+ hasTransform: Boolean(config.transform),
215
+ hasCustomText: Boolean(config.customText)
216
+ });
217
+ }
218
+ function serializeChunkingConfig(config) {
219
+ if (!config) return void 0;
220
+ return normalizeObject({
221
+ size: config.size,
222
+ overlap: config.overlap,
223
+ tokenizer: getTokenizerId(config.tokenizer),
224
+ maxInputLength: config.maxInputLength
225
+ });
226
+ }
227
+ function serializeSafetyConfig(config) {
228
+ if (!config) return void 0;
229
+ return normalizeObject({
230
+ piiRedaction: serializePiiConfig(config.piiRedaction),
231
+ minTextLength: config.minTextLength,
232
+ maxTokens: config.maxTokens
233
+ });
234
+ }
235
+ function serializePiiConfig(config) {
236
+ if (!config) return void 0;
237
+ return normalizeObject({
238
+ email: config.email ?? false,
239
+ phone: config.phone ?? false,
240
+ creditCard: config.creditCard ?? false,
241
+ ssn: config.ssn ?? false,
242
+ ipAddress: config.ipAddress ?? false,
243
+ customPatterns: config.customPatterns?.map((pattern) => `${pattern.source}/${pattern.flags}`)
244
+ });
245
+ }
246
+ function getTokenizerId(tokenizer) {
247
+ if (!tokenizer || tokenizer === "heuristic") return "heuristic";
248
+ if (tokenizer === "tiktoken") return "tiktoken";
249
+ return "custom";
250
+ }
251
+ function stableStringify(value) {
252
+ return stringifyNormalized(normalizeValue(value));
253
+ }
254
+ function normalizeValue(value) {
255
+ if (value === void 0) return void 0;
256
+ if (value === null) return null;
257
+ if (Array.isArray(value)) return value.map((entry) => normalizeValue(entry)).filter((entry) => entry !== void 0);
258
+ if (typeof value === "object") return normalizeObject(value);
259
+ return value;
260
+ }
261
+ function normalizeObject(value) {
262
+ const normalized = {};
263
+ for (const key of Object.keys(value).sort()) {
264
+ const entry = normalizeValue(value[key]);
265
+ if (entry !== void 0) normalized[key] = entry;
266
+ }
267
+ return normalized;
268
+ }
269
+ function stringifyNormalized(value) {
270
+ if (value === void 0) return "undefined";
271
+ if (value === null) return "null";
272
+ if (typeof value === "string") return JSON.stringify(value);
273
+ if (typeof value === "number" || typeof value === "boolean") return String(value);
274
+ if (Array.isArray(value)) return `[${value.map((entry) => stringifyNormalized(entry)).join(",")}]`;
275
+ if (typeof value === "object") {
276
+ const obj = value;
277
+ return `{${Object.keys(obj).sort().map((key) => `${JSON.stringify(key)}:${stringifyNormalized(obj[key])}`).join(",")}}`;
278
+ }
279
+ return JSON.stringify(value);
280
+ }
281
+ /**
282
+ * In-memory LRU cache with TTL support.
283
+ * Content-addressable: uses content hash as key, not URL.
284
+ */
285
+ var InMemoryEmbeddingCache = class {
286
+ cache;
287
+ maxEntries;
288
+ defaultTtlMs;
289
+ constructor(options) {
290
+ this.cache = /* @__PURE__ */ new Map();
291
+ this.maxEntries = options?.maxEntries ?? DEFAULT_MAX_ENTRIES;
292
+ this.defaultTtlMs = options?.ttlMs ?? DEFAULT_TTL_MS;
293
+ }
294
+ async get(key) {
295
+ const entry = this.cache.get(key);
296
+ if (!entry) return;
297
+ const now = Date.now();
298
+ if (now > entry.expiresAt) {
299
+ this.cache.delete(key);
300
+ return;
301
+ }
302
+ entry.accessedAt = now;
303
+ return entry.value;
304
+ }
305
+ async set(key, value, options) {
306
+ const now = Date.now();
307
+ const ttl = options?.ttlMs ?? this.defaultTtlMs;
308
+ if (this.cache.size >= this.maxEntries && !this.cache.has(key)) this.evictLRU();
309
+ this.cache.set(key, {
310
+ value,
311
+ createdAt: now,
312
+ expiresAt: now + ttl,
313
+ accessedAt: now
314
+ });
315
+ }
316
+ async delete(key) {
317
+ return this.cache.delete(key);
318
+ }
319
+ async clear() {
320
+ this.cache.clear();
321
+ }
322
+ /**
323
+ * Get cache statistics.
324
+ */
325
+ getStats() {
326
+ const now = Date.now();
327
+ let expired = 0;
328
+ for (const entry of this.cache.values()) if (now > entry.expiresAt) expired++;
329
+ return {
330
+ size: this.cache.size,
331
+ maxEntries: this.maxEntries,
332
+ expired,
333
+ utilization: this.cache.size / this.maxEntries
334
+ };
335
+ }
336
+ /**
337
+ * Evict expired entries.
338
+ */
339
+ cleanup() {
340
+ const now = Date.now();
341
+ let evicted = 0;
342
+ for (const [key, entry] of this.cache.entries()) if (now > entry.expiresAt) {
343
+ this.cache.delete(key);
344
+ evicted++;
345
+ }
346
+ return evicted;
347
+ }
348
+ /**
349
+ * Evict least recently used entry.
350
+ */
351
+ evictLRU() {
352
+ let oldestKey = null;
353
+ let oldestAccess = Number.POSITIVE_INFINITY;
354
+ for (const [key, entry] of this.cache.entries()) if (entry.accessedAt < oldestAccess) {
355
+ oldestAccess = entry.accessedAt;
356
+ oldestKey = key;
357
+ }
358
+ if (oldestKey) this.cache.delete(oldestKey);
359
+ }
360
+ };
361
+ /**
362
+ * Default in-memory cache instance.
363
+ * Optimized for moderate cache sizes (default 1000 entries).
364
+ */
365
+ let defaultCache = null;
366
+ /**
367
+ * Get or create the default cache instance.
368
+ */
369
+ function getDefaultCache() {
370
+ if (!defaultCache) defaultCache = new InMemoryEmbeddingCache();
371
+ return defaultCache;
372
+ }
373
+
374
+ //#endregion
375
+ //#region src/embeddings/chunking.ts
376
+ /**
377
+ * Default chunk size in tokens.
378
+ */
379
+ const DEFAULT_CHUNK_SIZE$1 = 500;
380
+ /**
381
+ * Default overlap in tokens.
382
+ */
383
+ const DEFAULT_OVERLAP = 50;
384
+ /**
385
+ * Default maximum input length in characters.
386
+ */
387
+ const DEFAULT_MAX_INPUT_LENGTH = 1e5;
388
+ /**
389
+ * Heuristic token counting: approximately 4 characters per token.
390
+ * This is a reasonable approximation for English text.
391
+ */
392
+ function heuristicTokenCount(text) {
393
+ return Math.ceil(text.length / 4);
394
+ }
395
+ /**
396
+ * Convert token count to approximate character count.
397
+ */
398
+ function tokensToChars(tokens) {
399
+ return tokens * 4;
400
+ }
401
+ /**
402
+ * Create a tokenizer function based on configuration.
403
+ */
404
+ function createTokenizer(config) {
405
+ if (!config || config === "heuristic") return heuristicTokenCount;
406
+ if (config === "tiktoken") return heuristicTokenCount;
407
+ return config;
408
+ }
409
+ /**
410
+ * Find a natural break point in text (sentence or word boundary).
411
+ * Prefers common sentence boundaries (Latin + CJK), falls back to word boundaries.
412
+ */
413
+ function findBreakPoint(text, targetIndex) {
414
+ const searchStart = Math.max(0, targetIndex - Math.floor(targetIndex * .2));
415
+ const searchEnd = Math.min(text.length, targetIndex + Math.floor(targetIndex * .2));
416
+ const searchText = text.slice(searchStart, searchEnd);
417
+ const sentenceMatch = /[.!?。!?]\s*/g;
418
+ let lastSentenceEnd = -1;
419
+ for (const match of searchText.matchAll(sentenceMatch)) {
420
+ const absolutePos = searchStart + match.index + match[0].length;
421
+ if (absolutePos <= targetIndex) lastSentenceEnd = absolutePos;
422
+ }
423
+ if (lastSentenceEnd !== -1) return lastSentenceEnd;
424
+ const wordBoundary = text.lastIndexOf(" ", targetIndex);
425
+ if (wordBoundary > searchStart) return wordBoundary + 1;
426
+ return targetIndex;
427
+ }
428
+ /**
429
+ * Split text into overlapping chunks optimized for embedding.
430
+ * Respects sentence boundaries when possible.
431
+ */
432
+ function chunkText(text, config) {
433
+ const chunkSize = config?.size ?? DEFAULT_CHUNK_SIZE$1;
434
+ const rawOverlap = config?.overlap ?? DEFAULT_OVERLAP;
435
+ const safeOverlap = Math.max(0, rawOverlap);
436
+ const overlap = Math.min(safeOverlap, Math.max(0, chunkSize - 1));
437
+ const maxInputLength = config?.maxInputLength ?? DEFAULT_MAX_INPUT_LENGTH;
438
+ const tokenizer = createTokenizer(config?.tokenizer);
439
+ const normalizedText = (text.length > maxInputLength ? text.slice(0, maxInputLength) : text).replace(/\s+/g, " ").trim();
440
+ if (!normalizedText) return [];
441
+ const totalTokens = tokenizer(normalizedText);
442
+ if (totalTokens <= chunkSize) return [{
443
+ text: normalizedText,
444
+ startIndex: 0,
445
+ endIndex: normalizedText.length,
446
+ tokens: totalTokens
447
+ }];
448
+ const chunks = [];
449
+ const chunkSizeChars = tokensToChars(chunkSize);
450
+ const overlapChars = tokensToChars(overlap);
451
+ let startIndex = 0;
452
+ while (startIndex < normalizedText.length) {
453
+ const targetEnd = Math.min(startIndex + chunkSizeChars, normalizedText.length);
454
+ const endIndex = targetEnd < normalizedText.length ? findBreakPoint(normalizedText, targetEnd) : targetEnd;
455
+ const chunkText$1 = normalizedText.slice(startIndex, endIndex).trim();
456
+ if (chunkText$1) chunks.push({
457
+ text: chunkText$1,
458
+ startIndex,
459
+ endIndex,
460
+ tokens: tokenizer(chunkText$1)
461
+ });
462
+ if (endIndex >= normalizedText.length) break;
463
+ const nextStart = endIndex - overlapChars;
464
+ startIndex = Math.max(nextStart, startIndex + 1);
465
+ if (startIndex < normalizedText.length) {
466
+ const spaceIndex = normalizedText.indexOf(" ", startIndex);
467
+ if (spaceIndex !== -1 && spaceIndex < startIndex + overlapChars) startIndex = spaceIndex + 1;
468
+ }
469
+ }
470
+ return chunks;
471
+ }
472
+ /**
473
+ * Estimate total tokens for a text without chunking.
474
+ */
475
+ function estimateTokens(text, tokenizer) {
476
+ return createTokenizer(tokenizer)(text);
477
+ }
478
+
479
+ //#endregion
480
+ //#region src/embeddings/input.ts
481
+ /**
482
+ * Select and prepare input text for embedding based on configuration.
483
+ *
484
+ * @param data - Scraped data to extract input from
485
+ * @param config - Input configuration
486
+ * @returns Selected and prepared text, or undefined if no valid input
487
+ */
488
+ function selectInput(data, config) {
489
+ if (config?.transform) return normalizeText(config.transform(data));
490
+ if (config?.type === "custom" && config.customText) return normalizeText(config.customText);
491
+ const type = config?.type ?? "textContent";
492
+ switch (type) {
493
+ case "textContent": return selectTextContent(data);
494
+ case "title+summary": return selectTitleSummary(data);
495
+ case "custom": return selectTextContent(data);
496
+ default: {
497
+ const _exhaustive = type;
498
+ throw new Error(`Unknown input type: ${_exhaustive}`);
499
+ }
500
+ }
501
+ }
502
+ /**
503
+ * Select textContent as input.
504
+ */
505
+ function selectTextContent(data) {
506
+ if (data.textContent) return normalizeText(data.textContent);
507
+ if (data.content) return normalizeText(stripMarkdown(data.content));
508
+ if (data.excerpt) return normalizeText(data.excerpt);
509
+ if (data.description) return normalizeText(data.description);
510
+ }
511
+ /**
512
+ * Select title + summary (or fallbacks) as input.
513
+ * Optimized for semantic search and classification.
514
+ */
515
+ function selectTitleSummary(data) {
516
+ const parts = [];
517
+ if (data.title) parts.push(data.title);
518
+ if (data.summary) parts.push(data.summary);
519
+ else if (data.excerpt) parts.push(data.excerpt);
520
+ else if (data.description) parts.push(data.description);
521
+ if (parts.length === 0) return;
522
+ return normalizeText(parts.join("\n\n"));
523
+ }
524
+ /**
525
+ * Normalize text for embedding:
526
+ * - Collapse whitespace
527
+ * - Trim leading/trailing whitespace
528
+ * - Remove control characters
529
+ */
530
+ function normalizeText(text) {
531
+ if (!text) return "";
532
+ return text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "").replace(/[ \t]+/g, " ").replace(/\n{3,}/g, "\n\n").split("\n").map((line) => line.trim()).join("\n").trim();
533
+ }
534
+ /**
535
+ * Basic markdown stripping for when we need plain text from content.
536
+ * Not comprehensive, but handles common cases.
537
+ */
538
+ function stripMarkdown(markdown) {
539
+ return markdown.replace(/```[\s\S]*?```/g, "").replace(/`[^`]+`/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/\*\*([^*]+)\*\*/g, "$1").replace(/\*([^*]+)\*/g, "$1").replace(/__([^_]+)__/g, "$1").replace(/_([^_]+)_/g, "$1").replace(/^>\s+/gm, "").replace(/^[-*_]{3,}$/gm, "").replace(/^[\s]*[-*+]\s+/gm, "").replace(/^[\s]*\d+\.\s+/gm, "");
540
+ }
541
+ /**
542
+ * Check if the selected input meets minimum requirements.
543
+ */
544
+ function validateInput(text, minLength = 10) {
545
+ if (!text) return {
546
+ valid: false,
547
+ reason: "No input text available"
548
+ };
549
+ if (text.length < minLength) return {
550
+ valid: false,
551
+ reason: `Input too short (${text.length} < ${minLength} characters)`
552
+ };
553
+ const wordCount = text.split(/\s+/).filter((w) => w.length > 1).length;
554
+ if (wordCount < 3) return {
555
+ valid: false,
556
+ reason: `Input has too few words (${wordCount} < 3)`
557
+ };
558
+ return {
559
+ valid: true,
560
+ text,
561
+ wordCount,
562
+ charCount: text.length
563
+ };
564
+ }
565
+
566
+ //#endregion
567
+ //#region src/embeddings/providers/base.ts
568
+ /**
569
+ * Generate a stable cache key identifier for provider configuration.
570
+ */
571
+ function getProviderCacheKey(config) {
572
+ switch (config.type) {
573
+ case "http": return `http:${config.config.baseUrl.replace(/\/$/, "")}:${config.config.model}`;
574
+ case "custom": return `custom:${config.provider.name}`;
575
+ default: {
576
+ const _exhaustive = config;
577
+ return String(_exhaustive);
578
+ }
579
+ }
580
+ }
581
+
582
+ //#endregion
583
+ //#region src/embeddings/providers/http.ts
584
+ /**
585
+ * HTTP-based Embedding Provider using native fetch.
586
+ * Provides a unified interface for any REST-based embedding API.
587
+ */
588
+ /**
589
+ * HTTP-based embedding provider.
590
+ * Works with any REST API using native fetch.
591
+ */
592
+ var HttpEmbeddingProvider = class extends require_enhancer.BaseHttpProvider {
593
+ name = "http-embedding";
594
+ requestBuilder;
595
+ responseMapper;
596
+ constructor(config) {
597
+ super(config);
598
+ this.requestBuilder = config.requestBuilder ?? ((texts, model) => ({
599
+ input: texts,
600
+ model
601
+ }));
602
+ this.responseMapper = config.responseMapper ?? ((response) => {
603
+ const resp = response;
604
+ if (Array.isArray(resp.data)) return resp.data.map((item) => item.embedding);
605
+ if (Array.isArray(resp.embeddings)) return resp.embeddings;
606
+ if (Array.isArray(resp.embedding)) return [resp.embedding];
607
+ if (Array.isArray(response)) return response;
608
+ throw new require_enhancer.ScrapeError("Unable to parse embedding response. Provide a custom responseMapper.", "VALIDATION_ERROR");
609
+ });
610
+ }
611
+ /**
612
+ * Generate embeddings for one or more texts.
613
+ */
614
+ async embed(texts, options) {
615
+ const model = options.model || this.model;
616
+ const body = this.requestBuilder(texts, model);
617
+ const { data } = await this.fetch(this.baseUrl, {
618
+ body,
619
+ signal: options.signal
620
+ });
621
+ const embeddings = this.responseMapper(data);
622
+ if (embeddings.length !== texts.length) throw new require_enhancer.ScrapeError(`Embedding count mismatch: expected ${texts.length}, got ${embeddings.length}`, "VALIDATION_ERROR");
623
+ return { embeddings };
624
+ }
625
+ };
626
+ /**
627
+ * Create a generic HTTP embedding provider.
628
+ */
629
+ function createHttpEmbedding(config) {
630
+ return new HttpEmbeddingProvider(config);
631
+ }
632
+
633
+ //#endregion
634
+ //#region src/embeddings/providers/presets.ts
635
+ /**
636
+ * Create an OpenAI embedding provider.
637
+ *
638
+ * @example
639
+ * ```ts
640
+ * const provider = createOpenAIEmbedding({ apiKey: 'sk-...' });
641
+ * const { embeddings } = await provider.embed(['Hello'], { model: 'text-embedding-3-small' });
642
+ * ```
643
+ */
644
+ function createOpenAIEmbedding(options) {
645
+ const apiKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
646
+ if (!apiKey) throw new Error("OpenAI API key required. Set OPENAI_API_KEY env var or pass apiKey option.");
647
+ const headers = { Authorization: `Bearer ${apiKey}` };
648
+ if (options?.organization) headers["OpenAI-Organization"] = options.organization;
649
+ return new HttpEmbeddingProvider({
650
+ baseUrl: options?.baseUrl ?? "https://api.openai.com/v1/embeddings",
651
+ model: options?.model ?? "text-embedding-3-small",
652
+ headers,
653
+ requestBuilder: (texts, model) => ({
654
+ input: texts,
655
+ model
656
+ }),
657
+ responseMapper: (res) => res.data.map((item) => item.embedding)
658
+ });
659
+ }
660
+ /**
661
+ * Create an Azure OpenAI embedding provider.
662
+ *
663
+ * @example
664
+ * ```ts
665
+ * const provider = createAzureEmbedding({
666
+ * endpoint: 'https://my-resource.openai.azure.com',
667
+ * deploymentName: 'text-embedding-ada-002',
668
+ * apiVersion: '2023-05-15',
669
+ * });
670
+ * ```
671
+ */
672
+ function createAzureEmbedding(options) {
673
+ const apiKey = options.apiKey ?? process.env.AZURE_OPENAI_API_KEY;
674
+ if (!apiKey) throw new Error("Azure OpenAI API key required. Set AZURE_OPENAI_API_KEY env var or pass apiKey option.");
675
+ return new HttpEmbeddingProvider({
676
+ baseUrl: `${options.endpoint.replace(/\/$/, "")}/openai/deployments/${options.deploymentName}/embeddings?api-version=${options.apiVersion}`,
677
+ model: options.deploymentName,
678
+ headers: { "api-key": apiKey },
679
+ requestBuilder: (texts) => ({ input: texts }),
680
+ responseMapper: (res) => res.data.map((item) => item.embedding)
681
+ });
682
+ }
683
+ /**
684
+ * Create an Ollama embedding provider for local models.
685
+ *
686
+ * LIMITATION: Ollama's /api/embeddings endpoint processes one text at a time,
687
+ * not batches. When multiple chunks are embedded, each chunk triggers a
688
+ * separate HTTP request. This is handled transparently by the pipeline's
689
+ * sequential chunk processing, but may be slower than batch-capable providers.
690
+ * For high-throughput scenarios, consider using OpenAI, Cohere, or HuggingFace
691
+ * which support batch embedding in a single request.
692
+ *
693
+ * @example
694
+ * ```ts
695
+ * const provider = createOllamaEmbedding({ model: 'nomic-embed-text' });
696
+ * ```
697
+ */
698
+ function createOllamaEmbedding(options) {
699
+ return new HttpEmbeddingProvider({
700
+ baseUrl: options?.baseUrl ?? "http://localhost:11434/api/embeddings",
701
+ model: options?.model ?? "nomic-embed-text",
702
+ requireHttps: false,
703
+ allowPrivate: true,
704
+ requestBuilder: (texts, model) => ({
705
+ model,
706
+ prompt: texts[0]
707
+ }),
708
+ responseMapper: (res) => [res.embedding]
709
+ });
710
+ }
711
+ /**
712
+ * Create a HuggingFace Inference API embedding provider.
713
+ *
714
+ * @example
715
+ * ```ts
716
+ * const provider = createHuggingFaceEmbedding({
717
+ * model: 'sentence-transformers/all-MiniLM-L6-v2',
718
+ * });
719
+ * ```
720
+ */
721
+ function createHuggingFaceEmbedding(options) {
722
+ const apiKey = options.apiKey ?? process.env.HF_TOKEN ?? process.env.HUGGINGFACE_API_KEY;
723
+ const headers = {};
724
+ if (apiKey) headers.Authorization = `Bearer ${apiKey}`;
725
+ return new HttpEmbeddingProvider({
726
+ baseUrl: `https://api-inference.huggingface.co/models/${options.model}`,
727
+ model: options.model,
728
+ headers,
729
+ requestBuilder: (texts) => ({ inputs: texts }),
730
+ responseMapper: (response) => {
731
+ if (Array.isArray(response)) {
732
+ if (Array.isArray(response[0]) && typeof response[0][0] === "number") return response;
733
+ return [response];
734
+ }
735
+ throw new Error("Unexpected HuggingFace response format");
736
+ }
737
+ });
738
+ }
739
+ /**
740
+ * Create a local Transformers.js embedding provider.
741
+ * Uses dependency injection - user provides the imported transformers module.
742
+ *
743
+ * @example
744
+ * ```typescript
745
+ * import * as transformers from '@huggingface/transformers';
746
+ * import { createTransformersEmbedding } from 'scrapex/embeddings';
747
+ *
748
+ * const provider = createTransformersEmbedding(transformers, {
749
+ * model: 'Xenova/all-MiniLM-L6-v2',
750
+ * });
751
+ * ```
752
+ *
753
+ * Required Node.js dependencies:
754
+ * ```
755
+ * npm install @huggingface/transformers onnxruntime-node
756
+ * ```
757
+ */
758
+ function createTransformersEmbedding(transformers, options) {
759
+ let pipeline = null;
760
+ let currentModel = null;
761
+ const config = {
762
+ model: options?.model ?? "Xenova/all-MiniLM-L6-v2",
763
+ quantized: options?.quantized ?? true,
764
+ pooling: options?.pooling ?? "mean",
765
+ normalize: options?.normalize ?? true
766
+ };
767
+ return {
768
+ name: "transformers",
769
+ async embed(texts, request) {
770
+ const model = request.model || config.model;
771
+ if (!pipeline || currentModel !== model) {
772
+ const cacheDir = options?.cacheDir;
773
+ const env = transformers.env;
774
+ const priorCacheDir = env?.cacheDir;
775
+ if (cacheDir && env) env.cacheDir = cacheDir;
776
+ try {
777
+ pipeline = await transformers.pipeline("feature-extraction", model, { quantized: config.quantized });
778
+ } finally {
779
+ if (cacheDir && env) if (priorCacheDir === void 0) delete env.cacheDir;
780
+ else env.cacheDir = priorCacheDir;
781
+ }
782
+ currentModel = model;
783
+ }
784
+ const embeddings = [];
785
+ for (const text of texts) {
786
+ const output = await pipeline(text, {
787
+ pooling: config.pooling,
788
+ normalize: config.normalize
789
+ });
790
+ embeddings.push(Array.from(output.data));
791
+ }
792
+ return { embeddings };
793
+ }
794
+ };
795
+ }
796
+ /** Recommended models for Transformers.js */
797
+ const TRANSFORMERS_MODELS = {
798
+ DEFAULT: "Xenova/all-MiniLM-L6-v2",
799
+ QUALITY: "Xenova/all-mpnet-base-v2",
800
+ RETRIEVAL: "Xenova/bge-small-en-v1.5",
801
+ MULTILINGUAL: "Xenova/multilingual-e5-small"
802
+ };
803
+
804
+ //#endregion
805
+ //#region src/embeddings/providers/index.ts
806
+ /**
807
+ * Create an embedding provider from configuration.
808
+ * This is the main factory function for creating providers.
809
+ */
810
+ function createEmbeddingProvider(config) {
811
+ switch (config.type) {
812
+ case "http": return createHttpEmbedding(config.config);
813
+ case "custom": return config.provider;
814
+ default: throw new require_enhancer.ScrapeError(`Unknown embedding provider type: ${config.type}`, "VALIDATION_ERROR");
815
+ }
816
+ }
817
+
818
+ //#endregion
819
+ //#region src/embeddings/safety.ts
820
+ /**
821
+ * PII redaction patterns with high precision to minimize false positives.
822
+ * Patterns are designed to match common formats while avoiding over-matching.
823
+ */
824
+ const EMAIL_PATTERN = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
825
+ const PHONE_PATTERN = /(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b/g;
826
+ const CREDIT_CARD_PATTERN = /\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12}|(?:[0-9]{4}[-\s]){3}[0-9]{4}|[0-9]{13,19})\b/g;
827
+ const SSN_PATTERN = /\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b/g;
828
+ const IPV4_PATTERN = /\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g;
829
+ const REDACTED = "[REDACTED]";
830
+ /**
831
+ * Create a redaction function based on configuration.
832
+ * Returns a function that applies all configured PII patterns.
833
+ */
834
+ function createPiiRedactor(config) {
835
+ const patterns = [];
836
+ if (config.creditCard) patterns.push({
837
+ name: "creditCard",
838
+ pattern: CREDIT_CARD_PATTERN
839
+ });
840
+ if (config.email) patterns.push({
841
+ name: "email",
842
+ pattern: EMAIL_PATTERN
843
+ });
844
+ if (config.phone) patterns.push({
845
+ name: "phone",
846
+ pattern: PHONE_PATTERN
847
+ });
848
+ if (config.ssn) patterns.push({
849
+ name: "ssn",
850
+ pattern: SSN_PATTERN
851
+ });
852
+ if (config.ipAddress) patterns.push({
853
+ name: "ipAddress",
854
+ pattern: IPV4_PATTERN
855
+ });
856
+ if (config.customPatterns) for (let i = 0; i < config.customPatterns.length; i++) {
857
+ const customPattern = config.customPatterns[i];
858
+ if (customPattern) patterns.push({
859
+ name: `custom_${i}`,
860
+ pattern: customPattern
861
+ });
862
+ }
863
+ return (text) => {
864
+ let redactedText = text;
865
+ let totalRedactions = 0;
866
+ const redactionsByType = {};
867
+ for (const { name, pattern } of patterns) {
868
+ pattern.lastIndex = 0;
869
+ const matchCount = text.match(pattern)?.length ?? 0;
870
+ if (matchCount > 0) {
871
+ redactedText = redactedText.replace(pattern, REDACTED);
872
+ totalRedactions += matchCount;
873
+ redactionsByType[name] = (redactionsByType[name] ?? 0) + matchCount;
874
+ }
875
+ }
876
+ return {
877
+ text: redactedText,
878
+ redacted: totalRedactions > 0,
879
+ redactionCount: totalRedactions,
880
+ redactionsByType
881
+ };
882
+ };
883
+ }
884
+ /**
885
+ * Simple redaction that applies all default patterns.
886
+ * Use createPiiRedactor() for fine-grained control.
887
+ */
888
+ function redactPii(text) {
889
+ return createPiiRedactor({
890
+ email: true,
891
+ phone: true,
892
+ creditCard: true,
893
+ ssn: true,
894
+ ipAddress: true
895
+ })(text);
896
+ }
897
+
898
+ //#endregion
899
+ //#region src/embeddings/pipeline.ts
900
+ const DEFAULT_CHUNK_SIZE = 500;
901
+ /**
902
+ * Get the effective model for embedding.
903
+ * Prioritizes: explicit options.model > provider config model
904
+ */
905
+ function getEffectiveModel(providerConfig, explicitModel) {
906
+ if (explicitModel) return explicitModel;
907
+ if (providerConfig.type === "http") return providerConfig.config.model;
908
+ }
909
+ /**
910
+ * Generate embeddings for scraped data.
911
+ * This is the main entry point for the embedding pipeline.
912
+ */
913
+ async function generateEmbeddings(data, options) {
914
+ const startTime = Date.now();
915
+ try {
916
+ const provider = createEmbeddingProvider(options.provider);
917
+ const model = getEffectiveModel(options.provider, options.model);
918
+ const validation = validateInput(selectInput(data, options.input), options.safety?.minTextLength ?? 10);
919
+ if (!validation.valid) return createSkippedResult(validation.reason, { model });
920
+ const originalInput = validation.text;
921
+ let inputText = validation.text;
922
+ let piiRedacted = false;
923
+ if (options.safety?.piiRedaction) {
924
+ const redactionResult = createPiiRedactor(options.safety.piiRedaction)(inputText);
925
+ inputText = redactionResult.text;
926
+ piiRedacted = redactionResult.redacted;
927
+ }
928
+ const effectiveChunking = applyMaxTokensToChunking(options.chunking, options.safety?.maxTokens);
929
+ const cacheKey = generateCacheKey({
930
+ providerKey: getProviderCacheKey(options.provider),
931
+ model,
932
+ dimensions: options.output?.dimensions,
933
+ aggregation: options.output?.aggregation,
934
+ input: options.input,
935
+ chunking: effectiveChunking,
936
+ safety: options.safety,
937
+ cacheKeySalt: options.cache?.cacheKeySalt,
938
+ content: inputText
939
+ });
940
+ const cache = options.cache?.store ?? getDefaultCache();
941
+ const cachedResult = await cache.get(cacheKey);
942
+ if (cachedResult && cachedResult.status === "success") {
943
+ if (options.onMetrics) options.onMetrics({
944
+ provider: provider.name,
945
+ model,
946
+ inputTokens: estimateTokens(inputText),
947
+ outputDimensions: getDimensions(cachedResult.aggregation === "all" ? cachedResult.vectors : cachedResult.vector),
948
+ chunks: cachedResult.source.chunks,
949
+ latencyMs: Date.now() - startTime,
950
+ cached: true,
951
+ retries: 0,
952
+ piiRedacted
953
+ });
954
+ return {
955
+ ...cachedResult,
956
+ source: {
957
+ ...cachedResult.source,
958
+ cached: true
959
+ }
960
+ };
961
+ }
962
+ const chunks = chunkText(inputText, effectiveChunking);
963
+ const callbackChunks = options.onChunk && options.safety?.allowSensitiveCallbacks ? chunkText(originalInput, effectiveChunking) : null;
964
+ if (chunks.length === 0) return createSkippedResult("No content after chunking", { model });
965
+ const sharedState = options.resilience?.state;
966
+ const rateLimiter = sharedState?.rateLimiter ?? (options.resilience?.rateLimit ? new require_enhancer.RateLimiter(options.resilience.rateLimit) : null);
967
+ const circuitBreaker = sharedState?.circuitBreaker ?? (options.resilience?.circuitBreaker ? new require_enhancer.CircuitBreaker(options.resilience.circuitBreaker) : null);
968
+ const concurrency = options.resilience?.concurrency ?? 1;
969
+ const semaphore = sharedState?.semaphore ?? new require_enhancer.Semaphore(concurrency);
970
+ const embeddings = [];
971
+ let totalTokens = 0;
972
+ let retryCount = 0;
973
+ for (let i = 0; i < chunks.length; i++) {
974
+ const chunk = chunks[i];
975
+ if (!chunk) continue;
976
+ if (rateLimiter) await rateLimiter.acquire();
977
+ if (circuitBreaker?.isOpen()) return createSkippedResult("Circuit breaker is open", {
978
+ model,
979
+ chunks: i
980
+ });
981
+ await semaphore.execute(async () => {
982
+ const { result: result$1 } = await require_enhancer.withResilience(async (signal) => {
983
+ return provider.embed([chunk.text], {
984
+ model,
985
+ dimensions: options.output?.dimensions,
986
+ signal
987
+ });
988
+ }, options.resilience, {
989
+ circuitBreaker: circuitBreaker ?? void 0,
990
+ rateLimiter: void 0,
991
+ semaphore: void 0
992
+ }, { onRetry: () => {
993
+ retryCount++;
994
+ } });
995
+ if (result$1.usage) totalTokens += result$1.usage.totalTokens;
996
+ else totalTokens += chunk.tokens;
997
+ const embedding = result$1.embeddings[0];
998
+ if (embedding) {
999
+ embeddings.push(embedding);
1000
+ if (options.onChunk) {
1001
+ const callbackText = callbackChunks?.[i]?.text ?? chunk.text;
1002
+ options.onChunk(callbackText, embedding);
1003
+ }
1004
+ }
1005
+ });
1006
+ }
1007
+ const aggregation = options.output?.aggregation ?? "average";
1008
+ const aggregated = aggregateVectors(embeddings, aggregation);
1009
+ const source = {
1010
+ model,
1011
+ chunks: chunks.length,
1012
+ tokens: totalTokens || estimateTokens(inputText),
1013
+ checksum: generateChecksum(inputText),
1014
+ cached: false,
1015
+ latencyMs: Date.now() - startTime
1016
+ };
1017
+ let result;
1018
+ if (aggregated.type === "single") result = {
1019
+ status: "success",
1020
+ aggregation,
1021
+ vector: aggregated.vector,
1022
+ source
1023
+ };
1024
+ else result = {
1025
+ status: "success",
1026
+ aggregation: "all",
1027
+ vectors: aggregated.vectors,
1028
+ source
1029
+ };
1030
+ await cache.set(cacheKey, result, { ttlMs: options.cache?.ttlMs });
1031
+ if (options.onMetrics) {
1032
+ const metrics = {
1033
+ provider: provider.name,
1034
+ model,
1035
+ inputTokens: source.tokens,
1036
+ outputDimensions: aggregated.dimensions,
1037
+ chunks: chunks.length,
1038
+ latencyMs: source.latencyMs,
1039
+ cached: false,
1040
+ retries: retryCount,
1041
+ piiRedacted
1042
+ };
1043
+ options.onMetrics(metrics);
1044
+ }
1045
+ return result;
1046
+ } catch (error) {
1047
+ const reason = error instanceof Error ? error.message : String(error);
1048
+ if (error instanceof require_enhancer.ScrapeError && ["INVALID_URL", "BLOCKED"].includes(error.code)) throw error;
1049
+ return createSkippedResult(reason, { latencyMs: Date.now() - startTime });
1050
+ }
1051
+ }
1052
+ function applyMaxTokensToChunking(chunking, maxTokens) {
1053
+ if (!maxTokens || maxTokens <= 0) return chunking;
1054
+ const baseSize = chunking?.size ?? DEFAULT_CHUNK_SIZE;
1055
+ const baseOverlap = chunking?.overlap ?? 50;
1056
+ const clampedSize = Math.min(baseSize, maxTokens);
1057
+ const clampedOverlap = Math.min(baseOverlap, Math.max(0, clampedSize - 1));
1058
+ return {
1059
+ ...chunking,
1060
+ size: clampedSize,
1061
+ overlap: clampedOverlap
1062
+ };
1063
+ }
1064
+ /**
1065
+ * Embed arbitrary text directly.
1066
+ * Standalone function for embedding text outside of scrape().
1067
+ */
1068
+ async function embed(text, options) {
1069
+ return generateEmbeddings({ textContent: text }, {
1070
+ ...options,
1071
+ input: {
1072
+ ...options.input,
1073
+ type: "textContent"
1074
+ }
1075
+ });
1076
+ }
1077
+ /**
1078
+ * Embed from existing ScrapedData.
1079
+ * Useful when you've already scraped and want to add embeddings later.
1080
+ */
1081
+ async function embedScrapedData(data, options) {
1082
+ return generateEmbeddings(data, options);
1083
+ }
1084
+ /**
1085
+ * Create a skipped result with reason.
1086
+ */
1087
+ function createSkippedResult(reason, partialSource) {
1088
+ return {
1089
+ status: "skipped",
1090
+ reason,
1091
+ source: partialSource ?? {}
1092
+ };
1093
+ }
1094
+
84
1095
  //#endregion
85
1096
  //#region src/extractors/content.ts
86
1097
  const turndown$1 = new turndown.default({
@@ -109,7 +1120,7 @@ var ContentExtractor = class {
109
1120
  async extract(context) {
110
1121
  const { options } = context;
111
1122
  if (options.extractContent === false) return {};
112
- const article = new __mozilla_readability.Readability(context.getDocument().cloneNode(true)).parse();
1123
+ const article = new _mozilla_readability.Readability(context.getDocument().cloneNode(true)).parse();
113
1124
  if (!article || !article.content) return this.extractFallback(context);
114
1125
  let content = turndown$1.turndown(article.content);
115
1126
  const maxLength = options.maxContentLength ?? 5e4;
@@ -557,7 +1568,9 @@ var NativeFetcher = class {
557
1568
  throw new require_enhancer.ScrapeError(`HTTP error ${response.status}: ${url}`, "FETCH_FAILED", response.status);
558
1569
  }
559
1570
  const contentType = response.headers.get("content-type") || "";
560
- if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) throw new require_enhancer.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
1571
+ if (options.allowedContentTypes) {
1572
+ if (!options.allowedContentTypes.some((type) => contentType.toLowerCase().includes(type.toLowerCase()))) throw new require_enhancer.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
1573
+ } else if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) throw new require_enhancer.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
561
1574
  const html = await response.text();
562
1575
  const responseHeaders = {};
563
1576
  response.headers.forEach((value, key) => {
@@ -748,6 +1761,7 @@ async function scrape(url, options = {}) {
748
1761
  console.error("LLM extraction failed:", error);
749
1762
  intermediateResult.error = intermediateResult.error ? `${intermediateResult.error}; LLM extraction: ${error instanceof Error ? error.message : String(error)}` : `LLM extraction: ${error instanceof Error ? error.message : String(error)}`;
750
1763
  }
1764
+ if (options.embeddings) intermediateResult.embeddings = await generateEmbeddings(intermediateResult, options.embeddings);
751
1765
  const scrapeTimeMs = Date.now() - startTime;
752
1766
  return {
753
1767
  ...intermediateResult,
@@ -788,9 +1802,8 @@ async function scrapeHtml(html, url, options = {}) {
788
1802
  console.error(`Extractor "${extractor.name}" failed:`, error);
789
1803
  context = mergeResults(context, { error: context.results.error ? `${context.results.error}; ${extractor.name}: ${error instanceof Error ? error.message : String(error)}` : `${extractor.name}: ${error instanceof Error ? error.message : String(error)}` });
790
1804
  }
791
- const scrapeTimeMs = Date.now() - startTime;
792
1805
  const domain = extractDomain(normalizedUrl);
793
- return {
1806
+ const intermediateResult = {
794
1807
  url: normalizedUrl,
795
1808
  canonicalUrl: context.results.canonicalUrl || normalizedUrl,
796
1809
  domain,
@@ -817,9 +1830,127 @@ async function scrapeHtml(html, url, options = {}) {
817
1830
  extracted: context.results.extracted,
818
1831
  custom: context.results.custom,
819
1832
  scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
820
- scrapeTimeMs,
1833
+ scrapeTimeMs: 0,
821
1834
  error: context.results.error
822
1835
  };
1836
+ if (options.embeddings) intermediateResult.embeddings = await generateEmbeddings(intermediateResult, options.embeddings);
1837
+ const scrapeTimeMs = Date.now() - startTime;
1838
+ return {
1839
+ ...intermediateResult,
1840
+ scrapeTimeMs
1841
+ };
1842
+ }
1843
+
1844
+ //#endregion
1845
+ //#region src/utils/feed.ts
1846
+ /**
1847
+ * Fetch and parse an RSS/Atom feed from a URL.
1848
+ * Uses scrapex's fetcher infrastructure for consistent behavior.
1849
+ */
1850
+ async function fetchFeed(url, options) {
1851
+ const result = await (options?.fetcher || defaultFetcher).fetch(url, {
1852
+ timeout: options?.timeout,
1853
+ userAgent: options?.userAgent,
1854
+ allowedContentTypes: [
1855
+ "application/rss+xml",
1856
+ "application/atom+xml",
1857
+ "application/rdf+xml",
1858
+ "application/xml",
1859
+ "text/xml",
1860
+ "text/html"
1861
+ ]
1862
+ });
1863
+ return new require_parsers.RSSParser(options?.parserOptions).parse(result.html, url);
1864
+ }
1865
+ /**
1866
+ * Detect RSS/Atom feed URLs from HTML.
1867
+ * Supports RSS, Atom, and RDF feed types.
1868
+ */
1869
+ function discoverFeeds(html, baseUrl) {
1870
+ const $ = cheerio.load(html);
1871
+ const feeds = [];
1872
+ const seen = /* @__PURE__ */ new Set();
1873
+ $([
1874
+ "link[type=\"application/rss+xml\"]",
1875
+ "link[type=\"application/atom+xml\"]",
1876
+ "link[type=\"application/rdf+xml\"]",
1877
+ "link[rel=\"alternate\"][type*=\"xml\"]"
1878
+ ].join(", ")).each((_, el) => {
1879
+ const href = $(el).attr("href");
1880
+ if (href) try {
1881
+ const resolved = new URL(href, baseUrl).href;
1882
+ if (!seen.has(resolved)) {
1883
+ seen.add(resolved);
1884
+ feeds.push(resolved);
1885
+ }
1886
+ } catch {}
1887
+ });
1888
+ return feeds;
1889
+ }
1890
+ /**
1891
+ * Filter feed items by date range.
1892
+ * Items without publishedAt are included by default.
1893
+ */
1894
+ function filterByDate(items, options) {
1895
+ const { after, before, includeUndated = true } = options;
1896
+ return items.filter((item) => {
1897
+ if (!item.publishedAt) return includeUndated;
1898
+ const date = new Date(item.publishedAt);
1899
+ if (after && date < after) return false;
1900
+ if (before && date > before) return false;
1901
+ return true;
1902
+ });
1903
+ }
1904
+ /**
1905
+ * Convert feed items to markdown for LLM consumption.
1906
+ * Uses ISO 8601 date format for consistency across environments.
1907
+ */
1908
+ function feedToMarkdown(feed, options) {
1909
+ const { includeContent = false, maxItems } = options || {};
1910
+ const lines = [`# ${feed.title}`, ""];
1911
+ if (feed.description) lines.push(feed.description, "");
1912
+ const items = maxItems ? feed.items.slice(0, maxItems) : feed.items;
1913
+ for (const item of items) {
1914
+ lines.push(`## ${item.title}`);
1915
+ if (item.publishedAt) {
1916
+ const date = item.publishedAt.split("T")[0];
1917
+ lines.push(`*${date}*`);
1918
+ }
1919
+ lines.push("");
1920
+ if (includeContent && item.content) lines.push(item.content);
1921
+ else if (item.description) lines.push(item.description);
1922
+ if (item.link) lines.push(`[Read more](${item.link})`, "");
1923
+ else lines.push("");
1924
+ }
1925
+ return lines.join("\n");
1926
+ }
1927
+ /**
1928
+ * Extract plain text from feed items for LLM processing.
1929
+ * Concatenates title, description, and content.
1930
+ */
1931
+ function feedToText(feed, options) {
1932
+ const { maxItems, separator = "\n\n---\n\n" } = options || {};
1933
+ return (maxItems ? feed.items.slice(0, maxItems) : feed.items).map((item) => {
1934
+ const parts = [item.title];
1935
+ if (item.description) parts.push(item.description);
1936
+ if (item.content) parts.push(item.content);
1937
+ return parts.join("\n\n");
1938
+ }).join(separator);
1939
+ }
1940
+ /**
1941
+ * Paginate through a feed using rel="next" links (RFC 5005).
1942
+ * Returns an async generator that yields each page.
1943
+ */
1944
+ async function* paginateFeed(url, options) {
1945
+ const { maxPages = 10, ...fetchOptions } = options || {};
1946
+ let currentUrl = url;
1947
+ let pageCount = 0;
1948
+ while (currentUrl && pageCount < maxPages) {
1949
+ const result = await fetchFeed(currentUrl, fetchOptions);
1950
+ yield result.data;
1951
+ currentUrl = result.data.next;
1952
+ pageCount++;
1953
+ }
823
1954
  }
824
1955
 
825
1956
  //#endregion
@@ -827,17 +1958,39 @@ exports.ContentExtractor = ContentExtractor;
827
1958
  exports.DEFAULT_TIMEOUT = DEFAULT_TIMEOUT;
828
1959
  exports.DEFAULT_USER_AGENT = DEFAULT_USER_AGENT;
829
1960
  exports.FaviconExtractor = FaviconExtractor;
1961
+ exports.InMemoryEmbeddingCache = InMemoryEmbeddingCache;
830
1962
  exports.JsonLdExtractor = JsonLdExtractor;
831
1963
  exports.LinksExtractor = LinksExtractor;
832
1964
  exports.MetaExtractor = MetaExtractor;
833
1965
  exports.NativeFetcher = NativeFetcher;
1966
+ exports.RSSParser = require_parsers.RSSParser;
834
1967
  exports.ScrapeError = require_enhancer.ScrapeError;
835
- exports.__toESM = __toESM;
1968
+ exports.TRANSFORMERS_MODELS = TRANSFORMERS_MODELS;
1969
+ exports.aggregateVectors = aggregateVectors;
836
1970
  exports.checkRobotsTxt = checkRobotsTxt;
1971
+ exports.chunkText = chunkText;
1972
+ exports.cosineSimilarity = cosineSimilarity;
1973
+ exports.createAzureEmbedding = createAzureEmbedding;
837
1974
  exports.createDefaultExtractors = createDefaultExtractors;
1975
+ exports.createEmbeddingProvider = createEmbeddingProvider;
838
1976
  exports.createExtractionContext = createExtractionContext;
1977
+ exports.createHttpEmbedding = createHttpEmbedding;
1978
+ exports.createHuggingFaceEmbedding = createHuggingFaceEmbedding;
1979
+ exports.createOllamaEmbedding = createOllamaEmbedding;
1980
+ exports.createOpenAIEmbedding = createOpenAIEmbedding;
1981
+ exports.createPiiRedactor = createPiiRedactor;
1982
+ exports.createTransformersEmbedding = createTransformersEmbedding;
839
1983
  exports.defaultFetcher = defaultFetcher;
1984
+ exports.discoverFeeds = discoverFeeds;
1985
+ exports.embed = embed;
1986
+ exports.embedScrapedData = embedScrapedData;
1987
+ exports.estimateTokens = estimateTokens;
840
1988
  exports.extractDomain = extractDomain;
1989
+ exports.feedToMarkdown = feedToMarkdown;
1990
+ exports.feedToText = feedToText;
1991
+ exports.fetchFeed = fetchFeed;
1992
+ exports.filterByDate = filterByDate;
1993
+ exports.generateEmbeddings = generateEmbeddings;
841
1994
  exports.getPath = getPath;
842
1995
  exports.getProtocol = getProtocol;
843
1996
  exports.isExternalUrl = isExternalUrl;
@@ -845,6 +1998,8 @@ exports.isValidUrl = isValidUrl;
845
1998
  exports.matchesUrlPattern = matchesUrlPattern;
846
1999
  exports.mergeResults = mergeResults;
847
2000
  exports.normalizeUrl = normalizeUrl;
2001
+ exports.paginateFeed = paginateFeed;
2002
+ exports.redactPii = redactPii;
848
2003
  exports.resolveUrl = resolveUrl;
849
2004
  exports.scrape = scrape;
850
2005
  exports.scrapeHtml = scrapeHtml;