scrapex 0.5.3 → 1.0.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +551 -145
- package/dist/enhancer-ByjRD-t5.mjs +769 -0
- package/dist/enhancer-ByjRD-t5.mjs.map +1 -0
- package/dist/enhancer-j0xqKDJm.cjs +847 -0
- package/dist/enhancer-j0xqKDJm.cjs.map +1 -0
- package/dist/index-CDgcRnig.d.cts +268 -0
- package/dist/index-CDgcRnig.d.cts.map +1 -0
- package/dist/index-piS5wtki.d.mts +268 -0
- package/dist/index-piS5wtki.d.mts.map +1 -0
- package/dist/index.cjs +2007 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +580 -0
- package/dist/index.d.cts.map +1 -0
- package/dist/index.d.mts +580 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +1956 -0
- package/dist/index.mjs.map +1 -0
- package/dist/llm/index.cjs +334 -0
- package/dist/llm/index.cjs.map +1 -0
- package/dist/llm/index.d.cts +258 -0
- package/dist/llm/index.d.cts.map +1 -0
- package/dist/llm/index.d.mts +258 -0
- package/dist/llm/index.d.mts.map +1 -0
- package/dist/llm/index.mjs +317 -0
- package/dist/llm/index.mjs.map +1 -0
- package/dist/parsers/index.cjs +11 -0
- package/dist/parsers/index.d.cts +2 -0
- package/dist/parsers/index.d.mts +2 -0
- package/dist/parsers/index.mjs +3 -0
- package/dist/parsers-Bneuws8x.cjs +569 -0
- package/dist/parsers-Bneuws8x.cjs.map +1 -0
- package/dist/parsers-CwkYnyWY.mjs +482 -0
- package/dist/parsers-CwkYnyWY.mjs.map +1 -0
- package/dist/types-CadAXrme.d.mts +674 -0
- package/dist/types-CadAXrme.d.mts.map +1 -0
- package/dist/types-DPEtPihB.d.cts +674 -0
- package/dist/types-DPEtPihB.d.cts.map +1 -0
- package/package.json +79 -100
- package/dist/index.d.ts +0 -45
- package/dist/index.js +0 -8
- package/dist/scrapex.cjs.development.js +0 -1130
- package/dist/scrapex.cjs.development.js.map +0 -1
- package/dist/scrapex.cjs.production.min.js +0 -2
- package/dist/scrapex.cjs.production.min.js.map +0 -1
- package/dist/scrapex.esm.js +0 -1122
- package/dist/scrapex.esm.js.map +0 -1
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,2007 @@
|
|
|
1
|
+
const require_parsers = require('./parsers-Bneuws8x.cjs');
|
|
2
|
+
const require_enhancer = require('./enhancer-j0xqKDJm.cjs');
|
|
3
|
+
let cheerio = require("cheerio");
|
|
4
|
+
cheerio = require_parsers.__toESM(cheerio);
|
|
5
|
+
let node_crypto = require("node:crypto");
|
|
6
|
+
let _mozilla_readability = require("@mozilla/readability");
|
|
7
|
+
let turndown = require("turndown");
|
|
8
|
+
turndown = require_parsers.__toESM(turndown);
|
|
9
|
+
|
|
10
|
+
//#region src/core/context.ts
|
|
11
|
+
let jsdomModule = null;
|
|
12
|
+
/**
|
|
13
|
+
* Preload JSDOM module (called once during scrape initialization)
|
|
14
|
+
*/
|
|
15
|
+
async function preloadJsdom() {
|
|
16
|
+
if (!jsdomModule) jsdomModule = await import("jsdom");
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Create an extraction context with lazy JSDOM loading.
|
|
20
|
+
*
|
|
21
|
+
* Cheerio is always available for fast DOM queries.
|
|
22
|
+
* JSDOM is only loaded when getDocument() is called (for Readability).
|
|
23
|
+
*/
|
|
24
|
+
function createExtractionContext(url, finalUrl, html, options) {
|
|
25
|
+
let document = null;
|
|
26
|
+
return {
|
|
27
|
+
url,
|
|
28
|
+
finalUrl,
|
|
29
|
+
html,
|
|
30
|
+
$: cheerio.load(html),
|
|
31
|
+
options,
|
|
32
|
+
results: {},
|
|
33
|
+
getDocument() {
|
|
34
|
+
if (!document) {
|
|
35
|
+
if (!jsdomModule) throw new Error("JSDOM not preloaded. Call preloadJsdom() before using getDocument().");
|
|
36
|
+
document = new jsdomModule.JSDOM(html, { url: finalUrl }).window.document;
|
|
37
|
+
}
|
|
38
|
+
return document;
|
|
39
|
+
}
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Merge partial results into the context
|
|
44
|
+
*/
|
|
45
|
+
function mergeResults(context, extracted) {
|
|
46
|
+
return {
|
|
47
|
+
...context,
|
|
48
|
+
results: {
|
|
49
|
+
...context.results,
|
|
50
|
+
...extracted,
|
|
51
|
+
custom: extracted.custom || context.results.custom ? {
|
|
52
|
+
...context.results.custom,
|
|
53
|
+
...extracted.custom
|
|
54
|
+
} : void 0
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
//#endregion
|
|
60
|
+
//#region src/embeddings/aggregation.ts
|
|
61
|
+
/**
|
|
62
|
+
* Aggregate multiple embedding vectors into a single vector or return all.
|
|
63
|
+
*
|
|
64
|
+
* @param vectors - Array of embedding vectors (must all have same dimensions)
|
|
65
|
+
* @param strategy - Aggregation strategy
|
|
66
|
+
* @returns Aggregated result based on strategy
|
|
67
|
+
*/
|
|
68
|
+
function aggregateVectors(vectors, strategy = "average") {
|
|
69
|
+
if (vectors.length === 0) throw new Error("Cannot aggregate empty vector array");
|
|
70
|
+
const firstVector = vectors[0];
|
|
71
|
+
if (!firstVector) throw new Error("Cannot aggregate empty vector array");
|
|
72
|
+
const dimensions = firstVector.length;
|
|
73
|
+
for (let i = 1; i < vectors.length; i++) {
|
|
74
|
+
const vec = vectors[i];
|
|
75
|
+
if (!vec || vec.length !== dimensions) throw new Error(`Vector dimension mismatch: expected ${dimensions}, got ${vec?.length ?? 0} at index ${i}`);
|
|
76
|
+
}
|
|
77
|
+
switch (strategy) {
|
|
78
|
+
case "average": return {
|
|
79
|
+
type: "single",
|
|
80
|
+
vector: averageVectors(vectors),
|
|
81
|
+
dimensions
|
|
82
|
+
};
|
|
83
|
+
case "max": return {
|
|
84
|
+
type: "single",
|
|
85
|
+
vector: maxPoolVectors(vectors),
|
|
86
|
+
dimensions
|
|
87
|
+
};
|
|
88
|
+
case "first": return {
|
|
89
|
+
type: "single",
|
|
90
|
+
vector: firstVector,
|
|
91
|
+
dimensions
|
|
92
|
+
};
|
|
93
|
+
case "all": return {
|
|
94
|
+
type: "multiple",
|
|
95
|
+
vectors,
|
|
96
|
+
dimensions
|
|
97
|
+
};
|
|
98
|
+
default: {
|
|
99
|
+
const _exhaustive = strategy;
|
|
100
|
+
throw new Error(`Unknown aggregation strategy: ${_exhaustive}`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Compute element-wise average of vectors.
|
|
106
|
+
*/
|
|
107
|
+
function averageVectors(vectors) {
|
|
108
|
+
const first = vectors[0];
|
|
109
|
+
if (!first || vectors.length === 1) return first ?? [];
|
|
110
|
+
const dimensions = first.length;
|
|
111
|
+
const count = vectors.length;
|
|
112
|
+
const result = new Array(dimensions).fill(0);
|
|
113
|
+
for (const vector of vectors) for (let i = 0; i < dimensions; i++) {
|
|
114
|
+
const val = result[i];
|
|
115
|
+
if (val !== void 0) result[i] = val + (vector[i] ?? 0);
|
|
116
|
+
}
|
|
117
|
+
for (let i = 0; i < dimensions; i++) {
|
|
118
|
+
const val = result[i];
|
|
119
|
+
if (val !== void 0) result[i] = val / count;
|
|
120
|
+
}
|
|
121
|
+
return result;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Compute element-wise maximum of vectors (max pooling).
|
|
125
|
+
*/
|
|
126
|
+
function maxPoolVectors(vectors) {
|
|
127
|
+
const first = vectors[0];
|
|
128
|
+
if (!first || vectors.length === 1) return first ?? [];
|
|
129
|
+
const dimensions = first.length;
|
|
130
|
+
const result = [...first];
|
|
131
|
+
for (let v = 1; v < vectors.length; v++) {
|
|
132
|
+
const vec = vectors[v];
|
|
133
|
+
if (!vec) continue;
|
|
134
|
+
for (let i = 0; i < dimensions; i++) {
|
|
135
|
+
const val = vec[i] ?? 0;
|
|
136
|
+
if (val > (result[i] ?? 0)) result[i] = val;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
return result;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Compute cosine similarity between two vectors.
|
|
143
|
+
* Both vectors should be normalized for accurate results.
|
|
144
|
+
*/
|
|
145
|
+
function cosineSimilarity(a, b) {
|
|
146
|
+
if (a.length !== b.length) throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`);
|
|
147
|
+
let dot = 0;
|
|
148
|
+
let magnitudeA = 0;
|
|
149
|
+
let magnitudeB = 0;
|
|
150
|
+
for (let i = 0; i < a.length; i++) {
|
|
151
|
+
const aVal = a[i] ?? 0;
|
|
152
|
+
const bVal = b[i] ?? 0;
|
|
153
|
+
dot += aVal * bVal;
|
|
154
|
+
magnitudeA += aVal * aVal;
|
|
155
|
+
magnitudeB += bVal * bVal;
|
|
156
|
+
}
|
|
157
|
+
const magnitude = Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB);
|
|
158
|
+
if (magnitude === 0) return 0;
|
|
159
|
+
return dot / magnitude;
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Get the dimensions of a vector or set of vectors.
|
|
163
|
+
*/
|
|
164
|
+
function getDimensions(vectors) {
|
|
165
|
+
if (vectors.length === 0) return 0;
|
|
166
|
+
const first = vectors[0];
|
|
167
|
+
if (typeof first === "number") return vectors.length;
|
|
168
|
+
return first?.length ?? 0;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
//#endregion
|
|
172
|
+
//#region src/embeddings/cache.ts
|
|
173
|
+
/**
|
|
174
|
+
* Default maximum cache entries.
|
|
175
|
+
*/
|
|
176
|
+
const DEFAULT_MAX_ENTRIES = 1e3;
|
|
177
|
+
/**
|
|
178
|
+
* Default TTL in milliseconds (1 hour).
|
|
179
|
+
*/
|
|
180
|
+
const DEFAULT_TTL_MS = 3600 * 1e3;
|
|
181
|
+
/**
|
|
182
|
+
* Generate a content-addressable cache key.
|
|
183
|
+
* Key is based on content hash and embedding configuration.
|
|
184
|
+
* Note: custom RegExp patterns are serialized by source+flags; different
|
|
185
|
+
* constructions can yield different cache keys even if equivalent.
|
|
186
|
+
*/
|
|
187
|
+
function generateCacheKey(params) {
|
|
188
|
+
const hash = (0, node_crypto.createHash)("sha256");
|
|
189
|
+
const fingerprint = stableStringify({
|
|
190
|
+
providerKey: params.providerKey,
|
|
191
|
+
model: params.model ?? "provider-default",
|
|
192
|
+
dimensions: params.dimensions ?? "default",
|
|
193
|
+
aggregation: params.aggregation ?? "average",
|
|
194
|
+
input: serializeInputConfig(params.input),
|
|
195
|
+
chunking: serializeChunkingConfig(params.chunking),
|
|
196
|
+
safety: serializeSafetyConfig(params.safety),
|
|
197
|
+
cacheKeySalt: params.cacheKeySalt
|
|
198
|
+
});
|
|
199
|
+
hash.update(fingerprint);
|
|
200
|
+
hash.update("\0");
|
|
201
|
+
hash.update(params.content);
|
|
202
|
+
return hash.digest("hex");
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Generate a checksum for content verification.
|
|
206
|
+
*/
|
|
207
|
+
function generateChecksum(content) {
|
|
208
|
+
return (0, node_crypto.createHash)("sha256").update(content).digest("hex").slice(0, 16);
|
|
209
|
+
}
|
|
210
|
+
function serializeInputConfig(config) {
|
|
211
|
+
if (!config) return void 0;
|
|
212
|
+
return normalizeObject({
|
|
213
|
+
type: config.type ?? "textContent",
|
|
214
|
+
hasTransform: Boolean(config.transform),
|
|
215
|
+
hasCustomText: Boolean(config.customText)
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
function serializeChunkingConfig(config) {
|
|
219
|
+
if (!config) return void 0;
|
|
220
|
+
return normalizeObject({
|
|
221
|
+
size: config.size,
|
|
222
|
+
overlap: config.overlap,
|
|
223
|
+
tokenizer: getTokenizerId(config.tokenizer),
|
|
224
|
+
maxInputLength: config.maxInputLength
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
function serializeSafetyConfig(config) {
|
|
228
|
+
if (!config) return void 0;
|
|
229
|
+
return normalizeObject({
|
|
230
|
+
piiRedaction: serializePiiConfig(config.piiRedaction),
|
|
231
|
+
minTextLength: config.minTextLength,
|
|
232
|
+
maxTokens: config.maxTokens
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
function serializePiiConfig(config) {
|
|
236
|
+
if (!config) return void 0;
|
|
237
|
+
return normalizeObject({
|
|
238
|
+
email: config.email ?? false,
|
|
239
|
+
phone: config.phone ?? false,
|
|
240
|
+
creditCard: config.creditCard ?? false,
|
|
241
|
+
ssn: config.ssn ?? false,
|
|
242
|
+
ipAddress: config.ipAddress ?? false,
|
|
243
|
+
customPatterns: config.customPatterns?.map((pattern) => `${pattern.source}/${pattern.flags}`)
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
function getTokenizerId(tokenizer) {
|
|
247
|
+
if (!tokenizer || tokenizer === "heuristic") return "heuristic";
|
|
248
|
+
if (tokenizer === "tiktoken") return "tiktoken";
|
|
249
|
+
return "custom";
|
|
250
|
+
}
|
|
251
|
+
function stableStringify(value) {
|
|
252
|
+
return stringifyNormalized(normalizeValue(value));
|
|
253
|
+
}
|
|
254
|
+
function normalizeValue(value) {
|
|
255
|
+
if (value === void 0) return void 0;
|
|
256
|
+
if (value === null) return null;
|
|
257
|
+
if (Array.isArray(value)) return value.map((entry) => normalizeValue(entry)).filter((entry) => entry !== void 0);
|
|
258
|
+
if (typeof value === "object") return normalizeObject(value);
|
|
259
|
+
return value;
|
|
260
|
+
}
|
|
261
|
+
function normalizeObject(value) {
|
|
262
|
+
const normalized = {};
|
|
263
|
+
for (const key of Object.keys(value).sort()) {
|
|
264
|
+
const entry = normalizeValue(value[key]);
|
|
265
|
+
if (entry !== void 0) normalized[key] = entry;
|
|
266
|
+
}
|
|
267
|
+
return normalized;
|
|
268
|
+
}
|
|
269
|
+
function stringifyNormalized(value) {
|
|
270
|
+
if (value === void 0) return "undefined";
|
|
271
|
+
if (value === null) return "null";
|
|
272
|
+
if (typeof value === "string") return JSON.stringify(value);
|
|
273
|
+
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
274
|
+
if (Array.isArray(value)) return `[${value.map((entry) => stringifyNormalized(entry)).join(",")}]`;
|
|
275
|
+
if (typeof value === "object") {
|
|
276
|
+
const obj = value;
|
|
277
|
+
return `{${Object.keys(obj).sort().map((key) => `${JSON.stringify(key)}:${stringifyNormalized(obj[key])}`).join(",")}}`;
|
|
278
|
+
}
|
|
279
|
+
return JSON.stringify(value);
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* In-memory LRU cache with TTL support.
|
|
283
|
+
* Content-addressable: uses content hash as key, not URL.
|
|
284
|
+
*/
|
|
285
|
+
var InMemoryEmbeddingCache = class {
|
|
286
|
+
cache;
|
|
287
|
+
maxEntries;
|
|
288
|
+
defaultTtlMs;
|
|
289
|
+
constructor(options) {
|
|
290
|
+
this.cache = /* @__PURE__ */ new Map();
|
|
291
|
+
this.maxEntries = options?.maxEntries ?? DEFAULT_MAX_ENTRIES;
|
|
292
|
+
this.defaultTtlMs = options?.ttlMs ?? DEFAULT_TTL_MS;
|
|
293
|
+
}
|
|
294
|
+
async get(key) {
|
|
295
|
+
const entry = this.cache.get(key);
|
|
296
|
+
if (!entry) return;
|
|
297
|
+
const now = Date.now();
|
|
298
|
+
if (now > entry.expiresAt) {
|
|
299
|
+
this.cache.delete(key);
|
|
300
|
+
return;
|
|
301
|
+
}
|
|
302
|
+
entry.accessedAt = now;
|
|
303
|
+
return entry.value;
|
|
304
|
+
}
|
|
305
|
+
async set(key, value, options) {
|
|
306
|
+
const now = Date.now();
|
|
307
|
+
const ttl = options?.ttlMs ?? this.defaultTtlMs;
|
|
308
|
+
if (this.cache.size >= this.maxEntries && !this.cache.has(key)) this.evictLRU();
|
|
309
|
+
this.cache.set(key, {
|
|
310
|
+
value,
|
|
311
|
+
createdAt: now,
|
|
312
|
+
expiresAt: now + ttl,
|
|
313
|
+
accessedAt: now
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
async delete(key) {
|
|
317
|
+
return this.cache.delete(key);
|
|
318
|
+
}
|
|
319
|
+
async clear() {
|
|
320
|
+
this.cache.clear();
|
|
321
|
+
}
|
|
322
|
+
/**
|
|
323
|
+
* Get cache statistics.
|
|
324
|
+
*/
|
|
325
|
+
getStats() {
|
|
326
|
+
const now = Date.now();
|
|
327
|
+
let expired = 0;
|
|
328
|
+
for (const entry of this.cache.values()) if (now > entry.expiresAt) expired++;
|
|
329
|
+
return {
|
|
330
|
+
size: this.cache.size,
|
|
331
|
+
maxEntries: this.maxEntries,
|
|
332
|
+
expired,
|
|
333
|
+
utilization: this.cache.size / this.maxEntries
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
/**
|
|
337
|
+
* Evict expired entries.
|
|
338
|
+
*/
|
|
339
|
+
cleanup() {
|
|
340
|
+
const now = Date.now();
|
|
341
|
+
let evicted = 0;
|
|
342
|
+
for (const [key, entry] of this.cache.entries()) if (now > entry.expiresAt) {
|
|
343
|
+
this.cache.delete(key);
|
|
344
|
+
evicted++;
|
|
345
|
+
}
|
|
346
|
+
return evicted;
|
|
347
|
+
}
|
|
348
|
+
/**
|
|
349
|
+
* Evict least recently used entry.
|
|
350
|
+
*/
|
|
351
|
+
evictLRU() {
|
|
352
|
+
let oldestKey = null;
|
|
353
|
+
let oldestAccess = Number.POSITIVE_INFINITY;
|
|
354
|
+
for (const [key, entry] of this.cache.entries()) if (entry.accessedAt < oldestAccess) {
|
|
355
|
+
oldestAccess = entry.accessedAt;
|
|
356
|
+
oldestKey = key;
|
|
357
|
+
}
|
|
358
|
+
if (oldestKey) this.cache.delete(oldestKey);
|
|
359
|
+
}
|
|
360
|
+
};
|
|
361
|
+
/**
|
|
362
|
+
* Default in-memory cache instance.
|
|
363
|
+
* Optimized for moderate cache sizes (default 1000 entries).
|
|
364
|
+
*/
|
|
365
|
+
let defaultCache = null;
|
|
366
|
+
/**
|
|
367
|
+
* Get or create the default cache instance.
|
|
368
|
+
*/
|
|
369
|
+
function getDefaultCache() {
|
|
370
|
+
if (!defaultCache) defaultCache = new InMemoryEmbeddingCache();
|
|
371
|
+
return defaultCache;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
//#endregion
|
|
375
|
+
//#region src/embeddings/chunking.ts
|
|
376
|
+
/**
|
|
377
|
+
* Default chunk size in tokens.
|
|
378
|
+
*/
|
|
379
|
+
const DEFAULT_CHUNK_SIZE$1 = 500;
|
|
380
|
+
/**
|
|
381
|
+
* Default overlap in tokens.
|
|
382
|
+
*/
|
|
383
|
+
const DEFAULT_OVERLAP = 50;
|
|
384
|
+
/**
|
|
385
|
+
* Default maximum input length in characters.
|
|
386
|
+
*/
|
|
387
|
+
const DEFAULT_MAX_INPUT_LENGTH = 1e5;
|
|
388
|
+
/**
|
|
389
|
+
* Heuristic token counting: approximately 4 characters per token.
|
|
390
|
+
* This is a reasonable approximation for English text.
|
|
391
|
+
*/
|
|
392
|
+
function heuristicTokenCount(text) {
|
|
393
|
+
return Math.ceil(text.length / 4);
|
|
394
|
+
}
|
|
395
|
+
/**
|
|
396
|
+
* Convert token count to approximate character count.
|
|
397
|
+
*/
|
|
398
|
+
function tokensToChars(tokens) {
|
|
399
|
+
return tokens * 4;
|
|
400
|
+
}
|
|
401
|
+
/**
|
|
402
|
+
* Create a tokenizer function based on configuration.
|
|
403
|
+
*/
|
|
404
|
+
function createTokenizer(config) {
|
|
405
|
+
if (!config || config === "heuristic") return heuristicTokenCount;
|
|
406
|
+
if (config === "tiktoken") return heuristicTokenCount;
|
|
407
|
+
return config;
|
|
408
|
+
}
|
|
409
|
+
/**
|
|
410
|
+
* Find a natural break point in text (sentence or word boundary).
|
|
411
|
+
* Prefers common sentence boundaries (Latin + CJK), falls back to word boundaries.
|
|
412
|
+
*/
|
|
413
|
+
function findBreakPoint(text, targetIndex) {
|
|
414
|
+
const searchStart = Math.max(0, targetIndex - Math.floor(targetIndex * .2));
|
|
415
|
+
const searchEnd = Math.min(text.length, targetIndex + Math.floor(targetIndex * .2));
|
|
416
|
+
const searchText = text.slice(searchStart, searchEnd);
|
|
417
|
+
const sentenceMatch = /[.!?。!?]\s*/g;
|
|
418
|
+
let lastSentenceEnd = -1;
|
|
419
|
+
for (const match of searchText.matchAll(sentenceMatch)) {
|
|
420
|
+
const absolutePos = searchStart + match.index + match[0].length;
|
|
421
|
+
if (absolutePos <= targetIndex) lastSentenceEnd = absolutePos;
|
|
422
|
+
}
|
|
423
|
+
if (lastSentenceEnd !== -1) return lastSentenceEnd;
|
|
424
|
+
const wordBoundary = text.lastIndexOf(" ", targetIndex);
|
|
425
|
+
if (wordBoundary > searchStart) return wordBoundary + 1;
|
|
426
|
+
return targetIndex;
|
|
427
|
+
}
|
|
428
|
+
/**
|
|
429
|
+
* Split text into overlapping chunks optimized for embedding.
|
|
430
|
+
* Respects sentence boundaries when possible.
|
|
431
|
+
*/
|
|
432
|
+
function chunkText(text, config) {
|
|
433
|
+
const chunkSize = config?.size ?? DEFAULT_CHUNK_SIZE$1;
|
|
434
|
+
const rawOverlap = config?.overlap ?? DEFAULT_OVERLAP;
|
|
435
|
+
const safeOverlap = Math.max(0, rawOverlap);
|
|
436
|
+
const overlap = Math.min(safeOverlap, Math.max(0, chunkSize - 1));
|
|
437
|
+
const maxInputLength = config?.maxInputLength ?? DEFAULT_MAX_INPUT_LENGTH;
|
|
438
|
+
const tokenizer = createTokenizer(config?.tokenizer);
|
|
439
|
+
const normalizedText = (text.length > maxInputLength ? text.slice(0, maxInputLength) : text).replace(/\s+/g, " ").trim();
|
|
440
|
+
if (!normalizedText) return [];
|
|
441
|
+
const totalTokens = tokenizer(normalizedText);
|
|
442
|
+
if (totalTokens <= chunkSize) return [{
|
|
443
|
+
text: normalizedText,
|
|
444
|
+
startIndex: 0,
|
|
445
|
+
endIndex: normalizedText.length,
|
|
446
|
+
tokens: totalTokens
|
|
447
|
+
}];
|
|
448
|
+
const chunks = [];
|
|
449
|
+
const chunkSizeChars = tokensToChars(chunkSize);
|
|
450
|
+
const overlapChars = tokensToChars(overlap);
|
|
451
|
+
let startIndex = 0;
|
|
452
|
+
while (startIndex < normalizedText.length) {
|
|
453
|
+
const targetEnd = Math.min(startIndex + chunkSizeChars, normalizedText.length);
|
|
454
|
+
const endIndex = targetEnd < normalizedText.length ? findBreakPoint(normalizedText, targetEnd) : targetEnd;
|
|
455
|
+
const chunkText$1 = normalizedText.slice(startIndex, endIndex).trim();
|
|
456
|
+
if (chunkText$1) chunks.push({
|
|
457
|
+
text: chunkText$1,
|
|
458
|
+
startIndex,
|
|
459
|
+
endIndex,
|
|
460
|
+
tokens: tokenizer(chunkText$1)
|
|
461
|
+
});
|
|
462
|
+
if (endIndex >= normalizedText.length) break;
|
|
463
|
+
const nextStart = endIndex - overlapChars;
|
|
464
|
+
startIndex = Math.max(nextStart, startIndex + 1);
|
|
465
|
+
if (startIndex < normalizedText.length) {
|
|
466
|
+
const spaceIndex = normalizedText.indexOf(" ", startIndex);
|
|
467
|
+
if (spaceIndex !== -1 && spaceIndex < startIndex + overlapChars) startIndex = spaceIndex + 1;
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
return chunks;
|
|
471
|
+
}
|
|
472
|
+
/**
|
|
473
|
+
* Estimate total tokens for a text without chunking.
|
|
474
|
+
*/
|
|
475
|
+
function estimateTokens(text, tokenizer) {
|
|
476
|
+
return createTokenizer(tokenizer)(text);
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
//#endregion
|
|
480
|
+
//#region src/embeddings/input.ts
|
|
481
|
+
/**
|
|
482
|
+
* Select and prepare input text for embedding based on configuration.
|
|
483
|
+
*
|
|
484
|
+
* @param data - Scraped data to extract input from
|
|
485
|
+
* @param config - Input configuration
|
|
486
|
+
* @returns Selected and prepared text, or undefined if no valid input
|
|
487
|
+
*/
|
|
488
|
+
function selectInput(data, config) {
|
|
489
|
+
if (config?.transform) return normalizeText(config.transform(data));
|
|
490
|
+
if (config?.type === "custom" && config.customText) return normalizeText(config.customText);
|
|
491
|
+
const type = config?.type ?? "textContent";
|
|
492
|
+
switch (type) {
|
|
493
|
+
case "textContent": return selectTextContent(data);
|
|
494
|
+
case "title+summary": return selectTitleSummary(data);
|
|
495
|
+
case "custom": return selectTextContent(data);
|
|
496
|
+
default: {
|
|
497
|
+
const _exhaustive = type;
|
|
498
|
+
throw new Error(`Unknown input type: ${_exhaustive}`);
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
/**
|
|
503
|
+
* Select textContent as input.
|
|
504
|
+
*/
|
|
505
|
+
function selectTextContent(data) {
|
|
506
|
+
if (data.textContent) return normalizeText(data.textContent);
|
|
507
|
+
if (data.content) return normalizeText(stripMarkdown(data.content));
|
|
508
|
+
if (data.excerpt) return normalizeText(data.excerpt);
|
|
509
|
+
if (data.description) return normalizeText(data.description);
|
|
510
|
+
}
|
|
511
|
+
/**
|
|
512
|
+
* Select title + summary (or fallbacks) as input.
|
|
513
|
+
* Optimized for semantic search and classification.
|
|
514
|
+
*/
|
|
515
|
+
function selectTitleSummary(data) {
|
|
516
|
+
const parts = [];
|
|
517
|
+
if (data.title) parts.push(data.title);
|
|
518
|
+
if (data.summary) parts.push(data.summary);
|
|
519
|
+
else if (data.excerpt) parts.push(data.excerpt);
|
|
520
|
+
else if (data.description) parts.push(data.description);
|
|
521
|
+
if (parts.length === 0) return;
|
|
522
|
+
return normalizeText(parts.join("\n\n"));
|
|
523
|
+
}
|
|
524
|
+
/**
|
|
525
|
+
* Normalize text for embedding:
|
|
526
|
+
* - Collapse whitespace
|
|
527
|
+
* - Trim leading/trailing whitespace
|
|
528
|
+
* - Remove control characters
|
|
529
|
+
*/
|
|
530
|
+
function normalizeText(text) {
|
|
531
|
+
if (!text) return "";
|
|
532
|
+
return text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "").replace(/[ \t]+/g, " ").replace(/\n{3,}/g, "\n\n").split("\n").map((line) => line.trim()).join("\n").trim();
|
|
533
|
+
}
|
|
534
|
+
/**
|
|
535
|
+
* Basic markdown stripping for when we need plain text from content.
|
|
536
|
+
* Not comprehensive, but handles common cases.
|
|
537
|
+
*/
|
|
538
|
+
function stripMarkdown(markdown) {
|
|
539
|
+
return markdown.replace(/```[\s\S]*?```/g, "").replace(/`[^`]+`/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/\*\*([^*]+)\*\*/g, "$1").replace(/\*([^*]+)\*/g, "$1").replace(/__([^_]+)__/g, "$1").replace(/_([^_]+)_/g, "$1").replace(/^>\s+/gm, "").replace(/^[-*_]{3,}$/gm, "").replace(/^[\s]*[-*+]\s+/gm, "").replace(/^[\s]*\d+\.\s+/gm, "");
|
|
540
|
+
}
|
|
541
|
+
/**
|
|
542
|
+
* Check if the selected input meets minimum requirements.
|
|
543
|
+
*/
|
|
544
|
+
function validateInput(text, minLength = 10) {
|
|
545
|
+
if (!text) return {
|
|
546
|
+
valid: false,
|
|
547
|
+
reason: "No input text available"
|
|
548
|
+
};
|
|
549
|
+
if (text.length < minLength) return {
|
|
550
|
+
valid: false,
|
|
551
|
+
reason: `Input too short (${text.length} < ${minLength} characters)`
|
|
552
|
+
};
|
|
553
|
+
const wordCount = text.split(/\s+/).filter((w) => w.length > 1).length;
|
|
554
|
+
if (wordCount < 3) return {
|
|
555
|
+
valid: false,
|
|
556
|
+
reason: `Input has too few words (${wordCount} < 3)`
|
|
557
|
+
};
|
|
558
|
+
return {
|
|
559
|
+
valid: true,
|
|
560
|
+
text,
|
|
561
|
+
wordCount,
|
|
562
|
+
charCount: text.length
|
|
563
|
+
};
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
//#endregion
|
|
567
|
+
//#region src/embeddings/providers/base.ts
|
|
568
|
+
/**
|
|
569
|
+
* Generate a stable cache key identifier for provider configuration.
|
|
570
|
+
*/
|
|
571
|
+
function getProviderCacheKey(config) {
|
|
572
|
+
switch (config.type) {
|
|
573
|
+
case "http": return `http:${config.config.baseUrl.replace(/\/$/, "")}:${config.config.model}`;
|
|
574
|
+
case "custom": return `custom:${config.provider.name}`;
|
|
575
|
+
default: {
|
|
576
|
+
const _exhaustive = config;
|
|
577
|
+
return String(_exhaustive);
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
//#endregion
|
|
583
|
+
//#region src/embeddings/providers/http.ts
|
|
584
|
+
/**
|
|
585
|
+
* HTTP-based Embedding Provider using native fetch.
|
|
586
|
+
* Provides a unified interface for any REST-based embedding API.
|
|
587
|
+
*/
|
|
588
|
+
/**
|
|
589
|
+
* HTTP-based embedding provider.
|
|
590
|
+
* Works with any REST API using native fetch.
|
|
591
|
+
*/
|
|
592
|
+
var HttpEmbeddingProvider = class extends require_enhancer.BaseHttpProvider {
|
|
593
|
+
name = "http-embedding";
|
|
594
|
+
requestBuilder;
|
|
595
|
+
responseMapper;
|
|
596
|
+
constructor(config) {
|
|
597
|
+
super(config);
|
|
598
|
+
this.requestBuilder = config.requestBuilder ?? ((texts, model) => ({
|
|
599
|
+
input: texts,
|
|
600
|
+
model
|
|
601
|
+
}));
|
|
602
|
+
this.responseMapper = config.responseMapper ?? ((response) => {
|
|
603
|
+
const resp = response;
|
|
604
|
+
if (Array.isArray(resp.data)) return resp.data.map((item) => item.embedding);
|
|
605
|
+
if (Array.isArray(resp.embeddings)) return resp.embeddings;
|
|
606
|
+
if (Array.isArray(resp.embedding)) return [resp.embedding];
|
|
607
|
+
if (Array.isArray(response)) return response;
|
|
608
|
+
throw new require_enhancer.ScrapeError("Unable to parse embedding response. Provide a custom responseMapper.", "VALIDATION_ERROR");
|
|
609
|
+
});
|
|
610
|
+
}
|
|
611
|
+
/**
|
|
612
|
+
* Generate embeddings for one or more texts.
|
|
613
|
+
*/
|
|
614
|
+
async embed(texts, options) {
|
|
615
|
+
const model = options.model || this.model;
|
|
616
|
+
const body = this.requestBuilder(texts, model);
|
|
617
|
+
const { data } = await this.fetch(this.baseUrl, {
|
|
618
|
+
body,
|
|
619
|
+
signal: options.signal
|
|
620
|
+
});
|
|
621
|
+
const embeddings = this.responseMapper(data);
|
|
622
|
+
if (embeddings.length !== texts.length) throw new require_enhancer.ScrapeError(`Embedding count mismatch: expected ${texts.length}, got ${embeddings.length}`, "VALIDATION_ERROR");
|
|
623
|
+
return { embeddings };
|
|
624
|
+
}
|
|
625
|
+
};
|
|
626
|
+
/**
|
|
627
|
+
* Create a generic HTTP embedding provider.
|
|
628
|
+
*/
|
|
629
|
+
function createHttpEmbedding(config) {
|
|
630
|
+
return new HttpEmbeddingProvider(config);
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
//#endregion
|
|
634
|
+
//#region src/embeddings/providers/presets.ts
|
|
635
|
+
/**
|
|
636
|
+
* Create an OpenAI embedding provider.
|
|
637
|
+
*
|
|
638
|
+
* @example
|
|
639
|
+
* ```ts
|
|
640
|
+
* const provider = createOpenAIEmbedding({ apiKey: 'sk-...' });
|
|
641
|
+
* const { embeddings } = await provider.embed(['Hello'], { model: 'text-embedding-3-small' });
|
|
642
|
+
* ```
|
|
643
|
+
*/
|
|
644
|
+
function createOpenAIEmbedding(options) {
|
|
645
|
+
const apiKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
|
|
646
|
+
if (!apiKey) throw new Error("OpenAI API key required. Set OPENAI_API_KEY env var or pass apiKey option.");
|
|
647
|
+
const headers = { Authorization: `Bearer ${apiKey}` };
|
|
648
|
+
if (options?.organization) headers["OpenAI-Organization"] = options.organization;
|
|
649
|
+
return new HttpEmbeddingProvider({
|
|
650
|
+
baseUrl: options?.baseUrl ?? "https://api.openai.com/v1/embeddings",
|
|
651
|
+
model: options?.model ?? "text-embedding-3-small",
|
|
652
|
+
headers,
|
|
653
|
+
requestBuilder: (texts, model) => ({
|
|
654
|
+
input: texts,
|
|
655
|
+
model
|
|
656
|
+
}),
|
|
657
|
+
responseMapper: (res) => res.data.map((item) => item.embedding)
|
|
658
|
+
});
|
|
659
|
+
}
|
|
660
|
+
/**
|
|
661
|
+
* Create an Azure OpenAI embedding provider.
|
|
662
|
+
*
|
|
663
|
+
* @example
|
|
664
|
+
* ```ts
|
|
665
|
+
* const provider = createAzureEmbedding({
|
|
666
|
+
* endpoint: 'https://my-resource.openai.azure.com',
|
|
667
|
+
* deploymentName: 'text-embedding-ada-002',
|
|
668
|
+
* apiVersion: '2023-05-15',
|
|
669
|
+
* });
|
|
670
|
+
* ```
|
|
671
|
+
*/
|
|
672
|
+
function createAzureEmbedding(options) {
|
|
673
|
+
const apiKey = options.apiKey ?? process.env.AZURE_OPENAI_API_KEY;
|
|
674
|
+
if (!apiKey) throw new Error("Azure OpenAI API key required. Set AZURE_OPENAI_API_KEY env var or pass apiKey option.");
|
|
675
|
+
return new HttpEmbeddingProvider({
|
|
676
|
+
baseUrl: `${options.endpoint.replace(/\/$/, "")}/openai/deployments/${options.deploymentName}/embeddings?api-version=${options.apiVersion}`,
|
|
677
|
+
model: options.deploymentName,
|
|
678
|
+
headers: { "api-key": apiKey },
|
|
679
|
+
requestBuilder: (texts) => ({ input: texts }),
|
|
680
|
+
responseMapper: (res) => res.data.map((item) => item.embedding)
|
|
681
|
+
});
|
|
682
|
+
}
|
|
683
|
+
/**
|
|
684
|
+
* Create an Ollama embedding provider for local models.
|
|
685
|
+
*
|
|
686
|
+
* LIMITATION: Ollama's /api/embeddings endpoint processes one text at a time,
|
|
687
|
+
* not batches. When multiple chunks are embedded, each chunk triggers a
|
|
688
|
+
* separate HTTP request. This is handled transparently by the pipeline's
|
|
689
|
+
* sequential chunk processing, but may be slower than batch-capable providers.
|
|
690
|
+
* For high-throughput scenarios, consider using OpenAI, Cohere, or HuggingFace
|
|
691
|
+
* which support batch embedding in a single request.
|
|
692
|
+
*
|
|
693
|
+
* @example
|
|
694
|
+
* ```ts
|
|
695
|
+
* const provider = createOllamaEmbedding({ model: 'nomic-embed-text' });
|
|
696
|
+
* ```
|
|
697
|
+
*/
|
|
698
|
+
function createOllamaEmbedding(options) {
|
|
699
|
+
return new HttpEmbeddingProvider({
|
|
700
|
+
baseUrl: options?.baseUrl ?? "http://localhost:11434/api/embeddings",
|
|
701
|
+
model: options?.model ?? "nomic-embed-text",
|
|
702
|
+
requireHttps: false,
|
|
703
|
+
allowPrivate: true,
|
|
704
|
+
requestBuilder: (texts, model) => ({
|
|
705
|
+
model,
|
|
706
|
+
prompt: texts[0]
|
|
707
|
+
}),
|
|
708
|
+
responseMapper: (res) => [res.embedding]
|
|
709
|
+
});
|
|
710
|
+
}
|
|
711
|
+
/**
|
|
712
|
+
* Create a HuggingFace Inference API embedding provider.
|
|
713
|
+
*
|
|
714
|
+
* @example
|
|
715
|
+
* ```ts
|
|
716
|
+
* const provider = createHuggingFaceEmbedding({
|
|
717
|
+
* model: 'sentence-transformers/all-MiniLM-L6-v2',
|
|
718
|
+
* });
|
|
719
|
+
* ```
|
|
720
|
+
*/
|
|
721
|
+
function createHuggingFaceEmbedding(options) {
|
|
722
|
+
const apiKey = options.apiKey ?? process.env.HF_TOKEN ?? process.env.HUGGINGFACE_API_KEY;
|
|
723
|
+
const headers = {};
|
|
724
|
+
if (apiKey) headers.Authorization = `Bearer ${apiKey}`;
|
|
725
|
+
return new HttpEmbeddingProvider({
|
|
726
|
+
baseUrl: `https://api-inference.huggingface.co/models/${options.model}`,
|
|
727
|
+
model: options.model,
|
|
728
|
+
headers,
|
|
729
|
+
requestBuilder: (texts) => ({ inputs: texts }),
|
|
730
|
+
responseMapper: (response) => {
|
|
731
|
+
if (Array.isArray(response)) {
|
|
732
|
+
if (Array.isArray(response[0]) && typeof response[0][0] === "number") return response;
|
|
733
|
+
return [response];
|
|
734
|
+
}
|
|
735
|
+
throw new Error("Unexpected HuggingFace response format");
|
|
736
|
+
}
|
|
737
|
+
});
|
|
738
|
+
}
|
|
739
|
+
/**
|
|
740
|
+
* Create a local Transformers.js embedding provider.
|
|
741
|
+
* Uses dependency injection - user provides the imported transformers module.
|
|
742
|
+
*
|
|
743
|
+
* @example
|
|
744
|
+
* ```typescript
|
|
745
|
+
* import * as transformers from '@huggingface/transformers';
|
|
746
|
+
* import { createTransformersEmbedding } from 'scrapex/embeddings';
|
|
747
|
+
*
|
|
748
|
+
* const provider = createTransformersEmbedding(transformers, {
|
|
749
|
+
* model: 'Xenova/all-MiniLM-L6-v2',
|
|
750
|
+
* });
|
|
751
|
+
* ```
|
|
752
|
+
*
|
|
753
|
+
* Required Node.js dependencies:
|
|
754
|
+
* ```
|
|
755
|
+
* npm install @huggingface/transformers onnxruntime-node
|
|
756
|
+
* ```
|
|
757
|
+
*/
|
|
758
|
+
function createTransformersEmbedding(transformers, options) {
|
|
759
|
+
let pipeline = null;
|
|
760
|
+
let currentModel = null;
|
|
761
|
+
const config = {
|
|
762
|
+
model: options?.model ?? "Xenova/all-MiniLM-L6-v2",
|
|
763
|
+
quantized: options?.quantized ?? true,
|
|
764
|
+
pooling: options?.pooling ?? "mean",
|
|
765
|
+
normalize: options?.normalize ?? true
|
|
766
|
+
};
|
|
767
|
+
return {
|
|
768
|
+
name: "transformers",
|
|
769
|
+
async embed(texts, request) {
|
|
770
|
+
const model = request.model || config.model;
|
|
771
|
+
if (!pipeline || currentModel !== model) {
|
|
772
|
+
const cacheDir = options?.cacheDir;
|
|
773
|
+
const env = transformers.env;
|
|
774
|
+
const priorCacheDir = env?.cacheDir;
|
|
775
|
+
if (cacheDir && env) env.cacheDir = cacheDir;
|
|
776
|
+
try {
|
|
777
|
+
pipeline = await transformers.pipeline("feature-extraction", model, { quantized: config.quantized });
|
|
778
|
+
} finally {
|
|
779
|
+
if (cacheDir && env) if (priorCacheDir === void 0) delete env.cacheDir;
|
|
780
|
+
else env.cacheDir = priorCacheDir;
|
|
781
|
+
}
|
|
782
|
+
currentModel = model;
|
|
783
|
+
}
|
|
784
|
+
const embeddings = [];
|
|
785
|
+
for (const text of texts) {
|
|
786
|
+
const output = await pipeline(text, {
|
|
787
|
+
pooling: config.pooling,
|
|
788
|
+
normalize: config.normalize
|
|
789
|
+
});
|
|
790
|
+
embeddings.push(Array.from(output.data));
|
|
791
|
+
}
|
|
792
|
+
return { embeddings };
|
|
793
|
+
}
|
|
794
|
+
};
|
|
795
|
+
}
|
|
796
|
+
/** Recommended models for Transformers.js */
|
|
797
|
+
const TRANSFORMERS_MODELS = {
|
|
798
|
+
DEFAULT: "Xenova/all-MiniLM-L6-v2",
|
|
799
|
+
QUALITY: "Xenova/all-mpnet-base-v2",
|
|
800
|
+
RETRIEVAL: "Xenova/bge-small-en-v1.5",
|
|
801
|
+
MULTILINGUAL: "Xenova/multilingual-e5-small"
|
|
802
|
+
};
|
|
803
|
+
|
|
804
|
+
//#endregion
|
|
805
|
+
//#region src/embeddings/providers/index.ts
|
|
806
|
+
/**
|
|
807
|
+
* Create an embedding provider from configuration.
|
|
808
|
+
* This is the main factory function for creating providers.
|
|
809
|
+
*/
|
|
810
|
+
function createEmbeddingProvider(config) {
|
|
811
|
+
switch (config.type) {
|
|
812
|
+
case "http": return createHttpEmbedding(config.config);
|
|
813
|
+
case "custom": return config.provider;
|
|
814
|
+
default: throw new require_enhancer.ScrapeError(`Unknown embedding provider type: ${config.type}`, "VALIDATION_ERROR");
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
//#endregion
|
|
819
|
+
//#region src/embeddings/safety.ts
|
|
820
|
+
/**
|
|
821
|
+
* PII redaction patterns with high precision to minimize false positives.
|
|
822
|
+
* Patterns are designed to match common formats while avoiding over-matching.
|
|
823
|
+
*/
|
|
824
|
+
const EMAIL_PATTERN = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
|
|
825
|
+
const PHONE_PATTERN = /(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b/g;
|
|
826
|
+
const CREDIT_CARD_PATTERN = /\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12}|(?:[0-9]{4}[-\s]){3}[0-9]{4}|[0-9]{13,19})\b/g;
|
|
827
|
+
const SSN_PATTERN = /\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b/g;
|
|
828
|
+
const IPV4_PATTERN = /\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g;
|
|
829
|
+
const REDACTED = "[REDACTED]";
|
|
830
|
+
/**
|
|
831
|
+
* Create a redaction function based on configuration.
|
|
832
|
+
* Returns a function that applies all configured PII patterns.
|
|
833
|
+
*/
|
|
834
|
+
function createPiiRedactor(config) {
|
|
835
|
+
const patterns = [];
|
|
836
|
+
if (config.creditCard) patterns.push({
|
|
837
|
+
name: "creditCard",
|
|
838
|
+
pattern: CREDIT_CARD_PATTERN
|
|
839
|
+
});
|
|
840
|
+
if (config.email) patterns.push({
|
|
841
|
+
name: "email",
|
|
842
|
+
pattern: EMAIL_PATTERN
|
|
843
|
+
});
|
|
844
|
+
if (config.phone) patterns.push({
|
|
845
|
+
name: "phone",
|
|
846
|
+
pattern: PHONE_PATTERN
|
|
847
|
+
});
|
|
848
|
+
if (config.ssn) patterns.push({
|
|
849
|
+
name: "ssn",
|
|
850
|
+
pattern: SSN_PATTERN
|
|
851
|
+
});
|
|
852
|
+
if (config.ipAddress) patterns.push({
|
|
853
|
+
name: "ipAddress",
|
|
854
|
+
pattern: IPV4_PATTERN
|
|
855
|
+
});
|
|
856
|
+
if (config.customPatterns) for (let i = 0; i < config.customPatterns.length; i++) {
|
|
857
|
+
const customPattern = config.customPatterns[i];
|
|
858
|
+
if (customPattern) patterns.push({
|
|
859
|
+
name: `custom_${i}`,
|
|
860
|
+
pattern: customPattern
|
|
861
|
+
});
|
|
862
|
+
}
|
|
863
|
+
return (text) => {
|
|
864
|
+
let redactedText = text;
|
|
865
|
+
let totalRedactions = 0;
|
|
866
|
+
const redactionsByType = {};
|
|
867
|
+
for (const { name, pattern } of patterns) {
|
|
868
|
+
pattern.lastIndex = 0;
|
|
869
|
+
const matchCount = text.match(pattern)?.length ?? 0;
|
|
870
|
+
if (matchCount > 0) {
|
|
871
|
+
redactedText = redactedText.replace(pattern, REDACTED);
|
|
872
|
+
totalRedactions += matchCount;
|
|
873
|
+
redactionsByType[name] = (redactionsByType[name] ?? 0) + matchCount;
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
return {
|
|
877
|
+
text: redactedText,
|
|
878
|
+
redacted: totalRedactions > 0,
|
|
879
|
+
redactionCount: totalRedactions,
|
|
880
|
+
redactionsByType
|
|
881
|
+
};
|
|
882
|
+
};
|
|
883
|
+
}
|
|
884
|
+
/**
|
|
885
|
+
* Simple redaction that applies all default patterns.
|
|
886
|
+
* Use createPiiRedactor() for fine-grained control.
|
|
887
|
+
*/
|
|
888
|
+
function redactPii(text) {
|
|
889
|
+
return createPiiRedactor({
|
|
890
|
+
email: true,
|
|
891
|
+
phone: true,
|
|
892
|
+
creditCard: true,
|
|
893
|
+
ssn: true,
|
|
894
|
+
ipAddress: true
|
|
895
|
+
})(text);
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
//#endregion
|
|
899
|
+
//#region src/embeddings/pipeline.ts
|
|
900
|
+
const DEFAULT_CHUNK_SIZE = 500;
|
|
901
|
+
/**
|
|
902
|
+
* Get the effective model for embedding.
|
|
903
|
+
* Prioritizes: explicit options.model > provider config model
|
|
904
|
+
*/
|
|
905
|
+
function getEffectiveModel(providerConfig, explicitModel) {
|
|
906
|
+
if (explicitModel) return explicitModel;
|
|
907
|
+
if (providerConfig.type === "http") return providerConfig.config.model;
|
|
908
|
+
}
|
|
909
|
+
/**
|
|
910
|
+
* Generate embeddings for scraped data.
|
|
911
|
+
* This is the main entry point for the embedding pipeline.
|
|
912
|
+
*/
|
|
913
|
+
async function generateEmbeddings(data, options) {
|
|
914
|
+
const startTime = Date.now();
|
|
915
|
+
try {
|
|
916
|
+
const provider = createEmbeddingProvider(options.provider);
|
|
917
|
+
const model = getEffectiveModel(options.provider, options.model);
|
|
918
|
+
const validation = validateInput(selectInput(data, options.input), options.safety?.minTextLength ?? 10);
|
|
919
|
+
if (!validation.valid) return createSkippedResult(validation.reason, { model });
|
|
920
|
+
const originalInput = validation.text;
|
|
921
|
+
let inputText = validation.text;
|
|
922
|
+
let piiRedacted = false;
|
|
923
|
+
if (options.safety?.piiRedaction) {
|
|
924
|
+
const redactionResult = createPiiRedactor(options.safety.piiRedaction)(inputText);
|
|
925
|
+
inputText = redactionResult.text;
|
|
926
|
+
piiRedacted = redactionResult.redacted;
|
|
927
|
+
}
|
|
928
|
+
const effectiveChunking = applyMaxTokensToChunking(options.chunking, options.safety?.maxTokens);
|
|
929
|
+
const cacheKey = generateCacheKey({
|
|
930
|
+
providerKey: getProviderCacheKey(options.provider),
|
|
931
|
+
model,
|
|
932
|
+
dimensions: options.output?.dimensions,
|
|
933
|
+
aggregation: options.output?.aggregation,
|
|
934
|
+
input: options.input,
|
|
935
|
+
chunking: effectiveChunking,
|
|
936
|
+
safety: options.safety,
|
|
937
|
+
cacheKeySalt: options.cache?.cacheKeySalt,
|
|
938
|
+
content: inputText
|
|
939
|
+
});
|
|
940
|
+
const cache = options.cache?.store ?? getDefaultCache();
|
|
941
|
+
const cachedResult = await cache.get(cacheKey);
|
|
942
|
+
if (cachedResult && cachedResult.status === "success") {
|
|
943
|
+
if (options.onMetrics) options.onMetrics({
|
|
944
|
+
provider: provider.name,
|
|
945
|
+
model,
|
|
946
|
+
inputTokens: estimateTokens(inputText),
|
|
947
|
+
outputDimensions: getDimensions(cachedResult.aggregation === "all" ? cachedResult.vectors : cachedResult.vector),
|
|
948
|
+
chunks: cachedResult.source.chunks,
|
|
949
|
+
latencyMs: Date.now() - startTime,
|
|
950
|
+
cached: true,
|
|
951
|
+
retries: 0,
|
|
952
|
+
piiRedacted
|
|
953
|
+
});
|
|
954
|
+
return {
|
|
955
|
+
...cachedResult,
|
|
956
|
+
source: {
|
|
957
|
+
...cachedResult.source,
|
|
958
|
+
cached: true
|
|
959
|
+
}
|
|
960
|
+
};
|
|
961
|
+
}
|
|
962
|
+
const chunks = chunkText(inputText, effectiveChunking);
|
|
963
|
+
const callbackChunks = options.onChunk && options.safety?.allowSensitiveCallbacks ? chunkText(originalInput, effectiveChunking) : null;
|
|
964
|
+
if (chunks.length === 0) return createSkippedResult("No content after chunking", { model });
|
|
965
|
+
const sharedState = options.resilience?.state;
|
|
966
|
+
const rateLimiter = sharedState?.rateLimiter ?? (options.resilience?.rateLimit ? new require_enhancer.RateLimiter(options.resilience.rateLimit) : null);
|
|
967
|
+
const circuitBreaker = sharedState?.circuitBreaker ?? (options.resilience?.circuitBreaker ? new require_enhancer.CircuitBreaker(options.resilience.circuitBreaker) : null);
|
|
968
|
+
const concurrency = options.resilience?.concurrency ?? 1;
|
|
969
|
+
const semaphore = sharedState?.semaphore ?? new require_enhancer.Semaphore(concurrency);
|
|
970
|
+
const embeddings = [];
|
|
971
|
+
let totalTokens = 0;
|
|
972
|
+
let retryCount = 0;
|
|
973
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
974
|
+
const chunk = chunks[i];
|
|
975
|
+
if (!chunk) continue;
|
|
976
|
+
if (rateLimiter) await rateLimiter.acquire();
|
|
977
|
+
if (circuitBreaker?.isOpen()) return createSkippedResult("Circuit breaker is open", {
|
|
978
|
+
model,
|
|
979
|
+
chunks: i
|
|
980
|
+
});
|
|
981
|
+
await semaphore.execute(async () => {
|
|
982
|
+
const { result: result$1 } = await require_enhancer.withResilience(async (signal) => {
|
|
983
|
+
return provider.embed([chunk.text], {
|
|
984
|
+
model,
|
|
985
|
+
dimensions: options.output?.dimensions,
|
|
986
|
+
signal
|
|
987
|
+
});
|
|
988
|
+
}, options.resilience, {
|
|
989
|
+
circuitBreaker: circuitBreaker ?? void 0,
|
|
990
|
+
rateLimiter: void 0,
|
|
991
|
+
semaphore: void 0
|
|
992
|
+
}, { onRetry: () => {
|
|
993
|
+
retryCount++;
|
|
994
|
+
} });
|
|
995
|
+
if (result$1.usage) totalTokens += result$1.usage.totalTokens;
|
|
996
|
+
else totalTokens += chunk.tokens;
|
|
997
|
+
const embedding = result$1.embeddings[0];
|
|
998
|
+
if (embedding) {
|
|
999
|
+
embeddings.push(embedding);
|
|
1000
|
+
if (options.onChunk) {
|
|
1001
|
+
const callbackText = callbackChunks?.[i]?.text ?? chunk.text;
|
|
1002
|
+
options.onChunk(callbackText, embedding);
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
});
|
|
1006
|
+
}
|
|
1007
|
+
const aggregation = options.output?.aggregation ?? "average";
|
|
1008
|
+
const aggregated = aggregateVectors(embeddings, aggregation);
|
|
1009
|
+
const source = {
|
|
1010
|
+
model,
|
|
1011
|
+
chunks: chunks.length,
|
|
1012
|
+
tokens: totalTokens || estimateTokens(inputText),
|
|
1013
|
+
checksum: generateChecksum(inputText),
|
|
1014
|
+
cached: false,
|
|
1015
|
+
latencyMs: Date.now() - startTime
|
|
1016
|
+
};
|
|
1017
|
+
let result;
|
|
1018
|
+
if (aggregated.type === "single") result = {
|
|
1019
|
+
status: "success",
|
|
1020
|
+
aggregation,
|
|
1021
|
+
vector: aggregated.vector,
|
|
1022
|
+
source
|
|
1023
|
+
};
|
|
1024
|
+
else result = {
|
|
1025
|
+
status: "success",
|
|
1026
|
+
aggregation: "all",
|
|
1027
|
+
vectors: aggregated.vectors,
|
|
1028
|
+
source
|
|
1029
|
+
};
|
|
1030
|
+
await cache.set(cacheKey, result, { ttlMs: options.cache?.ttlMs });
|
|
1031
|
+
if (options.onMetrics) {
|
|
1032
|
+
const metrics = {
|
|
1033
|
+
provider: provider.name,
|
|
1034
|
+
model,
|
|
1035
|
+
inputTokens: source.tokens,
|
|
1036
|
+
outputDimensions: aggregated.dimensions,
|
|
1037
|
+
chunks: chunks.length,
|
|
1038
|
+
latencyMs: source.latencyMs,
|
|
1039
|
+
cached: false,
|
|
1040
|
+
retries: retryCount,
|
|
1041
|
+
piiRedacted
|
|
1042
|
+
};
|
|
1043
|
+
options.onMetrics(metrics);
|
|
1044
|
+
}
|
|
1045
|
+
return result;
|
|
1046
|
+
} catch (error) {
|
|
1047
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
1048
|
+
if (error instanceof require_enhancer.ScrapeError && ["INVALID_URL", "BLOCKED"].includes(error.code)) throw error;
|
|
1049
|
+
return createSkippedResult(reason, { latencyMs: Date.now() - startTime });
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
function applyMaxTokensToChunking(chunking, maxTokens) {
|
|
1053
|
+
if (!maxTokens || maxTokens <= 0) return chunking;
|
|
1054
|
+
const baseSize = chunking?.size ?? DEFAULT_CHUNK_SIZE;
|
|
1055
|
+
const baseOverlap = chunking?.overlap ?? 50;
|
|
1056
|
+
const clampedSize = Math.min(baseSize, maxTokens);
|
|
1057
|
+
const clampedOverlap = Math.min(baseOverlap, Math.max(0, clampedSize - 1));
|
|
1058
|
+
return {
|
|
1059
|
+
...chunking,
|
|
1060
|
+
size: clampedSize,
|
|
1061
|
+
overlap: clampedOverlap
|
|
1062
|
+
};
|
|
1063
|
+
}
|
|
1064
|
+
/**
|
|
1065
|
+
* Embed arbitrary text directly.
|
|
1066
|
+
* Standalone function for embedding text outside of scrape().
|
|
1067
|
+
*/
|
|
1068
|
+
async function embed(text, options) {
|
|
1069
|
+
return generateEmbeddings({ textContent: text }, {
|
|
1070
|
+
...options,
|
|
1071
|
+
input: {
|
|
1072
|
+
...options.input,
|
|
1073
|
+
type: "textContent"
|
|
1074
|
+
}
|
|
1075
|
+
});
|
|
1076
|
+
}
|
|
1077
|
+
/**
|
|
1078
|
+
* Embed from existing ScrapedData.
|
|
1079
|
+
* Useful when you've already scraped and want to add embeddings later.
|
|
1080
|
+
*/
|
|
1081
|
+
async function embedScrapedData(data, options) {
|
|
1082
|
+
return generateEmbeddings(data, options);
|
|
1083
|
+
}
|
|
1084
|
+
/**
|
|
1085
|
+
* Create a skipped result with reason.
|
|
1086
|
+
*/
|
|
1087
|
+
function createSkippedResult(reason, partialSource) {
|
|
1088
|
+
return {
|
|
1089
|
+
status: "skipped",
|
|
1090
|
+
reason,
|
|
1091
|
+
source: partialSource ?? {}
|
|
1092
|
+
};
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
//#endregion
|
|
1096
|
+
//#region src/extractors/content.ts
|
|
1097
|
+
const turndown$1 = new turndown.default({
|
|
1098
|
+
headingStyle: "atx",
|
|
1099
|
+
codeBlockStyle: "fenced",
|
|
1100
|
+
bulletListMarker: "-",
|
|
1101
|
+
emDelimiter: "_",
|
|
1102
|
+
strongDelimiter: "**",
|
|
1103
|
+
linkStyle: "inlined"
|
|
1104
|
+
});
|
|
1105
|
+
turndown$1.remove([
|
|
1106
|
+
"script",
|
|
1107
|
+
"style",
|
|
1108
|
+
"noscript",
|
|
1109
|
+
"iframe",
|
|
1110
|
+
"nav",
|
|
1111
|
+
"footer"
|
|
1112
|
+
]);
|
|
1113
|
+
/**
|
|
1114
|
+
* Extracts main content using Mozilla Readability.
|
|
1115
|
+
* Converts HTML to Markdown for LLM consumption.
|
|
1116
|
+
*/
|
|
1117
|
+
var ContentExtractor = class {
|
|
1118
|
+
name = "content";
|
|
1119
|
+
priority = 50;
|
|
1120
|
+
async extract(context) {
|
|
1121
|
+
const { options } = context;
|
|
1122
|
+
if (options.extractContent === false) return {};
|
|
1123
|
+
const article = new _mozilla_readability.Readability(context.getDocument().cloneNode(true)).parse();
|
|
1124
|
+
if (!article || !article.content) return this.extractFallback(context);
|
|
1125
|
+
let content = turndown$1.turndown(article.content);
|
|
1126
|
+
const maxLength = options.maxContentLength ?? 5e4;
|
|
1127
|
+
if (content.length > maxLength) content = `${content.slice(0, maxLength)}\n\n[Content truncated...]`;
|
|
1128
|
+
const textContent = (article.textContent ?? "").trim();
|
|
1129
|
+
const excerpt = this.createExcerpt(textContent);
|
|
1130
|
+
const wordCount = textContent.split(/\s+/).filter(Boolean).length;
|
|
1131
|
+
const contentType = this.detectContentType(context);
|
|
1132
|
+
return {
|
|
1133
|
+
content,
|
|
1134
|
+
textContent,
|
|
1135
|
+
excerpt: article.excerpt || excerpt,
|
|
1136
|
+
wordCount,
|
|
1137
|
+
contentType,
|
|
1138
|
+
title: article.title || void 0,
|
|
1139
|
+
author: article.byline || void 0,
|
|
1140
|
+
siteName: article.siteName || void 0
|
|
1141
|
+
};
|
|
1142
|
+
}
|
|
1143
|
+
extractFallback(context) {
|
|
1144
|
+
const { $ } = context;
|
|
1145
|
+
const bodyHtml = $("body").html() || "";
|
|
1146
|
+
const content = turndown$1.turndown(bodyHtml);
|
|
1147
|
+
const textContent = $("body").text().replace(/\s+/g, " ").trim();
|
|
1148
|
+
return {
|
|
1149
|
+
content: content.slice(0, context.options.maxContentLength ?? 5e4),
|
|
1150
|
+
textContent,
|
|
1151
|
+
excerpt: this.createExcerpt(textContent),
|
|
1152
|
+
wordCount: textContent.split(/\s+/).filter(Boolean).length,
|
|
1153
|
+
contentType: "unknown"
|
|
1154
|
+
};
|
|
1155
|
+
}
|
|
1156
|
+
createExcerpt(text, maxLength = 300) {
|
|
1157
|
+
if (text.length <= maxLength) return text;
|
|
1158
|
+
const truncated = text.slice(0, maxLength);
|
|
1159
|
+
const lastSpace = truncated.lastIndexOf(" ");
|
|
1160
|
+
return `${lastSpace > 0 ? truncated.slice(0, lastSpace) : truncated}...`;
|
|
1161
|
+
}
|
|
1162
|
+
detectContentType(context) {
|
|
1163
|
+
const { $, finalUrl } = context;
|
|
1164
|
+
const url = finalUrl.toLowerCase();
|
|
1165
|
+
if (url.includes("github.com") && !url.includes("/blob/") && !url.includes("/issues/")) {
|
|
1166
|
+
if ($("meta[property=\"og:type\"]").attr("content") === "object" || url.match(/github\.com\/[^/]+\/[^/]+\/?$/)) return "repo";
|
|
1167
|
+
}
|
|
1168
|
+
if (url.includes("npmjs.com/package/")) return "package";
|
|
1169
|
+
if (url.includes("pypi.org/project/")) return "package";
|
|
1170
|
+
if (url.includes("/docs/") || url.includes(".readthedocs.") || url.includes("/documentation/")) return "docs";
|
|
1171
|
+
if (url.includes("youtube.com") || url.includes("vimeo.com") || url.includes("youtu.be")) return "video";
|
|
1172
|
+
const hasPrice = $("[class*=\"price\"], [data-price], [itemprop=\"price\"]").length > 0;
|
|
1173
|
+
const hasAddToCart = $("[class*=\"cart\"], [class*=\"buy\"], button:contains(\"Add\")").length > 0;
|
|
1174
|
+
if (hasPrice || hasAddToCart) return "product";
|
|
1175
|
+
const ogType = $("meta[property=\"og:type\"]").attr("content")?.toLowerCase();
|
|
1176
|
+
if (ogType === "article" || ogType === "blog" || ogType === "news") return "article";
|
|
1177
|
+
const hasArticleTag = $("article").length > 0;
|
|
1178
|
+
const hasDateline = $("time[datetime], [class*=\"date\"], [class*=\"byline\"]").length > 0;
|
|
1179
|
+
if (hasArticleTag && hasDateline) return "article";
|
|
1180
|
+
return "unknown";
|
|
1181
|
+
}
|
|
1182
|
+
};
|
|
1183
|
+
|
|
1184
|
+
//#endregion
|
|
1185
|
+
//#region src/utils/url.ts
|
|
1186
|
+
/**
|
|
1187
|
+
* Common tracking parameters to remove from URLs
|
|
1188
|
+
*/
|
|
1189
|
+
const TRACKING_PARAMS = [
|
|
1190
|
+
"utm_source",
|
|
1191
|
+
"utm_medium",
|
|
1192
|
+
"utm_campaign",
|
|
1193
|
+
"utm_term",
|
|
1194
|
+
"utm_content",
|
|
1195
|
+
"utm_id",
|
|
1196
|
+
"ref",
|
|
1197
|
+
"fbclid",
|
|
1198
|
+
"gclid",
|
|
1199
|
+
"gclsrc",
|
|
1200
|
+
"dclid",
|
|
1201
|
+
"msclkid",
|
|
1202
|
+
"mc_cid",
|
|
1203
|
+
"mc_eid",
|
|
1204
|
+
"_ga",
|
|
1205
|
+
"_gl",
|
|
1206
|
+
"source",
|
|
1207
|
+
"referrer"
|
|
1208
|
+
];
|
|
1209
|
+
/**
|
|
1210
|
+
* Validate if a string is a valid URL
|
|
1211
|
+
*/
|
|
1212
|
+
function isValidUrl(url) {
|
|
1213
|
+
try {
|
|
1214
|
+
const parsed = new URL(url);
|
|
1215
|
+
return ["http:", "https:"].includes(parsed.protocol);
|
|
1216
|
+
} catch {
|
|
1217
|
+
return false;
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
/**
|
|
1221
|
+
* Normalize URL by removing tracking params and trailing slashes
|
|
1222
|
+
*/
|
|
1223
|
+
function normalizeUrl(url) {
|
|
1224
|
+
try {
|
|
1225
|
+
const parsed = new URL(url);
|
|
1226
|
+
for (const param of TRACKING_PARAMS) parsed.searchParams.delete(param);
|
|
1227
|
+
let normalized = parsed.toString();
|
|
1228
|
+
if (normalized.endsWith("/") && parsed.pathname !== "/") normalized = normalized.slice(0, -1);
|
|
1229
|
+
return normalized;
|
|
1230
|
+
} catch {
|
|
1231
|
+
return url;
|
|
1232
|
+
}
|
|
1233
|
+
}
|
|
1234
|
+
/**
|
|
1235
|
+
* Extract domain from URL (without www prefix)
|
|
1236
|
+
*/
|
|
1237
|
+
function extractDomain(url) {
|
|
1238
|
+
try {
|
|
1239
|
+
return new URL(url).hostname.replace(/^www\./, "");
|
|
1240
|
+
} catch {
|
|
1241
|
+
return "";
|
|
1242
|
+
}
|
|
1243
|
+
}
|
|
1244
|
+
/**
|
|
1245
|
+
* Resolve a potentially relative URL against a base URL
|
|
1246
|
+
*/
|
|
1247
|
+
function resolveUrl(url, baseUrl) {
|
|
1248
|
+
if (!url) return void 0;
|
|
1249
|
+
try {
|
|
1250
|
+
return new URL(url, baseUrl).href;
|
|
1251
|
+
} catch {
|
|
1252
|
+
return url;
|
|
1253
|
+
}
|
|
1254
|
+
}
|
|
1255
|
+
/**
|
|
1256
|
+
* Check if a URL is external relative to a domain
|
|
1257
|
+
*/
|
|
1258
|
+
function isExternalUrl(url, baseDomain) {
|
|
1259
|
+
try {
|
|
1260
|
+
return new URL(url).hostname.replace(/^www\./, "") !== baseDomain;
|
|
1261
|
+
} catch {
|
|
1262
|
+
return false;
|
|
1263
|
+
}
|
|
1264
|
+
}
|
|
1265
|
+
/**
|
|
1266
|
+
* Extract protocol from URL
|
|
1267
|
+
*/
|
|
1268
|
+
function getProtocol(url) {
|
|
1269
|
+
try {
|
|
1270
|
+
return new URL(url).protocol;
|
|
1271
|
+
} catch {
|
|
1272
|
+
return "";
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
/**
|
|
1276
|
+
* Get the path portion of a URL
|
|
1277
|
+
*/
|
|
1278
|
+
function getPath(url) {
|
|
1279
|
+
try {
|
|
1280
|
+
return new URL(url).pathname;
|
|
1281
|
+
} catch {
|
|
1282
|
+
return "";
|
|
1283
|
+
}
|
|
1284
|
+
}
|
|
1285
|
+
/**
|
|
1286
|
+
* Check if URL matches a pattern (supports * wildcard)
|
|
1287
|
+
*/
|
|
1288
|
+
function matchesUrlPattern(url, pattern) {
|
|
1289
|
+
if (!pattern.includes("*")) return url === pattern || url.startsWith(pattern);
|
|
1290
|
+
const regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1291
|
+
return (/* @__PURE__ */ new RegExp(`^${regexPattern}`)).test(url);
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
//#endregion
|
|
1295
|
+
//#region src/extractors/favicon.ts
|
|
1296
|
+
/**
|
|
1297
|
+
* Extracts favicon URL from the page.
|
|
1298
|
+
* Checks multiple sources in order of preference.
|
|
1299
|
+
*/
|
|
1300
|
+
var FaviconExtractor = class {
|
|
1301
|
+
name = "favicon";
|
|
1302
|
+
priority = 70;
|
|
1303
|
+
async extract(context) {
|
|
1304
|
+
const { $, finalUrl } = context;
|
|
1305
|
+
for (const selector of [
|
|
1306
|
+
"link[rel=\"icon\"][type=\"image/svg+xml\"]",
|
|
1307
|
+
"link[rel=\"icon\"][sizes=\"192x192\"]",
|
|
1308
|
+
"link[rel=\"icon\"][sizes=\"180x180\"]",
|
|
1309
|
+
"link[rel=\"icon\"][sizes=\"128x128\"]",
|
|
1310
|
+
"link[rel=\"icon\"][sizes=\"96x96\"]",
|
|
1311
|
+
"link[rel=\"apple-touch-icon\"][sizes=\"180x180\"]",
|
|
1312
|
+
"link[rel=\"apple-touch-icon\"]",
|
|
1313
|
+
"link[rel=\"icon\"][sizes=\"32x32\"]",
|
|
1314
|
+
"link[rel=\"icon\"]",
|
|
1315
|
+
"link[rel=\"shortcut icon\"]"
|
|
1316
|
+
]) {
|
|
1317
|
+
const href = $(selector).first().attr("href");
|
|
1318
|
+
if (href) return { favicon: resolveUrl(finalUrl, href) };
|
|
1319
|
+
}
|
|
1320
|
+
try {
|
|
1321
|
+
const url = new URL(finalUrl);
|
|
1322
|
+
return { favicon: `${url.protocol}//${url.host}/favicon.ico` };
|
|
1323
|
+
} catch {
|
|
1324
|
+
return {};
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
};
|
|
1328
|
+
|
|
1329
|
+
//#endregion
|
|
1330
|
+
//#region src/extractors/jsonld.ts
|
|
1331
|
+
/**
|
|
1332
|
+
* Extracts JSON-LD structured data from the page.
|
|
1333
|
+
* Also extracts additional metadata from structured data.
|
|
1334
|
+
*/
|
|
1335
|
+
var JsonLdExtractor = class {
|
|
1336
|
+
name = "jsonld";
|
|
1337
|
+
priority = 80;
|
|
1338
|
+
async extract(context) {
|
|
1339
|
+
const { $ } = context;
|
|
1340
|
+
const jsonLd = [];
|
|
1341
|
+
$("script[type=\"application/ld+json\"]").each((_, el) => {
|
|
1342
|
+
const content = $(el).html();
|
|
1343
|
+
if (!content) return;
|
|
1344
|
+
try {
|
|
1345
|
+
const parsed = JSON.parse(content);
|
|
1346
|
+
if (Array.isArray(parsed)) jsonLd.push(...parsed);
|
|
1347
|
+
else if (typeof parsed === "object" && parsed !== null) jsonLd.push(parsed);
|
|
1348
|
+
} catch {}
|
|
1349
|
+
});
|
|
1350
|
+
if (jsonLd.length === 0) return {};
|
|
1351
|
+
return {
|
|
1352
|
+
jsonLd,
|
|
1353
|
+
...this.extractMetadata(jsonLd)
|
|
1354
|
+
};
|
|
1355
|
+
}
|
|
1356
|
+
extractMetadata(jsonLd) {
|
|
1357
|
+
const result = {};
|
|
1358
|
+
for (const item of jsonLd) {
|
|
1359
|
+
const type = this.getType(item);
|
|
1360
|
+
if (type?.match(/Article|BlogPosting|NewsArticle|WebPage/i)) {
|
|
1361
|
+
result.title = result.title || this.getString(item, "headline", "name");
|
|
1362
|
+
result.description = result.description || this.getString(item, "description");
|
|
1363
|
+
result.author = result.author || this.getAuthor(item);
|
|
1364
|
+
result.publishedAt = result.publishedAt || this.getString(item, "datePublished");
|
|
1365
|
+
result.modifiedAt = result.modifiedAt || this.getString(item, "dateModified");
|
|
1366
|
+
result.image = result.image || this.getImage(item);
|
|
1367
|
+
}
|
|
1368
|
+
if (type === "Organization") result.siteName = result.siteName || this.getString(item, "name");
|
|
1369
|
+
if (type === "Product") {
|
|
1370
|
+
result.title = result.title || this.getString(item, "name");
|
|
1371
|
+
result.description = result.description || this.getString(item, "description");
|
|
1372
|
+
result.image = result.image || this.getImage(item);
|
|
1373
|
+
}
|
|
1374
|
+
if (type === "SoftwareApplication") {
|
|
1375
|
+
result.title = result.title || this.getString(item, "name");
|
|
1376
|
+
result.description = result.description || this.getString(item, "description");
|
|
1377
|
+
}
|
|
1378
|
+
const keywords = this.getKeywords(item);
|
|
1379
|
+
if (keywords.length > 0) result.keywords = [...result.keywords || [], ...keywords];
|
|
1380
|
+
}
|
|
1381
|
+
if (result.keywords) result.keywords = [...new Set(result.keywords)];
|
|
1382
|
+
return result;
|
|
1383
|
+
}
|
|
1384
|
+
getType(item) {
|
|
1385
|
+
const type = item["@type"];
|
|
1386
|
+
if (typeof type === "string") return type;
|
|
1387
|
+
if (Array.isArray(type)) return type[0];
|
|
1388
|
+
}
|
|
1389
|
+
getString(item, ...keys) {
|
|
1390
|
+
for (const key of keys) {
|
|
1391
|
+
const value = item[key];
|
|
1392
|
+
if (typeof value === "string") return value;
|
|
1393
|
+
if (typeof value === "object" && value !== null && "@value" in value) return String(value["@value"]);
|
|
1394
|
+
}
|
|
1395
|
+
}
|
|
1396
|
+
getAuthor(item) {
|
|
1397
|
+
const author = item.author;
|
|
1398
|
+
if (typeof author === "string") return author;
|
|
1399
|
+
if (Array.isArray(author)) return author.map((a) => typeof a === "string" ? a : this.getString(a, "name")).filter(Boolean).join(", ") || void 0;
|
|
1400
|
+
if (typeof author === "object" && author !== null) {
|
|
1401
|
+
const authorObj = author;
|
|
1402
|
+
return this.getString(authorObj, "name") || void 0;
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
getImage(item) {
|
|
1406
|
+
const image = item.image;
|
|
1407
|
+
if (typeof image === "string") return image;
|
|
1408
|
+
if (Array.isArray(image) && image.length > 0) return this.getImage({ image: image[0] });
|
|
1409
|
+
if (typeof image === "object" && image !== null) {
|
|
1410
|
+
const imageObj = image;
|
|
1411
|
+
return this.getString(imageObj, "url", "contentUrl") || void 0;
|
|
1412
|
+
}
|
|
1413
|
+
}
|
|
1414
|
+
getKeywords(item) {
|
|
1415
|
+
const keywords = item.keywords;
|
|
1416
|
+
if (typeof keywords === "string") return keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
1417
|
+
if (Array.isArray(keywords)) return keywords.filter((k) => typeof k === "string");
|
|
1418
|
+
return [];
|
|
1419
|
+
}
|
|
1420
|
+
};
|
|
1421
|
+
|
|
1422
|
+
//#endregion
|
|
1423
|
+
//#region src/extractors/links.ts
|
|
1424
|
+
/**
|
|
1425
|
+
* Extracts links from the page content.
|
|
1426
|
+
* Filters out navigation/footer links and focuses on content links.
|
|
1427
|
+
*/
|
|
1428
|
+
var LinksExtractor = class {
|
|
1429
|
+
name = "links";
|
|
1430
|
+
priority = 30;
|
|
1431
|
+
async extract(context) {
|
|
1432
|
+
const { $, finalUrl } = context;
|
|
1433
|
+
const links = [];
|
|
1434
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1435
|
+
const contentArea = $("article, main, [role=\"main\"]").first();
|
|
1436
|
+
const container = contentArea.length > 0 ? contentArea : $("body");
|
|
1437
|
+
const skipSelectors = "nav, header, footer, aside, [role=\"navigation\"], [class*=\"nav\"], [class*=\"footer\"], [class*=\"header\"], [class*=\"sidebar\"], [class*=\"menu\"]";
|
|
1438
|
+
container.find("a[href]").each((_, el) => {
|
|
1439
|
+
const $el = $(el);
|
|
1440
|
+
if ($el.closest(skipSelectors).length > 0) return;
|
|
1441
|
+
const href = $el.attr("href");
|
|
1442
|
+
if (!href) return;
|
|
1443
|
+
if (href.startsWith("#") || href.startsWith("javascript:") || href.startsWith("mailto:") || href.startsWith("tel:")) return;
|
|
1444
|
+
const resolvedUrl = resolveUrl(href, finalUrl);
|
|
1445
|
+
if (!resolvedUrl || !isValidUrl(resolvedUrl)) return;
|
|
1446
|
+
if (seen.has(resolvedUrl)) return;
|
|
1447
|
+
seen.add(resolvedUrl);
|
|
1448
|
+
const text = $el.text().trim() || $el.attr("title") || $el.attr("aria-label") || "";
|
|
1449
|
+
if (text.length < 2) return;
|
|
1450
|
+
const baseDomain = extractDomain(finalUrl);
|
|
1451
|
+
links.push({
|
|
1452
|
+
url: resolvedUrl,
|
|
1453
|
+
text: text.slice(0, 200),
|
|
1454
|
+
isExternal: isExternalUrl(resolvedUrl, baseDomain)
|
|
1455
|
+
});
|
|
1456
|
+
});
|
|
1457
|
+
return { links: links.slice(0, 100) };
|
|
1458
|
+
}
|
|
1459
|
+
};
|
|
1460
|
+
|
|
1461
|
+
//#endregion
|
|
1462
|
+
//#region src/extractors/meta.ts
|
|
1463
|
+
/**
|
|
1464
|
+
* Extracts metadata from HTML meta tags, Open Graph, and Twitter cards.
|
|
1465
|
+
* Runs first to provide basic metadata for other extractors.
|
|
1466
|
+
*/
|
|
1467
|
+
var MetaExtractor = class {
|
|
1468
|
+
name = "meta";
|
|
1469
|
+
priority = 100;
|
|
1470
|
+
async extract(context) {
|
|
1471
|
+
const { $ } = context;
|
|
1472
|
+
const getMeta = (nameOrProperty) => {
|
|
1473
|
+
return ($(`meta[name="${nameOrProperty}"]`).attr("content") || $(`meta[property="${nameOrProperty}"]`).attr("content") || $(`meta[itemprop="${nameOrProperty}"]`).attr("content"))?.trim() || void 0;
|
|
1474
|
+
};
|
|
1475
|
+
const title = getMeta("og:title") || getMeta("twitter:title") || $("title").first().text().trim() || "";
|
|
1476
|
+
const description = getMeta("og:description") || getMeta("twitter:description") || getMeta("description") || "";
|
|
1477
|
+
const image = getMeta("og:image") || getMeta("twitter:image") || getMeta("twitter:image:src") || void 0;
|
|
1478
|
+
const canonicalUrl = $("link[rel=\"canonical\"]").attr("href") || getMeta("og:url") || context.finalUrl;
|
|
1479
|
+
const author = getMeta("author") || getMeta("article:author") || getMeta("twitter:creator") || $("[rel=\"author\"]").first().text().trim() || void 0;
|
|
1480
|
+
const siteName = getMeta("og:site_name") || getMeta("application-name") || void 0;
|
|
1481
|
+
const publishedAt = getMeta("article:published_time") || getMeta("datePublished") || getMeta("date") || $("time[datetime]").first().attr("datetime") || void 0;
|
|
1482
|
+
const modifiedAt = getMeta("article:modified_time") || getMeta("dateModified") || void 0;
|
|
1483
|
+
const language = $("html").attr("lang") || getMeta("og:locale") || getMeta("language") || void 0;
|
|
1484
|
+
const keywordsRaw = getMeta("keywords") || getMeta("article:tag") || "";
|
|
1485
|
+
return {
|
|
1486
|
+
title,
|
|
1487
|
+
description,
|
|
1488
|
+
image,
|
|
1489
|
+
canonicalUrl,
|
|
1490
|
+
author,
|
|
1491
|
+
siteName,
|
|
1492
|
+
publishedAt,
|
|
1493
|
+
modifiedAt,
|
|
1494
|
+
language,
|
|
1495
|
+
keywords: keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : []
|
|
1496
|
+
};
|
|
1497
|
+
}
|
|
1498
|
+
};
|
|
1499
|
+
|
|
1500
|
+
//#endregion
|
|
1501
|
+
//#region src/extractors/index.ts
|
|
1502
|
+
/**
|
|
1503
|
+
* Default extractors in priority order.
|
|
1504
|
+
* Higher priority runs first.
|
|
1505
|
+
*/
|
|
1506
|
+
function createDefaultExtractors() {
|
|
1507
|
+
return [
|
|
1508
|
+
new MetaExtractor(),
|
|
1509
|
+
new JsonLdExtractor(),
|
|
1510
|
+
new FaviconExtractor(),
|
|
1511
|
+
new ContentExtractor(),
|
|
1512
|
+
new LinksExtractor()
|
|
1513
|
+
];
|
|
1514
|
+
}
|
|
1515
|
+
/**
|
|
1516
|
+
* Sort extractors by priority (higher first).
|
|
1517
|
+
*/
|
|
1518
|
+
function sortExtractors(extractors) {
|
|
1519
|
+
return [...extractors].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0));
|
|
1520
|
+
}
|
|
1521
|
+
|
|
1522
|
+
//#endregion
|
|
1523
|
+
//#region src/fetchers/types.ts
|
|
1524
|
+
/**
|
|
1525
|
+
* Default user agent string
|
|
1526
|
+
*/
|
|
1527
|
+
const DEFAULT_USER_AGENT = "Scrapex-Bot/2.0 (+https://github.com/developer-rakeshpaul/scrapex)";
|
|
1528
|
+
/**
|
|
1529
|
+
* Default timeout in milliseconds
|
|
1530
|
+
*/
|
|
1531
|
+
const DEFAULT_TIMEOUT = 1e4;
|
|
1532
|
+
|
|
1533
|
+
//#endregion
|
|
1534
|
+
//#region src/fetchers/fetch.ts
|
|
1535
|
+
/**
|
|
1536
|
+
* Default fetcher using native fetch API.
|
|
1537
|
+
* Works in Node.js 18+ without polyfills.
|
|
1538
|
+
*/
|
|
1539
|
+
var NativeFetcher = class {
|
|
1540
|
+
name = "native-fetch";
|
|
1541
|
+
async fetch(url, options = {}) {
|
|
1542
|
+
const { timeout = DEFAULT_TIMEOUT, userAgent = DEFAULT_USER_AGENT, headers = {} } = options;
|
|
1543
|
+
let parsedUrl;
|
|
1544
|
+
try {
|
|
1545
|
+
parsedUrl = new URL(url);
|
|
1546
|
+
} catch {
|
|
1547
|
+
throw new require_enhancer.ScrapeError(`Invalid URL: ${url}`, "INVALID_URL");
|
|
1548
|
+
}
|
|
1549
|
+
if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new require_enhancer.ScrapeError(`Invalid protocol: ${parsedUrl.protocol}`, "INVALID_URL");
|
|
1550
|
+
const controller = new AbortController();
|
|
1551
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
1552
|
+
try {
|
|
1553
|
+
const response = await fetch(url, {
|
|
1554
|
+
signal: controller.signal,
|
|
1555
|
+
headers: {
|
|
1556
|
+
"User-Agent": userAgent,
|
|
1557
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
1558
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
1559
|
+
...headers
|
|
1560
|
+
},
|
|
1561
|
+
redirect: "follow"
|
|
1562
|
+
});
|
|
1563
|
+
clearTimeout(timeoutId);
|
|
1564
|
+
if (!response.ok) {
|
|
1565
|
+
if (response.status === 404) throw new require_enhancer.ScrapeError(`Page not found: ${url}`, "NOT_FOUND", 404);
|
|
1566
|
+
if (response.status === 403 || response.status === 401) throw new require_enhancer.ScrapeError(`Access blocked: ${url}`, "BLOCKED", response.status);
|
|
1567
|
+
if (response.status === 429) throw new require_enhancer.ScrapeError(`Rate limited: ${url}`, "BLOCKED", 429);
|
|
1568
|
+
throw new require_enhancer.ScrapeError(`HTTP error ${response.status}: ${url}`, "FETCH_FAILED", response.status);
|
|
1569
|
+
}
|
|
1570
|
+
const contentType = response.headers.get("content-type") || "";
|
|
1571
|
+
if (options.allowedContentTypes) {
|
|
1572
|
+
if (!options.allowedContentTypes.some((type) => contentType.toLowerCase().includes(type.toLowerCase()))) throw new require_enhancer.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
|
|
1573
|
+
} else if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) throw new require_enhancer.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
|
|
1574
|
+
const html = await response.text();
|
|
1575
|
+
const responseHeaders = {};
|
|
1576
|
+
response.headers.forEach((value, key) => {
|
|
1577
|
+
responseHeaders[key] = value;
|
|
1578
|
+
});
|
|
1579
|
+
return {
|
|
1580
|
+
html,
|
|
1581
|
+
finalUrl: response.url,
|
|
1582
|
+
statusCode: response.status,
|
|
1583
|
+
contentType,
|
|
1584
|
+
headers: responseHeaders
|
|
1585
|
+
};
|
|
1586
|
+
} catch (error) {
|
|
1587
|
+
clearTimeout(timeoutId);
|
|
1588
|
+
if (error instanceof require_enhancer.ScrapeError) throw error;
|
|
1589
|
+
if (error instanceof Error && error.name === "AbortError") throw new require_enhancer.ScrapeError(`Request timed out after ${timeout}ms`, "TIMEOUT");
|
|
1590
|
+
if (error instanceof Error) throw new require_enhancer.ScrapeError(`Fetch failed: ${error.message}`, "FETCH_FAILED", void 0, error);
|
|
1591
|
+
throw new require_enhancer.ScrapeError("Unknown fetch error", "FETCH_FAILED");
|
|
1592
|
+
}
|
|
1593
|
+
}
|
|
1594
|
+
};
|
|
1595
|
+
/**
|
|
1596
|
+
* Default fetcher instance
|
|
1597
|
+
*/
|
|
1598
|
+
const defaultFetcher = new NativeFetcher();
|
|
1599
|
+
|
|
1600
|
+
//#endregion
|
|
1601
|
+
//#region src/fetchers/robots.ts
|
|
1602
|
+
/**
|
|
1603
|
+
* Check if URL is allowed by robots.txt
|
|
1604
|
+
*
|
|
1605
|
+
* @param url - The URL to check
|
|
1606
|
+
* @param userAgent - User agent to check rules for
|
|
1607
|
+
* @returns Whether the URL is allowed and optional reason
|
|
1608
|
+
*/
|
|
1609
|
+
async function checkRobotsTxt(url, userAgent = DEFAULT_USER_AGENT) {
|
|
1610
|
+
try {
|
|
1611
|
+
const parsedUrl = new URL(url);
|
|
1612
|
+
const robotsUrl = `${parsedUrl.protocol}//${parsedUrl.host}/robots.txt`;
|
|
1613
|
+
const response = await fetch(robotsUrl, {
|
|
1614
|
+
headers: { "User-Agent": userAgent },
|
|
1615
|
+
signal: AbortSignal.timeout(5e3)
|
|
1616
|
+
});
|
|
1617
|
+
if (!response.ok) return { allowed: true };
|
|
1618
|
+
const allowed = isPathAllowed(parseRobotsTxt(await response.text(), userAgent), parsedUrl.pathname + parsedUrl.search);
|
|
1619
|
+
return {
|
|
1620
|
+
allowed,
|
|
1621
|
+
reason: allowed ? void 0 : "Blocked by robots.txt"
|
|
1622
|
+
};
|
|
1623
|
+
} catch {
|
|
1624
|
+
return { allowed: true };
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
/**
|
|
1628
|
+
* Parse robots.txt content for a specific user agent
|
|
1629
|
+
*/
|
|
1630
|
+
function parseRobotsTxt(content, userAgent) {
|
|
1631
|
+
const rules = {
|
|
1632
|
+
disallow: [],
|
|
1633
|
+
allow: []
|
|
1634
|
+
};
|
|
1635
|
+
const lines = content.split("\n");
|
|
1636
|
+
const botName = userAgent.split(/[\s/]/)[0]?.toLowerCase() || "";
|
|
1637
|
+
let currentAgent = "";
|
|
1638
|
+
let isMatchingAgent = false;
|
|
1639
|
+
let hasFoundSpecificAgent = false;
|
|
1640
|
+
for (const rawLine of lines) {
|
|
1641
|
+
const line = rawLine.trim();
|
|
1642
|
+
if (!line || line.startsWith("#")) continue;
|
|
1643
|
+
const colonIndex = line.indexOf(":");
|
|
1644
|
+
if (colonIndex === -1) continue;
|
|
1645
|
+
const directive = line.slice(0, colonIndex).trim().toLowerCase();
|
|
1646
|
+
const value = line.slice(colonIndex + 1).trim();
|
|
1647
|
+
if (directive === "user-agent") {
|
|
1648
|
+
currentAgent = value.toLowerCase();
|
|
1649
|
+
isMatchingAgent = currentAgent === "*" || currentAgent === botName || botName.includes(currentAgent);
|
|
1650
|
+
if (currentAgent !== "*" && isMatchingAgent) {
|
|
1651
|
+
hasFoundSpecificAgent = true;
|
|
1652
|
+
rules.disallow = [];
|
|
1653
|
+
rules.allow = [];
|
|
1654
|
+
}
|
|
1655
|
+
} else if (isMatchingAgent && (!hasFoundSpecificAgent || currentAgent !== "*")) {
|
|
1656
|
+
if (directive === "disallow" && value) rules.disallow.push(value);
|
|
1657
|
+
else if (directive === "allow" && value) rules.allow.push(value);
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
return rules;
|
|
1661
|
+
}
|
|
1662
|
+
/**
|
|
1663
|
+
* Check if a path is allowed based on robots.txt rules
|
|
1664
|
+
*/
|
|
1665
|
+
function isPathAllowed(rules, path) {
|
|
1666
|
+
if (rules.disallow.length === 0 && rules.allow.length === 0) return true;
|
|
1667
|
+
for (const pattern of rules.allow) if (matchesPattern(path, pattern)) return true;
|
|
1668
|
+
for (const pattern of rules.disallow) if (matchesPattern(path, pattern)) return false;
|
|
1669
|
+
return true;
|
|
1670
|
+
}
|
|
1671
|
+
/**
|
|
1672
|
+
* Check if a path matches a robots.txt pattern
|
|
1673
|
+
*/
|
|
1674
|
+
function matchesPattern(path, pattern) {
|
|
1675
|
+
if (!pattern) return false;
|
|
1676
|
+
if (pattern.endsWith("*")) return path.startsWith(pattern.slice(0, -1));
|
|
1677
|
+
if (pattern.endsWith("$")) return path === pattern.slice(0, -1);
|
|
1678
|
+
if (pattern.includes("*")) return (/* @__PURE__ */ new RegExp(`^${pattern.replace(/\*/g, ".*").replace(/\?/g, "\\?")}.*`)).test(path);
|
|
1679
|
+
return path.startsWith(pattern);
|
|
1680
|
+
}
|
|
1681
|
+
|
|
1682
|
+
//#endregion
|
|
1683
|
+
//#region src/core/scrape.ts
|
|
1684
|
+
/**
|
|
1685
|
+
* Scrape a URL and extract metadata and content.
|
|
1686
|
+
*
|
|
1687
|
+
* @param url - The URL to scrape
|
|
1688
|
+
* @param options - Scraping options
|
|
1689
|
+
* @returns Scraped data with metadata and content
|
|
1690
|
+
*
|
|
1691
|
+
* @example
|
|
1692
|
+
* ```ts
|
|
1693
|
+
* const result = await scrape('https://example.com/article');
|
|
1694
|
+
* console.log(result.title, result.content);
|
|
1695
|
+
* ```
|
|
1696
|
+
*/
|
|
1697
|
+
async function scrape(url, options = {}) {
|
|
1698
|
+
const startTime = Date.now();
|
|
1699
|
+
if (!isValidUrl(url)) throw new require_enhancer.ScrapeError("Invalid URL provided", "INVALID_URL");
|
|
1700
|
+
const normalizedUrl = normalizeUrl(url);
|
|
1701
|
+
if (options.respectRobots) {
|
|
1702
|
+
const robotsResult = await checkRobotsTxt(normalizedUrl, options.userAgent);
|
|
1703
|
+
if (!robotsResult.allowed) throw new require_enhancer.ScrapeError(`URL blocked by robots.txt: ${robotsResult.reason || "disallowed"}`, "ROBOTS_BLOCKED");
|
|
1704
|
+
}
|
|
1705
|
+
const fetchResult = await (options.fetcher ?? defaultFetcher).fetch(normalizedUrl, {
|
|
1706
|
+
timeout: options.timeout,
|
|
1707
|
+
userAgent: options.userAgent
|
|
1708
|
+
});
|
|
1709
|
+
await preloadJsdom();
|
|
1710
|
+
let context = createExtractionContext(normalizedUrl, fetchResult.finalUrl, fetchResult.html, options);
|
|
1711
|
+
let extractors;
|
|
1712
|
+
if (options.replaceDefaultExtractors) extractors = options.extractors ?? [];
|
|
1713
|
+
else {
|
|
1714
|
+
const defaults = createDefaultExtractors();
|
|
1715
|
+
extractors = options.extractors ? [...defaults, ...options.extractors] : defaults;
|
|
1716
|
+
}
|
|
1717
|
+
extractors = sortExtractors(extractors);
|
|
1718
|
+
for (const extractor of extractors) try {
|
|
1719
|
+
const extracted = await extractor.extract(context);
|
|
1720
|
+
context = mergeResults(context, extracted);
|
|
1721
|
+
} catch (error) {
|
|
1722
|
+
console.error(`Extractor "${extractor.name}" failed:`, error);
|
|
1723
|
+
context = mergeResults(context, { error: context.results.error ? `${context.results.error}; ${extractor.name}: ${error instanceof Error ? error.message : String(error)}` : `${extractor.name}: ${error instanceof Error ? error.message : String(error)}` });
|
|
1724
|
+
}
|
|
1725
|
+
const intermediateResult = {
|
|
1726
|
+
url: normalizedUrl,
|
|
1727
|
+
canonicalUrl: context.results.canonicalUrl || fetchResult.finalUrl,
|
|
1728
|
+
domain: extractDomain(fetchResult.finalUrl),
|
|
1729
|
+
title: context.results.title || "",
|
|
1730
|
+
description: context.results.description || "",
|
|
1731
|
+
image: context.results.image,
|
|
1732
|
+
favicon: context.results.favicon,
|
|
1733
|
+
content: context.results.content || "",
|
|
1734
|
+
textContent: context.results.textContent || "",
|
|
1735
|
+
excerpt: context.results.excerpt || "",
|
|
1736
|
+
wordCount: context.results.wordCount || 0,
|
|
1737
|
+
author: context.results.author,
|
|
1738
|
+
publishedAt: context.results.publishedAt,
|
|
1739
|
+
modifiedAt: context.results.modifiedAt,
|
|
1740
|
+
siteName: context.results.siteName,
|
|
1741
|
+
language: context.results.language,
|
|
1742
|
+
contentType: context.results.contentType || "unknown",
|
|
1743
|
+
keywords: context.results.keywords || [],
|
|
1744
|
+
jsonLd: context.results.jsonLd,
|
|
1745
|
+
links: context.results.links,
|
|
1746
|
+
custom: context.results.custom,
|
|
1747
|
+
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1748
|
+
scrapeTimeMs: 0,
|
|
1749
|
+
error: context.results.error
|
|
1750
|
+
};
|
|
1751
|
+
if (options.llm && options.enhance && options.enhance.length > 0) try {
|
|
1752
|
+
const enhanced = await require_enhancer.enhance(intermediateResult, options.llm, options.enhance);
|
|
1753
|
+
Object.assign(intermediateResult, enhanced);
|
|
1754
|
+
} catch (error) {
|
|
1755
|
+
console.error("LLM enhancement failed:", error);
|
|
1756
|
+
intermediateResult.error = intermediateResult.error ? `${intermediateResult.error}; LLM: ${error instanceof Error ? error.message : String(error)}` : `LLM: ${error instanceof Error ? error.message : String(error)}`;
|
|
1757
|
+
}
|
|
1758
|
+
if (options.llm && options.extract) try {
|
|
1759
|
+
intermediateResult.extracted = await require_enhancer.extract(intermediateResult, options.llm, options.extract);
|
|
1760
|
+
} catch (error) {
|
|
1761
|
+
console.error("LLM extraction failed:", error);
|
|
1762
|
+
intermediateResult.error = intermediateResult.error ? `${intermediateResult.error}; LLM extraction: ${error instanceof Error ? error.message : String(error)}` : `LLM extraction: ${error instanceof Error ? error.message : String(error)}`;
|
|
1763
|
+
}
|
|
1764
|
+
if (options.embeddings) intermediateResult.embeddings = await generateEmbeddings(intermediateResult, options.embeddings);
|
|
1765
|
+
const scrapeTimeMs = Date.now() - startTime;
|
|
1766
|
+
return {
|
|
1767
|
+
...intermediateResult,
|
|
1768
|
+
scrapeTimeMs
|
|
1769
|
+
};
|
|
1770
|
+
}
|
|
1771
|
+
/**
|
|
1772
|
+
* Scrape from raw HTML string (no fetch).
|
|
1773
|
+
*
|
|
1774
|
+
* @param html - The HTML content
|
|
1775
|
+
* @param url - The URL (for resolving relative links)
|
|
1776
|
+
* @param options - Scraping options
|
|
1777
|
+
* @returns Scraped data with metadata and content
|
|
1778
|
+
*
|
|
1779
|
+
* @example
|
|
1780
|
+
* ```ts
|
|
1781
|
+
* const html = await fetchSomehow('https://example.com');
|
|
1782
|
+
* const result = await scrapeHtml(html, 'https://example.com');
|
|
1783
|
+
* ```
|
|
1784
|
+
*/
|
|
1785
|
+
async function scrapeHtml(html, url, options = {}) {
|
|
1786
|
+
const startTime = Date.now();
|
|
1787
|
+
if (!isValidUrl(url)) throw new require_enhancer.ScrapeError("Invalid URL provided", "INVALID_URL");
|
|
1788
|
+
const normalizedUrl = normalizeUrl(url);
|
|
1789
|
+
await preloadJsdom();
|
|
1790
|
+
let context = createExtractionContext(normalizedUrl, normalizedUrl, html, options);
|
|
1791
|
+
let extractors;
|
|
1792
|
+
if (options.replaceDefaultExtractors) extractors = options.extractors ?? [];
|
|
1793
|
+
else {
|
|
1794
|
+
const defaults = createDefaultExtractors();
|
|
1795
|
+
extractors = options.extractors ? [...defaults, ...options.extractors] : defaults;
|
|
1796
|
+
}
|
|
1797
|
+
extractors = sortExtractors(extractors);
|
|
1798
|
+
for (const extractor of extractors) try {
|
|
1799
|
+
const extracted = await extractor.extract(context);
|
|
1800
|
+
context = mergeResults(context, extracted);
|
|
1801
|
+
} catch (error) {
|
|
1802
|
+
console.error(`Extractor "${extractor.name}" failed:`, error);
|
|
1803
|
+
context = mergeResults(context, { error: context.results.error ? `${context.results.error}; ${extractor.name}: ${error instanceof Error ? error.message : String(error)}` : `${extractor.name}: ${error instanceof Error ? error.message : String(error)}` });
|
|
1804
|
+
}
|
|
1805
|
+
const domain = extractDomain(normalizedUrl);
|
|
1806
|
+
const intermediateResult = {
|
|
1807
|
+
url: normalizedUrl,
|
|
1808
|
+
canonicalUrl: context.results.canonicalUrl || normalizedUrl,
|
|
1809
|
+
domain,
|
|
1810
|
+
title: context.results.title || "",
|
|
1811
|
+
description: context.results.description || "",
|
|
1812
|
+
image: context.results.image,
|
|
1813
|
+
favicon: context.results.favicon,
|
|
1814
|
+
content: context.results.content || "",
|
|
1815
|
+
textContent: context.results.textContent || "",
|
|
1816
|
+
excerpt: context.results.excerpt || "",
|
|
1817
|
+
wordCount: context.results.wordCount || 0,
|
|
1818
|
+
author: context.results.author,
|
|
1819
|
+
publishedAt: context.results.publishedAt,
|
|
1820
|
+
modifiedAt: context.results.modifiedAt,
|
|
1821
|
+
siteName: context.results.siteName,
|
|
1822
|
+
language: context.results.language,
|
|
1823
|
+
contentType: context.results.contentType || "unknown",
|
|
1824
|
+
keywords: context.results.keywords || [],
|
|
1825
|
+
jsonLd: context.results.jsonLd,
|
|
1826
|
+
links: context.results.links,
|
|
1827
|
+
summary: context.results.summary,
|
|
1828
|
+
suggestedTags: context.results.suggestedTags,
|
|
1829
|
+
entities: context.results.entities,
|
|
1830
|
+
extracted: context.results.extracted,
|
|
1831
|
+
custom: context.results.custom,
|
|
1832
|
+
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1833
|
+
scrapeTimeMs: 0,
|
|
1834
|
+
error: context.results.error
|
|
1835
|
+
};
|
|
1836
|
+
if (options.embeddings) intermediateResult.embeddings = await generateEmbeddings(intermediateResult, options.embeddings);
|
|
1837
|
+
const scrapeTimeMs = Date.now() - startTime;
|
|
1838
|
+
return {
|
|
1839
|
+
...intermediateResult,
|
|
1840
|
+
scrapeTimeMs
|
|
1841
|
+
};
|
|
1842
|
+
}
|
|
1843
|
+
|
|
1844
|
+
//#endregion
|
|
1845
|
+
//#region src/utils/feed.ts
|
|
1846
|
+
/**
|
|
1847
|
+
* Fetch and parse an RSS/Atom feed from a URL.
|
|
1848
|
+
* Uses scrapex's fetcher infrastructure for consistent behavior.
|
|
1849
|
+
*/
|
|
1850
|
+
async function fetchFeed(url, options) {
|
|
1851
|
+
const result = await (options?.fetcher || defaultFetcher).fetch(url, {
|
|
1852
|
+
timeout: options?.timeout,
|
|
1853
|
+
userAgent: options?.userAgent,
|
|
1854
|
+
allowedContentTypes: [
|
|
1855
|
+
"application/rss+xml",
|
|
1856
|
+
"application/atom+xml",
|
|
1857
|
+
"application/rdf+xml",
|
|
1858
|
+
"application/xml",
|
|
1859
|
+
"text/xml",
|
|
1860
|
+
"text/html"
|
|
1861
|
+
]
|
|
1862
|
+
});
|
|
1863
|
+
return new require_parsers.RSSParser(options?.parserOptions).parse(result.html, url);
|
|
1864
|
+
}
|
|
1865
|
+
/**
|
|
1866
|
+
* Detect RSS/Atom feed URLs from HTML.
|
|
1867
|
+
* Supports RSS, Atom, and RDF feed types.
|
|
1868
|
+
*/
|
|
1869
|
+
function discoverFeeds(html, baseUrl) {
|
|
1870
|
+
const $ = cheerio.load(html);
|
|
1871
|
+
const feeds = [];
|
|
1872
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1873
|
+
$([
|
|
1874
|
+
"link[type=\"application/rss+xml\"]",
|
|
1875
|
+
"link[type=\"application/atom+xml\"]",
|
|
1876
|
+
"link[type=\"application/rdf+xml\"]",
|
|
1877
|
+
"link[rel=\"alternate\"][type*=\"xml\"]"
|
|
1878
|
+
].join(", ")).each((_, el) => {
|
|
1879
|
+
const href = $(el).attr("href");
|
|
1880
|
+
if (href) try {
|
|
1881
|
+
const resolved = new URL(href, baseUrl).href;
|
|
1882
|
+
if (!seen.has(resolved)) {
|
|
1883
|
+
seen.add(resolved);
|
|
1884
|
+
feeds.push(resolved);
|
|
1885
|
+
}
|
|
1886
|
+
} catch {}
|
|
1887
|
+
});
|
|
1888
|
+
return feeds;
|
|
1889
|
+
}
|
|
1890
|
+
/**
|
|
1891
|
+
* Filter feed items by date range.
|
|
1892
|
+
* Items without publishedAt are included by default.
|
|
1893
|
+
*/
|
|
1894
|
+
function filterByDate(items, options) {
|
|
1895
|
+
const { after, before, includeUndated = true } = options;
|
|
1896
|
+
return items.filter((item) => {
|
|
1897
|
+
if (!item.publishedAt) return includeUndated;
|
|
1898
|
+
const date = new Date(item.publishedAt);
|
|
1899
|
+
if (after && date < after) return false;
|
|
1900
|
+
if (before && date > before) return false;
|
|
1901
|
+
return true;
|
|
1902
|
+
});
|
|
1903
|
+
}
|
|
1904
|
+
/**
|
|
1905
|
+
* Convert feed items to markdown for LLM consumption.
|
|
1906
|
+
* Uses ISO 8601 date format for consistency across environments.
|
|
1907
|
+
*/
|
|
1908
|
+
function feedToMarkdown(feed, options) {
|
|
1909
|
+
const { includeContent = false, maxItems } = options || {};
|
|
1910
|
+
const lines = [`# ${feed.title}`, ""];
|
|
1911
|
+
if (feed.description) lines.push(feed.description, "");
|
|
1912
|
+
const items = maxItems ? feed.items.slice(0, maxItems) : feed.items;
|
|
1913
|
+
for (const item of items) {
|
|
1914
|
+
lines.push(`## ${item.title}`);
|
|
1915
|
+
if (item.publishedAt) {
|
|
1916
|
+
const date = item.publishedAt.split("T")[0];
|
|
1917
|
+
lines.push(`*${date}*`);
|
|
1918
|
+
}
|
|
1919
|
+
lines.push("");
|
|
1920
|
+
if (includeContent && item.content) lines.push(item.content);
|
|
1921
|
+
else if (item.description) lines.push(item.description);
|
|
1922
|
+
if (item.link) lines.push(`[Read more](${item.link})`, "");
|
|
1923
|
+
else lines.push("");
|
|
1924
|
+
}
|
|
1925
|
+
return lines.join("\n");
|
|
1926
|
+
}
|
|
1927
|
+
/**
|
|
1928
|
+
* Extract plain text from feed items for LLM processing.
|
|
1929
|
+
* Concatenates title, description, and content.
|
|
1930
|
+
*/
|
|
1931
|
+
function feedToText(feed, options) {
|
|
1932
|
+
const { maxItems, separator = "\n\n---\n\n" } = options || {};
|
|
1933
|
+
return (maxItems ? feed.items.slice(0, maxItems) : feed.items).map((item) => {
|
|
1934
|
+
const parts = [item.title];
|
|
1935
|
+
if (item.description) parts.push(item.description);
|
|
1936
|
+
if (item.content) parts.push(item.content);
|
|
1937
|
+
return parts.join("\n\n");
|
|
1938
|
+
}).join(separator);
|
|
1939
|
+
}
|
|
1940
|
+
/**
|
|
1941
|
+
* Paginate through a feed using rel="next" links (RFC 5005).
|
|
1942
|
+
* Returns an async generator that yields each page.
|
|
1943
|
+
*/
|
|
1944
|
+
async function* paginateFeed(url, options) {
|
|
1945
|
+
const { maxPages = 10, ...fetchOptions } = options || {};
|
|
1946
|
+
let currentUrl = url;
|
|
1947
|
+
let pageCount = 0;
|
|
1948
|
+
while (currentUrl && pageCount < maxPages) {
|
|
1949
|
+
const result = await fetchFeed(currentUrl, fetchOptions);
|
|
1950
|
+
yield result.data;
|
|
1951
|
+
currentUrl = result.data.next;
|
|
1952
|
+
pageCount++;
|
|
1953
|
+
}
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
//#endregion
|
|
1957
|
+
exports.ContentExtractor = ContentExtractor;
|
|
1958
|
+
exports.DEFAULT_TIMEOUT = DEFAULT_TIMEOUT;
|
|
1959
|
+
exports.DEFAULT_USER_AGENT = DEFAULT_USER_AGENT;
|
|
1960
|
+
exports.FaviconExtractor = FaviconExtractor;
|
|
1961
|
+
exports.InMemoryEmbeddingCache = InMemoryEmbeddingCache;
|
|
1962
|
+
exports.JsonLdExtractor = JsonLdExtractor;
|
|
1963
|
+
exports.LinksExtractor = LinksExtractor;
|
|
1964
|
+
exports.MetaExtractor = MetaExtractor;
|
|
1965
|
+
exports.NativeFetcher = NativeFetcher;
|
|
1966
|
+
exports.RSSParser = require_parsers.RSSParser;
|
|
1967
|
+
exports.ScrapeError = require_enhancer.ScrapeError;
|
|
1968
|
+
exports.TRANSFORMERS_MODELS = TRANSFORMERS_MODELS;
|
|
1969
|
+
exports.aggregateVectors = aggregateVectors;
|
|
1970
|
+
exports.checkRobotsTxt = checkRobotsTxt;
|
|
1971
|
+
exports.chunkText = chunkText;
|
|
1972
|
+
exports.cosineSimilarity = cosineSimilarity;
|
|
1973
|
+
exports.createAzureEmbedding = createAzureEmbedding;
|
|
1974
|
+
exports.createDefaultExtractors = createDefaultExtractors;
|
|
1975
|
+
exports.createEmbeddingProvider = createEmbeddingProvider;
|
|
1976
|
+
exports.createExtractionContext = createExtractionContext;
|
|
1977
|
+
exports.createHttpEmbedding = createHttpEmbedding;
|
|
1978
|
+
exports.createHuggingFaceEmbedding = createHuggingFaceEmbedding;
|
|
1979
|
+
exports.createOllamaEmbedding = createOllamaEmbedding;
|
|
1980
|
+
exports.createOpenAIEmbedding = createOpenAIEmbedding;
|
|
1981
|
+
exports.createPiiRedactor = createPiiRedactor;
|
|
1982
|
+
exports.createTransformersEmbedding = createTransformersEmbedding;
|
|
1983
|
+
exports.defaultFetcher = defaultFetcher;
|
|
1984
|
+
exports.discoverFeeds = discoverFeeds;
|
|
1985
|
+
exports.embed = embed;
|
|
1986
|
+
exports.embedScrapedData = embedScrapedData;
|
|
1987
|
+
exports.estimateTokens = estimateTokens;
|
|
1988
|
+
exports.extractDomain = extractDomain;
|
|
1989
|
+
exports.feedToMarkdown = feedToMarkdown;
|
|
1990
|
+
exports.feedToText = feedToText;
|
|
1991
|
+
exports.fetchFeed = fetchFeed;
|
|
1992
|
+
exports.filterByDate = filterByDate;
|
|
1993
|
+
exports.generateEmbeddings = generateEmbeddings;
|
|
1994
|
+
exports.getPath = getPath;
|
|
1995
|
+
exports.getProtocol = getProtocol;
|
|
1996
|
+
exports.isExternalUrl = isExternalUrl;
|
|
1997
|
+
exports.isValidUrl = isValidUrl;
|
|
1998
|
+
exports.matchesUrlPattern = matchesUrlPattern;
|
|
1999
|
+
exports.mergeResults = mergeResults;
|
|
2000
|
+
exports.normalizeUrl = normalizeUrl;
|
|
2001
|
+
exports.paginateFeed = paginateFeed;
|
|
2002
|
+
exports.redactPii = redactPii;
|
|
2003
|
+
exports.resolveUrl = resolveUrl;
|
|
2004
|
+
exports.scrape = scrape;
|
|
2005
|
+
exports.scrapeHtml = scrapeHtml;
|
|
2006
|
+
exports.sortExtractors = sortExtractors;
|
|
2007
|
+
//# sourceMappingURL=index.cjs.map
|