scrapex 0.5.2 → 1.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +392 -145
  3. package/dist/enhancer-Q6CSc1gA.mjs +220 -0
  4. package/dist/enhancer-Q6CSc1gA.mjs.map +1 -0
  5. package/dist/enhancer-oM4BhYYS.cjs +268 -0
  6. package/dist/enhancer-oM4BhYYS.cjs.map +1 -0
  7. package/dist/index.cjs +852 -0
  8. package/dist/index.cjs.map +1 -0
  9. package/dist/index.d.cts +264 -0
  10. package/dist/index.d.cts.map +1 -0
  11. package/dist/index.d.mts +264 -0
  12. package/dist/index.d.mts.map +1 -0
  13. package/dist/index.mjs +798 -0
  14. package/dist/index.mjs.map +1 -0
  15. package/dist/llm/index.cjs +316 -0
  16. package/dist/llm/index.cjs.map +1 -0
  17. package/dist/llm/index.d.cts +211 -0
  18. package/dist/llm/index.d.cts.map +1 -0
  19. package/dist/llm/index.d.mts +211 -0
  20. package/dist/llm/index.d.mts.map +1 -0
  21. package/dist/llm/index.mjs +310 -0
  22. package/dist/llm/index.mjs.map +1 -0
  23. package/dist/parsers/index.cjs +200 -0
  24. package/dist/parsers/index.cjs.map +1 -0
  25. package/dist/parsers/index.d.cts +133 -0
  26. package/dist/parsers/index.d.cts.map +1 -0
  27. package/dist/parsers/index.d.mts +133 -0
  28. package/dist/parsers/index.d.mts.map +1 -0
  29. package/dist/parsers/index.mjs +192 -0
  30. package/dist/parsers/index.mjs.map +1 -0
  31. package/dist/types-CNQZVW36.d.mts +150 -0
  32. package/dist/types-CNQZVW36.d.mts.map +1 -0
  33. package/dist/types-D0HYR95H.d.cts +150 -0
  34. package/dist/types-D0HYR95H.d.cts.map +1 -0
  35. package/package.json +80 -100
  36. package/dist/index.d.ts +0 -45
  37. package/dist/index.js +0 -8
  38. package/dist/scrapex.cjs.development.js +0 -1128
  39. package/dist/scrapex.cjs.development.js.map +0 -1
  40. package/dist/scrapex.cjs.production.min.js +0 -2
  41. package/dist/scrapex.cjs.production.min.js.map +0 -1
  42. package/dist/scrapex.esm.js +0 -1120
  43. package/dist/scrapex.esm.js.map +0 -1
package/dist/index.cjs ADDED
@@ -0,0 +1,852 @@
1
+ //#region rolldown:runtime
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __copyProps = (to, from, except, desc) => {
9
+ if (from && typeof from === "object" || typeof from === "function") {
10
+ for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
11
+ key = keys[i];
12
+ if (!__hasOwnProp.call(to, key) && key !== except) {
13
+ __defProp(to, key, {
14
+ get: ((k) => from[k]).bind(null, key),
15
+ enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
16
+ });
17
+ }
18
+ }
19
+ }
20
+ return to;
21
+ };
22
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", {
23
+ value: mod,
24
+ enumerable: true
25
+ }) : target, mod));
26
+
27
+ //#endregion
28
+ const require_enhancer = require('./enhancer-oM4BhYYS.cjs');
29
+ let cheerio = require("cheerio");
30
+ cheerio = __toESM(cheerio);
31
+ let __mozilla_readability = require("@mozilla/readability");
32
+ let turndown = require("turndown");
33
+ turndown = __toESM(turndown);
34
+
35
+ //#region src/core/context.ts
36
+ let jsdomModule = null;
37
+ /**
38
+ * Preload JSDOM module (called once during scrape initialization)
39
+ */
40
+ async function preloadJsdom() {
41
+ if (!jsdomModule) jsdomModule = await import("jsdom");
42
+ }
43
+ /**
44
+ * Create an extraction context with lazy JSDOM loading.
45
+ *
46
+ * Cheerio is always available for fast DOM queries.
47
+ * JSDOM is only loaded when getDocument() is called (for Readability).
48
+ */
49
+ function createExtractionContext(url, finalUrl, html, options) {
50
+ let document = null;
51
+ return {
52
+ url,
53
+ finalUrl,
54
+ html,
55
+ $: cheerio.load(html),
56
+ options,
57
+ results: {},
58
+ getDocument() {
59
+ if (!document) {
60
+ if (!jsdomModule) throw new Error("JSDOM not preloaded. Call preloadJsdom() before using getDocument().");
61
+ document = new jsdomModule.JSDOM(html, { url: finalUrl }).window.document;
62
+ }
63
+ return document;
64
+ }
65
+ };
66
+ }
67
+ /**
68
+ * Merge partial results into the context
69
+ */
70
+ function mergeResults(context, extracted) {
71
+ return {
72
+ ...context,
73
+ results: {
74
+ ...context.results,
75
+ ...extracted,
76
+ custom: extracted.custom || context.results.custom ? {
77
+ ...context.results.custom,
78
+ ...extracted.custom
79
+ } : void 0
80
+ }
81
+ };
82
+ }
83
+
84
+ //#endregion
85
+ //#region src/extractors/content.ts
86
+ const turndown$1 = new turndown.default({
87
+ headingStyle: "atx",
88
+ codeBlockStyle: "fenced",
89
+ bulletListMarker: "-",
90
+ emDelimiter: "_",
91
+ strongDelimiter: "**",
92
+ linkStyle: "inlined"
93
+ });
94
+ turndown$1.remove([
95
+ "script",
96
+ "style",
97
+ "noscript",
98
+ "iframe",
99
+ "nav",
100
+ "footer"
101
+ ]);
102
+ /**
103
+ * Extracts main content using Mozilla Readability.
104
+ * Converts HTML to Markdown for LLM consumption.
105
+ */
106
+ var ContentExtractor = class {
107
+ name = "content";
108
+ priority = 50;
109
+ async extract(context) {
110
+ const { options } = context;
111
+ if (options.extractContent === false) return {};
112
+ const article = new __mozilla_readability.Readability(context.getDocument().cloneNode(true)).parse();
113
+ if (!article || !article.content) return this.extractFallback(context);
114
+ let content = turndown$1.turndown(article.content);
115
+ const maxLength = options.maxContentLength ?? 5e4;
116
+ if (content.length > maxLength) content = `${content.slice(0, maxLength)}\n\n[Content truncated...]`;
117
+ const textContent = (article.textContent ?? "").trim();
118
+ const excerpt = this.createExcerpt(textContent);
119
+ const wordCount = textContent.split(/\s+/).filter(Boolean).length;
120
+ const contentType = this.detectContentType(context);
121
+ return {
122
+ content,
123
+ textContent,
124
+ excerpt: article.excerpt || excerpt,
125
+ wordCount,
126
+ contentType,
127
+ title: article.title || void 0,
128
+ author: article.byline || void 0,
129
+ siteName: article.siteName || void 0
130
+ };
131
+ }
132
+ extractFallback(context) {
133
+ const { $ } = context;
134
+ const bodyHtml = $("body").html() || "";
135
+ const content = turndown$1.turndown(bodyHtml);
136
+ const textContent = $("body").text().replace(/\s+/g, " ").trim();
137
+ return {
138
+ content: content.slice(0, context.options.maxContentLength ?? 5e4),
139
+ textContent,
140
+ excerpt: this.createExcerpt(textContent),
141
+ wordCount: textContent.split(/\s+/).filter(Boolean).length,
142
+ contentType: "unknown"
143
+ };
144
+ }
145
+ createExcerpt(text, maxLength = 300) {
146
+ if (text.length <= maxLength) return text;
147
+ const truncated = text.slice(0, maxLength);
148
+ const lastSpace = truncated.lastIndexOf(" ");
149
+ return `${lastSpace > 0 ? truncated.slice(0, lastSpace) : truncated}...`;
150
+ }
151
+ detectContentType(context) {
152
+ const { $, finalUrl } = context;
153
+ const url = finalUrl.toLowerCase();
154
+ if (url.includes("github.com") && !url.includes("/blob/") && !url.includes("/issues/")) {
155
+ if ($("meta[property=\"og:type\"]").attr("content") === "object" || url.match(/github\.com\/[^/]+\/[^/]+\/?$/)) return "repo";
156
+ }
157
+ if (url.includes("npmjs.com/package/")) return "package";
158
+ if (url.includes("pypi.org/project/")) return "package";
159
+ if (url.includes("/docs/") || url.includes(".readthedocs.") || url.includes("/documentation/")) return "docs";
160
+ if (url.includes("youtube.com") || url.includes("vimeo.com") || url.includes("youtu.be")) return "video";
161
+ const hasPrice = $("[class*=\"price\"], [data-price], [itemprop=\"price\"]").length > 0;
162
+ const hasAddToCart = $("[class*=\"cart\"], [class*=\"buy\"], button:contains(\"Add\")").length > 0;
163
+ if (hasPrice || hasAddToCart) return "product";
164
+ const ogType = $("meta[property=\"og:type\"]").attr("content")?.toLowerCase();
165
+ if (ogType === "article" || ogType === "blog" || ogType === "news") return "article";
166
+ const hasArticleTag = $("article").length > 0;
167
+ const hasDateline = $("time[datetime], [class*=\"date\"], [class*=\"byline\"]").length > 0;
168
+ if (hasArticleTag && hasDateline) return "article";
169
+ return "unknown";
170
+ }
171
+ };
172
+
173
+ //#endregion
174
+ //#region src/utils/url.ts
175
+ /**
176
+ * Common tracking parameters to remove from URLs
177
+ */
178
+ const TRACKING_PARAMS = [
179
+ "utm_source",
180
+ "utm_medium",
181
+ "utm_campaign",
182
+ "utm_term",
183
+ "utm_content",
184
+ "utm_id",
185
+ "ref",
186
+ "fbclid",
187
+ "gclid",
188
+ "gclsrc",
189
+ "dclid",
190
+ "msclkid",
191
+ "mc_cid",
192
+ "mc_eid",
193
+ "_ga",
194
+ "_gl",
195
+ "source",
196
+ "referrer"
197
+ ];
198
+ /**
199
+ * Validate if a string is a valid URL
200
+ */
201
+ function isValidUrl(url) {
202
+ try {
203
+ const parsed = new URL(url);
204
+ return ["http:", "https:"].includes(parsed.protocol);
205
+ } catch {
206
+ return false;
207
+ }
208
+ }
209
+ /**
210
+ * Normalize URL by removing tracking params and trailing slashes
211
+ */
212
+ function normalizeUrl(url) {
213
+ try {
214
+ const parsed = new URL(url);
215
+ for (const param of TRACKING_PARAMS) parsed.searchParams.delete(param);
216
+ let normalized = parsed.toString();
217
+ if (normalized.endsWith("/") && parsed.pathname !== "/") normalized = normalized.slice(0, -1);
218
+ return normalized;
219
+ } catch {
220
+ return url;
221
+ }
222
+ }
223
+ /**
224
+ * Extract domain from URL (without www prefix)
225
+ */
226
+ function extractDomain(url) {
227
+ try {
228
+ return new URL(url).hostname.replace(/^www\./, "");
229
+ } catch {
230
+ return "";
231
+ }
232
+ }
233
+ /**
234
+ * Resolve a potentially relative URL against a base URL
235
+ */
236
+ function resolveUrl(url, baseUrl) {
237
+ if (!url) return void 0;
238
+ try {
239
+ return new URL(url, baseUrl).href;
240
+ } catch {
241
+ return url;
242
+ }
243
+ }
244
+ /**
245
+ * Check if a URL is external relative to a domain
246
+ */
247
+ function isExternalUrl(url, baseDomain) {
248
+ try {
249
+ return new URL(url).hostname.replace(/^www\./, "") !== baseDomain;
250
+ } catch {
251
+ return false;
252
+ }
253
+ }
254
+ /**
255
+ * Extract protocol from URL
256
+ */
257
+ function getProtocol(url) {
258
+ try {
259
+ return new URL(url).protocol;
260
+ } catch {
261
+ return "";
262
+ }
263
+ }
264
+ /**
265
+ * Get the path portion of a URL
266
+ */
267
+ function getPath(url) {
268
+ try {
269
+ return new URL(url).pathname;
270
+ } catch {
271
+ return "";
272
+ }
273
+ }
274
+ /**
275
+ * Check if URL matches a pattern (supports * wildcard)
276
+ */
277
+ function matchesUrlPattern(url, pattern) {
278
+ if (!pattern.includes("*")) return url === pattern || url.startsWith(pattern);
279
+ const regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
280
+ return (/* @__PURE__ */ new RegExp(`^${regexPattern}`)).test(url);
281
+ }
282
+
283
+ //#endregion
284
+ //#region src/extractors/favicon.ts
285
+ /**
286
+ * Extracts favicon URL from the page.
287
+ * Checks multiple sources in order of preference.
288
+ */
289
+ var FaviconExtractor = class {
290
+ name = "favicon";
291
+ priority = 70;
292
+ async extract(context) {
293
+ const { $, finalUrl } = context;
294
+ for (const selector of [
295
+ "link[rel=\"icon\"][type=\"image/svg+xml\"]",
296
+ "link[rel=\"icon\"][sizes=\"192x192\"]",
297
+ "link[rel=\"icon\"][sizes=\"180x180\"]",
298
+ "link[rel=\"icon\"][sizes=\"128x128\"]",
299
+ "link[rel=\"icon\"][sizes=\"96x96\"]",
300
+ "link[rel=\"apple-touch-icon\"][sizes=\"180x180\"]",
301
+ "link[rel=\"apple-touch-icon\"]",
302
+ "link[rel=\"icon\"][sizes=\"32x32\"]",
303
+ "link[rel=\"icon\"]",
304
+ "link[rel=\"shortcut icon\"]"
305
+ ]) {
306
+ const href = $(selector).first().attr("href");
307
+ if (href) return { favicon: resolveUrl(finalUrl, href) };
308
+ }
309
+ try {
310
+ const url = new URL(finalUrl);
311
+ return { favicon: `${url.protocol}//${url.host}/favicon.ico` };
312
+ } catch {
313
+ return {};
314
+ }
315
+ }
316
+ };
317
+
318
+ //#endregion
319
+ //#region src/extractors/jsonld.ts
320
+ /**
321
+ * Extracts JSON-LD structured data from the page.
322
+ * Also extracts additional metadata from structured data.
323
+ */
324
+ var JsonLdExtractor = class {
325
+ name = "jsonld";
326
+ priority = 80;
327
+ async extract(context) {
328
+ const { $ } = context;
329
+ const jsonLd = [];
330
+ $("script[type=\"application/ld+json\"]").each((_, el) => {
331
+ const content = $(el).html();
332
+ if (!content) return;
333
+ try {
334
+ const parsed = JSON.parse(content);
335
+ if (Array.isArray(parsed)) jsonLd.push(...parsed);
336
+ else if (typeof parsed === "object" && parsed !== null) jsonLd.push(parsed);
337
+ } catch {}
338
+ });
339
+ if (jsonLd.length === 0) return {};
340
+ return {
341
+ jsonLd,
342
+ ...this.extractMetadata(jsonLd)
343
+ };
344
+ }
345
+ extractMetadata(jsonLd) {
346
+ const result = {};
347
+ for (const item of jsonLd) {
348
+ const type = this.getType(item);
349
+ if (type?.match(/Article|BlogPosting|NewsArticle|WebPage/i)) {
350
+ result.title = result.title || this.getString(item, "headline", "name");
351
+ result.description = result.description || this.getString(item, "description");
352
+ result.author = result.author || this.getAuthor(item);
353
+ result.publishedAt = result.publishedAt || this.getString(item, "datePublished");
354
+ result.modifiedAt = result.modifiedAt || this.getString(item, "dateModified");
355
+ result.image = result.image || this.getImage(item);
356
+ }
357
+ if (type === "Organization") result.siteName = result.siteName || this.getString(item, "name");
358
+ if (type === "Product") {
359
+ result.title = result.title || this.getString(item, "name");
360
+ result.description = result.description || this.getString(item, "description");
361
+ result.image = result.image || this.getImage(item);
362
+ }
363
+ if (type === "SoftwareApplication") {
364
+ result.title = result.title || this.getString(item, "name");
365
+ result.description = result.description || this.getString(item, "description");
366
+ }
367
+ const keywords = this.getKeywords(item);
368
+ if (keywords.length > 0) result.keywords = [...result.keywords || [], ...keywords];
369
+ }
370
+ if (result.keywords) result.keywords = [...new Set(result.keywords)];
371
+ return result;
372
+ }
373
+ getType(item) {
374
+ const type = item["@type"];
375
+ if (typeof type === "string") return type;
376
+ if (Array.isArray(type)) return type[0];
377
+ }
378
+ getString(item, ...keys) {
379
+ for (const key of keys) {
380
+ const value = item[key];
381
+ if (typeof value === "string") return value;
382
+ if (typeof value === "object" && value !== null && "@value" in value) return String(value["@value"]);
383
+ }
384
+ }
385
+ getAuthor(item) {
386
+ const author = item.author;
387
+ if (typeof author === "string") return author;
388
+ if (Array.isArray(author)) return author.map((a) => typeof a === "string" ? a : this.getString(a, "name")).filter(Boolean).join(", ") || void 0;
389
+ if (typeof author === "object" && author !== null) {
390
+ const authorObj = author;
391
+ return this.getString(authorObj, "name") || void 0;
392
+ }
393
+ }
394
+ getImage(item) {
395
+ const image = item.image;
396
+ if (typeof image === "string") return image;
397
+ if (Array.isArray(image) && image.length > 0) return this.getImage({ image: image[0] });
398
+ if (typeof image === "object" && image !== null) {
399
+ const imageObj = image;
400
+ return this.getString(imageObj, "url", "contentUrl") || void 0;
401
+ }
402
+ }
403
+ getKeywords(item) {
404
+ const keywords = item.keywords;
405
+ if (typeof keywords === "string") return keywords.split(",").map((k) => k.trim()).filter(Boolean);
406
+ if (Array.isArray(keywords)) return keywords.filter((k) => typeof k === "string");
407
+ return [];
408
+ }
409
+ };
410
+
411
+ //#endregion
412
+ //#region src/extractors/links.ts
413
+ /**
414
+ * Extracts links from the page content.
415
+ * Filters out navigation/footer links and focuses on content links.
416
+ */
417
+ var LinksExtractor = class {
418
+ name = "links";
419
+ priority = 30;
420
+ async extract(context) {
421
+ const { $, finalUrl } = context;
422
+ const links = [];
423
+ const seen = /* @__PURE__ */ new Set();
424
+ const contentArea = $("article, main, [role=\"main\"]").first();
425
+ const container = contentArea.length > 0 ? contentArea : $("body");
426
+ const skipSelectors = "nav, header, footer, aside, [role=\"navigation\"], [class*=\"nav\"], [class*=\"footer\"], [class*=\"header\"], [class*=\"sidebar\"], [class*=\"menu\"]";
427
+ container.find("a[href]").each((_, el) => {
428
+ const $el = $(el);
429
+ if ($el.closest(skipSelectors).length > 0) return;
430
+ const href = $el.attr("href");
431
+ if (!href) return;
432
+ if (href.startsWith("#") || href.startsWith("javascript:") || href.startsWith("mailto:") || href.startsWith("tel:")) return;
433
+ const resolvedUrl = resolveUrl(href, finalUrl);
434
+ if (!resolvedUrl || !isValidUrl(resolvedUrl)) return;
435
+ if (seen.has(resolvedUrl)) return;
436
+ seen.add(resolvedUrl);
437
+ const text = $el.text().trim() || $el.attr("title") || $el.attr("aria-label") || "";
438
+ if (text.length < 2) return;
439
+ const baseDomain = extractDomain(finalUrl);
440
+ links.push({
441
+ url: resolvedUrl,
442
+ text: text.slice(0, 200),
443
+ isExternal: isExternalUrl(resolvedUrl, baseDomain)
444
+ });
445
+ });
446
+ return { links: links.slice(0, 100) };
447
+ }
448
+ };
449
+
450
+ //#endregion
451
+ //#region src/extractors/meta.ts
452
+ /**
453
+ * Extracts metadata from HTML meta tags, Open Graph, and Twitter cards.
454
+ * Runs first to provide basic metadata for other extractors.
455
+ */
456
+ var MetaExtractor = class {
457
+ name = "meta";
458
+ priority = 100;
459
+ async extract(context) {
460
+ const { $ } = context;
461
+ const getMeta = (nameOrProperty) => {
462
+ return ($(`meta[name="${nameOrProperty}"]`).attr("content") || $(`meta[property="${nameOrProperty}"]`).attr("content") || $(`meta[itemprop="${nameOrProperty}"]`).attr("content"))?.trim() || void 0;
463
+ };
464
+ const title = getMeta("og:title") || getMeta("twitter:title") || $("title").first().text().trim() || "";
465
+ const description = getMeta("og:description") || getMeta("twitter:description") || getMeta("description") || "";
466
+ const image = getMeta("og:image") || getMeta("twitter:image") || getMeta("twitter:image:src") || void 0;
467
+ const canonicalUrl = $("link[rel=\"canonical\"]").attr("href") || getMeta("og:url") || context.finalUrl;
468
+ const author = getMeta("author") || getMeta("article:author") || getMeta("twitter:creator") || $("[rel=\"author\"]").first().text().trim() || void 0;
469
+ const siteName = getMeta("og:site_name") || getMeta("application-name") || void 0;
470
+ const publishedAt = getMeta("article:published_time") || getMeta("datePublished") || getMeta("date") || $("time[datetime]").first().attr("datetime") || void 0;
471
+ const modifiedAt = getMeta("article:modified_time") || getMeta("dateModified") || void 0;
472
+ const language = $("html").attr("lang") || getMeta("og:locale") || getMeta("language") || void 0;
473
+ const keywordsRaw = getMeta("keywords") || getMeta("article:tag") || "";
474
+ return {
475
+ title,
476
+ description,
477
+ image,
478
+ canonicalUrl,
479
+ author,
480
+ siteName,
481
+ publishedAt,
482
+ modifiedAt,
483
+ language,
484
+ keywords: keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : []
485
+ };
486
+ }
487
+ };
488
+
489
+ //#endregion
490
+ //#region src/extractors/index.ts
491
+ /**
492
+ * Default extractors in priority order.
493
+ * Higher priority runs first.
494
+ */
495
+ function createDefaultExtractors() {
496
+ return [
497
+ new MetaExtractor(),
498
+ new JsonLdExtractor(),
499
+ new FaviconExtractor(),
500
+ new ContentExtractor(),
501
+ new LinksExtractor()
502
+ ];
503
+ }
504
+ /**
505
+ * Sort extractors by priority (higher first).
506
+ */
507
+ function sortExtractors(extractors) {
508
+ return [...extractors].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0));
509
+ }
510
+
511
+ //#endregion
512
+ //#region src/fetchers/types.ts
513
+ /**
514
+ * Default user agent string
515
+ */
516
+ const DEFAULT_USER_AGENT = "Scrapex-Bot/2.0 (+https://github.com/developer-rakeshpaul/scrapex)";
517
+ /**
518
+ * Default timeout in milliseconds
519
+ */
520
+ const DEFAULT_TIMEOUT = 1e4;
521
+
522
+ //#endregion
523
+ //#region src/fetchers/fetch.ts
524
+ /**
525
+ * Default fetcher using native fetch API.
526
+ * Works in Node.js 18+ without polyfills.
527
+ */
528
+ var NativeFetcher = class {
529
+ name = "native-fetch";
530
+ async fetch(url, options = {}) {
531
+ const { timeout = DEFAULT_TIMEOUT, userAgent = DEFAULT_USER_AGENT, headers = {} } = options;
532
+ let parsedUrl;
533
+ try {
534
+ parsedUrl = new URL(url);
535
+ } catch {
536
+ throw new require_enhancer.ScrapeError(`Invalid URL: ${url}`, "INVALID_URL");
537
+ }
538
+ if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new require_enhancer.ScrapeError(`Invalid protocol: ${parsedUrl.protocol}`, "INVALID_URL");
539
+ const controller = new AbortController();
540
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
541
+ try {
542
+ const response = await fetch(url, {
543
+ signal: controller.signal,
544
+ headers: {
545
+ "User-Agent": userAgent,
546
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
547
+ "Accept-Language": "en-US,en;q=0.5",
548
+ ...headers
549
+ },
550
+ redirect: "follow"
551
+ });
552
+ clearTimeout(timeoutId);
553
+ if (!response.ok) {
554
+ if (response.status === 404) throw new require_enhancer.ScrapeError(`Page not found: ${url}`, "NOT_FOUND", 404);
555
+ if (response.status === 403 || response.status === 401) throw new require_enhancer.ScrapeError(`Access blocked: ${url}`, "BLOCKED", response.status);
556
+ if (response.status === 429) throw new require_enhancer.ScrapeError(`Rate limited: ${url}`, "BLOCKED", 429);
557
+ throw new require_enhancer.ScrapeError(`HTTP error ${response.status}: ${url}`, "FETCH_FAILED", response.status);
558
+ }
559
+ const contentType = response.headers.get("content-type") || "";
560
+ if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) throw new require_enhancer.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
561
+ const html = await response.text();
562
+ const responseHeaders = {};
563
+ response.headers.forEach((value, key) => {
564
+ responseHeaders[key] = value;
565
+ });
566
+ return {
567
+ html,
568
+ finalUrl: response.url,
569
+ statusCode: response.status,
570
+ contentType,
571
+ headers: responseHeaders
572
+ };
573
+ } catch (error) {
574
+ clearTimeout(timeoutId);
575
+ if (error instanceof require_enhancer.ScrapeError) throw error;
576
+ if (error instanceof Error && error.name === "AbortError") throw new require_enhancer.ScrapeError(`Request timed out after ${timeout}ms`, "TIMEOUT");
577
+ if (error instanceof Error) throw new require_enhancer.ScrapeError(`Fetch failed: ${error.message}`, "FETCH_FAILED", void 0, error);
578
+ throw new require_enhancer.ScrapeError("Unknown fetch error", "FETCH_FAILED");
579
+ }
580
+ }
581
+ };
582
+ /**
583
+ * Default fetcher instance
584
+ */
585
+ const defaultFetcher = new NativeFetcher();
586
+
587
+ //#endregion
588
+ //#region src/fetchers/robots.ts
589
+ /**
590
+ * Check if URL is allowed by robots.txt
591
+ *
592
+ * @param url - The URL to check
593
+ * @param userAgent - User agent to check rules for
594
+ * @returns Whether the URL is allowed and optional reason
595
+ */
596
+ async function checkRobotsTxt(url, userAgent = DEFAULT_USER_AGENT) {
597
+ try {
598
+ const parsedUrl = new URL(url);
599
+ const robotsUrl = `${parsedUrl.protocol}//${parsedUrl.host}/robots.txt`;
600
+ const response = await fetch(robotsUrl, {
601
+ headers: { "User-Agent": userAgent },
602
+ signal: AbortSignal.timeout(5e3)
603
+ });
604
+ if (!response.ok) return { allowed: true };
605
+ const allowed = isPathAllowed(parseRobotsTxt(await response.text(), userAgent), parsedUrl.pathname + parsedUrl.search);
606
+ return {
607
+ allowed,
608
+ reason: allowed ? void 0 : "Blocked by robots.txt"
609
+ };
610
+ } catch {
611
+ return { allowed: true };
612
+ }
613
+ }
614
+ /**
615
+ * Parse robots.txt content for a specific user agent
616
+ */
617
+ function parseRobotsTxt(content, userAgent) {
618
+ const rules = {
619
+ disallow: [],
620
+ allow: []
621
+ };
622
+ const lines = content.split("\n");
623
+ const botName = userAgent.split(/[\s/]/)[0]?.toLowerCase() || "";
624
+ let currentAgent = "";
625
+ let isMatchingAgent = false;
626
+ let hasFoundSpecificAgent = false;
627
+ for (const rawLine of lines) {
628
+ const line = rawLine.trim();
629
+ if (!line || line.startsWith("#")) continue;
630
+ const colonIndex = line.indexOf(":");
631
+ if (colonIndex === -1) continue;
632
+ const directive = line.slice(0, colonIndex).trim().toLowerCase();
633
+ const value = line.slice(colonIndex + 1).trim();
634
+ if (directive === "user-agent") {
635
+ currentAgent = value.toLowerCase();
636
+ isMatchingAgent = currentAgent === "*" || currentAgent === botName || botName.includes(currentAgent);
637
+ if (currentAgent !== "*" && isMatchingAgent) {
638
+ hasFoundSpecificAgent = true;
639
+ rules.disallow = [];
640
+ rules.allow = [];
641
+ }
642
+ } else if (isMatchingAgent && (!hasFoundSpecificAgent || currentAgent !== "*")) {
643
+ if (directive === "disallow" && value) rules.disallow.push(value);
644
+ else if (directive === "allow" && value) rules.allow.push(value);
645
+ }
646
+ }
647
+ return rules;
648
+ }
649
+ /**
650
+ * Check if a path is allowed based on robots.txt rules
651
+ */
652
+ function isPathAllowed(rules, path) {
653
+ if (rules.disallow.length === 0 && rules.allow.length === 0) return true;
654
+ for (const pattern of rules.allow) if (matchesPattern(path, pattern)) return true;
655
+ for (const pattern of rules.disallow) if (matchesPattern(path, pattern)) return false;
656
+ return true;
657
+ }
658
+ /**
659
+ * Check if a path matches a robots.txt pattern
660
+ */
661
+ function matchesPattern(path, pattern) {
662
+ if (!pattern) return false;
663
+ if (pattern.endsWith("*")) return path.startsWith(pattern.slice(0, -1));
664
+ if (pattern.endsWith("$")) return path === pattern.slice(0, -1);
665
+ if (pattern.includes("*")) return (/* @__PURE__ */ new RegExp(`^${pattern.replace(/\*/g, ".*").replace(/\?/g, "\\?")}.*`)).test(path);
666
+ return path.startsWith(pattern);
667
+ }
668
+
669
+ //#endregion
670
+ //#region src/core/scrape.ts
671
+ /**
672
+ * Scrape a URL and extract metadata and content.
673
+ *
674
+ * @param url - The URL to scrape
675
+ * @param options - Scraping options
676
+ * @returns Scraped data with metadata and content
677
+ *
678
+ * @example
679
+ * ```ts
680
+ * const result = await scrape('https://example.com/article');
681
+ * console.log(result.title, result.content);
682
+ * ```
683
+ */
684
+ async function scrape(url, options = {}) {
685
+ const startTime = Date.now();
686
+ if (!isValidUrl(url)) throw new require_enhancer.ScrapeError("Invalid URL provided", "INVALID_URL");
687
+ const normalizedUrl = normalizeUrl(url);
688
+ if (options.respectRobots) {
689
+ const robotsResult = await checkRobotsTxt(normalizedUrl, options.userAgent);
690
+ if (!robotsResult.allowed) throw new require_enhancer.ScrapeError(`URL blocked by robots.txt: ${robotsResult.reason || "disallowed"}`, "ROBOTS_BLOCKED");
691
+ }
692
+ const fetchResult = await (options.fetcher ?? defaultFetcher).fetch(normalizedUrl, {
693
+ timeout: options.timeout,
694
+ userAgent: options.userAgent
695
+ });
696
+ await preloadJsdom();
697
+ let context = createExtractionContext(normalizedUrl, fetchResult.finalUrl, fetchResult.html, options);
698
+ let extractors;
699
+ if (options.replaceDefaultExtractors) extractors = options.extractors ?? [];
700
+ else {
701
+ const defaults = createDefaultExtractors();
702
+ extractors = options.extractors ? [...defaults, ...options.extractors] : defaults;
703
+ }
704
+ extractors = sortExtractors(extractors);
705
+ for (const extractor of extractors) try {
706
+ const extracted = await extractor.extract(context);
707
+ context = mergeResults(context, extracted);
708
+ } catch (error) {
709
+ console.error(`Extractor "${extractor.name}" failed:`, error);
710
+ context = mergeResults(context, { error: context.results.error ? `${context.results.error}; ${extractor.name}: ${error instanceof Error ? error.message : String(error)}` : `${extractor.name}: ${error instanceof Error ? error.message : String(error)}` });
711
+ }
712
+ const intermediateResult = {
713
+ url: normalizedUrl,
714
+ canonicalUrl: context.results.canonicalUrl || fetchResult.finalUrl,
715
+ domain: extractDomain(fetchResult.finalUrl),
716
+ title: context.results.title || "",
717
+ description: context.results.description || "",
718
+ image: context.results.image,
719
+ favicon: context.results.favicon,
720
+ content: context.results.content || "",
721
+ textContent: context.results.textContent || "",
722
+ excerpt: context.results.excerpt || "",
723
+ wordCount: context.results.wordCount || 0,
724
+ author: context.results.author,
725
+ publishedAt: context.results.publishedAt,
726
+ modifiedAt: context.results.modifiedAt,
727
+ siteName: context.results.siteName,
728
+ language: context.results.language,
729
+ contentType: context.results.contentType || "unknown",
730
+ keywords: context.results.keywords || [],
731
+ jsonLd: context.results.jsonLd,
732
+ links: context.results.links,
733
+ custom: context.results.custom,
734
+ scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
735
+ scrapeTimeMs: 0,
736
+ error: context.results.error
737
+ };
738
+ if (options.llm && options.enhance && options.enhance.length > 0) try {
739
+ const enhanced = await require_enhancer.enhance(intermediateResult, options.llm, options.enhance);
740
+ Object.assign(intermediateResult, enhanced);
741
+ } catch (error) {
742
+ console.error("LLM enhancement failed:", error);
743
+ intermediateResult.error = intermediateResult.error ? `${intermediateResult.error}; LLM: ${error instanceof Error ? error.message : String(error)}` : `LLM: ${error instanceof Error ? error.message : String(error)}`;
744
+ }
745
+ if (options.llm && options.extract) try {
746
+ intermediateResult.extracted = await require_enhancer.extract(intermediateResult, options.llm, options.extract);
747
+ } catch (error) {
748
+ console.error("LLM extraction failed:", error);
749
+ intermediateResult.error = intermediateResult.error ? `${intermediateResult.error}; LLM extraction: ${error instanceof Error ? error.message : String(error)}` : `LLM extraction: ${error instanceof Error ? error.message : String(error)}`;
750
+ }
751
+ const scrapeTimeMs = Date.now() - startTime;
752
+ return {
753
+ ...intermediateResult,
754
+ scrapeTimeMs
755
+ };
756
+ }
757
+ /**
758
+ * Scrape from raw HTML string (no fetch).
759
+ *
760
+ * @param html - The HTML content
761
+ * @param url - The URL (for resolving relative links)
762
+ * @param options - Scraping options
763
+ * @returns Scraped data with metadata and content
764
+ *
765
+ * @example
766
+ * ```ts
767
+ * const html = await fetchSomehow('https://example.com');
768
+ * const result = await scrapeHtml(html, 'https://example.com');
769
+ * ```
770
+ */
771
+ async function scrapeHtml(html, url, options = {}) {
772
+ const startTime = Date.now();
773
+ if (!isValidUrl(url)) throw new require_enhancer.ScrapeError("Invalid URL provided", "INVALID_URL");
774
+ const normalizedUrl = normalizeUrl(url);
775
+ await preloadJsdom();
776
+ let context = createExtractionContext(normalizedUrl, normalizedUrl, html, options);
777
+ let extractors;
778
+ if (options.replaceDefaultExtractors) extractors = options.extractors ?? [];
779
+ else {
780
+ const defaults = createDefaultExtractors();
781
+ extractors = options.extractors ? [...defaults, ...options.extractors] : defaults;
782
+ }
783
+ extractors = sortExtractors(extractors);
784
+ for (const extractor of extractors) try {
785
+ const extracted = await extractor.extract(context);
786
+ context = mergeResults(context, extracted);
787
+ } catch (error) {
788
+ console.error(`Extractor "${extractor.name}" failed:`, error);
789
+ context = mergeResults(context, { error: context.results.error ? `${context.results.error}; ${extractor.name}: ${error instanceof Error ? error.message : String(error)}` : `${extractor.name}: ${error instanceof Error ? error.message : String(error)}` });
790
+ }
791
+ const scrapeTimeMs = Date.now() - startTime;
792
+ const domain = extractDomain(normalizedUrl);
793
+ return {
794
+ url: normalizedUrl,
795
+ canonicalUrl: context.results.canonicalUrl || normalizedUrl,
796
+ domain,
797
+ title: context.results.title || "",
798
+ description: context.results.description || "",
799
+ image: context.results.image,
800
+ favicon: context.results.favicon,
801
+ content: context.results.content || "",
802
+ textContent: context.results.textContent || "",
803
+ excerpt: context.results.excerpt || "",
804
+ wordCount: context.results.wordCount || 0,
805
+ author: context.results.author,
806
+ publishedAt: context.results.publishedAt,
807
+ modifiedAt: context.results.modifiedAt,
808
+ siteName: context.results.siteName,
809
+ language: context.results.language,
810
+ contentType: context.results.contentType || "unknown",
811
+ keywords: context.results.keywords || [],
812
+ jsonLd: context.results.jsonLd,
813
+ links: context.results.links,
814
+ summary: context.results.summary,
815
+ suggestedTags: context.results.suggestedTags,
816
+ entities: context.results.entities,
817
+ extracted: context.results.extracted,
818
+ custom: context.results.custom,
819
+ scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
820
+ scrapeTimeMs,
821
+ error: context.results.error
822
+ };
823
+ }
824
+
825
+ //#endregion
826
+ exports.ContentExtractor = ContentExtractor;
827
+ exports.DEFAULT_TIMEOUT = DEFAULT_TIMEOUT;
828
+ exports.DEFAULT_USER_AGENT = DEFAULT_USER_AGENT;
829
+ exports.FaviconExtractor = FaviconExtractor;
830
+ exports.JsonLdExtractor = JsonLdExtractor;
831
+ exports.LinksExtractor = LinksExtractor;
832
+ exports.MetaExtractor = MetaExtractor;
833
+ exports.NativeFetcher = NativeFetcher;
834
+ exports.ScrapeError = require_enhancer.ScrapeError;
835
+ exports.__toESM = __toESM;
836
+ exports.checkRobotsTxt = checkRobotsTxt;
837
+ exports.createDefaultExtractors = createDefaultExtractors;
838
+ exports.createExtractionContext = createExtractionContext;
839
+ exports.defaultFetcher = defaultFetcher;
840
+ exports.extractDomain = extractDomain;
841
+ exports.getPath = getPath;
842
+ exports.getProtocol = getProtocol;
843
+ exports.isExternalUrl = isExternalUrl;
844
+ exports.isValidUrl = isValidUrl;
845
+ exports.matchesUrlPattern = matchesUrlPattern;
846
+ exports.mergeResults = mergeResults;
847
+ exports.normalizeUrl = normalizeUrl;
848
+ exports.resolveUrl = resolveUrl;
849
+ exports.scrape = scrape;
850
+ exports.scrapeHtml = scrapeHtml;
851
+ exports.sortExtractors = sortExtractors;
852
+ //# sourceMappingURL=index.cjs.map