scrapex 0.5.3 → 1.0.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +392 -145
- package/dist/enhancer-Q6CSc1gA.mjs +220 -0
- package/dist/enhancer-Q6CSc1gA.mjs.map +1 -0
- package/dist/enhancer-oM4BhYYS.cjs +268 -0
- package/dist/enhancer-oM4BhYYS.cjs.map +1 -0
- package/dist/index.cjs +852 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +264 -0
- package/dist/index.d.cts.map +1 -0
- package/dist/index.d.mts +264 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +798 -0
- package/dist/index.mjs.map +1 -0
- package/dist/llm/index.cjs +316 -0
- package/dist/llm/index.cjs.map +1 -0
- package/dist/llm/index.d.cts +211 -0
- package/dist/llm/index.d.cts.map +1 -0
- package/dist/llm/index.d.mts +211 -0
- package/dist/llm/index.d.mts.map +1 -0
- package/dist/llm/index.mjs +310 -0
- package/dist/llm/index.mjs.map +1 -0
- package/dist/parsers/index.cjs +200 -0
- package/dist/parsers/index.cjs.map +1 -0
- package/dist/parsers/index.d.cts +133 -0
- package/dist/parsers/index.d.cts.map +1 -0
- package/dist/parsers/index.d.mts +133 -0
- package/dist/parsers/index.d.mts.map +1 -0
- package/dist/parsers/index.mjs +192 -0
- package/dist/parsers/index.mjs.map +1 -0
- package/dist/types-CNQZVW36.d.mts +150 -0
- package/dist/types-CNQZVW36.d.mts.map +1 -0
- package/dist/types-D0HYR95H.d.cts +150 -0
- package/dist/types-D0HYR95H.d.cts.map +1 -0
- package/package.json +80 -100
- package/dist/index.d.ts +0 -45
- package/dist/index.js +0 -8
- package/dist/scrapex.cjs.development.js +0 -1130
- package/dist/scrapex.cjs.development.js.map +0 -1
- package/dist/scrapex.cjs.production.min.js +0 -2
- package/dist/scrapex.cjs.production.min.js.map +0 -1
- package/dist/scrapex.esm.js +0 -1122
- package/dist/scrapex.esm.js.map +0 -1
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,852 @@
|
|
|
1
|
+
//#region rolldown:runtime
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __copyProps = (to, from, except, desc) => {
|
|
9
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
10
|
+
for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
|
|
11
|
+
key = keys[i];
|
|
12
|
+
if (!__hasOwnProp.call(to, key) && key !== except) {
|
|
13
|
+
__defProp(to, key, {
|
|
14
|
+
get: ((k) => from[k]).bind(null, key),
|
|
15
|
+
enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
return to;
|
|
21
|
+
};
|
|
22
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", {
|
|
23
|
+
value: mod,
|
|
24
|
+
enumerable: true
|
|
25
|
+
}) : target, mod));
|
|
26
|
+
|
|
27
|
+
//#endregion
|
|
28
|
+
const require_enhancer = require('./enhancer-oM4BhYYS.cjs');
|
|
29
|
+
let cheerio = require("cheerio");
|
|
30
|
+
cheerio = __toESM(cheerio);
|
|
31
|
+
let __mozilla_readability = require("@mozilla/readability");
|
|
32
|
+
let turndown = require("turndown");
|
|
33
|
+
turndown = __toESM(turndown);
|
|
34
|
+
|
|
35
|
+
//#region src/core/context.ts
|
|
36
|
+
let jsdomModule = null;
|
|
37
|
+
/**
|
|
38
|
+
* Preload JSDOM module (called once during scrape initialization)
|
|
39
|
+
*/
|
|
40
|
+
async function preloadJsdom() {
|
|
41
|
+
if (!jsdomModule) jsdomModule = await import("jsdom");
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Create an extraction context with lazy JSDOM loading.
|
|
45
|
+
*
|
|
46
|
+
* Cheerio is always available for fast DOM queries.
|
|
47
|
+
* JSDOM is only loaded when getDocument() is called (for Readability).
|
|
48
|
+
*/
|
|
49
|
+
function createExtractionContext(url, finalUrl, html, options) {
|
|
50
|
+
let document = null;
|
|
51
|
+
return {
|
|
52
|
+
url,
|
|
53
|
+
finalUrl,
|
|
54
|
+
html,
|
|
55
|
+
$: cheerio.load(html),
|
|
56
|
+
options,
|
|
57
|
+
results: {},
|
|
58
|
+
getDocument() {
|
|
59
|
+
if (!document) {
|
|
60
|
+
if (!jsdomModule) throw new Error("JSDOM not preloaded. Call preloadJsdom() before using getDocument().");
|
|
61
|
+
document = new jsdomModule.JSDOM(html, { url: finalUrl }).window.document;
|
|
62
|
+
}
|
|
63
|
+
return document;
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Merge partial results into the context
|
|
69
|
+
*/
|
|
70
|
+
function mergeResults(context, extracted) {
|
|
71
|
+
return {
|
|
72
|
+
...context,
|
|
73
|
+
results: {
|
|
74
|
+
...context.results,
|
|
75
|
+
...extracted,
|
|
76
|
+
custom: extracted.custom || context.results.custom ? {
|
|
77
|
+
...context.results.custom,
|
|
78
|
+
...extracted.custom
|
|
79
|
+
} : void 0
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
//#endregion
|
|
85
|
+
//#region src/extractors/content.ts
|
|
86
|
+
const turndown$1 = new turndown.default({
|
|
87
|
+
headingStyle: "atx",
|
|
88
|
+
codeBlockStyle: "fenced",
|
|
89
|
+
bulletListMarker: "-",
|
|
90
|
+
emDelimiter: "_",
|
|
91
|
+
strongDelimiter: "**",
|
|
92
|
+
linkStyle: "inlined"
|
|
93
|
+
});
|
|
94
|
+
turndown$1.remove([
|
|
95
|
+
"script",
|
|
96
|
+
"style",
|
|
97
|
+
"noscript",
|
|
98
|
+
"iframe",
|
|
99
|
+
"nav",
|
|
100
|
+
"footer"
|
|
101
|
+
]);
|
|
102
|
+
/**
|
|
103
|
+
* Extracts main content using Mozilla Readability.
|
|
104
|
+
* Converts HTML to Markdown for LLM consumption.
|
|
105
|
+
*/
|
|
106
|
+
var ContentExtractor = class {
|
|
107
|
+
name = "content";
|
|
108
|
+
priority = 50;
|
|
109
|
+
async extract(context) {
|
|
110
|
+
const { options } = context;
|
|
111
|
+
if (options.extractContent === false) return {};
|
|
112
|
+
const article = new __mozilla_readability.Readability(context.getDocument().cloneNode(true)).parse();
|
|
113
|
+
if (!article || !article.content) return this.extractFallback(context);
|
|
114
|
+
let content = turndown$1.turndown(article.content);
|
|
115
|
+
const maxLength = options.maxContentLength ?? 5e4;
|
|
116
|
+
if (content.length > maxLength) content = `${content.slice(0, maxLength)}\n\n[Content truncated...]`;
|
|
117
|
+
const textContent = (article.textContent ?? "").trim();
|
|
118
|
+
const excerpt = this.createExcerpt(textContent);
|
|
119
|
+
const wordCount = textContent.split(/\s+/).filter(Boolean).length;
|
|
120
|
+
const contentType = this.detectContentType(context);
|
|
121
|
+
return {
|
|
122
|
+
content,
|
|
123
|
+
textContent,
|
|
124
|
+
excerpt: article.excerpt || excerpt,
|
|
125
|
+
wordCount,
|
|
126
|
+
contentType,
|
|
127
|
+
title: article.title || void 0,
|
|
128
|
+
author: article.byline || void 0,
|
|
129
|
+
siteName: article.siteName || void 0
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
extractFallback(context) {
|
|
133
|
+
const { $ } = context;
|
|
134
|
+
const bodyHtml = $("body").html() || "";
|
|
135
|
+
const content = turndown$1.turndown(bodyHtml);
|
|
136
|
+
const textContent = $("body").text().replace(/\s+/g, " ").trim();
|
|
137
|
+
return {
|
|
138
|
+
content: content.slice(0, context.options.maxContentLength ?? 5e4),
|
|
139
|
+
textContent,
|
|
140
|
+
excerpt: this.createExcerpt(textContent),
|
|
141
|
+
wordCount: textContent.split(/\s+/).filter(Boolean).length,
|
|
142
|
+
contentType: "unknown"
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
createExcerpt(text, maxLength = 300) {
|
|
146
|
+
if (text.length <= maxLength) return text;
|
|
147
|
+
const truncated = text.slice(0, maxLength);
|
|
148
|
+
const lastSpace = truncated.lastIndexOf(" ");
|
|
149
|
+
return `${lastSpace > 0 ? truncated.slice(0, lastSpace) : truncated}...`;
|
|
150
|
+
}
|
|
151
|
+
detectContentType(context) {
|
|
152
|
+
const { $, finalUrl } = context;
|
|
153
|
+
const url = finalUrl.toLowerCase();
|
|
154
|
+
if (url.includes("github.com") && !url.includes("/blob/") && !url.includes("/issues/")) {
|
|
155
|
+
if ($("meta[property=\"og:type\"]").attr("content") === "object" || url.match(/github\.com\/[^/]+\/[^/]+\/?$/)) return "repo";
|
|
156
|
+
}
|
|
157
|
+
if (url.includes("npmjs.com/package/")) return "package";
|
|
158
|
+
if (url.includes("pypi.org/project/")) return "package";
|
|
159
|
+
if (url.includes("/docs/") || url.includes(".readthedocs.") || url.includes("/documentation/")) return "docs";
|
|
160
|
+
if (url.includes("youtube.com") || url.includes("vimeo.com") || url.includes("youtu.be")) return "video";
|
|
161
|
+
const hasPrice = $("[class*=\"price\"], [data-price], [itemprop=\"price\"]").length > 0;
|
|
162
|
+
const hasAddToCart = $("[class*=\"cart\"], [class*=\"buy\"], button:contains(\"Add\")").length > 0;
|
|
163
|
+
if (hasPrice || hasAddToCart) return "product";
|
|
164
|
+
const ogType = $("meta[property=\"og:type\"]").attr("content")?.toLowerCase();
|
|
165
|
+
if (ogType === "article" || ogType === "blog" || ogType === "news") return "article";
|
|
166
|
+
const hasArticleTag = $("article").length > 0;
|
|
167
|
+
const hasDateline = $("time[datetime], [class*=\"date\"], [class*=\"byline\"]").length > 0;
|
|
168
|
+
if (hasArticleTag && hasDateline) return "article";
|
|
169
|
+
return "unknown";
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
//#endregion
|
|
174
|
+
//#region src/utils/url.ts
|
|
175
|
+
/**
|
|
176
|
+
* Common tracking parameters to remove from URLs
|
|
177
|
+
*/
|
|
178
|
+
const TRACKING_PARAMS = [
|
|
179
|
+
"utm_source",
|
|
180
|
+
"utm_medium",
|
|
181
|
+
"utm_campaign",
|
|
182
|
+
"utm_term",
|
|
183
|
+
"utm_content",
|
|
184
|
+
"utm_id",
|
|
185
|
+
"ref",
|
|
186
|
+
"fbclid",
|
|
187
|
+
"gclid",
|
|
188
|
+
"gclsrc",
|
|
189
|
+
"dclid",
|
|
190
|
+
"msclkid",
|
|
191
|
+
"mc_cid",
|
|
192
|
+
"mc_eid",
|
|
193
|
+
"_ga",
|
|
194
|
+
"_gl",
|
|
195
|
+
"source",
|
|
196
|
+
"referrer"
|
|
197
|
+
];
|
|
198
|
+
/**
|
|
199
|
+
* Validate if a string is a valid URL
|
|
200
|
+
*/
|
|
201
|
+
function isValidUrl(url) {
|
|
202
|
+
try {
|
|
203
|
+
const parsed = new URL(url);
|
|
204
|
+
return ["http:", "https:"].includes(parsed.protocol);
|
|
205
|
+
} catch {
|
|
206
|
+
return false;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Normalize URL by removing tracking params and trailing slashes
|
|
211
|
+
*/
|
|
212
|
+
function normalizeUrl(url) {
|
|
213
|
+
try {
|
|
214
|
+
const parsed = new URL(url);
|
|
215
|
+
for (const param of TRACKING_PARAMS) parsed.searchParams.delete(param);
|
|
216
|
+
let normalized = parsed.toString();
|
|
217
|
+
if (normalized.endsWith("/") && parsed.pathname !== "/") normalized = normalized.slice(0, -1);
|
|
218
|
+
return normalized;
|
|
219
|
+
} catch {
|
|
220
|
+
return url;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Extract domain from URL (without www prefix)
|
|
225
|
+
*/
|
|
226
|
+
function extractDomain(url) {
|
|
227
|
+
try {
|
|
228
|
+
return new URL(url).hostname.replace(/^www\./, "");
|
|
229
|
+
} catch {
|
|
230
|
+
return "";
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* Resolve a potentially relative URL against a base URL
|
|
235
|
+
*/
|
|
236
|
+
function resolveUrl(url, baseUrl) {
|
|
237
|
+
if (!url) return void 0;
|
|
238
|
+
try {
|
|
239
|
+
return new URL(url, baseUrl).href;
|
|
240
|
+
} catch {
|
|
241
|
+
return url;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Check if a URL is external relative to a domain
|
|
246
|
+
*/
|
|
247
|
+
function isExternalUrl(url, baseDomain) {
|
|
248
|
+
try {
|
|
249
|
+
return new URL(url).hostname.replace(/^www\./, "") !== baseDomain;
|
|
250
|
+
} catch {
|
|
251
|
+
return false;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Extract protocol from URL
|
|
256
|
+
*/
|
|
257
|
+
function getProtocol(url) {
|
|
258
|
+
try {
|
|
259
|
+
return new URL(url).protocol;
|
|
260
|
+
} catch {
|
|
261
|
+
return "";
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Get the path portion of a URL
|
|
266
|
+
*/
|
|
267
|
+
function getPath(url) {
|
|
268
|
+
try {
|
|
269
|
+
return new URL(url).pathname;
|
|
270
|
+
} catch {
|
|
271
|
+
return "";
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Check if URL matches a pattern (supports * wildcard)
|
|
276
|
+
*/
|
|
277
|
+
function matchesUrlPattern(url, pattern) {
|
|
278
|
+
if (!pattern.includes("*")) return url === pattern || url.startsWith(pattern);
|
|
279
|
+
const regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
280
|
+
return (/* @__PURE__ */ new RegExp(`^${regexPattern}`)).test(url);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
//#endregion
|
|
284
|
+
//#region src/extractors/favicon.ts
|
|
285
|
+
/**
|
|
286
|
+
* Extracts favicon URL from the page.
|
|
287
|
+
* Checks multiple sources in order of preference.
|
|
288
|
+
*/
|
|
289
|
+
var FaviconExtractor = class {
|
|
290
|
+
name = "favicon";
|
|
291
|
+
priority = 70;
|
|
292
|
+
async extract(context) {
|
|
293
|
+
const { $, finalUrl } = context;
|
|
294
|
+
for (const selector of [
|
|
295
|
+
"link[rel=\"icon\"][type=\"image/svg+xml\"]",
|
|
296
|
+
"link[rel=\"icon\"][sizes=\"192x192\"]",
|
|
297
|
+
"link[rel=\"icon\"][sizes=\"180x180\"]",
|
|
298
|
+
"link[rel=\"icon\"][sizes=\"128x128\"]",
|
|
299
|
+
"link[rel=\"icon\"][sizes=\"96x96\"]",
|
|
300
|
+
"link[rel=\"apple-touch-icon\"][sizes=\"180x180\"]",
|
|
301
|
+
"link[rel=\"apple-touch-icon\"]",
|
|
302
|
+
"link[rel=\"icon\"][sizes=\"32x32\"]",
|
|
303
|
+
"link[rel=\"icon\"]",
|
|
304
|
+
"link[rel=\"shortcut icon\"]"
|
|
305
|
+
]) {
|
|
306
|
+
const href = $(selector).first().attr("href");
|
|
307
|
+
if (href) return { favicon: resolveUrl(finalUrl, href) };
|
|
308
|
+
}
|
|
309
|
+
try {
|
|
310
|
+
const url = new URL(finalUrl);
|
|
311
|
+
return { favicon: `${url.protocol}//${url.host}/favicon.ico` };
|
|
312
|
+
} catch {
|
|
313
|
+
return {};
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
};
|
|
317
|
+
|
|
318
|
+
//#endregion
|
|
319
|
+
//#region src/extractors/jsonld.ts
|
|
320
|
+
/**
|
|
321
|
+
* Extracts JSON-LD structured data from the page.
|
|
322
|
+
* Also extracts additional metadata from structured data.
|
|
323
|
+
*/
|
|
324
|
+
var JsonLdExtractor = class {
|
|
325
|
+
name = "jsonld";
|
|
326
|
+
priority = 80;
|
|
327
|
+
async extract(context) {
|
|
328
|
+
const { $ } = context;
|
|
329
|
+
const jsonLd = [];
|
|
330
|
+
$("script[type=\"application/ld+json\"]").each((_, el) => {
|
|
331
|
+
const content = $(el).html();
|
|
332
|
+
if (!content) return;
|
|
333
|
+
try {
|
|
334
|
+
const parsed = JSON.parse(content);
|
|
335
|
+
if (Array.isArray(parsed)) jsonLd.push(...parsed);
|
|
336
|
+
else if (typeof parsed === "object" && parsed !== null) jsonLd.push(parsed);
|
|
337
|
+
} catch {}
|
|
338
|
+
});
|
|
339
|
+
if (jsonLd.length === 0) return {};
|
|
340
|
+
return {
|
|
341
|
+
jsonLd,
|
|
342
|
+
...this.extractMetadata(jsonLd)
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
extractMetadata(jsonLd) {
|
|
346
|
+
const result = {};
|
|
347
|
+
for (const item of jsonLd) {
|
|
348
|
+
const type = this.getType(item);
|
|
349
|
+
if (type?.match(/Article|BlogPosting|NewsArticle|WebPage/i)) {
|
|
350
|
+
result.title = result.title || this.getString(item, "headline", "name");
|
|
351
|
+
result.description = result.description || this.getString(item, "description");
|
|
352
|
+
result.author = result.author || this.getAuthor(item);
|
|
353
|
+
result.publishedAt = result.publishedAt || this.getString(item, "datePublished");
|
|
354
|
+
result.modifiedAt = result.modifiedAt || this.getString(item, "dateModified");
|
|
355
|
+
result.image = result.image || this.getImage(item);
|
|
356
|
+
}
|
|
357
|
+
if (type === "Organization") result.siteName = result.siteName || this.getString(item, "name");
|
|
358
|
+
if (type === "Product") {
|
|
359
|
+
result.title = result.title || this.getString(item, "name");
|
|
360
|
+
result.description = result.description || this.getString(item, "description");
|
|
361
|
+
result.image = result.image || this.getImage(item);
|
|
362
|
+
}
|
|
363
|
+
if (type === "SoftwareApplication") {
|
|
364
|
+
result.title = result.title || this.getString(item, "name");
|
|
365
|
+
result.description = result.description || this.getString(item, "description");
|
|
366
|
+
}
|
|
367
|
+
const keywords = this.getKeywords(item);
|
|
368
|
+
if (keywords.length > 0) result.keywords = [...result.keywords || [], ...keywords];
|
|
369
|
+
}
|
|
370
|
+
if (result.keywords) result.keywords = [...new Set(result.keywords)];
|
|
371
|
+
return result;
|
|
372
|
+
}
|
|
373
|
+
getType(item) {
|
|
374
|
+
const type = item["@type"];
|
|
375
|
+
if (typeof type === "string") return type;
|
|
376
|
+
if (Array.isArray(type)) return type[0];
|
|
377
|
+
}
|
|
378
|
+
getString(item, ...keys) {
|
|
379
|
+
for (const key of keys) {
|
|
380
|
+
const value = item[key];
|
|
381
|
+
if (typeof value === "string") return value;
|
|
382
|
+
if (typeof value === "object" && value !== null && "@value" in value) return String(value["@value"]);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
getAuthor(item) {
|
|
386
|
+
const author = item.author;
|
|
387
|
+
if (typeof author === "string") return author;
|
|
388
|
+
if (Array.isArray(author)) return author.map((a) => typeof a === "string" ? a : this.getString(a, "name")).filter(Boolean).join(", ") || void 0;
|
|
389
|
+
if (typeof author === "object" && author !== null) {
|
|
390
|
+
const authorObj = author;
|
|
391
|
+
return this.getString(authorObj, "name") || void 0;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
getImage(item) {
|
|
395
|
+
const image = item.image;
|
|
396
|
+
if (typeof image === "string") return image;
|
|
397
|
+
if (Array.isArray(image) && image.length > 0) return this.getImage({ image: image[0] });
|
|
398
|
+
if (typeof image === "object" && image !== null) {
|
|
399
|
+
const imageObj = image;
|
|
400
|
+
return this.getString(imageObj, "url", "contentUrl") || void 0;
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
getKeywords(item) {
|
|
404
|
+
const keywords = item.keywords;
|
|
405
|
+
if (typeof keywords === "string") return keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
406
|
+
if (Array.isArray(keywords)) return keywords.filter((k) => typeof k === "string");
|
|
407
|
+
return [];
|
|
408
|
+
}
|
|
409
|
+
};
|
|
410
|
+
|
|
411
|
+
//#endregion
|
|
412
|
+
//#region src/extractors/links.ts
|
|
413
|
+
/**
|
|
414
|
+
* Extracts links from the page content.
|
|
415
|
+
* Filters out navigation/footer links and focuses on content links.
|
|
416
|
+
*/
|
|
417
|
+
var LinksExtractor = class {
|
|
418
|
+
name = "links";
|
|
419
|
+
priority = 30;
|
|
420
|
+
async extract(context) {
|
|
421
|
+
const { $, finalUrl } = context;
|
|
422
|
+
const links = [];
|
|
423
|
+
const seen = /* @__PURE__ */ new Set();
|
|
424
|
+
const contentArea = $("article, main, [role=\"main\"]").first();
|
|
425
|
+
const container = contentArea.length > 0 ? contentArea : $("body");
|
|
426
|
+
const skipSelectors = "nav, header, footer, aside, [role=\"navigation\"], [class*=\"nav\"], [class*=\"footer\"], [class*=\"header\"], [class*=\"sidebar\"], [class*=\"menu\"]";
|
|
427
|
+
container.find("a[href]").each((_, el) => {
|
|
428
|
+
const $el = $(el);
|
|
429
|
+
if ($el.closest(skipSelectors).length > 0) return;
|
|
430
|
+
const href = $el.attr("href");
|
|
431
|
+
if (!href) return;
|
|
432
|
+
if (href.startsWith("#") || href.startsWith("javascript:") || href.startsWith("mailto:") || href.startsWith("tel:")) return;
|
|
433
|
+
const resolvedUrl = resolveUrl(href, finalUrl);
|
|
434
|
+
if (!resolvedUrl || !isValidUrl(resolvedUrl)) return;
|
|
435
|
+
if (seen.has(resolvedUrl)) return;
|
|
436
|
+
seen.add(resolvedUrl);
|
|
437
|
+
const text = $el.text().trim() || $el.attr("title") || $el.attr("aria-label") || "";
|
|
438
|
+
if (text.length < 2) return;
|
|
439
|
+
const baseDomain = extractDomain(finalUrl);
|
|
440
|
+
links.push({
|
|
441
|
+
url: resolvedUrl,
|
|
442
|
+
text: text.slice(0, 200),
|
|
443
|
+
isExternal: isExternalUrl(resolvedUrl, baseDomain)
|
|
444
|
+
});
|
|
445
|
+
});
|
|
446
|
+
return { links: links.slice(0, 100) };
|
|
447
|
+
}
|
|
448
|
+
};
|
|
449
|
+
|
|
450
|
+
//#endregion
|
|
451
|
+
//#region src/extractors/meta.ts
|
|
452
|
+
/**
|
|
453
|
+
* Extracts metadata from HTML meta tags, Open Graph, and Twitter cards.
|
|
454
|
+
* Runs first to provide basic metadata for other extractors.
|
|
455
|
+
*/
|
|
456
|
+
var MetaExtractor = class {
|
|
457
|
+
name = "meta";
|
|
458
|
+
priority = 100;
|
|
459
|
+
async extract(context) {
|
|
460
|
+
const { $ } = context;
|
|
461
|
+
const getMeta = (nameOrProperty) => {
|
|
462
|
+
return ($(`meta[name="${nameOrProperty}"]`).attr("content") || $(`meta[property="${nameOrProperty}"]`).attr("content") || $(`meta[itemprop="${nameOrProperty}"]`).attr("content"))?.trim() || void 0;
|
|
463
|
+
};
|
|
464
|
+
const title = getMeta("og:title") || getMeta("twitter:title") || $("title").first().text().trim() || "";
|
|
465
|
+
const description = getMeta("og:description") || getMeta("twitter:description") || getMeta("description") || "";
|
|
466
|
+
const image = getMeta("og:image") || getMeta("twitter:image") || getMeta("twitter:image:src") || void 0;
|
|
467
|
+
const canonicalUrl = $("link[rel=\"canonical\"]").attr("href") || getMeta("og:url") || context.finalUrl;
|
|
468
|
+
const author = getMeta("author") || getMeta("article:author") || getMeta("twitter:creator") || $("[rel=\"author\"]").first().text().trim() || void 0;
|
|
469
|
+
const siteName = getMeta("og:site_name") || getMeta("application-name") || void 0;
|
|
470
|
+
const publishedAt = getMeta("article:published_time") || getMeta("datePublished") || getMeta("date") || $("time[datetime]").first().attr("datetime") || void 0;
|
|
471
|
+
const modifiedAt = getMeta("article:modified_time") || getMeta("dateModified") || void 0;
|
|
472
|
+
const language = $("html").attr("lang") || getMeta("og:locale") || getMeta("language") || void 0;
|
|
473
|
+
const keywordsRaw = getMeta("keywords") || getMeta("article:tag") || "";
|
|
474
|
+
return {
|
|
475
|
+
title,
|
|
476
|
+
description,
|
|
477
|
+
image,
|
|
478
|
+
canonicalUrl,
|
|
479
|
+
author,
|
|
480
|
+
siteName,
|
|
481
|
+
publishedAt,
|
|
482
|
+
modifiedAt,
|
|
483
|
+
language,
|
|
484
|
+
keywords: keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : []
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
};
|
|
488
|
+
|
|
489
|
+
//#endregion
|
|
490
|
+
//#region src/extractors/index.ts
|
|
491
|
+
/**
|
|
492
|
+
* Default extractors in priority order.
|
|
493
|
+
* Higher priority runs first.
|
|
494
|
+
*/
|
|
495
|
+
function createDefaultExtractors() {
|
|
496
|
+
return [
|
|
497
|
+
new MetaExtractor(),
|
|
498
|
+
new JsonLdExtractor(),
|
|
499
|
+
new FaviconExtractor(),
|
|
500
|
+
new ContentExtractor(),
|
|
501
|
+
new LinksExtractor()
|
|
502
|
+
];
|
|
503
|
+
}
|
|
504
|
+
/**
|
|
505
|
+
* Sort extractors by priority (higher first).
|
|
506
|
+
*/
|
|
507
|
+
function sortExtractors(extractors) {
|
|
508
|
+
return [...extractors].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0));
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
//#endregion
|
|
512
|
+
//#region src/fetchers/types.ts
|
|
513
|
+
/**
|
|
514
|
+
* Default user agent string
|
|
515
|
+
*/
|
|
516
|
+
const DEFAULT_USER_AGENT = "Scrapex-Bot/2.0 (+https://github.com/developer-rakeshpaul/scrapex)";
|
|
517
|
+
/**
|
|
518
|
+
* Default timeout in milliseconds
|
|
519
|
+
*/
|
|
520
|
+
const DEFAULT_TIMEOUT = 1e4;
|
|
521
|
+
|
|
522
|
+
//#endregion
|
|
523
|
+
//#region src/fetchers/fetch.ts
|
|
524
|
+
/**
|
|
525
|
+
* Default fetcher using native fetch API.
|
|
526
|
+
* Works in Node.js 18+ without polyfills.
|
|
527
|
+
*/
|
|
528
|
+
var NativeFetcher = class {
|
|
529
|
+
name = "native-fetch";
|
|
530
|
+
async fetch(url, options = {}) {
|
|
531
|
+
const { timeout = DEFAULT_TIMEOUT, userAgent = DEFAULT_USER_AGENT, headers = {} } = options;
|
|
532
|
+
let parsedUrl;
|
|
533
|
+
try {
|
|
534
|
+
parsedUrl = new URL(url);
|
|
535
|
+
} catch {
|
|
536
|
+
throw new require_enhancer.ScrapeError(`Invalid URL: ${url}`, "INVALID_URL");
|
|
537
|
+
}
|
|
538
|
+
if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new require_enhancer.ScrapeError(`Invalid protocol: ${parsedUrl.protocol}`, "INVALID_URL");
|
|
539
|
+
const controller = new AbortController();
|
|
540
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
541
|
+
try {
|
|
542
|
+
const response = await fetch(url, {
|
|
543
|
+
signal: controller.signal,
|
|
544
|
+
headers: {
|
|
545
|
+
"User-Agent": userAgent,
|
|
546
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
547
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
548
|
+
...headers
|
|
549
|
+
},
|
|
550
|
+
redirect: "follow"
|
|
551
|
+
});
|
|
552
|
+
clearTimeout(timeoutId);
|
|
553
|
+
if (!response.ok) {
|
|
554
|
+
if (response.status === 404) throw new require_enhancer.ScrapeError(`Page not found: ${url}`, "NOT_FOUND", 404);
|
|
555
|
+
if (response.status === 403 || response.status === 401) throw new require_enhancer.ScrapeError(`Access blocked: ${url}`, "BLOCKED", response.status);
|
|
556
|
+
if (response.status === 429) throw new require_enhancer.ScrapeError(`Rate limited: ${url}`, "BLOCKED", 429);
|
|
557
|
+
throw new require_enhancer.ScrapeError(`HTTP error ${response.status}: ${url}`, "FETCH_FAILED", response.status);
|
|
558
|
+
}
|
|
559
|
+
const contentType = response.headers.get("content-type") || "";
|
|
560
|
+
if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) throw new require_enhancer.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
|
|
561
|
+
const html = await response.text();
|
|
562
|
+
const responseHeaders = {};
|
|
563
|
+
response.headers.forEach((value, key) => {
|
|
564
|
+
responseHeaders[key] = value;
|
|
565
|
+
});
|
|
566
|
+
return {
|
|
567
|
+
html,
|
|
568
|
+
finalUrl: response.url,
|
|
569
|
+
statusCode: response.status,
|
|
570
|
+
contentType,
|
|
571
|
+
headers: responseHeaders
|
|
572
|
+
};
|
|
573
|
+
} catch (error) {
|
|
574
|
+
clearTimeout(timeoutId);
|
|
575
|
+
if (error instanceof require_enhancer.ScrapeError) throw error;
|
|
576
|
+
if (error instanceof Error && error.name === "AbortError") throw new require_enhancer.ScrapeError(`Request timed out after ${timeout}ms`, "TIMEOUT");
|
|
577
|
+
if (error instanceof Error) throw new require_enhancer.ScrapeError(`Fetch failed: ${error.message}`, "FETCH_FAILED", void 0, error);
|
|
578
|
+
throw new require_enhancer.ScrapeError("Unknown fetch error", "FETCH_FAILED");
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
};
|
|
582
|
+
/**
|
|
583
|
+
* Default fetcher instance
|
|
584
|
+
*/
|
|
585
|
+
const defaultFetcher = new NativeFetcher();
|
|
586
|
+
|
|
587
|
+
//#endregion
|
|
588
|
+
//#region src/fetchers/robots.ts
|
|
589
|
+
/**
|
|
590
|
+
* Check if URL is allowed by robots.txt
|
|
591
|
+
*
|
|
592
|
+
* @param url - The URL to check
|
|
593
|
+
* @param userAgent - User agent to check rules for
|
|
594
|
+
* @returns Whether the URL is allowed and optional reason
|
|
595
|
+
*/
|
|
596
|
+
async function checkRobotsTxt(url, userAgent = DEFAULT_USER_AGENT) {
|
|
597
|
+
try {
|
|
598
|
+
const parsedUrl = new URL(url);
|
|
599
|
+
const robotsUrl = `${parsedUrl.protocol}//${parsedUrl.host}/robots.txt`;
|
|
600
|
+
const response = await fetch(robotsUrl, {
|
|
601
|
+
headers: { "User-Agent": userAgent },
|
|
602
|
+
signal: AbortSignal.timeout(5e3)
|
|
603
|
+
});
|
|
604
|
+
if (!response.ok) return { allowed: true };
|
|
605
|
+
const allowed = isPathAllowed(parseRobotsTxt(await response.text(), userAgent), parsedUrl.pathname + parsedUrl.search);
|
|
606
|
+
return {
|
|
607
|
+
allowed,
|
|
608
|
+
reason: allowed ? void 0 : "Blocked by robots.txt"
|
|
609
|
+
};
|
|
610
|
+
} catch {
|
|
611
|
+
return { allowed: true };
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
/**
|
|
615
|
+
* Parse robots.txt content for a specific user agent
|
|
616
|
+
*/
|
|
617
|
+
function parseRobotsTxt(content, userAgent) {
|
|
618
|
+
const rules = {
|
|
619
|
+
disallow: [],
|
|
620
|
+
allow: []
|
|
621
|
+
};
|
|
622
|
+
const lines = content.split("\n");
|
|
623
|
+
const botName = userAgent.split(/[\s/]/)[0]?.toLowerCase() || "";
|
|
624
|
+
let currentAgent = "";
|
|
625
|
+
let isMatchingAgent = false;
|
|
626
|
+
let hasFoundSpecificAgent = false;
|
|
627
|
+
for (const rawLine of lines) {
|
|
628
|
+
const line = rawLine.trim();
|
|
629
|
+
if (!line || line.startsWith("#")) continue;
|
|
630
|
+
const colonIndex = line.indexOf(":");
|
|
631
|
+
if (colonIndex === -1) continue;
|
|
632
|
+
const directive = line.slice(0, colonIndex).trim().toLowerCase();
|
|
633
|
+
const value = line.slice(colonIndex + 1).trim();
|
|
634
|
+
if (directive === "user-agent") {
|
|
635
|
+
currentAgent = value.toLowerCase();
|
|
636
|
+
isMatchingAgent = currentAgent === "*" || currentAgent === botName || botName.includes(currentAgent);
|
|
637
|
+
if (currentAgent !== "*" && isMatchingAgent) {
|
|
638
|
+
hasFoundSpecificAgent = true;
|
|
639
|
+
rules.disallow = [];
|
|
640
|
+
rules.allow = [];
|
|
641
|
+
}
|
|
642
|
+
} else if (isMatchingAgent && (!hasFoundSpecificAgent || currentAgent !== "*")) {
|
|
643
|
+
if (directive === "disallow" && value) rules.disallow.push(value);
|
|
644
|
+
else if (directive === "allow" && value) rules.allow.push(value);
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
return rules;
|
|
648
|
+
}
|
|
649
|
+
/**
|
|
650
|
+
* Check if a path is allowed based on robots.txt rules
|
|
651
|
+
*/
|
|
652
|
+
function isPathAllowed(rules, path) {
|
|
653
|
+
if (rules.disallow.length === 0 && rules.allow.length === 0) return true;
|
|
654
|
+
for (const pattern of rules.allow) if (matchesPattern(path, pattern)) return true;
|
|
655
|
+
for (const pattern of rules.disallow) if (matchesPattern(path, pattern)) return false;
|
|
656
|
+
return true;
|
|
657
|
+
}
|
|
658
|
+
/**
|
|
659
|
+
* Check if a path matches a robots.txt pattern
|
|
660
|
+
*/
|
|
661
|
+
function matchesPattern(path, pattern) {
|
|
662
|
+
if (!pattern) return false;
|
|
663
|
+
if (pattern.endsWith("*")) return path.startsWith(pattern.slice(0, -1));
|
|
664
|
+
if (pattern.endsWith("$")) return path === pattern.slice(0, -1);
|
|
665
|
+
if (pattern.includes("*")) return (/* @__PURE__ */ new RegExp(`^${pattern.replace(/\*/g, ".*").replace(/\?/g, "\\?")}.*`)).test(path);
|
|
666
|
+
return path.startsWith(pattern);
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
//#endregion
|
|
670
|
+
//#region src/core/scrape.ts
|
|
671
|
+
/**
|
|
672
|
+
* Scrape a URL and extract metadata and content.
|
|
673
|
+
*
|
|
674
|
+
* @param url - The URL to scrape
|
|
675
|
+
* @param options - Scraping options
|
|
676
|
+
* @returns Scraped data with metadata and content
|
|
677
|
+
*
|
|
678
|
+
* @example
|
|
679
|
+
* ```ts
|
|
680
|
+
* const result = await scrape('https://example.com/article');
|
|
681
|
+
* console.log(result.title, result.content);
|
|
682
|
+
* ```
|
|
683
|
+
*/
|
|
684
|
+
async function scrape(url, options = {}) {
|
|
685
|
+
const startTime = Date.now();
|
|
686
|
+
if (!isValidUrl(url)) throw new require_enhancer.ScrapeError("Invalid URL provided", "INVALID_URL");
|
|
687
|
+
const normalizedUrl = normalizeUrl(url);
|
|
688
|
+
if (options.respectRobots) {
|
|
689
|
+
const robotsResult = await checkRobotsTxt(normalizedUrl, options.userAgent);
|
|
690
|
+
if (!robotsResult.allowed) throw new require_enhancer.ScrapeError(`URL blocked by robots.txt: ${robotsResult.reason || "disallowed"}`, "ROBOTS_BLOCKED");
|
|
691
|
+
}
|
|
692
|
+
const fetchResult = await (options.fetcher ?? defaultFetcher).fetch(normalizedUrl, {
|
|
693
|
+
timeout: options.timeout,
|
|
694
|
+
userAgent: options.userAgent
|
|
695
|
+
});
|
|
696
|
+
await preloadJsdom();
|
|
697
|
+
let context = createExtractionContext(normalizedUrl, fetchResult.finalUrl, fetchResult.html, options);
|
|
698
|
+
let extractors;
|
|
699
|
+
if (options.replaceDefaultExtractors) extractors = options.extractors ?? [];
|
|
700
|
+
else {
|
|
701
|
+
const defaults = createDefaultExtractors();
|
|
702
|
+
extractors = options.extractors ? [...defaults, ...options.extractors] : defaults;
|
|
703
|
+
}
|
|
704
|
+
extractors = sortExtractors(extractors);
|
|
705
|
+
for (const extractor of extractors) try {
|
|
706
|
+
const extracted = await extractor.extract(context);
|
|
707
|
+
context = mergeResults(context, extracted);
|
|
708
|
+
} catch (error) {
|
|
709
|
+
console.error(`Extractor "${extractor.name}" failed:`, error);
|
|
710
|
+
context = mergeResults(context, { error: context.results.error ? `${context.results.error}; ${extractor.name}: ${error instanceof Error ? error.message : String(error)}` : `${extractor.name}: ${error instanceof Error ? error.message : String(error)}` });
|
|
711
|
+
}
|
|
712
|
+
const intermediateResult = {
|
|
713
|
+
url: normalizedUrl,
|
|
714
|
+
canonicalUrl: context.results.canonicalUrl || fetchResult.finalUrl,
|
|
715
|
+
domain: extractDomain(fetchResult.finalUrl),
|
|
716
|
+
title: context.results.title || "",
|
|
717
|
+
description: context.results.description || "",
|
|
718
|
+
image: context.results.image,
|
|
719
|
+
favicon: context.results.favicon,
|
|
720
|
+
content: context.results.content || "",
|
|
721
|
+
textContent: context.results.textContent || "",
|
|
722
|
+
excerpt: context.results.excerpt || "",
|
|
723
|
+
wordCount: context.results.wordCount || 0,
|
|
724
|
+
author: context.results.author,
|
|
725
|
+
publishedAt: context.results.publishedAt,
|
|
726
|
+
modifiedAt: context.results.modifiedAt,
|
|
727
|
+
siteName: context.results.siteName,
|
|
728
|
+
language: context.results.language,
|
|
729
|
+
contentType: context.results.contentType || "unknown",
|
|
730
|
+
keywords: context.results.keywords || [],
|
|
731
|
+
jsonLd: context.results.jsonLd,
|
|
732
|
+
links: context.results.links,
|
|
733
|
+
custom: context.results.custom,
|
|
734
|
+
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
735
|
+
scrapeTimeMs: 0,
|
|
736
|
+
error: context.results.error
|
|
737
|
+
};
|
|
738
|
+
if (options.llm && options.enhance && options.enhance.length > 0) try {
|
|
739
|
+
const enhanced = await require_enhancer.enhance(intermediateResult, options.llm, options.enhance);
|
|
740
|
+
Object.assign(intermediateResult, enhanced);
|
|
741
|
+
} catch (error) {
|
|
742
|
+
console.error("LLM enhancement failed:", error);
|
|
743
|
+
intermediateResult.error = intermediateResult.error ? `${intermediateResult.error}; LLM: ${error instanceof Error ? error.message : String(error)}` : `LLM: ${error instanceof Error ? error.message : String(error)}`;
|
|
744
|
+
}
|
|
745
|
+
if (options.llm && options.extract) try {
|
|
746
|
+
intermediateResult.extracted = await require_enhancer.extract(intermediateResult, options.llm, options.extract);
|
|
747
|
+
} catch (error) {
|
|
748
|
+
console.error("LLM extraction failed:", error);
|
|
749
|
+
intermediateResult.error = intermediateResult.error ? `${intermediateResult.error}; LLM extraction: ${error instanceof Error ? error.message : String(error)}` : `LLM extraction: ${error instanceof Error ? error.message : String(error)}`;
|
|
750
|
+
}
|
|
751
|
+
const scrapeTimeMs = Date.now() - startTime;
|
|
752
|
+
return {
|
|
753
|
+
...intermediateResult,
|
|
754
|
+
scrapeTimeMs
|
|
755
|
+
};
|
|
756
|
+
}
|
|
757
|
+
/**
|
|
758
|
+
* Scrape from raw HTML string (no fetch).
|
|
759
|
+
*
|
|
760
|
+
* @param html - The HTML content
|
|
761
|
+
* @param url - The URL (for resolving relative links)
|
|
762
|
+
* @param options - Scraping options
|
|
763
|
+
* @returns Scraped data with metadata and content
|
|
764
|
+
*
|
|
765
|
+
* @example
|
|
766
|
+
* ```ts
|
|
767
|
+
* const html = await fetchSomehow('https://example.com');
|
|
768
|
+
* const result = await scrapeHtml(html, 'https://example.com');
|
|
769
|
+
* ```
|
|
770
|
+
*/
|
|
771
|
+
async function scrapeHtml(html, url, options = {}) {
|
|
772
|
+
const startTime = Date.now();
|
|
773
|
+
if (!isValidUrl(url)) throw new require_enhancer.ScrapeError("Invalid URL provided", "INVALID_URL");
|
|
774
|
+
const normalizedUrl = normalizeUrl(url);
|
|
775
|
+
await preloadJsdom();
|
|
776
|
+
let context = createExtractionContext(normalizedUrl, normalizedUrl, html, options);
|
|
777
|
+
let extractors;
|
|
778
|
+
if (options.replaceDefaultExtractors) extractors = options.extractors ?? [];
|
|
779
|
+
else {
|
|
780
|
+
const defaults = createDefaultExtractors();
|
|
781
|
+
extractors = options.extractors ? [...defaults, ...options.extractors] : defaults;
|
|
782
|
+
}
|
|
783
|
+
extractors = sortExtractors(extractors);
|
|
784
|
+
for (const extractor of extractors) try {
|
|
785
|
+
const extracted = await extractor.extract(context);
|
|
786
|
+
context = mergeResults(context, extracted);
|
|
787
|
+
} catch (error) {
|
|
788
|
+
console.error(`Extractor "${extractor.name}" failed:`, error);
|
|
789
|
+
context = mergeResults(context, { error: context.results.error ? `${context.results.error}; ${extractor.name}: ${error instanceof Error ? error.message : String(error)}` : `${extractor.name}: ${error instanceof Error ? error.message : String(error)}` });
|
|
790
|
+
}
|
|
791
|
+
const scrapeTimeMs = Date.now() - startTime;
|
|
792
|
+
const domain = extractDomain(normalizedUrl);
|
|
793
|
+
return {
|
|
794
|
+
url: normalizedUrl,
|
|
795
|
+
canonicalUrl: context.results.canonicalUrl || normalizedUrl,
|
|
796
|
+
domain,
|
|
797
|
+
title: context.results.title || "",
|
|
798
|
+
description: context.results.description || "",
|
|
799
|
+
image: context.results.image,
|
|
800
|
+
favicon: context.results.favicon,
|
|
801
|
+
content: context.results.content || "",
|
|
802
|
+
textContent: context.results.textContent || "",
|
|
803
|
+
excerpt: context.results.excerpt || "",
|
|
804
|
+
wordCount: context.results.wordCount || 0,
|
|
805
|
+
author: context.results.author,
|
|
806
|
+
publishedAt: context.results.publishedAt,
|
|
807
|
+
modifiedAt: context.results.modifiedAt,
|
|
808
|
+
siteName: context.results.siteName,
|
|
809
|
+
language: context.results.language,
|
|
810
|
+
contentType: context.results.contentType || "unknown",
|
|
811
|
+
keywords: context.results.keywords || [],
|
|
812
|
+
jsonLd: context.results.jsonLd,
|
|
813
|
+
links: context.results.links,
|
|
814
|
+
summary: context.results.summary,
|
|
815
|
+
suggestedTags: context.results.suggestedTags,
|
|
816
|
+
entities: context.results.entities,
|
|
817
|
+
extracted: context.results.extracted,
|
|
818
|
+
custom: context.results.custom,
|
|
819
|
+
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
820
|
+
scrapeTimeMs,
|
|
821
|
+
error: context.results.error
|
|
822
|
+
};
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
//#endregion
|
|
826
|
+
exports.ContentExtractor = ContentExtractor;
|
|
827
|
+
exports.DEFAULT_TIMEOUT = DEFAULT_TIMEOUT;
|
|
828
|
+
exports.DEFAULT_USER_AGENT = DEFAULT_USER_AGENT;
|
|
829
|
+
exports.FaviconExtractor = FaviconExtractor;
|
|
830
|
+
exports.JsonLdExtractor = JsonLdExtractor;
|
|
831
|
+
exports.LinksExtractor = LinksExtractor;
|
|
832
|
+
exports.MetaExtractor = MetaExtractor;
|
|
833
|
+
exports.NativeFetcher = NativeFetcher;
|
|
834
|
+
exports.ScrapeError = require_enhancer.ScrapeError;
|
|
835
|
+
exports.__toESM = __toESM;
|
|
836
|
+
exports.checkRobotsTxt = checkRobotsTxt;
|
|
837
|
+
exports.createDefaultExtractors = createDefaultExtractors;
|
|
838
|
+
exports.createExtractionContext = createExtractionContext;
|
|
839
|
+
exports.defaultFetcher = defaultFetcher;
|
|
840
|
+
exports.extractDomain = extractDomain;
|
|
841
|
+
exports.getPath = getPath;
|
|
842
|
+
exports.getProtocol = getProtocol;
|
|
843
|
+
exports.isExternalUrl = isExternalUrl;
|
|
844
|
+
exports.isValidUrl = isValidUrl;
|
|
845
|
+
exports.matchesUrlPattern = matchesUrlPattern;
|
|
846
|
+
exports.mergeResults = mergeResults;
|
|
847
|
+
exports.normalizeUrl = normalizeUrl;
|
|
848
|
+
exports.resolveUrl = resolveUrl;
|
|
849
|
+
exports.scrape = scrape;
|
|
850
|
+
exports.scrapeHtml = scrapeHtml;
|
|
851
|
+
exports.sortExtractors = sortExtractors;
|
|
852
|
+
//# sourceMappingURL=index.cjs.map
|