defuddle-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1968 @@
1
+ var DefuddleLib = (() => {
2
+ var __defProp = Object.defineProperty;
3
+ var __defProps = Object.defineProperties;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropDescs = Object.getOwnPropertyDescriptors;
6
+ var __getOwnPropNames = Object.getOwnPropertyNames;
7
+ var __getOwnPropSymbols = Object.getOwnPropertySymbols;
8
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
9
+ var __propIsEnum = Object.prototype.propertyIsEnumerable;
10
+ var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
11
+ var __spreadValues = (a, b) => {
12
+ for (var prop in b || (b = {}))
13
+ if (__hasOwnProp.call(b, prop))
14
+ __defNormalProp(a, prop, b[prop]);
15
+ if (__getOwnPropSymbols)
16
+ for (var prop of __getOwnPropSymbols(b)) {
17
+ if (__propIsEnum.call(b, prop))
18
+ __defNormalProp(a, prop, b[prop]);
19
+ }
20
+ return a;
21
+ };
22
+ var __spreadProps = (a, b) => __defProps(a, __getOwnPropDescs(b));
23
+ var __export = (target, all) => {
24
+ for (var name in all)
25
+ __defProp(target, name, { get: all[name], enumerable: true });
26
+ };
27
+ var __copyProps = (to, from, except, desc) => {
28
+ if (from && typeof from === "object" || typeof from === "function") {
29
+ for (let key of __getOwnPropNames(from))
30
+ if (!__hasOwnProp.call(to, key) && key !== except)
31
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
32
+ }
33
+ return to;
34
+ };
35
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
36
+
37
+ // src/index.js
38
+ var index_exports = {};
39
+ __export(index_exports, {
40
+ Defuddle: () => Defuddle,
41
+ DefuddleResult: () => DefuddleResult
42
+ });
43
+
44
+ // src/schema-org.js
45
+ function extractSchemaOrg(doc) {
46
+ const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
47
+ if (!scripts.length) return null;
48
+ const items = [];
49
+ for (const script of scripts) {
50
+ const content = script.textContent || "";
51
+ if (!content.trim()) continue;
52
+ const parsed = parseJsonLd(content);
53
+ if (!parsed) continue;
54
+ if (parsed["@graph"] && Array.isArray(parsed["@graph"])) {
55
+ parsed["@graph"].forEach((item) => items.push(decodeStrings(item)));
56
+ } else if (Array.isArray(parsed)) {
57
+ parsed.forEach((item) => items.push(decodeStrings(item)));
58
+ } else {
59
+ items.push(decodeStrings(parsed));
60
+ }
61
+ }
62
+ return items.length > 0 ? items : null;
63
+ }
64
+ function parseJsonLd(content) {
65
+ content = content.replace(/\/\*[\s\S]*?\*\//g, "");
66
+ content = content.replace(/^\s*\/\/.*$/gm, "");
67
+ content = content.replace(/^\s*\/\/<!\[CDATA\[|\]\]>\/\/\s*$/gm, "");
68
+ content = content.replace(/^\s*<!\[CDATA\[([\s\S]*?)\]\]>\s*$/, "$1");
69
+ content = content.trim();
70
+ if (!content) return null;
71
+ try {
72
+ return JSON.parse(content);
73
+ } catch (e) {
74
+ return null;
75
+ }
76
+ }
77
+ function decodeStrings(item) {
78
+ if (typeof item === "string") {
79
+ return item.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&apos;/g, "'");
80
+ }
81
+ if (Array.isArray(item)) return item.map(decodeStrings);
82
+ if (item && typeof item === "object") {
83
+ const result = {};
84
+ for (const [key, val] of Object.entries(item)) {
85
+ result[key] = decodeStrings(val);
86
+ }
87
+ return result;
88
+ }
89
+ return item;
90
+ }
91
+ function getSchemaProperty(items, types, property) {
92
+ if (!items) return null;
93
+ for (const item of items) {
94
+ if (!item || typeof item !== "object") continue;
95
+ const itemType = item["@type"] || "";
96
+ if (types.length > 0 && !types.includes(itemType) && itemType !== "") continue;
97
+ const val = item[property];
98
+ if (val !== void 0 && val !== null && val !== "") return val;
99
+ }
100
+ return null;
101
+ }
102
+
103
+ // src/url-resolver.js
104
+ function resolveUrls(root, doc, url) {
105
+ if (!url) return;
106
+ let baseUrl = url;
107
+ const baseEl = doc.querySelector("base[href]");
108
+ if (baseEl) {
109
+ const baseHref = baseEl.getAttribute("href");
110
+ if (baseHref) {
111
+ const resolved = resolveUrl(baseHref, url);
112
+ if (resolved) baseUrl = resolved;
113
+ }
114
+ }
115
+ for (const el of root.querySelectorAll("[href]")) {
116
+ const href = el.getAttribute("href");
117
+ if (href && !href.startsWith("#")) {
118
+ el.setAttribute("href", resolveUrl(href, baseUrl));
119
+ }
120
+ }
121
+ for (const el of root.querySelectorAll("[src]")) {
122
+ const src = el.getAttribute("src");
123
+ if (src) el.setAttribute("src", resolveUrl(src, baseUrl));
124
+ }
125
+ for (const el of root.querySelectorAll("[srcset]")) {
126
+ const srcset = el.getAttribute("srcset");
127
+ if (srcset) el.setAttribute("srcset", resolveSrcset(srcset, baseUrl));
128
+ }
129
+ for (const el of root.querySelectorAll("[poster]")) {
130
+ const poster = el.getAttribute("poster");
131
+ if (poster) el.setAttribute("poster", resolveUrl(poster, baseUrl));
132
+ }
133
+ }
134
+ function resolveUrl(url, base) {
135
+ url = url.trim();
136
+ if (/^(javascript|data|vbscript):/i.test(url)) return url;
137
+ if (/^[a-z][a-z0-9+\-.]*:\/\//i.test(url)) return url;
138
+ if (url.startsWith("#")) return url;
139
+ try {
140
+ return new URL(url, base).href;
141
+ } catch (e) {
142
+ return url;
143
+ }
144
+ }
145
+ function resolveSrcset(srcset, base) {
146
+ const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])(?:,|$)/g;
147
+ const entries = [];
148
+ let match;
149
+ while ((match = entryPattern.exec(srcset)) !== null) {
150
+ const rawUrl = match[1].trim().replace(/^,\s*/, "");
151
+ const descriptor = match[2];
152
+ entries.push(`${resolveUrl(rawUrl, base)} ${descriptor}`);
153
+ }
154
+ if (entries.length > 0) return entries.join(", ");
155
+ return srcset.split(",").map((entry) => {
156
+ const parts = entry.trim().split(/\s+/);
157
+ if (parts[0]) parts[0] = resolveUrl(parts[0], base);
158
+ return parts.join(" ");
159
+ }).join(", ");
160
+ }
161
+
162
+ // src/metadata.js
163
+ var ARTICLE_TYPES = ["Article", "NewsArticle", "BlogPosting", "WebPage"];
164
+ var DATE_RE = /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/i;
165
+ function collectMetaTags(doc) {
166
+ const tags = [];
167
+ for (const meta of doc.querySelectorAll("meta")) {
168
+ const content = meta.getAttribute("content");
169
+ if (!content) continue;
170
+ tags.push({
171
+ name: meta.getAttribute("name") || null,
172
+ property: meta.getAttribute("property") || null,
173
+ content: decodeHtmlEntities(content)
174
+ });
175
+ }
176
+ return tags;
177
+ }
178
+ function extractMetadata(doc, url, schemaOrgData, metaTags) {
179
+ const getMeta = (value, attr) => {
180
+ for (const tag of metaTags) {
181
+ if (tag[attr] && tag[attr].toLowerCase() === value.toLowerCase()) {
182
+ return tag.content || null;
183
+ }
184
+ }
185
+ return null;
186
+ };
187
+ const getSchema = (property) => getSchemaProperty(schemaOrgData, ARTICLE_TYPES, property);
188
+ const site = extractSiteName(doc, metaTags, schemaOrgData, getMeta);
189
+ const title = extractTitle(doc, metaTags, schemaOrgData, getMeta, getSchema, site);
190
+ return {
191
+ title,
192
+ description: extractDescription(metaTags, getSchema, getMeta),
193
+ author: extractAuthor(doc, metaTags, schemaOrgData, getMeta),
194
+ published: extractPublished(doc, metaTags, schemaOrgData, getMeta, getSchema),
195
+ site,
196
+ domain: extractDomain(doc, url, getMeta),
197
+ favicon: extractFavicon(doc, url),
198
+ image: extractImage(doc, metaTags, getSchema, getMeta),
199
+ language: extractLanguage(doc, metaTags, getMeta)
200
+ };
201
+ }
202
+ function extractTitle(doc, metaTags, schemaOrgData, getMeta, getSchema, siteName) {
203
+ const raw = getMeta("og:title", "property") || getMeta("twitter:title", "name") || getSchema("headline") || getMeta("title", "name") || (doc.querySelector("title") ? doc.querySelector("title").textContent.trim() : "") || "";
204
+ return cleanTitle(raw, siteName);
205
+ }
206
+ function cleanTitle(title, siteName) {
207
+ if (!title || !siteName) return title;
208
+ const separators = ["|", " / ", " \xB7 ", " \u2013 ", " \u2014 ", " - ", ": "];
209
+ for (const sep of separators) {
210
+ if (!title.includes(sep)) continue;
211
+ const idx = title.indexOf(sep);
212
+ const left = title.slice(0, idx).trim();
213
+ const right = title.slice(idx + sep.length).trim();
214
+ if (fuzzyMatch(right, siteName)) return left;
215
+ if (fuzzyMatch(left, siteName)) return right;
216
+ }
217
+ return title;
218
+ }
219
+ function fuzzyMatch(a, b) {
220
+ const norm = (s) => s.toLowerCase().replace(/[^a-z0-9]/g, "");
221
+ const na = norm(a), nb = norm(b);
222
+ if (na === nb) return true;
223
+ if (nb.length > 2 && (na.includes(nb) || nb.includes(na))) return true;
224
+ return false;
225
+ }
226
+ function extractDescription(metaTags, getSchema, getMeta) {
227
+ return getMeta("og:description", "property") || getMeta("twitter:description", "name") || getMeta("description", "name") || getSchema("description") || "";
228
+ }
229
+ function extractAuthor(doc, metaTags, schemaOrgData, getMeta) {
230
+ for (const name of ["author", "sailthru.author", "citation_author", "dc.creator", "byl"]) {
231
+ const val = getMeta(name, "name");
232
+ if (val) return val;
233
+ }
234
+ const schemaAuthor = getSchemaAuthor(schemaOrgData);
235
+ if (schemaAuthor) return schemaAuthor;
236
+ const selectors = [
237
+ '[class*="author"]:not([class*="author-bio"])',
238
+ '[rel="author"]',
239
+ '[class*="byline"]',
240
+ '[itemprop="author"]'
241
+ ];
242
+ for (const sel of selectors) {
243
+ const el = doc.querySelector(sel);
244
+ if (el) {
245
+ const text = (el.textContent || "").trim();
246
+ const words = text.split(/\s+/).filter(Boolean).length;
247
+ if (text && words <= 6) return stripByPrefix(text);
248
+ }
249
+ }
250
+ return extractBylineNearH1(doc);
251
+ }
252
+ function getSchemaAuthor(schemaOrgData) {
253
+ var _a;
254
+ if (!schemaOrgData) return "";
255
+ for (const item of schemaOrgData) {
256
+ if (!item || typeof item !== "object") continue;
257
+ const author = item.author;
258
+ if (!author) continue;
259
+ if (typeof author === "string") return author;
260
+ if (typeof author === "object") {
261
+ if (author.name) return String(author.name);
262
+ if (Array.isArray(author) && ((_a = author[0]) == null ? void 0 : _a.name)) return String(author[0].name);
263
+ }
264
+ }
265
+ return "";
266
+ }
267
+ function extractBylineNearH1(doc) {
268
+ const h1 = doc.querySelector("h1");
269
+ if (!h1 || !h1.parentElement) return "";
270
+ let checked = 0;
271
+ for (const sibling of h1.parentElement.children) {
272
+ if (sibling === h1) continue;
273
+ const text = (sibling.textContent || "").trim();
274
+ const match = text.match(/^by\s+(.+)/i);
275
+ if (match) {
276
+ const words = text.split(/\s+/).filter(Boolean).length;
277
+ if (words <= 8) return match[1].trim();
278
+ }
279
+ if (++checked >= 5) break;
280
+ }
281
+ return "";
282
+ }
283
+ function stripByPrefix(text) {
284
+ const m = text.match(/^by\s+(.+)/i);
285
+ return m ? m[1].trim() : text;
286
+ }
287
+ function extractPublished(doc, metaTags, schemaOrgData, getMeta, getSchema) {
288
+ const raw = getMeta("article:published_time", "property") || getMeta("article:published", "property") || getMeta("date", "name") || getMeta("citation_date", "name") || getMeta("DC.date", "name") || getMeta("pubdate", "name") || null;
289
+ if (raw) return normalizeDate(raw);
290
+ const schemaDate = getSchema("datePublished");
291
+ if (schemaDate) return normalizeDate(String(schemaDate));
292
+ const timeEl = doc.querySelector("time[datetime]");
293
+ if (timeEl) {
294
+ const dt = timeEl.getAttribute("datetime");
295
+ if (dt) return normalizeDate(dt);
296
+ }
297
+ for (const tag of metaTags) {
298
+ if (tag.content && DATE_RE.test(tag.content)) {
299
+ const parsed = parseNaturalDate(tag.content);
300
+ if (parsed) return parsed;
301
+ }
302
+ }
303
+ return "";
304
+ }
305
+ function normalizeDate(raw) {
306
+ raw = raw.trim();
307
+ if (!raw) return "";
308
+ if (/^\d{4}-\d{2}-\d{2}/.test(raw)) {
309
+ try {
310
+ return new Date(raw).toISOString();
311
+ } catch (e) {
312
+ return raw;
313
+ }
314
+ }
315
+ return parseNaturalDate(raw) || raw;
316
+ }
317
+ function parseNaturalDate(input) {
318
+ input = input.trim();
319
+ const d = new Date(input);
320
+ if (!isNaN(d.getTime()) && d.getFullYear() > 1900) {
321
+ return d.toISOString();
322
+ }
323
+ return null;
324
+ }
325
+ function extractImage(doc, metaTags, getSchema, getMeta) {
326
+ const url = getMeta("og:image", "property") || getMeta("og:image:url", "property") || getMeta("twitter:image", "name") || getMeta("twitter:image:src", "name") || null;
327
+ if (url) return url;
328
+ const schemaImage = getSchema("image");
329
+ if (schemaImage) {
330
+ if (typeof schemaImage === "string") return schemaImage;
331
+ if (schemaImage.url) return String(schemaImage.url);
332
+ }
333
+ const imgs = doc.querySelectorAll("body img[src]");
334
+ for (const img of imgs) {
335
+ const w = parseInt(img.getAttribute("width") || "0");
336
+ const h = parseInt(img.getAttribute("height") || "0");
337
+ if (w >= 200 && h >= 100 || !w && !h) {
338
+ return img.getAttribute("src") || "";
339
+ }
340
+ }
341
+ return "";
342
+ }
343
+ function extractSiteName(doc, metaTags, schemaOrgData, getMeta) {
344
+ const og = getMeta("og:site_name", "property");
345
+ if (og) return og;
346
+ const publisher = getSchemaProperty(schemaOrgData, ARTICLE_TYPES, "publisher");
347
+ if (publisher) {
348
+ if (typeof publisher === "string") return publisher;
349
+ if (publisher.name) return String(publisher.name);
350
+ }
351
+ const titleEl = doc.querySelector("title");
352
+ if (titleEl) {
353
+ const title = titleEl.textContent || "";
354
+ for (const sep of ["|", " / ", " \xB7 ", " \u2013 ", " \u2014 "]) {
355
+ if (title.includes(sep)) {
356
+ const parts = title.split(sep);
357
+ const last = parts[parts.length - 1].trim();
358
+ if (last && last.split(/\s+/).length <= 4) return last;
359
+ }
360
+ }
361
+ }
362
+ return "";
363
+ }
364
+ function extractDomain(doc, url, getMeta) {
365
+ let pageUrl = url;
366
+ if (!pageUrl) {
367
+ pageUrl = getMeta("og:url", "property");
368
+ }
369
+ if (!pageUrl) {
370
+ const canonical = doc.querySelector('link[rel="canonical"][href]');
371
+ if (canonical) pageUrl = canonical.getAttribute("href");
372
+ }
373
+ if (!pageUrl) return "";
374
+ try {
375
+ let host = new URL(pageUrl).hostname;
376
+ if (host.startsWith("www.")) host = host.slice(4);
377
+ return host;
378
+ } catch (e) {
379
+ return "";
380
+ }
381
+ }
382
+ function extractFavicon(doc, url) {
383
+ for (const rel of ["shortcut icon", "icon"]) {
384
+ const el = doc.querySelector(`link[rel="${rel}"][href]`);
385
+ if (el) {
386
+ const href = el.getAttribute("href");
387
+ if (href) return url ? resolveUrl(href, url) : href;
388
+ }
389
+ }
390
+ if (url) {
391
+ try {
392
+ const u = new URL(url);
393
+ return u.origin + "/favicon.ico";
394
+ } catch (e) {
395
+ }
396
+ }
397
+ return "";
398
+ }
399
+ function extractLanguage(doc, metaTags, getMeta) {
400
+ const htmlEl = doc.documentElement;
401
+ if (htmlEl && htmlEl.getAttribute("lang")) {
402
+ return htmlEl.getAttribute("lang");
403
+ }
404
+ const locale = getMeta("og:locale", "property");
405
+ if (locale) return locale.replace("_", "-");
406
+ const contentLang = doc.querySelector('meta[http-equiv="Content-Language"]');
407
+ if (contentLang) return contentLang.getAttribute("content") || "";
408
+ return "";
409
+ }
410
+ function decodeHtmlEntities(text) {
411
+ return text.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&apos;/g, "'");
412
+ }
413
+
414
+ // src/constants.js
415
+ var ENTRY_POINT_SELECTORS = [
416
+ "#post",
417
+ ".post-content",
418
+ ".post-body",
419
+ ".article-content",
420
+ "#article-content",
421
+ ".article_post",
422
+ ".article-wrapper",
423
+ ".entry-content",
424
+ ".content-article",
425
+ ".instapaper_body",
426
+ ".post",
427
+ ".markdown-body",
428
+ "article",
429
+ '[role="article"]',
430
+ "main",
431
+ '[role="main"]',
432
+ "#content",
433
+ "body"
434
+ ];
435
+ var BLOCK_ELEMENTS_SELECTOR = "div,section,article,main,aside,header,footer,nav";
436
+ var CONTENT_INDICATORS = /* @__PURE__ */ new Set([
437
+ "article",
438
+ "body",
439
+ "content",
440
+ "entry",
441
+ "hentry",
442
+ "main",
443
+ "page",
444
+ "post",
445
+ "text",
446
+ "blog",
447
+ "story"
448
+ ]);
449
+ var NAV_INDICATORS = [
450
+ "nav",
451
+ "navigation",
452
+ "menu",
453
+ "sidebar",
454
+ "header",
455
+ "footer",
456
+ "breadcrumb",
457
+ "pagination",
458
+ "pager",
459
+ "tags",
460
+ "categories"
461
+ ];
462
+ var SOCIAL_MEDIA_PATTERNS = [
463
+ "twitter.com/",
464
+ "facebook.com/",
465
+ "instagram.com/",
466
+ "linkedin.com/",
467
+ "youtube.com/",
468
+ "tiktok.com/",
469
+ "pinterest.com/",
470
+ "reddit.com/"
471
+ ];
472
+ var MIN_IMAGE_DIMENSION = 33;
473
+ var EXACT_SELECTORS = [
474
+ "noscript",
475
+ "style",
476
+ "meta",
477
+ "link",
478
+ ".promo",
479
+ ".Promo",
480
+ ".alert",
481
+ "#barrier-page",
482
+ '[id="comments"]',
483
+ '[id="comment"]',
484
+ 'div[class*="cover-"]',
485
+ 'div[id*="cover-"]',
486
+ "header",
487
+ ".header",
488
+ "#header",
489
+ "#Header",
490
+ "#banner",
491
+ "#Banner",
492
+ "nav",
493
+ ".navigation",
494
+ "#navigation",
495
+ '[role="navigation"]',
496
+ '[role="dialog"]',
497
+ '[role="complementary"]',
498
+ '[class*="pagination"]',
499
+ ".menu",
500
+ "#siteSub",
501
+ ".previous",
502
+ ".Author",
503
+ '[class$="_bio"]',
504
+ "#categories",
505
+ ".contributor",
506
+ ".date",
507
+ "#date",
508
+ "[data-date]",
509
+ ".entry-meta",
510
+ ".meta",
511
+ ".tags",
512
+ "#tags",
513
+ '[rel="tag"]',
514
+ ".toc",
515
+ ".Toc",
516
+ "#toc",
517
+ ".headline",
518
+ "#headline",
519
+ "#title",
520
+ "#Title",
521
+ "#articleTag",
522
+ '[href*="/tag/"]',
523
+ '[href*="/tags/"]',
524
+ '[href*="/author/"]',
525
+ '[href*="/author?"]',
526
+ '[href$="/author"]',
527
+ 'a[href*="copyright.com"]',
528
+ 'a[href*="google.com/preferences"]',
529
+ '[href*="#toc"]',
530
+ '[href="#top"]',
531
+ '[href="#Top"]',
532
+ '[href="#page-header"]',
533
+ '[href="#content"]',
534
+ '[href="#site-content"]',
535
+ '[href="#main-content"]',
536
+ '[href^="#main"]',
537
+ '[src*="author"]',
538
+ "footer",
539
+ ".aside",
540
+ "button",
541
+ "canvas",
542
+ "dialog",
543
+ "fieldset",
544
+ "form",
545
+ "label",
546
+ "option",
547
+ "select",
548
+ '[role="listbox"]',
549
+ '[role="option"]',
550
+ "textarea",
551
+ "[hidden]",
552
+ '[aria-hidden="true"]',
553
+ ".hidden",
554
+ ".invisible",
555
+ "#logo",
556
+ "#Logo",
557
+ "#newsletter",
558
+ "#Newsletter",
559
+ ".subscribe",
560
+ ".noprint",
561
+ '[data-print-layout="hide"]',
562
+ '[data-block="donotprint"]',
563
+ ".sidebar",
564
+ ".Sidebar",
565
+ "#sidebar",
566
+ "#Sidebar",
567
+ "#side-bar",
568
+ "#sitesub",
569
+ ".copyright",
570
+ "#copyright",
571
+ ".licensebox",
572
+ "#page-info",
573
+ "#rss",
574
+ "#feed",
575
+ ".gutter",
576
+ "#primaryaudio",
577
+ "table.infobox",
578
+ ".gh-header-sticky"
579
+ ];
580
+ var PARTIAL_SELECTORS = [
581
+ "a-statement",
582
+ "access-wall",
583
+ "activitypub",
584
+ "actioncall",
585
+ "addcomment",
586
+ "addtoany",
587
+ "advert",
588
+ "adlayout",
589
+ "ad-tldr",
590
+ "ad-placement",
591
+ "ads-container",
592
+ "_ad_",
593
+ "AdBlock_",
594
+ "AdUnit",
595
+ "after_content",
596
+ "after_main_article",
597
+ "afterpost",
598
+ "allterms",
599
+ "-alert-",
600
+ "alert-box",
601
+ "_archive",
602
+ "around-the-web",
603
+ "aroundpages",
604
+ "article-author",
605
+ "article-badges",
606
+ "article-banner",
607
+ "article-bottom-section",
608
+ "article-bottom",
609
+ "article-category",
610
+ "article-card",
611
+ "article-citation",
612
+ "article__copy",
613
+ "article_date",
614
+ "article-date",
615
+ "article-end ",
616
+ "article_header",
617
+ "article-header",
618
+ "article__header",
619
+ "article__hero",
620
+ "article__info",
621
+ "article-info",
622
+ "article-meta",
623
+ "article_meta",
624
+ "article__meta",
625
+ "articlename",
626
+ "article-subject",
627
+ "article_subject",
628
+ "article-snippet",
629
+ "article-separator",
630
+ "article--share",
631
+ "article--topics",
632
+ "articletags",
633
+ "article-tags",
634
+ "article_tags",
635
+ "articletitle",
636
+ "article-title",
637
+ "article_title",
638
+ "articletopics",
639
+ "article-topics",
640
+ "article-actions",
641
+ "article--lede",
642
+ "articlewell",
643
+ "associated-people",
644
+ "audio-card",
645
+ "author-bio",
646
+ "author-box",
647
+ "author-info",
648
+ "author_info",
649
+ "authorm",
650
+ "author-mini-bio",
651
+ "author-name",
652
+ "author-publish-info",
653
+ "authored-by",
654
+ "avatar",
655
+ "back-to-top",
656
+ "backlink_container",
657
+ "backlinks-section",
658
+ "bio-block",
659
+ "biobox",
660
+ "blog-pager",
661
+ "bookmark-",
662
+ "-bookmark",
663
+ "bottominfo",
664
+ "bottomnav",
665
+ "bottom-of-article",
666
+ "bottom-wrapper",
667
+ "brand-bar",
668
+ "bcrumb",
669
+ "breadcrumb",
670
+ "brdcrumb",
671
+ "button-wrapper",
672
+ "buttons-container",
673
+ "btn-",
674
+ "-btn",
675
+ "byline",
676
+ "captcha",
677
+ "card-text",
678
+ "card-media",
679
+ "card-post",
680
+ "carouselcontainer",
681
+ "carousel-container",
682
+ "cat_header",
683
+ "catlinks",
684
+ "_categories",
685
+ "card-author",
686
+ "card-content",
687
+ "chapter-list",
688
+ "collections",
689
+ "comments",
690
+ "-comment",
691
+ "commentbox",
692
+ "comment-button",
693
+ "commentcomp",
694
+ "comment-content",
695
+ "comment-count",
696
+ "comment-form",
697
+ "comment-number",
698
+ "comment-respond",
699
+ "comment-thread",
700
+ "comment-wrap",
701
+ "complementary",
702
+ "consent",
703
+ "contact-",
704
+ "content-card",
705
+ "copycontent",
706
+ "content-topics",
707
+ "contentpromo",
708
+ "context-bar",
709
+ "context-widget",
710
+ "core-collateral",
711
+ "cover-image",
712
+ "cover-photo",
713
+ "cover-wrap",
714
+ "created-date",
715
+ "creative-commons_",
716
+ "c-subscribe",
717
+ "_cta",
718
+ "-cta",
719
+ "cta-",
720
+ "cta_",
721
+ "current-issue",
722
+ "custom-list-number",
723
+ "dateline",
724
+ "dateheader",
725
+ "date-header",
726
+ "date-pub",
727
+ "disclaimer",
728
+ "disclosure",
729
+ "discussion",
730
+ "discuss_",
731
+ "-dismiss",
732
+ "disqus",
733
+ "donate",
734
+ "donation",
735
+ "dropdown",
736
+ "editorial_contact",
737
+ "editorial-contact",
738
+ "element-invisible",
739
+ "eletters",
740
+ "emailsignup",
741
+ "emoji-bar",
742
+ "engagement-widget",
743
+ "enhancement-",
744
+ "entry-author-info",
745
+ "entry-categories",
746
+ "entry-date",
747
+ "entry-title",
748
+ "entry-utility",
749
+ "-error",
750
+ "error-",
751
+ "eyebrow",
752
+ "expand-reduce",
753
+ "external-anchor",
754
+ "externallinkembedwrapper",
755
+ "extra-services",
756
+ "extra-title",
757
+ "facebook",
758
+ "fancy-box",
759
+ "favorite",
760
+ "featured-content",
761
+ "feature_feed",
762
+ "feedback",
763
+ "feed-links",
764
+ "field-site-sections",
765
+ "fixheader",
766
+ "floating-vid",
767
+ "follower",
768
+ "footer",
769
+ "footnote-back",
770
+ "footnoteback",
771
+ "form-group",
772
+ "for-you",
773
+ "frontmatter",
774
+ "further-reading",
775
+ "fullbleedheader",
776
+ "gallery-count",
777
+ "gated-",
778
+ "gh-feed",
779
+ "gist-meta",
780
+ "goog-",
781
+ "graph-view",
782
+ "hamburger",
783
+ "header_logo",
784
+ "header-logo",
785
+ "header-pattern",
786
+ "hero-list",
787
+ "hide-for-print",
788
+ "hide-print",
789
+ "hide-when-no-script",
790
+ "hidden-print",
791
+ "hidden-sidenote",
792
+ "hidden-accessibility",
793
+ "infoline",
794
+ "inline-topic",
795
+ "instacartIntegration",
796
+ "interlude",
797
+ "interaction",
798
+ "itemendrow",
799
+ "intro-date",
800
+ "invisible",
801
+ "jp-no-solution",
802
+ "jp-relatedposts",
803
+ "jswarning",
804
+ "js-warning",
805
+ "jumplink",
806
+ "jumpto",
807
+ "jump-to-",
808
+ "js-skip-to-content",
809
+ "keepreading",
810
+ "keep-reading",
811
+ "keep_reading",
812
+ "keyword_wrap",
813
+ "kicker",
814
+ "labstab",
815
+ "-labels",
816
+ "language-name",
817
+ "lastupdated",
818
+ "latest-content",
819
+ "-ledes-",
820
+ "-license",
821
+ "license-",
822
+ "lightbox-popup",
823
+ "like-button",
824
+ "link-box",
825
+ "links-grid",
826
+ "links-title",
827
+ "listing-dynamic-terms",
828
+ "list-tags",
829
+ "listinks",
830
+ "loading",
831
+ "loa-info",
832
+ "logo_container",
833
+ "masthead",
834
+ "marketing",
835
+ "media-inquiry",
836
+ "-menu",
837
+ "menu-",
838
+ "metadata",
839
+ "meta-date",
840
+ "meta-row",
841
+ "might-like",
842
+ "minibio",
843
+ "more-about",
844
+ "mod-paywall",
845
+ "_modal",
846
+ "-modal",
847
+ "more-",
848
+ "morenews",
849
+ "morestories",
850
+ "more_wrapper",
851
+ "most-read",
852
+ "mw-editsection",
853
+ "mw-cite-backlink",
854
+ "mw-indicators",
855
+ "mw-jump-link",
856
+ "nav-",
857
+ "nav_",
858
+ "navigation-post",
859
+ "next-",
860
+ "newsgallery",
861
+ "news-story-title",
862
+ "newsletter_",
863
+ "newsletterbanner",
864
+ "newslettercontainer",
865
+ "newsletter-form",
866
+ "newsletter-signup",
867
+ "newslettersignup",
868
+ "newsletterwidget",
869
+ "newsletterwrapper",
870
+ "not-found",
871
+ "notessection",
872
+ "nomobile",
873
+ "noprint",
874
+ "open-slideshow",
875
+ "originally-published",
876
+ "other-blogs",
877
+ "outline-view",
878
+ "pagehead",
879
+ "page-header",
880
+ "page-title",
881
+ "paywall_message",
882
+ "-partners",
883
+ "permission-",
884
+ "plea",
885
+ "popular",
886
+ "popup_links",
887
+ "pop_stories",
888
+ "pop-up",
889
+ "post__author",
890
+ "post-author",
891
+ "post-bottom",
892
+ "post__category",
893
+ "postcomment",
894
+ "postdate",
895
+ "post-date",
896
+ "post_date",
897
+ "post-details",
898
+ "post-feeds",
899
+ "postinfo",
900
+ "post-info",
901
+ "post_info",
902
+ "post-inline-date",
903
+ "post-links",
904
+ "postlist",
905
+ "post_list",
906
+ "post_meta",
907
+ "post-meta",
908
+ "postmeta",
909
+ "post_more",
910
+ "postnavi",
911
+ "post-navigation",
912
+ "postpath",
913
+ "post-preview",
914
+ "postsnippet",
915
+ "post_snippet",
916
+ "post-snippet",
917
+ "post-subject",
918
+ "posttax",
919
+ "post-tax",
920
+ "post_tax",
921
+ "posttag",
922
+ "post_tag",
923
+ "post-tag",
924
+ "post_time",
925
+ "posttitle",
926
+ "post-title",
927
+ "post_title",
928
+ "post__title",
929
+ "post-ufi-button",
930
+ "prev-post",
931
+ "prevnext",
932
+ "prev_next",
933
+ "prev-next",
934
+ "previousnext",
935
+ "press-inquiries",
936
+ "print-none",
937
+ "print-header",
938
+ "privacy-notice",
939
+ "privacy-settings",
940
+ "profile",
941
+ "promo_article",
942
+ "promo-bar",
943
+ "promo-box",
944
+ "pubdate",
945
+ "pub_date",
946
+ "pub-date",
947
+ "publish_date",
948
+ "publish-date",
949
+ "publication-date",
950
+ "publicationName",
951
+ "qr-code",
952
+ "qr_code",
953
+ "quick_up",
954
+ "_rail",
955
+ "ratingssection",
956
+ "read_also",
957
+ "readmore",
958
+ "read-next",
959
+ "read_next",
960
+ "read_time",
961
+ "read-time",
962
+ "reading_time",
963
+ "reading-time",
964
+ "reading-list",
965
+ "recent-",
966
+ "recent-articles",
967
+ "recentpost",
968
+ "recent_post",
969
+ "recent-post",
970
+ "recommend",
971
+ "redirectedfrom",
972
+ "recirc",
973
+ "register",
974
+ "related",
975
+ "relevant",
976
+ "reversefootnote",
977
+ "robots-nocontent",
978
+ "_rss",
979
+ "rss-link",
980
+ "screen-reader-text",
981
+ "scroll_to",
982
+ "scroll-to",
983
+ "_search",
984
+ "-search",
985
+ "section-nav",
986
+ "series-banner",
987
+ "share-box",
988
+ "sharedaddy",
989
+ "share-icons",
990
+ "sharelinks",
991
+ "share-post",
992
+ "share-print",
993
+ "share-section",
994
+ "sharing_",
995
+ "shariff-",
996
+ "show-for-print",
997
+ "sidebartitle",
998
+ "sidebar-content",
999
+ "sidebar-wrapper",
1000
+ "sideitems",
1001
+ "sidebar-author",
1002
+ "sidebar-item",
1003
+ "side-box",
1004
+ "side-logo",
1005
+ "sign-in-gate",
1006
+ "similar-",
1007
+ "similar_",
1008
+ "similars-",
1009
+ "site-index",
1010
+ "site-header",
1011
+ "siteheader",
1012
+ "site-logo",
1013
+ "site-name",
1014
+ "site-wordpress",
1015
+ "skip-content",
1016
+ "skip-to-content",
1017
+ "skip-link",
1018
+ "c-skip-link",
1019
+ "_skip-link",
1020
+ "-slider",
1021
+ "slug-wrap",
1022
+ "social-author",
1023
+ "social-shar",
1024
+ "social-date",
1025
+ "speechify-ignore",
1026
+ "speedbump",
1027
+ "sponsor",
1028
+ "springercitation",
1029
+ "sr-only",
1030
+ "_stats",
1031
+ "story-date",
1032
+ "story-navigation",
1033
+ "storyreadtime",
1034
+ "storysmall",
1035
+ "storypublishdate",
1036
+ "subject-label",
1037
+ "subhead",
1038
+ "submenu",
1039
+ "-subscribe-",
1040
+ "subscriber-drive",
1041
+ "subscription-",
1042
+ "_tags",
1043
+ "tags__item",
1044
+ "tag_list",
1045
+ "taxonomy",
1046
+ "table-of-contents",
1047
+ "tabs-",
1048
+ "terminaltout",
1049
+ "time-rubric",
1050
+ "timestamp",
1051
+ "time-read",
1052
+ "time-to-read",
1053
+ "tip_off",
1054
+ "tiptout",
1055
+ "-tout-",
1056
+ "toc-container",
1057
+ "toggle-caption",
1058
+ "tooltip-content",
1059
+ "topbar",
1060
+ "topic-authors",
1061
+ "topic-footer",
1062
+ "topic-list",
1063
+ "topic-subnav",
1064
+ "top-wrapper",
1065
+ "tree-item",
1066
+ "trending",
1067
+ "trust-feat",
1068
+ "trust-badge",
1069
+ "trust-project",
1070
+ "twitter",
1071
+ "twiblock",
1072
+ "u-hide",
1073
+ "upsell",
1074
+ "viewbottom",
1075
+ "yarpp-related",
1076
+ "visually-hidden",
1077
+ "welcomebox",
1078
+ "widget_pages"
1079
+ ];
1080
+ var PARTIAL_SELECTORS_REGEX = new RegExp(
1081
+ "(?:" + PARTIAL_SELECTORS.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|") + ")",
1082
+ "i"
1083
+ );
1084
+ var FOOTNOTE_SELECTORS = [
1085
+ "div.footnote ol",
1086
+ "div.footnotes ol",
1087
+ 'div[role="doc-endnotes"]',
1088
+ 'div[role="doc-footnotes"]',
1089
+ "ol.footnotes-list",
1090
+ "ol.footnotes",
1091
+ "ol.references",
1092
+ "section.footnotes ol",
1093
+ 'section[role="doc-endnotes"]',
1094
+ 'section[role="doc-footnotes"]',
1095
+ 'section[role="doc-bibliography"]',
1096
+ "ul.footnotes-list",
1097
+ "#footnotes"
1098
+ ];
1099
+ var PARTIAL_MATCH_ATTRIBUTES = [
1100
+ "class",
1101
+ "id",
1102
+ "data-test",
1103
+ "data-testid",
1104
+ "data-test-id",
1105
+ "data-qa",
1106
+ "data-cy"
1107
+ ];
1108
+ var VIDEO_EMBED_PATTERNS = [
1109
+ "youtube.com",
1110
+ "youtu.be",
1111
+ "vimeo.com",
1112
+ "twitter.com",
1113
+ "x.com",
1114
+ "datawrapper.de"
1115
+ ];
1116
+ var ALLOWED_ATTRIBUTES = /* @__PURE__ */ new Set([
1117
+ "alt",
1118
+ "allow",
1119
+ "allowfullscreen",
1120
+ "aria-label",
1121
+ "checked",
1122
+ "colspan",
1123
+ "controls",
1124
+ "data-latex",
1125
+ "data-src",
1126
+ "data-srcset",
1127
+ "data-callout",
1128
+ "data-callout-title",
1129
+ "data-lang",
1130
+ "dir",
1131
+ "frameborder",
1132
+ "headers",
1133
+ "height",
1134
+ "href",
1135
+ "kind",
1136
+ "label",
1137
+ "lang",
1138
+ "role",
1139
+ "rowspan",
1140
+ "src",
1141
+ "srclang",
1142
+ "srcset",
1143
+ "title",
1144
+ "type",
1145
+ "width",
1146
+ "datetime"
1147
+ ]);
1148
+ var ALLOWED_EMPTY_TAGS = /* @__PURE__ */ new Set([
1149
+ "area",
1150
+ "audio",
1151
+ "br",
1152
+ "col",
1153
+ "embed",
1154
+ "figure",
1155
+ "hr",
1156
+ "iframe",
1157
+ "img",
1158
+ "input",
1159
+ "link",
1160
+ "meta",
1161
+ "picture",
1162
+ "source",
1163
+ "svg",
1164
+ "td",
1165
+ "th",
1166
+ "track",
1167
+ "video",
1168
+ "wbr"
1169
+ ]);
1170
+
1171
+ // src/utils.js
1172
+ function countWords(text) {
1173
+ if (!text || !text.trim()) return 0;
1174
+ const cjkMatches = text.match(
1175
+ /[\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7FF\u4E00-\u9FFF]/g
1176
+ );
1177
+ const cjkCount = cjkMatches ? cjkMatches.length : 0;
1178
+ const nonCjk = text.replace(/[\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7FF\u4E00-\u9FFF]/g, " ");
1179
+ const nonCjkWords = nonCjk.trim().split(/\s+/).filter((w) => w.length > 0).length;
1180
+ return cjkCount + nonCjkWords;
1181
+ }
1182
+ function countHtmlWords(html) {
1183
+ const text = html.replace(/<[^>]*>/g, " ").replace(/&nbsp;/gi, " ").replace(/&amp;/gi, "&").replace(/&lt;/gi, "<").replace(/&gt;/gi, ">").replace(/&quot;/gi, '"').replace(/&#\d+;/g, " ").replace(/&\w+;/g, " ");
1184
+ return countWords(text);
1185
+ }
1186
+ function getClassId(el) {
1187
+ return (el.getAttribute("class") || "") + " " + (el.getAttribute("id") || "");
1188
+ }
1189
+ function isAncestorOrSelf(el, target) {
1190
+ let node = target;
1191
+ while (node) {
1192
+ if (node === el) return true;
1193
+ node = node.parentElement;
1194
+ }
1195
+ return false;
1196
+ }
1197
+ function removeNode(el) {
1198
+ if (el && el.parentNode) {
1199
+ el.parentNode.removeChild(el);
1200
+ }
1201
+ }
1202
+ function removeAll(nodes) {
1203
+ Array.from(nodes).forEach((n) => removeNode(n));
1204
+ }
1205
+
1206
+ // src/content-scorer.js
1207
+ function scoreElement(el) {
1208
+ const text = el.textContent || "";
1209
+ const words = countWords(text);
1210
+ let score = words;
1211
+ score += el.getElementsByTagName("p").length * 10;
1212
+ score += (text.match(/,/g) || []).length;
1213
+ const classId = getClassId(el).toLowerCase();
1214
+ for (const indicator of CONTENT_INDICATORS) {
1215
+ if (classId.includes(indicator)) {
1216
+ score += 15;
1217
+ break;
1218
+ }
1219
+ }
1220
+ const linkDensity = getLinkDensity(el);
1221
+ score *= Math.max(0.5, 1 - linkDensity);
1222
+ return score;
1223
+ }
1224
+ function scoreNonContentBlock(el) {
1225
+ let score = 0;
1226
+ const classId = getClassId(el).toLowerCase();
1227
+ for (const indicator of NAV_INDICATORS) {
1228
+ if (classId.includes(indicator)) score -= 10;
1229
+ }
1230
+ if (getLinkDensity(el) > 0.5) score -= 15;
1231
+ if (hasSocialMediaLinks(el)) score -= 15;
1232
+ if (isAuthorDateByline(el)) score -= 10;
1233
+ if (isCardGrid(el)) score -= 15;
1234
+ return score;
1235
+ }
1236
+ function scoreAndRemove(doc, mainContent, debug = false) {
1237
+ const removed = [];
1238
+ const body = doc.body || doc.documentElement;
1239
+ if (!body) return removed;
1240
+ const tags = ["div", "section", "article", "main", "aside", "header", "footer", "nav"];
1241
+ const candidates = [];
1242
+ for (const tag of tags) {
1243
+ candidates.push(...Array.from(body.getElementsByTagName(tag)));
1244
+ }
1245
+ const toRemove = [];
1246
+ for (const el of candidates) {
1247
+ if (isAncestorOrSelf(el, mainContent)) continue;
1248
+ const score = scoreNonContentBlock(el);
1249
+ if (score < 0) {
1250
+ toRemove.push(el);
1251
+ if (debug) {
1252
+ removed.push({ step: "scoreAndRemove", score, tag: el.tagName, class: el.className });
1253
+ }
1254
+ }
1255
+ }
1256
+ for (const el of toRemove) {
1257
+ if (el.parentNode) el.parentNode.removeChild(el);
1258
+ }
1259
+ return removed;
1260
+ }
1261
+ function findBestElement(elements) {
1262
+ let best = null;
1263
+ let bestScore = -Infinity;
1264
+ for (const el of elements) {
1265
+ const score = scoreElement(el);
1266
+ if (score > bestScore) {
1267
+ bestScore = score;
1268
+ best = el;
1269
+ }
1270
+ }
1271
+ return best;
1272
+ }
1273
+ function getLinkDensity(el) {
1274
+ const totalText = (el.textContent || "").trim().length;
1275
+ if (totalText === 0) return 0;
1276
+ let linkText = 0;
1277
+ for (const a of el.getElementsByTagName("a")) {
1278
+ linkText += (a.textContent || "").trim().length;
1279
+ }
1280
+ return Math.min(1, linkText / totalText);
1281
+ }
1282
+ function isCardGrid(el) {
1283
+ const headings = el.querySelectorAll("h2,h3,h4").length;
1284
+ if (headings < 3) return false;
1285
+ const images = el.getElementsByTagName("img").length;
1286
+ if (images < 2) return false;
1287
+ const words = countWords(el.textContent || "");
1288
+ if (words >= 500) return false;
1289
+ return headings > 0 && words / headings < 20;
1290
+ }
1291
+ function isAuthorDateByline(el) {
1292
+ const text = (el.textContent || "").trim();
1293
+ const words = countWords(text);
1294
+ if (words >= 15) return false;
1295
+ const hasDate = /\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2}|\d{4}|\b\d{1,2}\/\d{1,2}/i.test(text);
1296
+ const hasByline = /^by\s+/i.test(text);
1297
+ return hasDate || hasByline;
1298
+ }
1299
+ function hasSocialMediaLinks(el) {
1300
+ const links = Array.from(el.getElementsByTagName("a"));
1301
+ if (links.length === 0) return false;
1302
+ const socialLinks = links.filter((a) => {
1303
+ const href = (a.getAttribute("href") || "").toLowerCase();
1304
+ return SOCIAL_MEDIA_PATTERNS.some((p) => href.includes(p));
1305
+ });
1306
+ return socialLinks.length > 0 && socialLinks.length / links.length > 0.5;
1307
+ }
1308
+
1309
+ // src/content-finder.js
1310
+ function findMainContent(doc) {
1311
+ const selectorCount = ENTRY_POINT_SELECTORS.length;
1312
+ const candidates = [];
1313
+ for (let index = 0; index < selectorCount; index++) {
1314
+ const selector = ENTRY_POINT_SELECTORS[index];
1315
+ let elements;
1316
+ try {
1317
+ elements = doc.querySelectorAll(selector);
1318
+ } catch (e) {
1319
+ continue;
1320
+ }
1321
+ for (const el of elements) {
1322
+ const baseScore = (selectorCount - index) * 40;
1323
+ const contentScore = scoreElement(el);
1324
+ candidates.push({
1325
+ element: el,
1326
+ score: baseScore + contentScore,
1327
+ selectorIndex: index
1328
+ });
1329
+ }
1330
+ }
1331
+ if (candidates.length === 0) {
1332
+ return findByScoring(doc);
1333
+ }
1334
+ candidates.sort((a, b) => b.score - a.score);
1335
+ const top = candidates[0];
1336
+ if (candidates.length === 1 && top.element.tagName.toLowerCase() === "body") {
1337
+ const tableContent = findTableBasedContent(doc);
1338
+ if (tableContent) return tableContent;
1339
+ }
1340
+ let best = top;
1341
+ for (const child of candidates) {
1342
+ if (child === top) continue;
1343
+ if (child.selectorIndex >= best.selectorIndex) continue;
1344
+ if (!top.element.contains(child.element)) continue;
1345
+ const childWords = countWords(child.element.textContent || "");
1346
+ if (childWords <= 50) continue;
1347
+ const siblingsAtIndex = candidates.filter(
1348
+ (c) => c.selectorIndex === child.selectorIndex && top.element.contains(c.element)
1349
+ ).length;
1350
+ if (siblingsAtIndex > 1) continue;
1351
+ best = child;
1352
+ }
1353
+ return best.element;
1354
+ }
1355
+ function findByScoring(doc) {
1356
+ const elements = Array.from(doc.querySelectorAll(BLOCK_ELEMENTS_SELECTOR)).filter((el) => scoreElement(el) > 0);
1357
+ if (elements.length === 0) {
1358
+ return doc.body || null;
1359
+ }
1360
+ return elements.reduce(
1361
+ (best, el) => scoreElement(el) > scoreElement(best) ? el : best
1362
+ );
1363
+ }
1364
+ function findTableBasedContent(doc) {
1365
+ const tables = Array.from(doc.getElementsByTagName("table"));
1366
+ const hasTableLayout = tables.some((table) => {
1367
+ const width = parseInt(table.getAttribute("width") || "0");
1368
+ const cls = (table.className || "").toLowerCase();
1369
+ return width > 400 || cls.includes("content") || cls.includes("article") || table.getAttribute("align") === "center";
1370
+ });
1371
+ if (!hasTableLayout) return null;
1372
+ const cells = Array.from(doc.getElementsByTagName("td"));
1373
+ return findBestElement(cells);
1374
+ }
1375
+
1376
+ // src/removals/hidden.js
1377
+ function removeHiddenElements(doc, debug = false) {
1378
+ const removed = [];
1379
+ const all = Array.from(doc.querySelectorAll("*"));
1380
+ const toRemove = [];
1381
+ for (const el of all) {
1382
+ if (isHidden(el) && !containsMath(el)) {
1383
+ toRemove.push(el);
1384
+ if (debug) removed.push({ step: "removeHidden", tag: el.tagName, class: el.className });
1385
+ }
1386
+ }
1387
+ removeAll(toRemove);
1388
+ return removed;
1389
+ }
1390
+ function isHidden(el) {
1391
+ if (el.hasAttribute("hidden")) return true;
1392
+ if (el.getAttribute("aria-hidden") === "true") return true;
1393
+ const style = el.getAttribute("style") || "";
1394
+ if (style && /(?:^|;)\s*(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)\s*(?:;|$)/i.test(style)) {
1395
+ return true;
1396
+ }
1397
+ const cls = el.getAttribute("class") || "";
1398
+ if (cls) {
1399
+ const tokens = cls.trim().split(/\s+/);
1400
+ if (tokens.includes("hidden") || tokens.includes("invisible")) return true;
1401
+ }
1402
+ return false;
1403
+ }
1404
+ function containsMath(el) {
1405
+ const cls = (el.getAttribute("class") || "").toLowerCase();
1406
+ if (cls.includes("katex-mathml") || cls.includes("math")) return true;
1407
+ return el.querySelector("math") !== null;
1408
+ }
1409
+
1410
+ // src/removals/selector-remover.js
1411
+ function removeExact(doc, mainContent = null, debug = false) {
1412
+ const removed = [];
1413
+ const toRemove = /* @__PURE__ */ new Set();
1414
+ for (const selector of EXACT_SELECTORS) {
1415
+ try {
1416
+ for (const el of doc.querySelectorAll(selector)) {
1417
+ if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
1418
+ if (isInsideCodeBlock(el)) continue;
1419
+ if (isFootnoteContainer(doc, el)) continue;
1420
+ toRemove.add(el);
1421
+ }
1422
+ } catch (e) {
1423
+ }
1424
+ }
1425
+ for (const el of doc.querySelectorAll("script")) {
1426
+ const type = (el.getAttribute("type") || "").toLowerCase();
1427
+ if (type.startsWith("math/")) continue;
1428
+ if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
1429
+ toRemove.add(el);
1430
+ }
1431
+ for (const el of doc.querySelectorAll("iframe")) {
1432
+ const src = (el.getAttribute("src") || "").toLowerCase();
1433
+ if (VIDEO_EMBED_PATTERNS.some((p) => src.includes(p))) continue;
1434
+ if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
1435
+ toRemove.add(el);
1436
+ }
1437
+ for (const el of doc.querySelectorAll("aside")) {
1438
+ const cls = (el.getAttribute("class") || "").toLowerCase();
1439
+ if (cls.includes("callout")) continue;
1440
+ if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
1441
+ toRemove.add(el);
1442
+ }
1443
+ for (const el of toRemove) {
1444
+ if (el.parentNode) {
1445
+ if (debug) removed.push({ step: "removeExact", tag: el.tagName, class: el.className });
1446
+ el.parentNode.removeChild(el);
1447
+ }
1448
+ }
1449
+ return removed;
1450
+ }
1451
+ function removePartial(doc, mainContent = null, debug = false) {
1452
+ const removed = [];
1453
+ const attrSelector = PARTIAL_MATCH_ATTRIBUTES.map((a) => `[${a}]`).join(",");
1454
+ let elements;
1455
+ try {
1456
+ elements = Array.from(doc.querySelectorAll(attrSelector));
1457
+ } catch (e) {
1458
+ return removed;
1459
+ }
1460
+ const toRemove = [];
1461
+ for (const el of elements) {
1462
+ const tag = el.tagName.toLowerCase();
1463
+ if (tag === "code" || tag === "pre") continue;
1464
+ if (isInsideCodeBlock(el)) continue;
1465
+ const combined = PARTIAL_MATCH_ATTRIBUTES.map((a) => el.getAttribute(a) || "").join(" ");
1466
+ if (!PARTIAL_SELECTORS_REGEX.test(combined)) continue;
1467
+ if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
1468
+ if (isFootnoteContainer(doc, el)) continue;
1469
+ toRemove.push(el);
1470
+ }
1471
+ for (const el of toRemove) {
1472
+ if (el.parentNode) {
1473
+ if (debug) removed.push({ step: "removePartial", tag: el.tagName, class: el.className });
1474
+ el.parentNode.removeChild(el);
1475
+ }
1476
+ }
1477
+ return removed;
1478
+ }
1479
+ function isInsideCodeBlock(el) {
1480
+ let parent = el.parentElement;
1481
+ while (parent) {
1482
+ const tag = parent.tagName.toLowerCase();
1483
+ if (tag === "code" || tag === "pre") return true;
1484
+ parent = parent.parentElement;
1485
+ }
1486
+ return false;
1487
+ }
1488
+ function isFootnoteContainer(doc, el) {
1489
+ for (const sel of FOOTNOTE_SELECTORS) {
1490
+ try {
1491
+ const matches = doc.querySelectorAll(sel);
1492
+ for (const m of matches) {
1493
+ if (m === el) return true;
1494
+ }
1495
+ } catch (e) {
1496
+ }
1497
+ }
1498
+ return false;
1499
+ }
1500
+
1501
+ // src/removals/small-images.js
1502
+ function removeSmallImages(doc, debug = false) {
1503
+ const toRemove = [];
1504
+ for (const img of doc.querySelectorAll("img, svg")) {
1505
+ const [w, h] = getDimensions(img);
1506
+ if (w !== null && w < MIN_IMAGE_DIMENSION || h !== null && h < MIN_IMAGE_DIMENSION) {
1507
+ toRemove.push(img);
1508
+ }
1509
+ }
1510
+ for (const img of toRemove) {
1511
+ if (img.parentNode) img.parentNode.removeChild(img);
1512
+ }
1513
+ return toRemove.length;
1514
+ }
1515
+ function getDimensions(el) {
1516
+ let w = null, h = null;
1517
+ const wAttr = el.getAttribute("width");
1518
+ const hAttr = el.getAttribute("height");
1519
+ if (wAttr && /^\d+$/.test(wAttr)) w = parseInt(wAttr);
1520
+ if (hAttr && /^\d+$/.test(hAttr)) h = parseInt(hAttr);
1521
+ const style = el.getAttribute("style") || "";
1522
+ if (style) {
1523
+ const wm = style.match(/\bwidth\s*:\s*(\d+)px/i);
1524
+ const hm = style.match(/\bheight\s*:\s*(\d+)px/i);
1525
+ if (wm) w = parseInt(wm[1]);
1526
+ if (hm) h = parseInt(hm[1]);
1527
+ }
1528
+ return [w, h];
1529
+ }
1530
+
1531
+ // src/removals/content-patterns.js
1532
+ var READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
1533
+ var STARTS_WITH_BY = /^by\s+\S/i;
1534
+ var BOILERPLATE_PATTERNS = [
1535
+ /^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
1536
+ /^A version of this (?:article|story) (?:appeared|was published) in\b/i,
1537
+ /^Originally (?:published|appeared) (?:in|on|at)\b/i,
1538
+ /^Any re-?use permitted\b/i
1539
+ ];
1540
+ function removeByContentPattern(root, debug = false, url = "") {
1541
+ const removed = [];
1542
+ removeHeroHeader(root, debug, removed);
1543
+ removeBreadcrumbs(root, debug, removed);
1544
+ removeAuthorBylines(root, debug, removed);
1545
+ removeReadTimeBlocks(root, debug, removed);
1546
+ removeBoilerplateSentences(root, debug, removed);
1547
+ removeTrailingThinSections(root, debug, removed);
1548
+ return removed;
1549
+ }
1550
+ function removeHeroHeader(root, debug, removed) {
1551
+ const timeNodes = Array.from(root.querySelectorAll("time"));
1552
+ for (const time of timeNodes) {
1553
+ const textBefore = getTextBefore(root, time);
1554
+ if (textBefore.length > 400) continue;
1555
+ let container = time.parentElement;
1556
+ let depth = 0;
1557
+ while (container && container !== root && depth < 4) {
1558
+ const headings = container.querySelectorAll("h1,h2,h3").length;
1559
+ if (headings > 0) {
1560
+ const proseWords = countProseWords(container);
1561
+ if (proseWords < 30) {
1562
+ safeRemove(container, root, debug, removed, "removeHeroHeader");
1563
+ break;
1564
+ }
1565
+ }
1566
+ container = container.parentElement;
1567
+ depth++;
1568
+ }
1569
+ }
1570
+ }
1571
+ function removeBreadcrumbs(root, debug, removed) {
1572
+ for (const tag of ["ul", "ol", "nav"]) {
1573
+ for (const list of Array.from(root.querySelectorAll(tag))) {
1574
+ const textBefore = getTextBefore(root, list);
1575
+ if (textBefore.length > 600) continue;
1576
+ if (isBreadcrumbList(list)) {
1577
+ safeRemove(list, root, debug, removed, "removeBreadcrumbs");
1578
+ }
1579
+ }
1580
+ }
1581
+ for (const el of Array.from(root.querySelectorAll('[aria-label*="breadcrumb"],[class*="breadcrumb"]'))) {
1582
+ safeRemove(el, root, debug, removed, "removeBreadcrumbs");
1583
+ }
1584
+ }
1585
+ function isBreadcrumbList(list) {
1586
+ const children = Array.from(list.children).filter(
1587
+ (el) => ["li", "a"].includes(el.tagName.toLowerCase())
1588
+ );
1589
+ if (children.length < 2 || children.length > 8) return false;
1590
+ const linkOnly = children.filter((item) => {
1591
+ const text = (item.textContent || "").trim();
1592
+ const words = countWords(text);
1593
+ if (words > 4) return false;
1594
+ return item.querySelector("a") !== null || item.tagName.toLowerCase() === "a";
1595
+ });
1596
+ return linkOnly.length >= children.length * 0.5;
1597
+ }
1598
+ function removeAuthorBylines(root, debug, removed) {
1599
+ const candidates = Array.from(root.querySelectorAll("*:not(p *)"));
1600
+ for (const el of candidates) {
1601
+ const textBefore = getTextBefore(root, el);
1602
+ if (textBefore.length > 600) continue;
1603
+ const text = (el.textContent || "").trim();
1604
+ const words = countWords(text);
1605
+ if (words < 2 || words > 15) continue;
1606
+ if (STARTS_WITH_BY.test(text)) {
1607
+ safeRemove(el, root, debug, removed, "removeAuthorBylines");
1608
+ }
1609
+ }
1610
+ }
1611
+ function removeReadTimeBlocks(root, debug, removed) {
1612
+ for (const el of Array.from(root.querySelectorAll("*"))) {
1613
+ if (el.children.length > 0) continue;
1614
+ const textBefore = getTextBefore(root, el);
1615
+ if (textBefore.length > 400) continue;
1616
+ const text = (el.textContent || "").trim();
1617
+ if (countWords(text) > 10) continue;
1618
+ if (READ_TIME_PATTERN.test(text)) {
1619
+ safeRemove(el, root, debug, removed, "removeReadTimeBlocks");
1620
+ }
1621
+ }
1622
+ }
1623
+ function removeBoilerplateSentences(root, debug, removed) {
1624
+ for (const p of Array.from(root.querySelectorAll("p"))) {
1625
+ const text = (p.textContent || "").trim();
1626
+ for (const pattern of BOILERPLATE_PATTERNS) {
1627
+ if (pattern.test(text)) {
1628
+ safeRemove(p, root, debug, removed, "removeBoilerplateSentences");
1629
+ break;
1630
+ }
1631
+ }
1632
+ }
1633
+ }
1634
+ function removeTrailingThinSections(root, debug, removed) {
1635
+ const totalWords = countWords(root.textContent || "");
1636
+ if (totalWords < 100) return;
1637
+ const threshold = totalWords * 0.15;
1638
+ const children = Array.from(root.children);
1639
+ const lastChildren = children.slice(-3);
1640
+ for (const child of lastChildren.reverse()) {
1641
+ const tag = child.tagName.toLowerCase();
1642
+ if (!["div", "section", "aside"].includes(tag)) continue;
1643
+ const words = countWords(child.textContent || "");
1644
+ if (words < threshold && child.querySelectorAll("h2,h3,h4").length > 0) {
1645
+ safeRemove(child, root, debug, removed, "removeTrailingThinSections");
1646
+ }
1647
+ }
1648
+ }
1649
+ function safeRemove(el, root, debug, removed, step) {
1650
+ if (el === root || !el.parentNode) return;
1651
+ if (debug) removed.push({ step, tag: el.tagName, text: (el.textContent || "").slice(0, 80) });
1652
+ el.parentNode.removeChild(el);
1653
+ }
1654
+ function getTextBefore(root, target) {
1655
+ const rootText = root.textContent || "";
1656
+ const targetText = (target.textContent || "").trim().slice(0, 30);
1657
+ const pos = rootText.indexOf(targetText);
1658
+ return pos === -1 ? "" : rootText.slice(0, pos);
1659
+ }
1660
+ function countProseWords(el) {
1661
+ let text = "";
1662
+ for (const tag of ["p", "li", "dd"]) {
1663
+ for (const node of el.getElementsByTagName(tag)) {
1664
+ text += " " + (node.textContent || "");
1665
+ }
1666
+ }
1667
+ return countWords(text);
1668
+ }
1669
+
1670
+ // src/standardizer.js
1671
+ function standardize(root) {
1672
+ removeComments(root);
1673
+ normalizeNbsp(root);
1674
+ convertH1ToH2(root);
1675
+ stripAttributes(root);
1676
+ removeEmptyElements(root);
1677
+ }
1678
+ function removeComments(root) {
1679
+ const walker = document.createTreeWalker ? createTreeWalker(root) : null;
1680
+ const comments = [];
1681
+ const iter = (node) => {
1682
+ for (const child of Array.from(node.childNodes)) {
1683
+ if (child.nodeType === 8) {
1684
+ comments.push(child);
1685
+ } else if (child.childNodes && child.childNodes.length) {
1686
+ iter(child);
1687
+ }
1688
+ }
1689
+ };
1690
+ iter(root);
1691
+ for (const c of comments) {
1692
+ if (c.parentNode) c.parentNode.removeChild(c);
1693
+ }
1694
+ }
1695
+ function normalizeNbsp(root) {
1696
+ const iter = (node) => {
1697
+ for (const child of Array.from(node.childNodes)) {
1698
+ if (child.nodeType === 3) {
1699
+ if (child.nodeValue && child.nodeValue.includes("\xA0")) {
1700
+ child.nodeValue = child.nodeValue.replace(/\u00A0/g, " ");
1701
+ }
1702
+ } else if (child.childNodes && child.childNodes.length) {
1703
+ iter(child);
1704
+ }
1705
+ }
1706
+ };
1707
+ iter(root);
1708
+ }
1709
+ function convertH1ToH2(root) {
1710
+ const h1s = Array.from(root.querySelectorAll("h1"));
1711
+ for (const h1 of h1s) {
1712
+ const h2 = root.ownerDocument ? root.ownerDocument.createElement("h2") : h1.cloneNode(false);
1713
+ h2.innerHTML = h1.innerHTML;
1714
+ for (const attr of Array.from(h1.attributes)) {
1715
+ h2.setAttribute(attr.name, attr.value);
1716
+ }
1717
+ if (h1.parentNode) h1.parentNode.replaceChild(h2, h1);
1718
+ }
1719
+ }
1720
+ function stripAttributes(root) {
1721
+ for (const el of root.querySelectorAll("*")) {
1722
+ const toRemove = [];
1723
+ for (const attr of Array.from(el.attributes)) {
1724
+ const name = attr.name.toLowerCase();
1725
+ if (name.startsWith("aria-")) continue;
1726
+ if (ALLOWED_ATTRIBUTES.has(name)) continue;
1727
+ if (name === "id") {
1728
+ const id = (attr.value || "").toLowerCase();
1729
+ if (id.startsWith("fn") || id.startsWith("ref") || id.startsWith("footnote")) continue;
1730
+ }
1731
+ toRemove.push(attr.name);
1732
+ }
1733
+ for (const name of toRemove) el.removeAttribute(name);
1734
+ }
1735
+ }
1736
+ function removeEmptyElements(root) {
1737
+ const emptyTags = ["p", "div", "span", "section", "aside", "li"];
1738
+ let changed = true;
1739
+ while (changed) {
1740
+ changed = false;
1741
+ for (const tag of emptyTags) {
1742
+ for (const el of Array.from(root.getElementsByTagName(tag))) {
1743
+ if (el === root) continue;
1744
+ if (ALLOWED_EMPTY_TAGS.has(tag)) continue;
1745
+ if (isEffectivelyEmpty(el)) {
1746
+ if (el.parentNode) {
1747
+ el.parentNode.removeChild(el);
1748
+ changed = true;
1749
+ }
1750
+ }
1751
+ }
1752
+ }
1753
+ }
1754
+ }
1755
+ function isEffectivelyEmpty(el) {
1756
+ if ((el.textContent || "").trim()) return false;
1757
+ const MEDIA = ["img", "video", "audio", "iframe", "svg", "picture", "canvas"];
1758
+ return !MEDIA.some((tag) => el.getElementsByTagName(tag).length > 0);
1759
+ }
1760
+ function createTreeWalker(root) {
1761
+ return null;
1762
+ }
1763
+
1764
+ // src/defuddle.js
1765
+ var Defuddle = class _Defuddle {
1766
+ /**
1767
+ * @param {Document} doc A parsed DOM Document
1768
+ * @param {object} [options]
1769
+ * @param {string} [options.url] Page URL for relative URL resolution and domain extraction
1770
+ * @param {boolean} [options.debug]
1771
+ */
1772
+ constructor(doc, options = {}) {
1773
+ this.doc = doc;
1774
+ this.url = options.url || null;
1775
+ this.options = options;
1776
+ this._schemaOrgData = void 0;
1777
+ this._metaTags = void 0;
1778
+ this._metadata = void 0;
1779
+ }
1780
+ /**
1781
+ * Parse the document and return a DefuddleResult.
1782
+ * @param {object} [opts] Override default options for this parse call
1783
+ * @returns {DefuddleResult}
1784
+ */
1785
+ parse(opts = {}) {
1786
+ const options = __spreadValues(__spreadValues({}, this.options), opts);
1787
+ let result = this._parseInternal(options);
1788
+ if (result.wordCount < 200) {
1789
+ const retry = this._parseInternal(__spreadProps(__spreadValues({}, options), { removePartialSelectors: false }));
1790
+ if (retry.wordCount > result.wordCount * 2) result = retry;
1791
+ }
1792
+ if (result.wordCount < 50) {
1793
+ const retry = this._parseInternal(__spreadProps(__spreadValues({}, options), { removeHidden: false }));
1794
+ if (retry.wordCount > result.wordCount * 2) result = retry;
1795
+ }
1796
+ if (result.wordCount < 50) {
1797
+ const retry = this._parseInternal(__spreadProps(__spreadValues({}, options), {
1798
+ scoreContent: false,
1799
+ removePartialSelectors: false,
1800
+ removeContentPatterns: false
1801
+ }));
1802
+ if (retry.wordCount > result.wordCount) result = retry;
1803
+ }
1804
+ return result;
1805
+ }
1806
+ /**
1807
+ * Internal parse — clones the document for each attempt.
1808
+ * @private
1809
+ */
1810
+ _parseInternal(opts = {}) {
1811
+ const startTime = Date.now();
1812
+ const {
1813
+ removeExactSelectors = true,
1814
+ removePartialSelectors = true,
1815
+ removeHidden = true,
1816
+ removeSmallImages: doSmallImages = true,
1817
+ scoreContent = true,
1818
+ removeContentPatterns = true,
1819
+ standardizeContent = true,
1820
+ debug = false
1821
+ } = opts;
1822
+ const doc = this.doc;
1823
+ if (!doc || !doc.documentElement) {
1824
+ return this._emptyResult(startTime);
1825
+ }
1826
+ if (this._schemaOrgData === void 0) {
1827
+ this._schemaOrgData = extractSchemaOrg(doc);
1828
+ }
1829
+ if (this._metaTags === void 0) {
1830
+ this._metaTags = collectMetaTags(doc);
1831
+ }
1832
+ if (this._metadata === void 0) {
1833
+ this._metadata = extractMetadata(doc, this.url, this._schemaOrgData, this._metaTags);
1834
+ }
1835
+ const clone = doc.cloneNode(true);
1836
+ let mainContent = null;
1837
+ if (opts.contentSelector) {
1838
+ try {
1839
+ mainContent = clone.querySelector(opts.contentSelector);
1840
+ } catch (e) {
1841
+ }
1842
+ }
1843
+ if (!mainContent) {
1844
+ mainContent = findMainContent(clone);
1845
+ }
1846
+ if (!mainContent) {
1847
+ return this._buildResult(clone.body ? clone.body.innerHTML : "", startTime);
1848
+ }
1849
+ if (doSmallImages) removeSmallImages(clone, debug);
1850
+ if (removeHidden) removeHiddenElements(clone, debug);
1851
+ if (removeExactSelectors) removeExact(clone, mainContent, debug);
1852
+ if (removePartialSelectors) removePartial(clone, mainContent, debug);
1853
+ if (scoreContent) scoreAndRemove(clone, mainContent, debug);
1854
+ if (removeContentPatterns) removeByContentPattern(mainContent, debug, this.url || "");
1855
+ if (standardizeContent) standardize(mainContent);
1856
+ if (this.url) resolveUrls(mainContent, clone, this.url);
1857
+ return this._buildResult(mainContent.outerHTML, startTime);
1858
+ }
1859
+ _buildResult(content, startTime) {
1860
+ const meta = this._metadata || {};
1861
+ return new DefuddleResult(__spreadProps(__spreadValues({
1862
+ content
1863
+ }, meta), {
1864
+ schemaOrgData: this._schemaOrgData || null,
1865
+ metaTags: this._metaTags || [],
1866
+ wordCount: countHtmlWords(content),
1867
+ parseTime: Date.now() - startTime
1868
+ }));
1869
+ }
1870
+ _emptyResult(startTime) {
1871
+ let domain = "";
1872
+ if (this.url) {
1873
+ try {
1874
+ let host = new URL(this.url).hostname;
1875
+ if (host.startsWith("www.")) host = host.slice(4);
1876
+ domain = host;
1877
+ } catch (e) {
1878
+ }
1879
+ }
1880
+ return new DefuddleResult({
1881
+ content: "",
1882
+ title: "",
1883
+ description: "",
1884
+ author: "",
1885
+ published: "",
1886
+ site: "",
1887
+ domain,
1888
+ favicon: "",
1889
+ image: "",
1890
+ language: "",
1891
+ schemaOrgData: null,
1892
+ metaTags: [],
1893
+ wordCount: 0,
1894
+ parseTime: Date.now() - startTime
1895
+ });
1896
+ }
1897
+ /**
1898
+ * Convenience static method: parse an HTML string.
1899
+ * In browser: uses DOMParser automatically.
1900
+ * In Node.js: requires passing a `parseHtml` function that returns a Document.
1901
+ *
1902
+ * @param {string|Document} input HTML string or existing Document
1903
+ * @param {object} [options]
1904
+ * @param {string} [options.url]
1905
+ * @param {Function} [options.parseHtml] Custom HTML parser: (html) => Document
1906
+ * @returns {DefuddleResult}
1907
+ */
1908
+ static parse(input, options = {}) {
1909
+ let doc;
1910
+ if (typeof input === "string") {
1911
+ if (options.parseHtml) {
1912
+ doc = options.parseHtml(input);
1913
+ } else if (typeof DOMParser !== "undefined") {
1914
+ doc = new DOMParser().parseFromString(input, "text/html");
1915
+ } else {
1916
+ throw new Error(
1917
+ 'Defuddle.parse() requires a DOM environment. In Node.js, pass a parseHtml function: Defuddle.parse(html, { parseHtml: html => require("linkedom").parseHTML(html).document })'
1918
+ );
1919
+ }
1920
+ } else {
1921
+ doc = input;
1922
+ }
1923
+ return new _Defuddle(doc, options).parse();
1924
+ }
1925
+ };
1926
+ var DefuddleResult = class {
1927
+ constructor(data) {
1928
+ this.content = data.content || "";
1929
+ this.title = data.title || "";
1930
+ this.description = data.description || "";
1931
+ this.author = data.author || "";
1932
+ this.published = data.published || "";
1933
+ this.site = data.site || "";
1934
+ this.domain = data.domain || "";
1935
+ this.favicon = data.favicon || "";
1936
+ this.image = data.image || "";
1937
+ this.language = data.language || "";
1938
+ this.wordCount = data.wordCount || 0;
1939
+ this.parseTime = data.parseTime || 0;
1940
+ this.schemaOrgData = data.schemaOrgData || null;
1941
+ this.metaTags = data.metaTags || [];
1942
+ }
1943
+ toJSON() {
1944
+ return {
1945
+ content: this.content,
1946
+ title: this.title,
1947
+ description: this.description,
1948
+ author: this.author,
1949
+ published: this.published,
1950
+ site: this.site,
1951
+ domain: this.domain,
1952
+ favicon: this.favicon,
1953
+ image: this.image,
1954
+ language: this.language,
1955
+ wordCount: this.wordCount,
1956
+ parseTime: this.parseTime,
1957
+ schemaOrgData: this.schemaOrgData,
1958
+ metaTags: this.metaTags
1959
+ };
1960
+ }
1961
+ };
1962
+ return __toCommonJS(index_exports);
1963
+ })();
1964
+ if (typeof module !== "undefined" && module.exports) {
1965
+ module.exports = DefuddleLib;
1966
+ } else if (typeof define === "function" && define.amd) {
1967
+ define(function() { return DefuddleLib; });
1968
+ }