defuddle-js 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +102 -0
- package/dist/defuddle.cjs.js +1950 -0
- package/dist/defuddle.umd.js +1968 -0
- package/package.json +41 -0
- package/src/constants.js +297 -0
- package/src/content-finder.js +116 -0
- package/src/content-scorer.js +194 -0
- package/src/defuddle.js +252 -0
- package/src/index.js +1 -0
- package/src/metadata.js +371 -0
- package/src/removals/content-patterns.js +174 -0
- package/src/removals/hidden.js +51 -0
- package/src/removals/selector-remover.js +137 -0
- package/src/removals/small-images.js +45 -0
- package/src/schema-org.js +102 -0
- package/src/standardizer.js +116 -0
- package/src/url-resolver.js +101 -0
- package/src/utils.js +95 -0
package/package.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "defuddle-js",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Extract main content and metadata from HTML pages. Works in browser and Node.js.",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"main": "dist/defuddle.cjs.js",
|
|
7
|
+
"module": "src/index.js",
|
|
8
|
+
"browser": "dist/defuddle.umd.js",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": "./src/index.js",
|
|
12
|
+
"require": "./dist/defuddle.cjs.js",
|
|
13
|
+
"browser": "./dist/defuddle.umd.js"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"files": [
|
|
17
|
+
"src",
|
|
18
|
+
"dist"
|
|
19
|
+
],
|
|
20
|
+
"type": "module",
|
|
21
|
+
"scripts": {
|
|
22
|
+
"build": "node build.cjs",
|
|
23
|
+
"test": "NODE_OPTIONS=--experimental-vm-modules jest",
|
|
24
|
+
"test:watch": "NODE_OPTIONS=--experimental-vm-modules jest --watch"
|
|
25
|
+
},
|
|
26
|
+
"devDependencies": {
|
|
27
|
+
"esbuild": "^0.25.0",
|
|
28
|
+
"jest": "^29.0.0",
|
|
29
|
+
"jest-environment-jsdom": "^29.0.0"
|
|
30
|
+
},
|
|
31
|
+
"keywords": [
|
|
32
|
+
"html",
|
|
33
|
+
"content-extraction",
|
|
34
|
+
"readability",
|
|
35
|
+
"article",
|
|
36
|
+
"metadata",
|
|
37
|
+
"parser",
|
|
38
|
+
"browser",
|
|
39
|
+
"nodejs"
|
|
40
|
+
]
|
|
41
|
+
}
|
package/src/constants.js
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Constants for defuddle-js content extraction.
|
|
3
|
+
* Ported from defuddle/src/constants.ts and defuddle-php.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export const ENTRY_POINT_SELECTORS = [
|
|
7
|
+
'#post',
|
|
8
|
+
'.post-content',
|
|
9
|
+
'.post-body',
|
|
10
|
+
'.article-content',
|
|
11
|
+
'#article-content',
|
|
12
|
+
'.article_post',
|
|
13
|
+
'.article-wrapper',
|
|
14
|
+
'.entry-content',
|
|
15
|
+
'.content-article',
|
|
16
|
+
'.instapaper_body',
|
|
17
|
+
'.post',
|
|
18
|
+
'.markdown-body',
|
|
19
|
+
'article',
|
|
20
|
+
'[role="article"]',
|
|
21
|
+
'main',
|
|
22
|
+
'[role="main"]',
|
|
23
|
+
'#content',
|
|
24
|
+
'body',
|
|
25
|
+
];
|
|
26
|
+
|
|
27
|
+
export const BLOCK_ELEMENTS_SELECTOR =
|
|
28
|
+
'div,section,article,main,aside,header,footer,nav';
|
|
29
|
+
|
|
30
|
+
export const CONTENT_INDICATORS = new Set([
|
|
31
|
+
'article', 'body', 'content', 'entry', 'hentry', 'main',
|
|
32
|
+
'page', 'post', 'text', 'blog', 'story',
|
|
33
|
+
]);
|
|
34
|
+
|
|
35
|
+
export const NAV_INDICATORS = [
|
|
36
|
+
'nav', 'navigation', 'menu', 'sidebar', 'header', 'footer',
|
|
37
|
+
'breadcrumb', 'pagination', 'pager', 'tags', 'categories',
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
export const SOCIAL_MEDIA_PATTERNS = [
|
|
41
|
+
'twitter.com/', 'facebook.com/', 'instagram.com/', 'linkedin.com/',
|
|
42
|
+
'youtube.com/', 'tiktok.com/', 'pinterest.com/', 'reddit.com/',
|
|
43
|
+
];
|
|
44
|
+
|
|
45
|
+
export const MIN_IMAGE_DIMENSION = 33;
|
|
46
|
+
|
|
47
|
+
/** Exact CSS selectors for known non-content elements */
|
|
48
|
+
export const EXACT_SELECTORS = [
|
|
49
|
+
'noscript',
|
|
50
|
+
'style',
|
|
51
|
+
'meta',
|
|
52
|
+
'link',
|
|
53
|
+
'.promo',
|
|
54
|
+
'.Promo',
|
|
55
|
+
'.alert',
|
|
56
|
+
'#barrier-page',
|
|
57
|
+
'[id="comments"]',
|
|
58
|
+
'[id="comment"]',
|
|
59
|
+
'div[class*="cover-"]',
|
|
60
|
+
'div[id*="cover-"]',
|
|
61
|
+
'header',
|
|
62
|
+
'.header',
|
|
63
|
+
'#header',
|
|
64
|
+
'#Header',
|
|
65
|
+
'#banner',
|
|
66
|
+
'#Banner',
|
|
67
|
+
'nav',
|
|
68
|
+
'.navigation',
|
|
69
|
+
'#navigation',
|
|
70
|
+
'[role="navigation"]',
|
|
71
|
+
'[role="dialog"]',
|
|
72
|
+
'[role="complementary"]',
|
|
73
|
+
'[class*="pagination"]',
|
|
74
|
+
'.menu',
|
|
75
|
+
'#siteSub',
|
|
76
|
+
'.previous',
|
|
77
|
+
'.Author',
|
|
78
|
+
'[class$="_bio"]',
|
|
79
|
+
'#categories',
|
|
80
|
+
'.contributor',
|
|
81
|
+
'.date',
|
|
82
|
+
'#date',
|
|
83
|
+
'[data-date]',
|
|
84
|
+
'.entry-meta',
|
|
85
|
+
'.meta',
|
|
86
|
+
'.tags',
|
|
87
|
+
'#tags',
|
|
88
|
+
'[rel="tag"]',
|
|
89
|
+
'.toc',
|
|
90
|
+
'.Toc',
|
|
91
|
+
'#toc',
|
|
92
|
+
'.headline',
|
|
93
|
+
'#headline',
|
|
94
|
+
'#title',
|
|
95
|
+
'#Title',
|
|
96
|
+
'#articleTag',
|
|
97
|
+
'[href*="/tag/"]',
|
|
98
|
+
'[href*="/tags/"]',
|
|
99
|
+
'[href*="/author/"]',
|
|
100
|
+
'[href*="/author?"]',
|
|
101
|
+
'[href$="/author"]',
|
|
102
|
+
'a[href*="copyright.com"]',
|
|
103
|
+
'a[href*="google.com/preferences"]',
|
|
104
|
+
'[href*="#toc"]',
|
|
105
|
+
'[href="#top"]',
|
|
106
|
+
'[href="#Top"]',
|
|
107
|
+
'[href="#page-header"]',
|
|
108
|
+
'[href="#content"]',
|
|
109
|
+
'[href="#site-content"]',
|
|
110
|
+
'[href="#main-content"]',
|
|
111
|
+
'[href^="#main"]',
|
|
112
|
+
'[src*="author"]',
|
|
113
|
+
'footer',
|
|
114
|
+
'.aside',
|
|
115
|
+
'button',
|
|
116
|
+
'canvas',
|
|
117
|
+
'dialog',
|
|
118
|
+
'fieldset',
|
|
119
|
+
'form',
|
|
120
|
+
'label',
|
|
121
|
+
'option',
|
|
122
|
+
'select',
|
|
123
|
+
'[role="listbox"]',
|
|
124
|
+
'[role="option"]',
|
|
125
|
+
'textarea',
|
|
126
|
+
'[hidden]',
|
|
127
|
+
'[aria-hidden="true"]',
|
|
128
|
+
'.hidden',
|
|
129
|
+
'.invisible',
|
|
130
|
+
'#logo',
|
|
131
|
+
'#Logo',
|
|
132
|
+
'#newsletter',
|
|
133
|
+
'#Newsletter',
|
|
134
|
+
'.subscribe',
|
|
135
|
+
'.noprint',
|
|
136
|
+
'[data-print-layout="hide"]',
|
|
137
|
+
'[data-block="donotprint"]',
|
|
138
|
+
'.sidebar',
|
|
139
|
+
'.Sidebar',
|
|
140
|
+
'#sidebar',
|
|
141
|
+
'#Sidebar',
|
|
142
|
+
'#side-bar',
|
|
143
|
+
'#sitesub',
|
|
144
|
+
'.copyright',
|
|
145
|
+
'#copyright',
|
|
146
|
+
'.licensebox',
|
|
147
|
+
'#page-info',
|
|
148
|
+
'#rss',
|
|
149
|
+
'#feed',
|
|
150
|
+
'.gutter',
|
|
151
|
+
'#primaryaudio',
|
|
152
|
+
'table.infobox',
|
|
153
|
+
'.gh-header-sticky',
|
|
154
|
+
];
|
|
155
|
+
|
|
156
|
+
/** Partial substring patterns tested against class + id attribute values */
|
|
157
|
+
export const PARTIAL_SELECTORS = [
|
|
158
|
+
'a-statement', 'access-wall', 'activitypub', 'actioncall', 'addcomment', 'addtoany',
|
|
159
|
+
'advert', 'adlayout', 'ad-tldr', 'ad-placement', 'ads-container', '_ad_', 'AdBlock_',
|
|
160
|
+
'AdUnit', 'after_content', 'after_main_article', 'afterpost', 'allterms', '-alert-',
|
|
161
|
+
'alert-box', '_archive', 'around-the-web', 'aroundpages', 'article-author',
|
|
162
|
+
'article-badges', 'article-banner', 'article-bottom-section', 'article-bottom',
|
|
163
|
+
'article-category', 'article-card', 'article-citation', 'article__copy',
|
|
164
|
+
'article_date', 'article-date', 'article-end ', 'article_header', 'article-header',
|
|
165
|
+
'article__header', 'article__hero', 'article__info', 'article-info', 'article-meta',
|
|
166
|
+
'article_meta', 'article__meta', 'articlename', 'article-subject', 'article_subject',
|
|
167
|
+
'article-snippet', 'article-separator', 'article--share', 'article--topics',
|
|
168
|
+
'articletags', 'article-tags', 'article_tags', 'articletitle', 'article-title',
|
|
169
|
+
'article_title', 'articletopics', 'article-topics', 'article-actions',
|
|
170
|
+
'article--lede', 'articlewell', 'associated-people', 'audio-card', 'author-bio',
|
|
171
|
+
'author-box', 'author-info', 'author_info', 'authorm', 'author-mini-bio',
|
|
172
|
+
'author-name', 'author-publish-info', 'authored-by', 'avatar',
|
|
173
|
+
'back-to-top', 'backlink_container', 'backlinks-section', 'bio-block', 'biobox',
|
|
174
|
+
'blog-pager', 'bookmark-', '-bookmark', 'bottominfo', 'bottomnav',
|
|
175
|
+
'bottom-of-article', 'bottom-wrapper', 'brand-bar', 'bcrumb', 'breadcrumb',
|
|
176
|
+
'brdcrumb', 'button-wrapper', 'buttons-container', 'btn-', '-btn', 'byline',
|
|
177
|
+
'captcha', 'card-text', 'card-media', 'card-post', 'carouselcontainer',
|
|
178
|
+
'carousel-container', 'cat_header', 'catlinks', '_categories', 'card-author',
|
|
179
|
+
'card-content', 'chapter-list', 'collections', 'comments', '-comment',
|
|
180
|
+
'commentbox', 'comment-button', 'commentcomp', 'comment-content', 'comment-count',
|
|
181
|
+
'comment-form', 'comment-number', 'comment-respond', 'comment-thread',
|
|
182
|
+
'comment-wrap', 'complementary', 'consent', 'contact-', 'content-card',
|
|
183
|
+
'copycontent', 'content-topics', 'contentpromo', 'context-bar', 'context-widget',
|
|
184
|
+
'core-collateral', 'cover-image', 'cover-photo', 'cover-wrap', 'created-date',
|
|
185
|
+
'creative-commons_', 'c-subscribe', '_cta', '-cta', 'cta-', 'cta_',
|
|
186
|
+
'current-issue', 'custom-list-number',
|
|
187
|
+
'dateline', 'dateheader', 'date-header', 'date-pub', 'disclaimer', 'disclosure',
|
|
188
|
+
'discussion', 'discuss_', '-dismiss', 'disqus', 'donate', 'donation', 'dropdown',
|
|
189
|
+
'editorial_contact', 'editorial-contact', 'element-invisible', 'eletters',
|
|
190
|
+
'emailsignup', 'emoji-bar', 'engagement-widget', 'enhancement-', 'entry-author-info',
|
|
191
|
+
'entry-categories', 'entry-date', 'entry-title', 'entry-utility', '-error', 'error-',
|
|
192
|
+
'eyebrow', 'expand-reduce', 'external-anchor', 'externallinkembedwrapper',
|
|
193
|
+
'extra-services', 'extra-title',
|
|
194
|
+
'facebook', 'fancy-box', 'favorite', 'featured-content', 'feature_feed', 'feedback',
|
|
195
|
+
'feed-links', 'field-site-sections', 'fixheader', 'floating-vid', 'follower',
|
|
196
|
+
'footer', 'footnote-back', 'footnoteback', 'form-group', 'for-you', 'frontmatter',
|
|
197
|
+
'further-reading', 'fullbleedheader',
|
|
198
|
+
'gallery-count', 'gated-', 'gh-feed', 'gist-meta', 'goog-', 'graph-view',
|
|
199
|
+
'hamburger', 'header_logo', 'header-logo', 'header-pattern', 'hero-list',
|
|
200
|
+
'hide-for-print', 'hide-print', 'hide-when-no-script', 'hidden-print',
|
|
201
|
+
'hidden-sidenote', 'hidden-accessibility',
|
|
202
|
+
'infoline', 'inline-topic', 'instacartIntegration', 'interlude', 'interaction',
|
|
203
|
+
'itemendrow', 'intro-date', 'invisible',
|
|
204
|
+
'jp-no-solution', 'jp-relatedposts', 'jswarning', 'js-warning', 'jumplink',
|
|
205
|
+
'jumpto', 'jump-to-', 'js-skip-to-content',
|
|
206
|
+
'keepreading', 'keep-reading', 'keep_reading', 'keyword_wrap', 'kicker',
|
|
207
|
+
'labstab', '-labels', 'language-name', 'lastupdated', 'latest-content', '-ledes-',
|
|
208
|
+
'-license', 'license-', 'lightbox-popup', 'like-button', 'link-box', 'links-grid',
|
|
209
|
+
'links-title', 'listing-dynamic-terms', 'list-tags', 'listinks', 'loading',
|
|
210
|
+
'loa-info', 'logo_container',
|
|
211
|
+
'masthead', 'marketing', 'media-inquiry', '-menu', 'menu-', 'metadata', 'meta-date',
|
|
212
|
+
'meta-row', 'might-like', 'minibio', 'more-about', 'mod-paywall', '_modal',
|
|
213
|
+
'-modal', 'more-', 'morenews', 'morestories', 'more_wrapper', 'most-read',
|
|
214
|
+
'mw-editsection', 'mw-cite-backlink', 'mw-indicators', 'mw-jump-link',
|
|
215
|
+
'nav-', 'nav_', 'navigation-post', 'next-', 'newsgallery', 'news-story-title',
|
|
216
|
+
'newsletter_', 'newsletterbanner', 'newslettercontainer', 'newsletter-form',
|
|
217
|
+
'newsletter-signup', 'newslettersignup', 'newsletterwidget', 'newsletterwrapper',
|
|
218
|
+
'not-found', 'notessection', 'nomobile', 'noprint',
|
|
219
|
+
'open-slideshow', 'originally-published', 'other-blogs', 'outline-view',
|
|
220
|
+
'pagehead', 'page-header', 'page-title', 'paywall_message', '-partners',
|
|
221
|
+
'permission-', 'plea', 'popular', 'popup_links', 'pop_stories', 'pop-up',
|
|
222
|
+
'post__author', 'post-author', 'post-bottom', 'post__category', 'postcomment',
|
|
223
|
+
'postdate', 'post-date', 'post_date', 'post-details', 'post-feeds', 'postinfo',
|
|
224
|
+
'post-info', 'post_info', 'post-inline-date', 'post-links', 'postlist',
|
|
225
|
+
'post_list', 'post_meta', 'post-meta', 'postmeta', 'post_more', 'postnavi',
|
|
226
|
+
'post-navigation', 'postpath', 'post-preview', 'postsnippet', 'post_snippet',
|
|
227
|
+
'post-snippet', 'post-subject', 'posttax', 'post-tax', 'post_tax', 'posttag',
|
|
228
|
+
'post_tag', 'post-tag', 'post_time', 'posttitle', 'post-title', 'post_title',
|
|
229
|
+
'post__title', 'post-ufi-button', 'prev-post', 'prevnext', 'prev_next',
|
|
230
|
+
'prev-next', 'previousnext', 'press-inquiries', 'print-none', 'print-header',
|
|
231
|
+
'privacy-notice', 'privacy-settings', 'profile', 'promo_article', 'promo-bar',
|
|
232
|
+
'promo-box', 'pubdate', 'pub_date', 'pub-date', 'publish_date', 'publish-date',
|
|
233
|
+
'publication-date', 'publicationName',
|
|
234
|
+
'qr-code', 'qr_code', 'quick_up',
|
|
235
|
+
'_rail', 'ratingssection', 'read_also', 'readmore', 'read-next', 'read_next',
|
|
236
|
+
'read_time', 'read-time', 'reading_time', 'reading-time', 'reading-list',
|
|
237
|
+
'recent-', 'recent-articles', 'recentpost', 'recent_post', 'recent-post',
|
|
238
|
+
'recommend', 'redirectedfrom', 'recirc', 'register', 'related', 'relevant',
|
|
239
|
+
'reversefootnote', 'robots-nocontent', '_rss', 'rss-link',
|
|
240
|
+
'screen-reader-text', 'scroll_to', 'scroll-to', '_search', '-search',
|
|
241
|
+
'section-nav', 'series-banner', 'share-box', 'sharedaddy', 'share-icons',
|
|
242
|
+
'sharelinks', 'share-post', 'share-print', 'share-section', 'sharing_',
|
|
243
|
+
'shariff-', 'show-for-print', 'sidebartitle', 'sidebar-content', 'sidebar-wrapper',
|
|
244
|
+
'sideitems', 'sidebar-author', 'sidebar-item', 'side-box', 'side-logo',
|
|
245
|
+
'sign-in-gate', 'similar-', 'similar_', 'similars-', 'site-index', 'site-header',
|
|
246
|
+
'siteheader', 'site-logo', 'site-name', 'site-wordpress', 'skip-content',
|
|
247
|
+
'skip-to-content', 'skip-link', 'c-skip-link', '_skip-link', '-slider',
|
|
248
|
+
'slug-wrap', 'social-author', 'social-shar', 'social-date', 'speechify-ignore',
|
|
249
|
+
'speedbump', 'sponsor', 'springercitation', 'sr-only', '_stats', 'story-date',
|
|
250
|
+
'story-navigation', 'storyreadtime', 'storysmall', 'storypublishdate',
|
|
251
|
+
'subject-label', 'subhead', 'submenu', '-subscribe-', 'subscriber-drive',
|
|
252
|
+
'subscription-',
|
|
253
|
+
'_tags', 'tags__item', 'tag_list', 'taxonomy', 'table-of-contents', 'tabs-',
|
|
254
|
+
'terminaltout', 'time-rubric', 'timestamp', 'time-read', 'time-to-read',
|
|
255
|
+
'tip_off', 'tiptout', '-tout-', 'toc-container', 'toggle-caption',
|
|
256
|
+
'tooltip-content', 'topbar', 'topic-authors', 'topic-footer', 'topic-list',
|
|
257
|
+
'topic-subnav', 'top-wrapper', 'tree-item', 'trending', 'trust-feat',
|
|
258
|
+
'trust-badge', 'trust-project', 'twitter', 'twiblock',
|
|
259
|
+
'u-hide', 'upsell',
|
|
260
|
+
'viewbottom', 'yarpp-related', 'visually-hidden', 'welcomebox', 'widget_pages',
|
|
261
|
+
];
|
|
262
|
+
|
|
263
|
+
export const PARTIAL_SELECTORS_REGEX = new RegExp(
|
|
264
|
+
'(?:' + PARTIAL_SELECTORS.map(p => p.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|') + ')',
|
|
265
|
+
'i'
|
|
266
|
+
);
|
|
267
|
+
|
|
268
|
+
export const FOOTNOTE_SELECTORS = [
|
|
269
|
+
'div.footnote ol', 'div.footnotes ol', 'div[role="doc-endnotes"]',
|
|
270
|
+
'div[role="doc-footnotes"]', 'ol.footnotes-list', 'ol.footnotes',
|
|
271
|
+
'ol.references', 'section.footnotes ol', 'section[role="doc-endnotes"]',
|
|
272
|
+
'section[role="doc-footnotes"]', 'section[role="doc-bibliography"]',
|
|
273
|
+
'ul.footnotes-list', '#footnotes',
|
|
274
|
+
];
|
|
275
|
+
|
|
276
|
+
export const PARTIAL_MATCH_ATTRIBUTES = [
|
|
277
|
+
'class', 'id', 'data-test', 'data-testid', 'data-test-id', 'data-qa', 'data-cy',
|
|
278
|
+
];
|
|
279
|
+
|
|
280
|
+
export const VIDEO_EMBED_PATTERNS = [
|
|
281
|
+
'youtube.com', 'youtu.be', 'vimeo.com', 'twitter.com', 'x.com', 'datawrapper.de',
|
|
282
|
+
];
|
|
283
|
+
|
|
284
|
+
export const ALLOWED_ATTRIBUTES = new Set([
|
|
285
|
+
'alt', 'allow', 'allowfullscreen', 'aria-label', 'checked',
|
|
286
|
+
'colspan', 'controls', 'data-latex', 'data-src', 'data-srcset',
|
|
287
|
+
'data-callout', 'data-callout-title', 'data-lang', 'dir',
|
|
288
|
+
'frameborder', 'headers', 'height', 'href', 'kind', 'label',
|
|
289
|
+
'lang', 'role', 'rowspan', 'src', 'srclang', 'srcset',
|
|
290
|
+
'title', 'type', 'width', 'datetime',
|
|
291
|
+
]);
|
|
292
|
+
|
|
293
|
+
export const ALLOWED_EMPTY_TAGS = new Set([
|
|
294
|
+
'area', 'audio', 'br', 'col', 'embed', 'figure', 'hr',
|
|
295
|
+
'iframe', 'img', 'input', 'link', 'meta', 'picture',
|
|
296
|
+
'source', 'svg', 'td', 'th', 'track', 'video', 'wbr',
|
|
297
|
+
]);
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Find the main content element in a document.
|
|
3
|
+
* Ported from defuddle-php/src/Content/ContentFinder.php
|
|
4
|
+
*/
|
|
5
|
+
import { ENTRY_POINT_SELECTORS, BLOCK_ELEMENTS_SELECTOR } from './constants.js';
|
|
6
|
+
import { scoreElement, findBestElement } from './content-scorer.js';
|
|
7
|
+
import { countWords, isAncestorOrSelf } from './utils.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Find the main content element in a document.
|
|
11
|
+
* @param {Document} doc
|
|
12
|
+
* @returns {Element|null}
|
|
13
|
+
*/
|
|
14
|
+
export function findMainContent(doc) {
|
|
15
|
+
const selectorCount = ENTRY_POINT_SELECTORS.length;
|
|
16
|
+
const candidates = [];
|
|
17
|
+
|
|
18
|
+
for (let index = 0; index < selectorCount; index++) {
|
|
19
|
+
const selector = ENTRY_POINT_SELECTORS[index];
|
|
20
|
+
let elements;
|
|
21
|
+
try {
|
|
22
|
+
elements = doc.querySelectorAll(selector);
|
|
23
|
+
} catch (e) {
|
|
24
|
+
continue;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
for (const el of elements) {
|
|
28
|
+
const baseScore = (selectorCount - index) * 40;
|
|
29
|
+
const contentScore = scoreElement(el);
|
|
30
|
+
candidates.push({
|
|
31
|
+
element: el,
|
|
32
|
+
score: baseScore + contentScore,
|
|
33
|
+
selectorIndex: index,
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (candidates.length === 0) {
|
|
39
|
+
return findByScoring(doc);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Sort by score descending
|
|
43
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
44
|
+
|
|
45
|
+
const top = candidates[0];
|
|
46
|
+
|
|
47
|
+
// If body is the only match, try table-based detection
|
|
48
|
+
if (
|
|
49
|
+
candidates.length === 1 &&
|
|
50
|
+
top.element.tagName.toLowerCase() === 'body'
|
|
51
|
+
) {
|
|
52
|
+
const tableContent = findTableBasedContent(doc);
|
|
53
|
+
if (tableContent) return tableContent;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Prefer more-specific child over generic parent
|
|
57
|
+
let best = top;
|
|
58
|
+
for (const child of candidates) {
|
|
59
|
+
if (child === top) continue;
|
|
60
|
+
if (child.selectorIndex >= best.selectorIndex) continue;
|
|
61
|
+
if (!top.element.contains(child.element)) continue;
|
|
62
|
+
|
|
63
|
+
const childWords = countWords(child.element.textContent || '');
|
|
64
|
+
if (childWords <= 50) continue;
|
|
65
|
+
|
|
66
|
+
// Don't prefer child if multiple siblings at same selector (listing page)
|
|
67
|
+
const siblingsAtIndex = candidates.filter(
|
|
68
|
+
c => c.selectorIndex === child.selectorIndex && top.element.contains(c.element)
|
|
69
|
+
).length;
|
|
70
|
+
if (siblingsAtIndex > 1) continue;
|
|
71
|
+
|
|
72
|
+
best = child;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return best.element;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Fallback: score all block elements and return the highest-scoring one.
|
|
80
|
+
* @param {Document} doc
|
|
81
|
+
* @returns {Element|null}
|
|
82
|
+
*/
|
|
83
|
+
function findByScoring(doc) {
|
|
84
|
+
const elements = Array.from(doc.querySelectorAll(BLOCK_ELEMENTS_SELECTOR))
|
|
85
|
+
.filter(el => scoreElement(el) > 0);
|
|
86
|
+
|
|
87
|
+
if (elements.length === 0) {
|
|
88
|
+
return doc.body || null;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return elements.reduce((best, el) =>
|
|
92
|
+
scoreElement(el) > scoreElement(best) ? el : best
|
|
93
|
+
);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* For old-style table-based layouts, find the highest-scoring <td>.
|
|
98
|
+
* @param {Document} doc
|
|
99
|
+
* @returns {Element|null}
|
|
100
|
+
*/
|
|
101
|
+
function findTableBasedContent(doc) {
|
|
102
|
+
const tables = Array.from(doc.getElementsByTagName('table'));
|
|
103
|
+
const hasTableLayout = tables.some(table => {
|
|
104
|
+
const width = parseInt(table.getAttribute('width') || '0');
|
|
105
|
+
const cls = (table.className || '').toLowerCase();
|
|
106
|
+
return width > 400 ||
|
|
107
|
+
cls.includes('content') ||
|
|
108
|
+
cls.includes('article') ||
|
|
109
|
+
table.getAttribute('align') === 'center';
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
if (!hasTableLayout) return null;
|
|
113
|
+
|
|
114
|
+
const cells = Array.from(doc.getElementsByTagName('td'));
|
|
115
|
+
return findBestElement(cells);
|
|
116
|
+
}
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content scoring algorithm for defuddle-js.
|
|
3
|
+
* Ported from defuddle/src/removals/scoring.ts and defuddle-php.
|
|
4
|
+
*/
|
|
5
|
+
import { CONTENT_INDICATORS, NAV_INDICATORS, SOCIAL_MEDIA_PATTERNS } from './constants.js';
|
|
6
|
+
import { countWords, getClassId, isAncestorOrSelf } from './utils.js';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Score an element as a potential content container.
|
|
10
|
+
* Higher = more likely to be main content.
|
|
11
|
+
* @param {Element} el
|
|
12
|
+
* @returns {number}
|
|
13
|
+
*/
|
|
14
|
+
export function scoreElement(el) {
|
|
15
|
+
const text = el.textContent || '';
|
|
16
|
+
const words = countWords(text);
|
|
17
|
+
let score = words;
|
|
18
|
+
|
|
19
|
+
// Paragraphs are a strong positive signal
|
|
20
|
+
score += el.getElementsByTagName('p').length * 10;
|
|
21
|
+
|
|
22
|
+
// Commas suggest prose
|
|
23
|
+
score += (text.match(/,/g) || []).length;
|
|
24
|
+
|
|
25
|
+
// Content-indicating class/id names
|
|
26
|
+
const classId = getClassId(el).toLowerCase();
|
|
27
|
+
for (const indicator of CONTENT_INDICATORS) {
|
|
28
|
+
if (classId.includes(indicator)) {
|
|
29
|
+
score += 15;
|
|
30
|
+
break;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Link density penalty
|
|
35
|
+
const linkDensity = getLinkDensity(el);
|
|
36
|
+
score *= Math.max(0.5, 1 - linkDensity);
|
|
37
|
+
|
|
38
|
+
return score;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Score an element as a potential non-content block.
|
|
43
|
+
* Returns a score where < 0 means the element should be removed.
|
|
44
|
+
* @param {Element} el
|
|
45
|
+
* @returns {number}
|
|
46
|
+
*/
|
|
47
|
+
export function scoreNonContentBlock(el) {
|
|
48
|
+
let score = 0;
|
|
49
|
+
const classId = getClassId(el).toLowerCase();
|
|
50
|
+
|
|
51
|
+
// Navigation indicator penalty
|
|
52
|
+
for (const indicator of NAV_INDICATORS) {
|
|
53
|
+
if (classId.includes(indicator)) score -= 10;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// High link density
|
|
57
|
+
if (getLinkDensity(el) > 0.5) score -= 15;
|
|
58
|
+
|
|
59
|
+
// Social media profile links
|
|
60
|
+
if (hasSocialMediaLinks(el)) score -= 15;
|
|
61
|
+
|
|
62
|
+
// Short author+date byline
|
|
63
|
+
if (isAuthorDateByline(el)) score -= 10;
|
|
64
|
+
|
|
65
|
+
// Card grid (listing page pattern)
|
|
66
|
+
if (isCardGrid(el)) score -= 15;
|
|
67
|
+
|
|
68
|
+
return score;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Score all block elements and remove those with score < 0.
|
|
73
|
+
* Protects mainContent and its ancestors.
|
|
74
|
+
* @param {Document} doc
|
|
75
|
+
* @param {Element} mainContent
|
|
76
|
+
* @param {boolean} debug
|
|
77
|
+
* @returns {Array}
|
|
78
|
+
*/
|
|
79
|
+
export function scoreAndRemove(doc, mainContent, debug = false) {
|
|
80
|
+
const removed = [];
|
|
81
|
+
const body = doc.body || doc.documentElement;
|
|
82
|
+
if (!body) return removed;
|
|
83
|
+
|
|
84
|
+
const tags = ['div', 'section', 'article', 'main', 'aside', 'header', 'footer', 'nav'];
|
|
85
|
+
const candidates = [];
|
|
86
|
+
for (const tag of tags) {
|
|
87
|
+
candidates.push(...Array.from(body.getElementsByTagName(tag)));
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const toRemove = [];
|
|
91
|
+
for (const el of candidates) {
|
|
92
|
+
if (isAncestorOrSelf(el, mainContent)) continue;
|
|
93
|
+
const score = scoreNonContentBlock(el);
|
|
94
|
+
if (score < 0) {
|
|
95
|
+
toRemove.push(el);
|
|
96
|
+
if (debug) {
|
|
97
|
+
removed.push({ step: 'scoreAndRemove', score, tag: el.tagName, class: el.className });
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
for (const el of toRemove) {
|
|
103
|
+
if (el.parentNode) el.parentNode.removeChild(el);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return removed;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Find the highest-scoring element from a list.
|
|
111
|
+
* @param {Element[]} elements
|
|
112
|
+
* @returns {Element|null}
|
|
113
|
+
*/
|
|
114
|
+
export function findBestElement(elements) {
|
|
115
|
+
let best = null;
|
|
116
|
+
let bestScore = -Infinity;
|
|
117
|
+
for (const el of elements) {
|
|
118
|
+
const score = scoreElement(el);
|
|
119
|
+
if (score > bestScore) {
|
|
120
|
+
bestScore = score;
|
|
121
|
+
best = el;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
return best;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Calculate link density of an element (link text / total text).
|
|
129
|
+
* @param {Element} el
|
|
130
|
+
* @returns {number} 0.0 to 1.0
|
|
131
|
+
*/
|
|
132
|
+
export function getLinkDensity(el) {
|
|
133
|
+
const totalText = (el.textContent || '').trim().length;
|
|
134
|
+
if (totalText === 0) return 0;
|
|
135
|
+
|
|
136
|
+
let linkText = 0;
|
|
137
|
+
for (const a of el.getElementsByTagName('a')) {
|
|
138
|
+
linkText += (a.textContent || '').trim().length;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
return Math.min(1, linkText / totalText);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Detect if an element looks like a card grid (listing page).
|
|
146
|
+
* 3+ headings, 2+ images, low prose per heading.
|
|
147
|
+
* @param {Element} el
|
|
148
|
+
* @returns {boolean}
|
|
149
|
+
*/
|
|
150
|
+
export function isCardGrid(el) {
|
|
151
|
+
const headings = el.querySelectorAll('h2,h3,h4').length;
|
|
152
|
+
if (headings < 3) return false;
|
|
153
|
+
|
|
154
|
+
const images = el.getElementsByTagName('img').length;
|
|
155
|
+
if (images < 2) return false;
|
|
156
|
+
|
|
157
|
+
const words = countWords(el.textContent || '');
|
|
158
|
+
if (words >= 500) return false;
|
|
159
|
+
|
|
160
|
+
return headings > 0 && (words / headings) < 20;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Detect if element is a short author+date byline (<15 words).
|
|
165
|
+
* @param {Element} el
|
|
166
|
+
* @returns {boolean}
|
|
167
|
+
*/
|
|
168
|
+
export function isAuthorDateByline(el) {
|
|
169
|
+
const text = (el.textContent || '').trim();
|
|
170
|
+
const words = countWords(text);
|
|
171
|
+
if (words >= 15) return false;
|
|
172
|
+
|
|
173
|
+
const hasDate = /\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2}|\d{4}|\b\d{1,2}\/\d{1,2}/i.test(text);
|
|
174
|
+
const hasByline = /^by\s+/i.test(text);
|
|
175
|
+
|
|
176
|
+
return hasDate || hasByline;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Check if element contains social media profile links.
|
|
181
|
+
* @param {Element} el
|
|
182
|
+
* @returns {boolean}
|
|
183
|
+
*/
|
|
184
|
+
export function hasSocialMediaLinks(el) {
|
|
185
|
+
const links = Array.from(el.getElementsByTagName('a'));
|
|
186
|
+
if (links.length === 0) return false;
|
|
187
|
+
|
|
188
|
+
const socialLinks = links.filter(a => {
|
|
189
|
+
const href = (a.getAttribute('href') || '').toLowerCase();
|
|
190
|
+
return SOCIAL_MEDIA_PATTERNS.some(p => href.includes(p));
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
return socialLinks.length > 0 && socialLinks.length / links.length > 0.5;
|
|
194
|
+
}
|