defuddle-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,252 @@
1
+ /**
2
+ * Defuddle - extract main content and metadata from HTML pages.
3
+ * Works in browser (via <script> tag) and Node.js.
4
+ *
5
+ * Browser usage:
6
+ * const result = new Defuddle(document).parse();
7
+ * // or with an HTML string:
8
+ * const doc = new DOMParser().parseFromString(html, 'text/html');
9
+ * const result = new Defuddle(doc, { url: 'https://example.com' }).parse();
10
+ *
11
+ * Node.js usage (with linkedom or jsdom):
12
+ * import { parseHTML } from 'linkedom';
13
+ * const { document } = parseHTML(html);
14
+ * const result = new Defuddle(document, { url }).parse();
15
+ */
16
+ import { extractSchemaOrg } from './schema-org.js';
17
+ import { collectMetaTags, extractMetadata } from './metadata.js';
18
+ import { findMainContent } from './content-finder.js';
19
+ import { scoreAndRemove } from './content-scorer.js';
20
+ import { removeHiddenElements } from './removals/hidden.js';
21
+ import { removeExact, removePartial } from './removals/selector-remover.js';
22
+ import { removeSmallImages } from './removals/small-images.js';
23
+ import { removeByContentPattern } from './removals/content-patterns.js';
24
+ import { resolveUrls } from './url-resolver.js';
25
+ import { standardize } from './standardizer.js';
26
+ import { countHtmlWords } from './utils.js';
27
+
28
+ export class Defuddle {
29
+ /**
30
+ * @param {Document} doc A parsed DOM Document
31
+ * @param {object} [options]
32
+ * @param {string} [options.url] Page URL for relative URL resolution and domain extraction
33
+ * @param {boolean} [options.debug]
34
+ */
35
+ constructor(doc, options = {}) {
36
+ this.doc = doc;
37
+ this.url = options.url || null;
38
+ this.options = options;
39
+ this._schemaOrgData = undefined;
40
+ this._metaTags = undefined;
41
+ this._metadata = undefined;
42
+ }
43
+
44
+ /**
45
+ * Parse the document and return a DefuddleResult.
46
+ * @param {object} [opts] Override default options for this parse call
47
+ * @returns {DefuddleResult}
48
+ */
49
+ parse(opts = {}) {
50
+ const options = { ...this.options, ...opts };
51
+
52
+ // First pass
53
+ let result = this._parseInternal(options);
54
+
55
+ // Retry 1: too little content → disable partial selectors
56
+ if (result.wordCount < 200) {
57
+ const retry = this._parseInternal({ ...options, removePartialSelectors: false });
58
+ if (retry.wordCount > result.wordCount * 2) result = retry;
59
+ }
60
+
61
+ // Retry 2: still too little → disable hidden removal
62
+ if (result.wordCount < 50) {
63
+ const retry = this._parseInternal({ ...options, removeHidden: false });
64
+ if (retry.wordCount > result.wordCount * 2) result = retry;
65
+ }
66
+
67
+ // Retry 3: index/listing page → disable scoring and patterns
68
+ if (result.wordCount < 50) {
69
+ const retry = this._parseInternal({
70
+ ...options,
71
+ scoreContent: false,
72
+ removePartialSelectors: false,
73
+ removeContentPatterns: false,
74
+ });
75
+ if (retry.wordCount > result.wordCount) result = retry;
76
+ }
77
+
78
+ return result;
79
+ }
80
+
81
+ /**
82
+ * Internal parse — clones the document for each attempt.
83
+ * @private
84
+ */
85
+ _parseInternal(opts = {}) {
86
+ const startTime = Date.now();
87
+
88
+ const {
89
+ removeExactSelectors = true,
90
+ removePartialSelectors = true,
91
+ removeHidden = true,
92
+ removeSmallImages: doSmallImages = true,
93
+ scoreContent = true,
94
+ removeContentPatterns = true,
95
+ standardizeContent = true,
96
+ debug = false,
97
+ } = opts;
98
+
99
+ const doc = this.doc;
100
+ if (!doc || !doc.documentElement) {
101
+ return this._emptyResult(startTime);
102
+ }
103
+
104
+ // Cache schema, meta tags, metadata (shared across retries)
105
+ if (this._schemaOrgData === undefined) {
106
+ this._schemaOrgData = extractSchemaOrg(doc);
107
+ }
108
+ if (this._metaTags === undefined) {
109
+ this._metaTags = collectMetaTags(doc);
110
+ }
111
+ if (this._metadata === undefined) {
112
+ this._metadata = extractMetadata(doc, this.url, this._schemaOrgData, this._metaTags);
113
+ }
114
+
115
+ // Clone document for destructive processing
116
+ const clone = doc.cloneNode(true);
117
+
118
+ // Find main content
119
+ let mainContent = null;
120
+ if (opts.contentSelector) {
121
+ try {
122
+ mainContent = clone.querySelector(opts.contentSelector);
123
+ } catch (e) {}
124
+ }
125
+ if (!mainContent) {
126
+ mainContent = findMainContent(clone);
127
+ }
128
+ if (!mainContent) {
129
+ return this._buildResult(clone.body ? clone.body.innerHTML : '', startTime);
130
+ }
131
+
132
+ // Removal pipeline
133
+ if (doSmallImages) removeSmallImages(clone, debug);
134
+ if (removeHidden) removeHiddenElements(clone, debug);
135
+ if (removeExactSelectors) removeExact(clone, mainContent, debug);
136
+ if (removePartialSelectors) removePartial(clone, mainContent, debug);
137
+ if (scoreContent) scoreAndRemove(clone, mainContent, debug);
138
+ if (removeContentPatterns) removeByContentPattern(mainContent, debug, this.url || '');
139
+
140
+ // Standardize
141
+ if (standardizeContent) standardize(mainContent);
142
+
143
+ // Resolve URLs
144
+ if (this.url) resolveUrls(mainContent, clone, this.url);
145
+
146
+ return this._buildResult(mainContent.outerHTML, startTime);
147
+ }
148
+
149
+ _buildResult(content, startTime) {
150
+ const meta = this._metadata || {};
151
+ return new DefuddleResult({
152
+ content,
153
+ ...meta,
154
+ schemaOrgData: this._schemaOrgData || null,
155
+ metaTags: this._metaTags || [],
156
+ wordCount: countHtmlWords(content),
157
+ parseTime: Date.now() - startTime,
158
+ });
159
+ }
160
+
161
+ _emptyResult(startTime) {
162
+ let domain = '';
163
+ if (this.url) {
164
+ try {
165
+ let host = new URL(this.url).hostname;
166
+ if (host.startsWith('www.')) host = host.slice(4);
167
+ domain = host;
168
+ } catch (e) {}
169
+ }
170
+ return new DefuddleResult({
171
+ content: '', title: '', description: '', author: '', published: '',
172
+ site: '', domain, favicon: '', image: '', language: '',
173
+ schemaOrgData: null, metaTags: [],
174
+ wordCount: 0, parseTime: Date.now() - startTime,
175
+ });
176
+ }
177
+
178
+ /**
179
+ * Convenience static method: parse an HTML string.
180
+ * In browser: uses DOMParser automatically.
181
+ * In Node.js: requires passing a `parseHtml` function that returns a Document.
182
+ *
183
+ * @param {string|Document} input HTML string or existing Document
184
+ * @param {object} [options]
185
+ * @param {string} [options.url]
186
+ * @param {Function} [options.parseHtml] Custom HTML parser: (html) => Document
187
+ * @returns {DefuddleResult}
188
+ */
189
+ static parse(input, options = {}) {
190
+ let doc;
191
+
192
+ if (typeof input === 'string') {
193
+ if (options.parseHtml) {
194
+ doc = options.parseHtml(input);
195
+ } else if (typeof DOMParser !== 'undefined') {
196
+ // Browser environment
197
+ doc = new DOMParser().parseFromString(input, 'text/html');
198
+ } else {
199
+ throw new Error(
200
+ 'Defuddle.parse() requires a DOM environment. ' +
201
+ 'In Node.js, pass a parseHtml function: ' +
202
+ 'Defuddle.parse(html, { parseHtml: html => require("linkedom").parseHTML(html).document })'
203
+ );
204
+ }
205
+ } else {
206
+ doc = input;
207
+ }
208
+
209
+ return new Defuddle(doc, options).parse();
210
+ }
211
+ }
212
+
213
+ /**
214
+ * Result object returned by Defuddle.parse().
215
+ */
216
+ export class DefuddleResult {
217
+ constructor(data) {
218
+ this.content = data.content || '';
219
+ this.title = data.title || '';
220
+ this.description = data.description || '';
221
+ this.author = data.author || '';
222
+ this.published = data.published || '';
223
+ this.site = data.site || '';
224
+ this.domain = data.domain || '';
225
+ this.favicon = data.favicon || '';
226
+ this.image = data.image || '';
227
+ this.language = data.language || '';
228
+ this.wordCount = data.wordCount || 0;
229
+ this.parseTime = data.parseTime || 0;
230
+ this.schemaOrgData = data.schemaOrgData || null;
231
+ this.metaTags = data.metaTags || [];
232
+ }
233
+
234
+ toJSON() {
235
+ return {
236
+ content: this.content,
237
+ title: this.title,
238
+ description: this.description,
239
+ author: this.author,
240
+ published: this.published,
241
+ site: this.site,
242
+ domain: this.domain,
243
+ favicon: this.favicon,
244
+ image: this.image,
245
+ language: this.language,
246
+ wordCount: this.wordCount,
247
+ parseTime: this.parseTime,
248
+ schemaOrgData: this.schemaOrgData,
249
+ metaTags: this.metaTags,
250
+ };
251
+ }
252
+ }
package/src/index.js ADDED
@@ -0,0 +1 @@
1
+ export { Defuddle, DefuddleResult } from './defuddle.js';
@@ -0,0 +1,371 @@
1
+ /**
2
+ * Metadata extraction for defuddle-js.
3
+ * Cascading: OG → Twitter Card → Schema.org → DOM heuristics
4
+ */
5
+ import { extractSchemaOrg, getSchemaProperty } from './schema-org.js';
6
+ import { resolveUrl } from './url-resolver.js';
7
+
8
+ const ARTICLE_TYPES = ['Article', 'NewsArticle', 'BlogPosting', 'WebPage'];
9
+ const DATE_RE = /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/i;
10
+
11
+ /**
12
+ * Collect all <meta> tags from a document.
13
+ * @param {Document} doc
14
+ * @returns {Array<{name:string,property:string,content:string}>}
15
+ */
16
+ export function collectMetaTags(doc) {
17
+ const tags = [];
18
+ for (const meta of doc.querySelectorAll('meta')) {
19
+ const content = meta.getAttribute('content');
20
+ if (!content) continue;
21
+ tags.push({
22
+ name: meta.getAttribute('name') || null,
23
+ property: meta.getAttribute('property') || null,
24
+ content: decodeHtmlEntities(content),
25
+ });
26
+ }
27
+ return tags;
28
+ }
29
+
30
+ /**
31
+ * Extract all metadata from a document.
32
+ * @param {Document} doc
33
+ * @param {string|null} url
34
+ * @param {Array|null} schemaOrgData
35
+ * @param {Array} metaTags
36
+ * @returns {object}
37
+ */
38
+ export function extractMetadata(doc, url, schemaOrgData, metaTags) {
39
+ const getMeta = (value, attr) => {
40
+ for (const tag of metaTags) {
41
+ if (tag[attr] && tag[attr].toLowerCase() === value.toLowerCase()) {
42
+ return tag.content || null;
43
+ }
44
+ }
45
+ return null;
46
+ };
47
+
48
+ const getSchema = (property) =>
49
+ getSchemaProperty(schemaOrgData, ARTICLE_TYPES, property);
50
+
51
+ const site = extractSiteName(doc, metaTags, schemaOrgData, getMeta);
52
+ const title = extractTitle(doc, metaTags, schemaOrgData, getMeta, getSchema, site);
53
+
54
+ return {
55
+ title,
56
+ description: extractDescription(metaTags, getSchema, getMeta),
57
+ author: extractAuthor(doc, metaTags, schemaOrgData, getMeta),
58
+ published: extractPublished(doc, metaTags, schemaOrgData, getMeta, getSchema),
59
+ site,
60
+ domain: extractDomain(doc, url, getMeta),
61
+ favicon: extractFavicon(doc, url),
62
+ image: extractImage(doc, metaTags, getSchema, getMeta),
63
+ language: extractLanguage(doc, metaTags, getMeta),
64
+ };
65
+ }
66
+
67
+ // ── Title ──────────────────────────────────────────────────────────────────
68
+
69
+ function extractTitle(doc, metaTags, schemaOrgData, getMeta, getSchema, siteName) {
70
+ const raw = getMeta('og:title', 'property')
71
+ || getMeta('twitter:title', 'name')
72
+ || getSchema('headline')
73
+ || getMeta('title', 'name')
74
+ || (doc.querySelector('title') ? doc.querySelector('title').textContent.trim() : '')
75
+ || '';
76
+ return cleanTitle(raw, siteName);
77
+ }
78
+
79
+ function cleanTitle(title, siteName) {
80
+ if (!title || !siteName) return title;
81
+
82
+ const separators = ['|', ' / ', ' · ', ' – ', ' — ', ' - ', ': '];
83
+ for (const sep of separators) {
84
+ if (!title.includes(sep)) continue;
85
+ const idx = title.indexOf(sep);
86
+ const left = title.slice(0, idx).trim();
87
+ const right = title.slice(idx + sep.length).trim();
88
+
89
+ if (fuzzyMatch(right, siteName)) return left;
90
+ if (fuzzyMatch(left, siteName)) return right;
91
+ }
92
+ return title;
93
+ }
94
+
95
+ function fuzzyMatch(a, b) {
96
+ const norm = s => s.toLowerCase().replace(/[^a-z0-9]/g, '');
97
+ const na = norm(a), nb = norm(b);
98
+ if (na === nb) return true;
99
+ if (nb.length > 2 && (na.includes(nb) || nb.includes(na))) return true;
100
+ return false;
101
+ }
102
+
103
+ // ── Description ────────────────────────────────────────────────────────────
104
+
105
+ function extractDescription(metaTags, getSchema, getMeta) {
106
+ return getMeta('og:description', 'property')
107
+ || getMeta('twitter:description', 'name')
108
+ || getMeta('description', 'name')
109
+ || getSchema('description')
110
+ || '';
111
+ }
112
+
113
+ // ── Author ─────────────────────────────────────────────────────────────────
114
+
115
+ function extractAuthor(doc, metaTags, schemaOrgData, getMeta) {
116
+ for (const name of ['author', 'sailthru.author', 'citation_author', 'dc.creator', 'byl']) {
117
+ const val = getMeta(name, 'name');
118
+ if (val) return val;
119
+ }
120
+
121
+ // Schema.org author
122
+ const schemaAuthor = getSchemaAuthor(schemaOrgData);
123
+ if (schemaAuthor) return schemaAuthor;
124
+
125
+ // DOM selectors
126
+ const selectors = [
127
+ '[class*="author"]:not([class*="author-bio"])',
128
+ '[rel="author"]',
129
+ '[class*="byline"]',
130
+ '[itemprop="author"]',
131
+ ];
132
+ for (const sel of selectors) {
133
+ const el = doc.querySelector(sel);
134
+ if (el) {
135
+ const text = (el.textContent || '').trim();
136
+ const words = text.split(/\s+/).filter(Boolean).length;
137
+ if (text && words <= 6) return stripByPrefix(text);
138
+ }
139
+ }
140
+
141
+ // "By X" near h1
142
+ return extractBylineNearH1(doc);
143
+ }
144
+
145
+ function getSchemaAuthor(schemaOrgData) {
146
+ if (!schemaOrgData) return '';
147
+ for (const item of schemaOrgData) {
148
+ if (!item || typeof item !== 'object') continue;
149
+ const author = item.author;
150
+ if (!author) continue;
151
+ if (typeof author === 'string') return author;
152
+ if (typeof author === 'object') {
153
+ if (author.name) return String(author.name);
154
+ if (Array.isArray(author) && author[0]?.name) return String(author[0].name);
155
+ }
156
+ }
157
+ return '';
158
+ }
159
+
160
+ function extractBylineNearH1(doc) {
161
+ const h1 = doc.querySelector('h1');
162
+ if (!h1 || !h1.parentElement) return '';
163
+
164
+ let checked = 0;
165
+ for (const sibling of h1.parentElement.children) {
166
+ if (sibling === h1) continue;
167
+ const text = (sibling.textContent || '').trim();
168
+ const match = text.match(/^by\s+(.+)/i);
169
+ if (match) {
170
+ const words = text.split(/\s+/).filter(Boolean).length;
171
+ if (words <= 8) return match[1].trim();
172
+ }
173
+ if (++checked >= 5) break;
174
+ }
175
+ return '';
176
+ }
177
+
178
+ function stripByPrefix(text) {
179
+ const m = text.match(/^by\s+(.+)/i);
180
+ return m ? m[1].trim() : text;
181
+ }
182
+
183
+ // ── Published ──────────────────────────────────────────────────────────────
184
+
185
+ function extractPublished(doc, metaTags, schemaOrgData, getMeta, getSchema) {
186
+ const raw = getMeta('article:published_time', 'property')
187
+ || getMeta('article:published', 'property')
188
+ || getMeta('date', 'name')
189
+ || getMeta('citation_date', 'name')
190
+ || getMeta('DC.date', 'name')
191
+ || getMeta('pubdate', 'name')
192
+ || null;
193
+
194
+ if (raw) return normalizeDate(raw);
195
+
196
+ const schemaDate = getSchema('datePublished');
197
+ if (schemaDate) return normalizeDate(String(schemaDate));
198
+
199
+ // <time datetime>
200
+ const timeEl = doc.querySelector('time[datetime]');
201
+ if (timeEl) {
202
+ const dt = timeEl.getAttribute('datetime');
203
+ if (dt) return normalizeDate(dt);
204
+ }
205
+
206
+ // Natural language dates in meta content
207
+ for (const tag of metaTags) {
208
+ if (tag.content && DATE_RE.test(tag.content)) {
209
+ const parsed = parseNaturalDate(tag.content);
210
+ if (parsed) return parsed;
211
+ }
212
+ }
213
+
214
+ return '';
215
+ }
216
+
217
+ function normalizeDate(raw) {
218
+ raw = raw.trim();
219
+ if (!raw) return '';
220
+
221
+ // Already ISO-ish
222
+ if (/^\d{4}-\d{2}-\d{2}/.test(raw)) {
223
+ try {
224
+ return new Date(raw).toISOString();
225
+ } catch (e) {
226
+ return raw;
227
+ }
228
+ }
229
+
230
+ return parseNaturalDate(raw) || raw;
231
+ }
232
+
233
+ function parseNaturalDate(input) {
234
+ input = input.trim();
235
+ const d = new Date(input);
236
+ if (!isNaN(d.getTime()) && d.getFullYear() > 1900) {
237
+ return d.toISOString();
238
+ }
239
+ return null;
240
+ }
241
+
242
+ // ── Image ──────────────────────────────────────────────────────────────────
243
+
244
+ function extractImage(doc, metaTags, getSchema, getMeta) {
245
+ const url = getMeta('og:image', 'property')
246
+ || getMeta('og:image:url', 'property')
247
+ || getMeta('twitter:image', 'name')
248
+ || getMeta('twitter:image:src', 'name')
249
+ || null;
250
+ if (url) return url;
251
+
252
+ const schemaImage = getSchema('image');
253
+ if (schemaImage) {
254
+ if (typeof schemaImage === 'string') return schemaImage;
255
+ if (schemaImage.url) return String(schemaImage.url);
256
+ }
257
+
258
+ // First large image in body
259
+ const imgs = doc.querySelectorAll('body img[src]');
260
+ for (const img of imgs) {
261
+ const w = parseInt(img.getAttribute('width') || '0');
262
+ const h = parseInt(img.getAttribute('height') || '0');
263
+ if ((w >= 200 && h >= 100) || (!w && !h)) {
264
+ return img.getAttribute('src') || '';
265
+ }
266
+ }
267
+
268
+ return '';
269
+ }
270
+
271
+ // ── Site name ──────────────────────────────────────────────────────────────
272
+
273
+ function extractSiteName(doc, metaTags, schemaOrgData, getMeta) {
274
+ const og = getMeta('og:site_name', 'property');
275
+ if (og) return og;
276
+
277
+ const publisher = getSchemaProperty(schemaOrgData, ARTICLE_TYPES, 'publisher');
278
+ if (publisher) {
279
+ if (typeof publisher === 'string') return publisher;
280
+ if (publisher.name) return String(publisher.name);
281
+ }
282
+
283
+ // Fallback from <title> using separators
284
+ const titleEl = doc.querySelector('title');
285
+ if (titleEl) {
286
+ const title = titleEl.textContent || '';
287
+ for (const sep of ['|', ' / ', ' · ', ' – ', ' — ']) {
288
+ if (title.includes(sep)) {
289
+ const parts = title.split(sep);
290
+ const last = parts[parts.length - 1].trim();
291
+ if (last && last.split(/\s+/).length <= 4) return last;
292
+ }
293
+ }
294
+ }
295
+
296
+ return '';
297
+ }
298
+
299
+ // ── Domain ─────────────────────────────────────────────────────────────────
300
+
301
+ function extractDomain(doc, url, getMeta) {
302
+ let pageUrl = url;
303
+
304
+ if (!pageUrl) {
305
+ pageUrl = getMeta('og:url', 'property');
306
+ }
307
+ if (!pageUrl) {
308
+ const canonical = doc.querySelector('link[rel="canonical"][href]');
309
+ if (canonical) pageUrl = canonical.getAttribute('href');
310
+ }
311
+
312
+ if (!pageUrl) return '';
313
+
314
+ try {
315
+ let host = new URL(pageUrl).hostname;
316
+ if (host.startsWith('www.')) host = host.slice(4);
317
+ return host;
318
+ } catch (e) {
319
+ return '';
320
+ }
321
+ }
322
+
323
+ // ── Favicon ────────────────────────────────────────────────────────────────
324
+
325
+ function extractFavicon(doc, url) {
326
+ for (const rel of ['shortcut icon', 'icon']) {
327
+ const el = doc.querySelector(`link[rel="${rel}"][href]`);
328
+ if (el) {
329
+ const href = el.getAttribute('href');
330
+ if (href) return url ? resolveUrl(href, url) : href;
331
+ }
332
+ }
333
+
334
+ if (url) {
335
+ try {
336
+ const u = new URL(url);
337
+ return u.origin + '/favicon.ico';
338
+ } catch (e) {}
339
+ }
340
+
341
+ return '';
342
+ }
343
+
344
+ // ── Language ───────────────────────────────────────────────────────────────
345
+
346
+ function extractLanguage(doc, metaTags, getMeta) {
347
+ const htmlEl = doc.documentElement;
348
+ if (htmlEl && htmlEl.getAttribute('lang')) {
349
+ return htmlEl.getAttribute('lang');
350
+ }
351
+
352
+ const locale = getMeta('og:locale', 'property');
353
+ if (locale) return locale.replace('_', '-');
354
+
355
+ const contentLang = doc.querySelector('meta[http-equiv="Content-Language"]');
356
+ if (contentLang) return contentLang.getAttribute('content') || '';
357
+
358
+ return '';
359
+ }
360
+
361
+ // ── Helpers ────────────────────────────────────────────────────────────────
362
+
363
+ function decodeHtmlEntities(text) {
364
+ return text
365
+ .replace(/&amp;/g, '&')
366
+ .replace(/&lt;/g, '<')
367
+ .replace(/&gt;/g, '>')
368
+ .replace(/&quot;/g, '"')
369
+ .replace(/&#39;/g, "'")
370
+ .replace(/&apos;/g, "'");
371
+ }