defuddle-js 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +102 -0
- package/dist/defuddle.cjs.js +1950 -0
- package/dist/defuddle.umd.js +1968 -0
- package/package.json +41 -0
- package/src/constants.js +297 -0
- package/src/content-finder.js +116 -0
- package/src/content-scorer.js +194 -0
- package/src/defuddle.js +252 -0
- package/src/index.js +1 -0
- package/src/metadata.js +371 -0
- package/src/removals/content-patterns.js +174 -0
- package/src/removals/hidden.js +51 -0
- package/src/removals/selector-remover.js +137 -0
- package/src/removals/small-images.js +45 -0
- package/src/schema-org.js +102 -0
- package/src/standardizer.js +116 -0
- package/src/url-resolver.js +101 -0
- package/src/utils.js +95 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL resolution utilities for defuddle-js.
|
|
3
|
+
* Resolves relative URLs to absolute within DOM elements.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Resolve all relative URLs in a DOM element.
|
|
8
|
+
* @param {Element} root
|
|
9
|
+
* @param {Document} doc
|
|
10
|
+
* @param {string|null} url Page URL
|
|
11
|
+
*/
|
|
12
|
+
export function resolveUrls(root, doc, url) {
|
|
13
|
+
if (!url) return;
|
|
14
|
+
|
|
15
|
+
// Respect <base href> if present
|
|
16
|
+
let baseUrl = url;
|
|
17
|
+
const baseEl = doc.querySelector('base[href]');
|
|
18
|
+
if (baseEl) {
|
|
19
|
+
const baseHref = baseEl.getAttribute('href');
|
|
20
|
+
if (baseHref) {
|
|
21
|
+
const resolved = resolveUrl(baseHref, url);
|
|
22
|
+
if (resolved) baseUrl = resolved;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Resolve href, src, srcset, poster
|
|
27
|
+
for (const el of root.querySelectorAll('[href]')) {
|
|
28
|
+
const href = el.getAttribute('href');
|
|
29
|
+
if (href && !href.startsWith('#')) {
|
|
30
|
+
el.setAttribute('href', resolveUrl(href, baseUrl));
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
for (const el of root.querySelectorAll('[src]')) {
|
|
35
|
+
const src = el.getAttribute('src');
|
|
36
|
+
if (src) el.setAttribute('src', resolveUrl(src, baseUrl));
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
for (const el of root.querySelectorAll('[srcset]')) {
|
|
40
|
+
const srcset = el.getAttribute('srcset');
|
|
41
|
+
if (srcset) el.setAttribute('srcset', resolveSrcset(srcset, baseUrl));
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
for (const el of root.querySelectorAll('[poster]')) {
|
|
45
|
+
const poster = el.getAttribute('poster');
|
|
46
|
+
if (poster) el.setAttribute('poster', resolveUrl(poster, baseUrl));
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Resolve a potentially relative URL against a base URL.
|
|
52
|
+
* @param {string} url
|
|
53
|
+
* @param {string} base
|
|
54
|
+
* @returns {string}
|
|
55
|
+
*/
|
|
56
|
+
export function resolveUrl(url, base) {
|
|
57
|
+
url = url.trim();
|
|
58
|
+
|
|
59
|
+
// Skip dangerous schemes
|
|
60
|
+
if (/^(javascript|data|vbscript):/i.test(url)) return url;
|
|
61
|
+
|
|
62
|
+
// Already absolute
|
|
63
|
+
if (/^[a-z][a-z0-9+\-.]*:\/\//i.test(url)) return url;
|
|
64
|
+
|
|
65
|
+
// Fragment only
|
|
66
|
+
if (url.startsWith('#')) return url;
|
|
67
|
+
|
|
68
|
+
try {
|
|
69
|
+
return new URL(url, base).href;
|
|
70
|
+
} catch (e) {
|
|
71
|
+
return url;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Resolve a srcset attribute value.
|
|
77
|
+
* @param {string} srcset
|
|
78
|
+
* @param {string} base
|
|
79
|
+
* @returns {string}
|
|
80
|
+
*/
|
|
81
|
+
export function resolveSrcset(srcset, base) {
|
|
82
|
+
// Parse using width/density descriptors as delimiters (not commas)
|
|
83
|
+
const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])(?:,|$)/g;
|
|
84
|
+
const entries = [];
|
|
85
|
+
let match;
|
|
86
|
+
|
|
87
|
+
while ((match = entryPattern.exec(srcset)) !== null) {
|
|
88
|
+
const rawUrl = match[1].trim().replace(/^,\s*/, '');
|
|
89
|
+
const descriptor = match[2];
|
|
90
|
+
entries.push(`${resolveUrl(rawUrl, base)} ${descriptor}`);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (entries.length > 0) return entries.join(', ');
|
|
94
|
+
|
|
95
|
+
// Fallback: simple comma split
|
|
96
|
+
return srcset.split(',').map(entry => {
|
|
97
|
+
const parts = entry.trim().split(/\s+/);
|
|
98
|
+
if (parts[0]) parts[0] = resolveUrl(parts[0], base);
|
|
99
|
+
return parts.join(' ');
|
|
100
|
+
}).join(', ');
|
|
101
|
+
}
|
package/src/utils.js
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions for defuddle-js
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Count words in a string, CJK-aware.
|
|
7
|
+
* CJK characters are counted individually as words.
|
|
8
|
+
* @param {string} text
|
|
9
|
+
* @returns {number}
|
|
10
|
+
*/
|
|
11
|
+
export function countWords(text) {
|
|
12
|
+
if (!text || !text.trim()) return 0;
|
|
13
|
+
|
|
14
|
+
// Count CJK characters individually
|
|
15
|
+
const cjkMatches = text.match(
|
|
16
|
+
/[\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7FF\u4E00-\u9FFF]/g
|
|
17
|
+
);
|
|
18
|
+
const cjkCount = cjkMatches ? cjkMatches.length : 0;
|
|
19
|
+
|
|
20
|
+
// Remove CJK and count remaining words
|
|
21
|
+
const nonCjk = text.replace(/[\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7FF\u4E00-\u9FFF]/g, ' ');
|
|
22
|
+
const nonCjkWords = nonCjk.trim().split(/\s+/).filter(w => w.length > 0).length;
|
|
23
|
+
|
|
24
|
+
return cjkCount + nonCjkWords;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Count words in an HTML string (strips tags first).
|
|
29
|
+
* @param {string} html
|
|
30
|
+
* @returns {number}
|
|
31
|
+
*/
|
|
32
|
+
export function countHtmlWords(html) {
|
|
33
|
+
const text = html
|
|
34
|
+
.replace(/<[^>]*>/g, ' ')
|
|
35
|
+
.replace(/ /gi, ' ')
|
|
36
|
+
.replace(/&/gi, '&')
|
|
37
|
+
.replace(/</gi, '<')
|
|
38
|
+
.replace(/>/gi, '>')
|
|
39
|
+
.replace(/"/gi, '"')
|
|
40
|
+
.replace(/&#\d+;/g, ' ')
|
|
41
|
+
.replace(/&\w+;/g, ' ');
|
|
42
|
+
return countWords(text);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Get element's combined class + id string for pattern matching.
|
|
47
|
+
* @param {Element} el
|
|
48
|
+
* @returns {string}
|
|
49
|
+
*/
|
|
50
|
+
export function getClassId(el) {
|
|
51
|
+
return (el.getAttribute('class') || '') + ' ' + (el.getAttribute('id') || '');
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Check if el is an ancestor of (or equal to) target.
|
|
56
|
+
* @param {Element} el
|
|
57
|
+
* @param {Element} target
|
|
58
|
+
* @returns {boolean}
|
|
59
|
+
*/
|
|
60
|
+
export function isAncestorOrSelf(el, target) {
|
|
61
|
+
let node = target;
|
|
62
|
+
while (node) {
|
|
63
|
+
if (node === el) return true;
|
|
64
|
+
node = node.parentElement;
|
|
65
|
+
}
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Get the inner HTML of a DOM element.
|
|
71
|
+
* Works the same in browser and linkedom/jsdom.
|
|
72
|
+
* @param {Element} el
|
|
73
|
+
* @returns {string}
|
|
74
|
+
*/
|
|
75
|
+
export function getInnerHtml(el) {
|
|
76
|
+
return el.innerHTML;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Safely remove a node from the DOM.
|
|
81
|
+
* @param {Element} el
|
|
82
|
+
*/
|
|
83
|
+
export function removeNode(el) {
|
|
84
|
+
if (el && el.parentNode) {
|
|
85
|
+
el.parentNode.removeChild(el);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Iterator-safe removal: collect nodes first, then remove.
|
|
91
|
+
* @param {NodeList|Element[]} nodes
|
|
92
|
+
*/
|
|
93
|
+
export function removeAll(nodes) {
|
|
94
|
+
Array.from(nodes).forEach(n => removeNode(n));
|
|
95
|
+
}
|