defuddle-js 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +102 -0
- package/dist/defuddle.cjs.js +1950 -0
- package/dist/defuddle.umd.js +1968 -0
- package/package.json +41 -0
- package/src/constants.js +297 -0
- package/src/content-finder.js +116 -0
- package/src/content-scorer.js +194 -0
- package/src/defuddle.js +252 -0
- package/src/index.js +1 -0
- package/src/metadata.js +371 -0
- package/src/removals/content-patterns.js +174 -0
- package/src/removals/hidden.js +51 -0
- package/src/removals/selector-remover.js +137 -0
- package/src/removals/small-images.js +45 -0
- package/src/schema-org.js +102 -0
- package/src/standardizer.js +116 -0
- package/src/url-resolver.js +101 -0
- package/src/utils.js +95 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Remove content patterns: breadcrumbs, bylines, hero headers, boilerplate.
|
|
3
|
+
* Ported from defuddle-php/src/Removals/ContentPatternRemover.php
|
|
4
|
+
*/
|
|
5
|
+
import { countWords } from '../utils.js';
|
|
6
|
+
|
|
7
|
+
const DATE_PATTERN = /\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2}/i;
|
|
8
|
+
const READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
|
|
9
|
+
const STARTS_WITH_BY = /^by\s+\S/i;
|
|
10
|
+
const BOILERPLATE_PATTERNS = [
|
|
11
|
+
/^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
|
|
12
|
+
/^A version of this (?:article|story) (?:appeared|was published) in\b/i,
|
|
13
|
+
/^Originally (?:published|appeared) (?:in|on|at)\b/i,
|
|
14
|
+
/^Any re-?use permitted\b/i,
|
|
15
|
+
];
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Remove content patterns from the isolated content element.
|
|
19
|
+
* @param {Element} root
|
|
20
|
+
* @param {boolean} debug
|
|
21
|
+
* @param {string} url
|
|
22
|
+
* @returns {Array}
|
|
23
|
+
*/
|
|
24
|
+
export function removeByContentPattern(root, debug = false, url = '') {
|
|
25
|
+
const removed = [];
|
|
26
|
+
removeHeroHeader(root, debug, removed);
|
|
27
|
+
removeBreadcrumbs(root, debug, removed);
|
|
28
|
+
removeAuthorBylines(root, debug, removed);
|
|
29
|
+
removeReadTimeBlocks(root, debug, removed);
|
|
30
|
+
removeBoilerplateSentences(root, debug, removed);
|
|
31
|
+
removeTrailingThinSections(root, debug, removed);
|
|
32
|
+
return removed;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function removeHeroHeader(root, debug, removed) {
|
|
36
|
+
const timeNodes = Array.from(root.querySelectorAll('time'));
|
|
37
|
+
for (const time of timeNodes) {
|
|
38
|
+
const textBefore = getTextBefore(root, time);
|
|
39
|
+
if (textBefore.length > 400) continue;
|
|
40
|
+
|
|
41
|
+
let container = time.parentElement;
|
|
42
|
+
let depth = 0;
|
|
43
|
+
while (container && container !== root && depth < 4) {
|
|
44
|
+
const headings = container.querySelectorAll('h1,h2,h3').length;
|
|
45
|
+
if (headings > 0) {
|
|
46
|
+
const proseWords = countProseWords(container);
|
|
47
|
+
if (proseWords < 30) {
|
|
48
|
+
safeRemove(container, root, debug, removed, 'removeHeroHeader');
|
|
49
|
+
break;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
container = container.parentElement;
|
|
53
|
+
depth++;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function removeBreadcrumbs(root, debug, removed) {
|
|
59
|
+
for (const tag of ['ul', 'ol', 'nav']) {
|
|
60
|
+
for (const list of Array.from(root.querySelectorAll(tag))) {
|
|
61
|
+
const textBefore = getTextBefore(root, list);
|
|
62
|
+
if (textBefore.length > 600) continue;
|
|
63
|
+
if (isBreadcrumbList(list)) {
|
|
64
|
+
safeRemove(list, root, debug, removed, 'removeBreadcrumbs');
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
for (const el of Array.from(root.querySelectorAll('[aria-label*="breadcrumb"],[class*="breadcrumb"]'))) {
|
|
70
|
+
safeRemove(el, root, debug, removed, 'removeBreadcrumbs');
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function isBreadcrumbList(list) {
|
|
75
|
+
const children = Array.from(list.children).filter(
|
|
76
|
+
el => ['li', 'a'].includes(el.tagName.toLowerCase())
|
|
77
|
+
);
|
|
78
|
+
if (children.length < 2 || children.length > 8) return false;
|
|
79
|
+
|
|
80
|
+
const linkOnly = children.filter(item => {
|
|
81
|
+
const text = (item.textContent || '').trim();
|
|
82
|
+
const words = countWords(text);
|
|
83
|
+
if (words > 4) return false;
|
|
84
|
+
return item.querySelector('a') !== null || item.tagName.toLowerCase() === 'a';
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
return linkOnly.length >= children.length * 0.5;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function removeAuthorBylines(root, debug, removed) {
|
|
91
|
+
const candidates = Array.from(root.querySelectorAll('*:not(p *)'));
|
|
92
|
+
for (const el of candidates) {
|
|
93
|
+
const textBefore = getTextBefore(root, el);
|
|
94
|
+
if (textBefore.length > 600) continue;
|
|
95
|
+
|
|
96
|
+
const text = (el.textContent || '').trim();
|
|
97
|
+
const words = countWords(text);
|
|
98
|
+
if (words < 2 || words > 15) continue;
|
|
99
|
+
|
|
100
|
+
if (STARTS_WITH_BY.test(text)) {
|
|
101
|
+
safeRemove(el, root, debug, removed, 'removeAuthorBylines');
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
function removeReadTimeBlocks(root, debug, removed) {
|
|
107
|
+
// Leaf elements near top
|
|
108
|
+
for (const el of Array.from(root.querySelectorAll('*'))) {
|
|
109
|
+
if (el.children.length > 0) continue; // leaf elements only
|
|
110
|
+
const textBefore = getTextBefore(root, el);
|
|
111
|
+
if (textBefore.length > 400) continue;
|
|
112
|
+
|
|
113
|
+
const text = (el.textContent || '').trim();
|
|
114
|
+
if (countWords(text) > 10) continue;
|
|
115
|
+
|
|
116
|
+
if (READ_TIME_PATTERN.test(text)) {
|
|
117
|
+
safeRemove(el, root, debug, removed, 'removeReadTimeBlocks');
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function removeBoilerplateSentences(root, debug, removed) {
|
|
123
|
+
for (const p of Array.from(root.querySelectorAll('p'))) {
|
|
124
|
+
const text = (p.textContent || '').trim();
|
|
125
|
+
for (const pattern of BOILERPLATE_PATTERNS) {
|
|
126
|
+
if (pattern.test(text)) {
|
|
127
|
+
safeRemove(p, root, debug, removed, 'removeBoilerplateSentences');
|
|
128
|
+
break;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function removeTrailingThinSections(root, debug, removed) {
|
|
135
|
+
const totalWords = countWords(root.textContent || '');
|
|
136
|
+
if (totalWords < 100) return;
|
|
137
|
+
|
|
138
|
+
const threshold = totalWords * 0.15;
|
|
139
|
+
const children = Array.from(root.children);
|
|
140
|
+
const lastChildren = children.slice(-3);
|
|
141
|
+
|
|
142
|
+
for (const child of lastChildren.reverse()) {
|
|
143
|
+
const tag = child.tagName.toLowerCase();
|
|
144
|
+
if (!['div', 'section', 'aside'].includes(tag)) continue;
|
|
145
|
+
|
|
146
|
+
const words = countWords(child.textContent || '');
|
|
147
|
+
if (words < threshold && child.querySelectorAll('h2,h3,h4').length > 0) {
|
|
148
|
+
safeRemove(child, root, debug, removed, 'removeTrailingThinSections');
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function safeRemove(el, root, debug, removed, step) {
|
|
154
|
+
if (el === root || !el.parentNode) return;
|
|
155
|
+
if (debug) removed.push({ step, tag: el.tagName, text: (el.textContent || '').slice(0, 80) });
|
|
156
|
+
el.parentNode.removeChild(el);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function getTextBefore(root, target) {
|
|
160
|
+
const rootText = root.textContent || '';
|
|
161
|
+
const targetText = (target.textContent || '').trim().slice(0, 30);
|
|
162
|
+
const pos = rootText.indexOf(targetText);
|
|
163
|
+
return pos === -1 ? '' : rootText.slice(0, pos);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function countProseWords(el) {
|
|
167
|
+
let text = '';
|
|
168
|
+
for (const tag of ['p', 'li', 'dd']) {
|
|
169
|
+
for (const node of el.getElementsByTagName(tag)) {
|
|
170
|
+
text += ' ' + (node.textContent || '');
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return countWords(text);
|
|
174
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Remove hidden elements from a document.
|
|
3
|
+
*/
|
|
4
|
+
import { removeAll } from '../utils.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Remove elements hidden via display:none, aria-hidden, [hidden], or CSS classes.
|
|
8
|
+
* Preserves math/katex content even if marked hidden.
|
|
9
|
+
* @param {Document} doc
|
|
10
|
+
* @param {boolean} debug
|
|
11
|
+
* @returns {Array}
|
|
12
|
+
*/
|
|
13
|
+
export function removeHiddenElements(doc, debug = false) {
|
|
14
|
+
const removed = [];
|
|
15
|
+
const all = Array.from(doc.querySelectorAll('*'));
|
|
16
|
+
const toRemove = [];
|
|
17
|
+
|
|
18
|
+
for (const el of all) {
|
|
19
|
+
if (isHidden(el) && !containsMath(el)) {
|
|
20
|
+
toRemove.push(el);
|
|
21
|
+
if (debug) removed.push({ step: 'removeHidden', tag: el.tagName, class: el.className });
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
removeAll(toRemove);
|
|
26
|
+
return removed;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function isHidden(el) {
|
|
30
|
+
if (el.hasAttribute('hidden')) return true;
|
|
31
|
+
if (el.getAttribute('aria-hidden') === 'true') return true;
|
|
32
|
+
|
|
33
|
+
const style = el.getAttribute('style') || '';
|
|
34
|
+
if (style && /(?:^|;)\s*(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)\s*(?:;|$)/i.test(style)) {
|
|
35
|
+
return true;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const cls = el.getAttribute('class') || '';
|
|
39
|
+
if (cls) {
|
|
40
|
+
const tokens = cls.trim().split(/\s+/);
|
|
41
|
+
if (tokens.includes('hidden') || tokens.includes('invisible')) return true;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return false;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function containsMath(el) {
|
|
48
|
+
const cls = (el.getAttribute('class') || '').toLowerCase();
|
|
49
|
+
if (cls.includes('katex-mathml') || cls.includes('math')) return true;
|
|
50
|
+
return el.querySelector('math') !== null;
|
|
51
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Remove elements by exact CSS selectors and partial class/id patterns.
|
|
3
|
+
*/
|
|
4
|
+
import {
|
|
5
|
+
EXACT_SELECTORS,
|
|
6
|
+
PARTIAL_SELECTORS_REGEX,
|
|
7
|
+
PARTIAL_MATCH_ATTRIBUTES,
|
|
8
|
+
FOOTNOTE_SELECTORS,
|
|
9
|
+
VIDEO_EMBED_PATTERNS,
|
|
10
|
+
} from '../constants.js';
|
|
11
|
+
import { isAncestorOrSelf, removeAll } from '../utils.js';
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Remove elements matching exact CSS selectors.
|
|
15
|
+
* @param {Document} doc
|
|
16
|
+
* @param {Element|null} mainContent
|
|
17
|
+
* @param {boolean} debug
|
|
18
|
+
* @returns {Array}
|
|
19
|
+
*/
|
|
20
|
+
export function removeExact(doc, mainContent = null, debug = false) {
|
|
21
|
+
const removed = [];
|
|
22
|
+
const toRemove = new Set();
|
|
23
|
+
|
|
24
|
+
for (const selector of EXACT_SELECTORS) {
|
|
25
|
+
try {
|
|
26
|
+
for (const el of doc.querySelectorAll(selector)) {
|
|
27
|
+
if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
|
|
28
|
+
if (isInsideCodeBlock(el)) continue;
|
|
29
|
+
if (isFootnoteContainer(doc, el)) continue;
|
|
30
|
+
toRemove.add(el);
|
|
31
|
+
}
|
|
32
|
+
} catch (e) {
|
|
33
|
+
// Skip unsupported selectors
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Special: scripts (not math)
|
|
38
|
+
for (const el of doc.querySelectorAll('script')) {
|
|
39
|
+
const type = (el.getAttribute('type') || '').toLowerCase();
|
|
40
|
+
if (type.startsWith('math/')) continue;
|
|
41
|
+
if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
|
|
42
|
+
toRemove.add(el);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Special: iframes — keep video embeds
|
|
46
|
+
for (const el of doc.querySelectorAll('iframe')) {
|
|
47
|
+
const src = (el.getAttribute('src') || '').toLowerCase();
|
|
48
|
+
if (VIDEO_EMBED_PATTERNS.some(p => src.includes(p))) continue;
|
|
49
|
+
if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
|
|
50
|
+
toRemove.add(el);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Special: aside (keep callouts)
|
|
54
|
+
for (const el of doc.querySelectorAll('aside')) {
|
|
55
|
+
const cls = (el.getAttribute('class') || '').toLowerCase();
|
|
56
|
+
if (cls.includes('callout')) continue;
|
|
57
|
+
if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
|
|
58
|
+
toRemove.add(el);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
for (const el of toRemove) {
|
|
62
|
+
if (el.parentNode) {
|
|
63
|
+
if (debug) removed.push({ step: 'removeExact', tag: el.tagName, class: el.className });
|
|
64
|
+
el.parentNode.removeChild(el);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return removed;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Remove elements whose class/id matches partial selector patterns.
|
|
73
|
+
* @param {Document} doc
|
|
74
|
+
* @param {Element|null} mainContent
|
|
75
|
+
* @param {boolean} debug
|
|
76
|
+
* @returns {Array}
|
|
77
|
+
*/
|
|
78
|
+
export function removePartial(doc, mainContent = null, debug = false) {
|
|
79
|
+
const removed = [];
|
|
80
|
+
const attrSelector = PARTIAL_MATCH_ATTRIBUTES.map(a => `[${a}]`).join(',');
|
|
81
|
+
|
|
82
|
+
let elements;
|
|
83
|
+
try {
|
|
84
|
+
elements = Array.from(doc.querySelectorAll(attrSelector));
|
|
85
|
+
} catch (e) {
|
|
86
|
+
return removed;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const toRemove = [];
|
|
90
|
+
|
|
91
|
+
for (const el of elements) {
|
|
92
|
+
const tag = el.tagName.toLowerCase();
|
|
93
|
+
if (tag === 'code' || tag === 'pre') continue;
|
|
94
|
+
if (isInsideCodeBlock(el)) continue;
|
|
95
|
+
|
|
96
|
+
const combined = PARTIAL_MATCH_ATTRIBUTES
|
|
97
|
+
.map(a => el.getAttribute(a) || '')
|
|
98
|
+
.join(' ');
|
|
99
|
+
|
|
100
|
+
if (!PARTIAL_SELECTORS_REGEX.test(combined)) continue;
|
|
101
|
+
if (mainContent && isAncestorOrSelf(el, mainContent)) continue;
|
|
102
|
+
if (isFootnoteContainer(doc, el)) continue;
|
|
103
|
+
|
|
104
|
+
toRemove.push(el);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
for (const el of toRemove) {
|
|
108
|
+
if (el.parentNode) {
|
|
109
|
+
if (debug) removed.push({ step: 'removePartial', tag: el.tagName, class: el.className });
|
|
110
|
+
el.parentNode.removeChild(el);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return removed;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function isInsideCodeBlock(el) {
|
|
118
|
+
let parent = el.parentElement;
|
|
119
|
+
while (parent) {
|
|
120
|
+
const tag = parent.tagName.toLowerCase();
|
|
121
|
+
if (tag === 'code' || tag === 'pre') return true;
|
|
122
|
+
parent = parent.parentElement;
|
|
123
|
+
}
|
|
124
|
+
return false;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function isFootnoteContainer(doc, el) {
|
|
128
|
+
for (const sel of FOOTNOTE_SELECTORS) {
|
|
129
|
+
try {
|
|
130
|
+
const matches = doc.querySelectorAll(sel);
|
|
131
|
+
for (const m of matches) {
|
|
132
|
+
if (m === el) return true;
|
|
133
|
+
}
|
|
134
|
+
} catch (e) {}
|
|
135
|
+
}
|
|
136
|
+
return false;
|
|
137
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Remove images smaller than MIN_IMAGE_DIMENSION.
|
|
3
|
+
*/
|
|
4
|
+
import { MIN_IMAGE_DIMENSION } from '../constants.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* @param {Document} doc
|
|
8
|
+
* @param {boolean} debug
|
|
9
|
+
* @returns {number} count removed
|
|
10
|
+
*/
|
|
11
|
+
export function removeSmallImages(doc, debug = false) {
|
|
12
|
+
const toRemove = [];
|
|
13
|
+
|
|
14
|
+
for (const img of doc.querySelectorAll('img, svg')) {
|
|
15
|
+
const [w, h] = getDimensions(img);
|
|
16
|
+
if ((w !== null && w < MIN_IMAGE_DIMENSION) || (h !== null && h < MIN_IMAGE_DIMENSION)) {
|
|
17
|
+
toRemove.push(img);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
for (const img of toRemove) {
|
|
22
|
+
if (img.parentNode) img.parentNode.removeChild(img);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
return toRemove.length;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function getDimensions(el) {
|
|
29
|
+
let w = null, h = null;
|
|
30
|
+
|
|
31
|
+
const wAttr = el.getAttribute('width');
|
|
32
|
+
const hAttr = el.getAttribute('height');
|
|
33
|
+
if (wAttr && /^\d+$/.test(wAttr)) w = parseInt(wAttr);
|
|
34
|
+
if (hAttr && /^\d+$/.test(hAttr)) h = parseInt(hAttr);
|
|
35
|
+
|
|
36
|
+
const style = el.getAttribute('style') || '';
|
|
37
|
+
if (style) {
|
|
38
|
+
const wm = style.match(/\bwidth\s*:\s*(\d+)px/i);
|
|
39
|
+
const hm = style.match(/\bheight\s*:\s*(\d+)px/i);
|
|
40
|
+
if (wm) w = parseInt(wm[1]);
|
|
41
|
+
if (hm) h = parseInt(hm[1]);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return [w, h];
|
|
45
|
+
}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema.org JSON-LD extractor for defuddle-js.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Extract Schema.org JSON-LD data from a document.
|
|
7
|
+
* @param {Document} doc
|
|
8
|
+
* @returns {Array|null}
|
|
9
|
+
*/
|
|
10
|
+
export function extractSchemaOrg(doc) {
|
|
11
|
+
const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
|
|
12
|
+
if (!scripts.length) return null;
|
|
13
|
+
|
|
14
|
+
const items = [];
|
|
15
|
+
|
|
16
|
+
for (const script of scripts) {
|
|
17
|
+
const content = script.textContent || '';
|
|
18
|
+
if (!content.trim()) continue;
|
|
19
|
+
|
|
20
|
+
const parsed = parseJsonLd(content);
|
|
21
|
+
if (!parsed) continue;
|
|
22
|
+
|
|
23
|
+
if (parsed['@graph'] && Array.isArray(parsed['@graph'])) {
|
|
24
|
+
parsed['@graph'].forEach(item => items.push(decodeStrings(item)));
|
|
25
|
+
} else if (Array.isArray(parsed)) {
|
|
26
|
+
parsed.forEach(item => items.push(decodeStrings(item)));
|
|
27
|
+
} else {
|
|
28
|
+
items.push(decodeStrings(parsed));
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
return items.length > 0 ? items : null;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Parse a JSON-LD string, stripping comments and CDATA wrappers.
|
|
37
|
+
* @param {string} content
|
|
38
|
+
* @returns {object|null}
|
|
39
|
+
*/
|
|
40
|
+
function parseJsonLd(content) {
|
|
41
|
+
// Strip block comments /* ... */
|
|
42
|
+
content = content.replace(/\/\*[\s\S]*?\*\//g, '');
|
|
43
|
+
// Strip line comments // ...
|
|
44
|
+
content = content.replace(/^\s*\/\/.*$/gm, '');
|
|
45
|
+
// Strip CDATA wrappers
|
|
46
|
+
content = content.replace(/^\s*\/\/<!\[CDATA\[|\]\]>\/\/\s*$/gm, '');
|
|
47
|
+
content = content.replace(/^\s*<!\[CDATA\[([\s\S]*?)\]\]>\s*$/, '$1');
|
|
48
|
+
content = content.trim();
|
|
49
|
+
|
|
50
|
+
if (!content) return null;
|
|
51
|
+
|
|
52
|
+
try {
|
|
53
|
+
return JSON.parse(content);
|
|
54
|
+
} catch (e) {
|
|
55
|
+
return null;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Recursively decode HTML entities in all string values.
|
|
61
|
+
* @param {any} item
|
|
62
|
+
* @returns {any}
|
|
63
|
+
*/
|
|
64
|
+
function decodeStrings(item) {
|
|
65
|
+
if (typeof item === 'string') {
|
|
66
|
+
return item
|
|
67
|
+
.replace(/&/g, '&')
|
|
68
|
+
.replace(/</g, '<')
|
|
69
|
+
.replace(/>/g, '>')
|
|
70
|
+
.replace(/"/g, '"')
|
|
71
|
+
.replace(/'/g, "'")
|
|
72
|
+
.replace(/'/g, "'");
|
|
73
|
+
}
|
|
74
|
+
if (Array.isArray(item)) return item.map(decodeStrings);
|
|
75
|
+
if (item && typeof item === 'object') {
|
|
76
|
+
const result = {};
|
|
77
|
+
for (const [key, val] of Object.entries(item)) {
|
|
78
|
+
result[key] = decodeStrings(val);
|
|
79
|
+
}
|
|
80
|
+
return result;
|
|
81
|
+
}
|
|
82
|
+
return item;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Get a property value from schema.org data using dot-notation path.
|
|
87
|
+
* @param {Array|null} items
|
|
88
|
+
* @param {string[]} types @type values to match
|
|
89
|
+
* @param {string} property
|
|
90
|
+
* @returns {any}
|
|
91
|
+
*/
|
|
92
|
+
export function getSchemaProperty(items, types, property) {
|
|
93
|
+
if (!items) return null;
|
|
94
|
+
for (const item of items) {
|
|
95
|
+
if (!item || typeof item !== 'object') continue;
|
|
96
|
+
const itemType = item['@type'] || '';
|
|
97
|
+
if (types.length > 0 && !types.includes(itemType) && itemType !== '') continue;
|
|
98
|
+
const val = item[property];
|
|
99
|
+
if (val !== undefined && val !== null && val !== '') return val;
|
|
100
|
+
}
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML standardization for defuddle-js.
|
|
3
|
+
*/
|
|
4
|
+
import { ALLOWED_ATTRIBUTES, ALLOWED_EMPTY_TAGS } from './constants.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Standardize HTML in a content element.
|
|
8
|
+
* @param {Element} root
|
|
9
|
+
*/
|
|
10
|
+
export function standardize(root) {
|
|
11
|
+
removeComments(root);
|
|
12
|
+
normalizeNbsp(root);
|
|
13
|
+
convertH1ToH2(root);
|
|
14
|
+
stripAttributes(root);
|
|
15
|
+
removeEmptyElements(root);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function removeComments(root) {
|
|
19
|
+
const walker = document.createTreeWalker
|
|
20
|
+
? createTreeWalker(root)
|
|
21
|
+
: null;
|
|
22
|
+
|
|
23
|
+
// Use XPath-style traversal since createTreeWalker may not exist in all environments
|
|
24
|
+
const comments = [];
|
|
25
|
+
const iter = (node) => {
|
|
26
|
+
for (const child of Array.from(node.childNodes)) {
|
|
27
|
+
if (child.nodeType === 8 /* COMMENT_NODE */) {
|
|
28
|
+
comments.push(child);
|
|
29
|
+
} else if (child.childNodes && child.childNodes.length) {
|
|
30
|
+
iter(child);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
};
|
|
34
|
+
iter(root);
|
|
35
|
+
for (const c of comments) {
|
|
36
|
+
if (c.parentNode) c.parentNode.removeChild(c);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function normalizeNbsp(root) {
|
|
41
|
+
const iter = (node) => {
|
|
42
|
+
for (const child of Array.from(node.childNodes)) {
|
|
43
|
+
if (child.nodeType === 3 /* TEXT_NODE */) {
|
|
44
|
+
if (child.nodeValue && child.nodeValue.includes('\u00A0')) {
|
|
45
|
+
child.nodeValue = child.nodeValue.replace(/\u00A0/g, ' ');
|
|
46
|
+
}
|
|
47
|
+
} else if (child.childNodes && child.childNodes.length) {
|
|
48
|
+
iter(child);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
};
|
|
52
|
+
iter(root);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function convertH1ToH2(root) {
|
|
56
|
+
const h1s = Array.from(root.querySelectorAll('h1'));
|
|
57
|
+
for (const h1 of h1s) {
|
|
58
|
+
const h2 = root.ownerDocument
|
|
59
|
+
? root.ownerDocument.createElement('h2')
|
|
60
|
+
: h1.cloneNode(false);
|
|
61
|
+
h2.innerHTML = h1.innerHTML;
|
|
62
|
+
// Copy attributes
|
|
63
|
+
for (const attr of Array.from(h1.attributes)) {
|
|
64
|
+
h2.setAttribute(attr.name, attr.value);
|
|
65
|
+
}
|
|
66
|
+
if (h1.parentNode) h1.parentNode.replaceChild(h2, h1);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function stripAttributes(root) {
|
|
71
|
+
for (const el of root.querySelectorAll('*')) {
|
|
72
|
+
const toRemove = [];
|
|
73
|
+
for (const attr of Array.from(el.attributes)) {
|
|
74
|
+
const name = attr.name.toLowerCase();
|
|
75
|
+
if (name.startsWith('aria-')) continue;
|
|
76
|
+
if (ALLOWED_ATTRIBUTES.has(name)) continue;
|
|
77
|
+
// Keep footnote-related id attributes
|
|
78
|
+
if (name === 'id') {
|
|
79
|
+
const id = (attr.value || '').toLowerCase();
|
|
80
|
+
if (id.startsWith('fn') || id.startsWith('ref') || id.startsWith('footnote')) continue;
|
|
81
|
+
}
|
|
82
|
+
toRemove.push(attr.name);
|
|
83
|
+
}
|
|
84
|
+
for (const name of toRemove) el.removeAttribute(name);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function removeEmptyElements(root) {
|
|
89
|
+
const emptyTags = ['p', 'div', 'span', 'section', 'aside', 'li'];
|
|
90
|
+
let changed = true;
|
|
91
|
+
while (changed) {
|
|
92
|
+
changed = false;
|
|
93
|
+
for (const tag of emptyTags) {
|
|
94
|
+
for (const el of Array.from(root.getElementsByTagName(tag))) {
|
|
95
|
+
if (el === root) continue;
|
|
96
|
+
if (ALLOWED_EMPTY_TAGS.has(tag)) continue;
|
|
97
|
+
if (isEffectivelyEmpty(el)) {
|
|
98
|
+
if (el.parentNode) {
|
|
99
|
+
el.parentNode.removeChild(el);
|
|
100
|
+
changed = true;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function isEffectivelyEmpty(el) {
|
|
109
|
+
if ((el.textContent || '').trim()) return false;
|
|
110
|
+
const MEDIA = ['img', 'video', 'audio', 'iframe', 'svg', 'picture', 'canvas'];
|
|
111
|
+
return !MEDIA.some(tag => el.getElementsByTagName(tag).length > 0);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function createTreeWalker(root) {
|
|
115
|
+
return null; // placeholder — we use our own traversal above
|
|
116
|
+
}
|