@j0hanz/fetch-url-mcp 1.12.7 → 1.12.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/http/auth.d.ts +2 -2
- package/dist/http/auth.d.ts.map +1 -1
- package/dist/http/auth.js +4 -5
- package/dist/http/index.d.ts +6 -0
- package/dist/http/index.d.ts.map +1 -0
- package/dist/http/index.js +5 -0
- package/dist/http/native.d.ts +73 -0
- package/dist/http/native.d.ts.map +1 -1
- package/dist/http/native.js +554 -10
- package/dist/http/rate-limit.d.ts +1 -1
- package/dist/http/rate-limit.d.ts.map +1 -1
- package/dist/http/rate-limit.js +3 -4
- package/dist/index.d.ts +17 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +67 -6
- package/dist/lib/config.js +2 -2
- package/dist/lib/core.d.ts +56 -4
- package/dist/lib/core.d.ts.map +1 -1
- package/dist/lib/core.js +155 -4
- package/dist/lib/error/classes.d.ts +19 -0
- package/dist/lib/error/classes.d.ts.map +1 -0
- package/dist/lib/error/classes.js +107 -0
- package/dist/lib/error/classify.d.ts +4 -0
- package/dist/lib/error/classify.d.ts.map +1 -0
- package/dist/lib/error/classify.js +154 -0
- package/dist/lib/error/codes.d.ts +23 -0
- package/dist/lib/error/codes.d.ts.map +1 -0
- package/dist/lib/error/codes.js +22 -0
- package/dist/lib/error/index.d.ts +6 -0
- package/dist/lib/error/index.d.ts.map +1 -0
- package/dist/lib/error/index.js +5 -0
- package/dist/lib/{error-messages.d.ts → error/messages.d.ts} +2 -2
- package/dist/lib/error/messages.d.ts.map +1 -0
- package/dist/lib/{error-messages.js → error/messages.js} +2 -2
- package/dist/lib/{tool-errors.d.ts → error/payload.d.ts} +7 -13
- package/dist/lib/error/payload.d.ts.map +1 -0
- package/dist/lib/error/payload.js +108 -0
- package/dist/lib/mcp-interop.d.ts.map +1 -1
- package/dist/lib/mcp-interop.js +4 -6
- package/dist/lib/net/http.d.ts.map +1 -0
- package/dist/lib/{http.js → net/http.js} +4 -7
- package/dist/lib/net/index.d.ts +4 -0
- package/dist/lib/net/index.d.ts.map +1 -0
- package/dist/lib/net/index.js +3 -0
- package/dist/lib/{fetch-pipeline.d.ts → net/pipeline.d.ts} +3 -3
- package/dist/lib/net/pipeline.d.ts.map +1 -0
- package/dist/lib/{fetch-pipeline.js → net/pipeline.js} +3 -5
- package/dist/lib/{url.d.ts → net/url.d.ts} +1 -1
- package/dist/lib/net/url.d.ts.map +1 -0
- package/dist/lib/{url.js → net/url.js} +3 -5
- package/dist/lib/utils.d.ts +2 -18
- package/dist/lib/utils.d.ts.map +1 -1
- package/dist/lib/utils.js +29 -104
- package/dist/resources/index.d.ts.map +1 -1
- package/dist/resources/index.js +8 -5
- package/dist/schemas.d.ts +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +7 -9
- package/dist/tasks/index.d.ts +2 -0
- package/dist/tasks/index.d.ts.map +1 -0
- package/dist/tasks/index.js +1 -0
- package/dist/tasks/manager.d.ts +123 -1
- package/dist/tasks/manager.d.ts.map +1 -1
- package/dist/tasks/manager.js +745 -10
- package/dist/tools/{fetch-url.d.ts → index.d.ts} +4 -5
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/{fetch-url.js → index.js} +6 -8
- package/dist/transform/index.d.ts +279 -0
- package/dist/transform/index.d.ts.map +1 -0
- package/dist/transform/index.js +5234 -0
- package/package.json +2 -2
- package/dist/cli.d.ts +0 -19
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js +0 -65
- package/dist/http/health.d.ts +0 -8
- package/dist/http/health.d.ts.map +0 -1
- package/dist/http/health.js +0 -152
- package/dist/http/helpers.d.ts +0 -68
- package/dist/http/helpers.d.ts.map +0 -1
- package/dist/http/helpers.js +0 -402
- package/dist/lib/error-codes.d.ts +0 -13
- package/dist/lib/error-codes.d.ts.map +0 -1
- package/dist/lib/error-codes.js +0 -12
- package/dist/lib/error-messages.d.ts.map +0 -1
- package/dist/lib/fetch-pipeline.d.ts.map +0 -1
- package/dist/lib/http.d.ts.map +0 -1
- package/dist/lib/logger-names.d.ts +0 -16
- package/dist/lib/logger-names.d.ts.map +0 -1
- package/dist/lib/logger-names.js +0 -15
- package/dist/lib/session.d.ts +0 -44
- package/dist/lib/session.d.ts.map +0 -1
- package/dist/lib/session.js +0 -137
- package/dist/lib/tool-errors.d.ts.map +0 -1
- package/dist/lib/tool-errors.js +0 -253
- package/dist/lib/url.d.ts.map +0 -1
- package/dist/lib/zod.d.ts +0 -3
- package/dist/lib/zod.d.ts.map +0 -1
- package/dist/lib/zod.js +0 -27
- package/dist/tasks/call-contract.d.ts +0 -25
- package/dist/tasks/call-contract.d.ts.map +0 -1
- package/dist/tasks/call-contract.js +0 -59
- package/dist/tasks/execution.d.ts +0 -16
- package/dist/tasks/execution.d.ts.map +0 -1
- package/dist/tasks/execution.js +0 -241
- package/dist/tasks/handlers.d.ts +0 -11
- package/dist/tasks/handlers.d.ts.map +0 -1
- package/dist/tasks/handlers.js +0 -157
- package/dist/tasks/owner.d.ts +0 -43
- package/dist/tasks/owner.d.ts.map +0 -1
- package/dist/tasks/owner.js +0 -144
- package/dist/tasks/registry.d.ts +0 -20
- package/dist/tasks/registry.d.ts.map +0 -1
- package/dist/tasks/registry.js +0 -40
- package/dist/tasks/waiters.d.ts +0 -27
- package/dist/tasks/waiters.d.ts.map +0 -1
- package/dist/tasks/waiters.js +0 -114
- package/dist/tools/fetch-url.d.ts.map +0 -1
- package/dist/transform/dom-prep.d.ts +0 -16
- package/dist/transform/dom-prep.d.ts.map +0 -1
- package/dist/transform/dom-prep.js +0 -1287
- package/dist/transform/html-translators.d.ts +0 -5
- package/dist/transform/html-translators.d.ts.map +0 -1
- package/dist/transform/html-translators.js +0 -697
- package/dist/transform/markdown-cleanup.d.ts +0 -10
- package/dist/transform/markdown-cleanup.d.ts.map +0 -1
- package/dist/transform/markdown-cleanup.js +0 -542
- package/dist/transform/metadata.d.ts +0 -18
- package/dist/transform/metadata.d.ts.map +0 -1
- package/dist/transform/metadata.js +0 -462
- package/dist/transform/next-flight.d.ts +0 -2
- package/dist/transform/next-flight.d.ts.map +0 -1
- package/dist/transform/next-flight.js +0 -374
- package/dist/transform/shared.d.ts +0 -8
- package/dist/transform/shared.d.ts.map +0 -1
- package/dist/transform/shared.js +0 -137
- package/dist/transform/transform.d.ts +0 -38
- package/dist/transform/transform.d.ts.map +0 -1
- package/dist/transform/transform.js +0 -1042
- package/dist/transform/types.d.ts +0 -124
- package/dist/transform/types.d.ts.map +0 -1
- package/dist/transform/types.js +0 -5
- package/dist/transform/worker-pool.d.ts +0 -76
- package/dist/transform/worker-pool.d.ts.map +0 -1
- package/dist/transform/worker-pool.js +0 -725
- /package/dist/lib/{http.d.ts → net/http.d.ts} +0 -0
|
@@ -1,1287 +0,0 @@
|
|
|
1
|
-
import { parseHTML } from 'linkedom';
|
|
2
|
-
import { config, logDebug } from '../lib/core.js';
|
|
3
|
-
import { Loggers } from '../lib/logger-names.js';
|
|
4
|
-
import { CharCode, isWhitespaceChar } from '../lib/utils.js';
|
|
5
|
-
// ── Thresholds ──────────────────────────────────────────────────────
|
|
6
|
-
const NOISE_SCAN_LIMIT = 50_000;
|
|
7
|
-
const MIN_BODY_CONTENT_LENGTH = 100;
|
|
8
|
-
const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
9
|
-
const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
10
|
-
const ABORT_CHECK_INTERVAL = 500;
|
|
11
|
-
const NODE_FILTER_SHOW_TEXT = 4;
|
|
12
|
-
const ASIDE_NAV_LINK_DENSITY_THRESHOLD = 0.5;
|
|
13
|
-
const ASIDE_NAV_MIN_LINKS = 10;
|
|
14
|
-
const INLINE_DEMO_INSTRUCTION_MAX_CHARS = 160;
|
|
15
|
-
const REDUNDANT_PREVIEW_SEGMENT_MAX_CHARS = 60;
|
|
16
|
-
const REDUNDANT_PREVIEW_MAX_SEGMENTS = 12;
|
|
17
|
-
const DENSITY_BASE_CHARS = 100;
|
|
18
|
-
const MAX_PERMALINK_TEXT_LENGTH = 2;
|
|
19
|
-
const MIN_LINES_FOR_TRUNCATION_CHECK = 3;
|
|
20
|
-
// ── Regex patterns ──────────────────────────────────────────────────
|
|
21
|
-
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
22
|
-
const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
|
|
23
|
-
const NOISE_PATTERNS = [
|
|
24
|
-
/<\s*(?:script|style|noscript|iframe|nav|footer|header|form|button|input|select|textarea|svg|canvas)\b/i,
|
|
25
|
-
/[\s"']role\s*=\s*['"]?(?:navigation|banner|complementary|contentinfo|tree|menubar|menu)['"]?/i,
|
|
26
|
-
/[\s"'](?:aria-hidden\s*=\s*['"]?true['"]?|hidden)/i,
|
|
27
|
-
/[\s"'](?:banner|promo|announcement|cta|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast)\b/i,
|
|
28
|
-
/[\s"'](?:fixed|sticky|z-50|z-4|breadcrumbs?|pagination)\b/i,
|
|
29
|
-
];
|
|
30
|
-
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
31
|
-
const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50))\b/;
|
|
32
|
-
const HEADING_PERMALINK_TEXT_PATTERN = /^(?:#|¶|§|¤|🔗)$/u;
|
|
33
|
-
const HEADING_PERMALINK_CLASS_PATTERN = /\b(?:mark|permalink|hash-link|anchor(?:js)?-?link|header-?link|heading-anchor|deep-link)\b/i;
|
|
34
|
-
const HIDDEN_STYLE_REGEX = /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i;
|
|
35
|
-
const DISPLAY_NONE_REGEX = /display\s*:\s*none/i;
|
|
36
|
-
const DISPLAY_NONE_STRIP_REGEX = /display\s*:\s*none\s*;?/gi;
|
|
37
|
-
const UTM_PARAM_REGEX = /[?&]utm_(?:source|medium|campaign)=/i;
|
|
38
|
-
/** Sentinel regex that intentionally never matches; used for empty token sets. */
|
|
39
|
-
const NO_MATCH_REGEX = /a^/i;
|
|
40
|
-
// ── URL prefixes to skip during resolution ──────────────────────────
|
|
41
|
-
const SKIP_URL_PREFIXES = [
|
|
42
|
-
'#',
|
|
43
|
-
'javascript:',
|
|
44
|
-
'mailto:',
|
|
45
|
-
'tel:',
|
|
46
|
-
'data:',
|
|
47
|
-
'blob:',
|
|
48
|
-
];
|
|
49
|
-
// ── Tag / role sets ─────────────────────────────────────────────────
|
|
50
|
-
const BASE_STRUCTURAL_TAGS = new Set([
|
|
51
|
-
'script',
|
|
52
|
-
'style',
|
|
53
|
-
'noscript',
|
|
54
|
-
'iframe',
|
|
55
|
-
'template',
|
|
56
|
-
'form',
|
|
57
|
-
'button',
|
|
58
|
-
'input',
|
|
59
|
-
'select',
|
|
60
|
-
'textarea',
|
|
61
|
-
]);
|
|
62
|
-
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer']);
|
|
63
|
-
const NAVIGATION_ROLES = new Set([
|
|
64
|
-
'navigation',
|
|
65
|
-
'banner',
|
|
66
|
-
'complementary',
|
|
67
|
-
'contentinfo',
|
|
68
|
-
'tree',
|
|
69
|
-
'menubar',
|
|
70
|
-
'menu',
|
|
71
|
-
'dialog',
|
|
72
|
-
'alertdialog',
|
|
73
|
-
'search',
|
|
74
|
-
]);
|
|
75
|
-
const INTERACTIVE_CONTENT_ROLES = new Set([
|
|
76
|
-
'tabpanel',
|
|
77
|
-
'tab',
|
|
78
|
-
'tablist',
|
|
79
|
-
'dialog',
|
|
80
|
-
'alertdialog',
|
|
81
|
-
'menu',
|
|
82
|
-
'menuitem',
|
|
83
|
-
'option',
|
|
84
|
-
'listbox',
|
|
85
|
-
'combobox',
|
|
86
|
-
'tooltip',
|
|
87
|
-
'alert',
|
|
88
|
-
]);
|
|
89
|
-
// ── Promo tokens ────────────────────────────────────────────────────
|
|
90
|
-
const PROMO_TOKENS_ALWAYS = [
|
|
91
|
-
'banner',
|
|
92
|
-
'promo',
|
|
93
|
-
'announcement',
|
|
94
|
-
'cta',
|
|
95
|
-
'advert',
|
|
96
|
-
'ads',
|
|
97
|
-
'sponsor',
|
|
98
|
-
'recommend',
|
|
99
|
-
'breadcrumb',
|
|
100
|
-
'breadcrumbs',
|
|
101
|
-
'taglist',
|
|
102
|
-
'twitter-tweet',
|
|
103
|
-
'fb-post',
|
|
104
|
-
'instagram-media',
|
|
105
|
-
'social-embed',
|
|
106
|
-
'author-bio',
|
|
107
|
-
'byline',
|
|
108
|
-
'sharedaddy',
|
|
109
|
-
'sharing',
|
|
110
|
-
];
|
|
111
|
-
const PROMO_TOKENS_AGGRESSIVE = ['ad', 'related', 'comment'];
|
|
112
|
-
const PROMO_TOKENS_BY_CATEGORY = {
|
|
113
|
-
'cookie-banners': ['cookie', 'consent', 'popup', 'modal', 'overlay', 'toast'],
|
|
114
|
-
newsletters: ['newsletter', 'subscribe'],
|
|
115
|
-
'social-share': ['share', 'social', 'share-button'],
|
|
116
|
-
'author-blocks': ['author-bio', 'byline', 'author-info', 'writer-profile'],
|
|
117
|
-
'related-content': [
|
|
118
|
-
'related-post',
|
|
119
|
-
'related-article',
|
|
120
|
-
'more-stories',
|
|
121
|
-
'recommended-posts',
|
|
122
|
-
],
|
|
123
|
-
};
|
|
124
|
-
// ── Noise selector configurations ───────────────────────────────────
|
|
125
|
-
const BASE_NOISE_SELECTORS = {
|
|
126
|
-
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"],[class*="breadcrumb"]',
|
|
127
|
-
cookieBanners: '[role="dialog"]',
|
|
128
|
-
hidden: '[style*="display: none"],[style*="display:none"],[style*="visibility: hidden"],[style*="visibility:hidden"],[hidden],[aria-hidden="true"]',
|
|
129
|
-
};
|
|
130
|
-
const DOCS_CONTROL_SELECTORS = [
|
|
131
|
-
'.content-icon-container',
|
|
132
|
-
'.edit-this-page',
|
|
133
|
-
'.toc-overlay-icon',
|
|
134
|
-
'.theme-toggle-container',
|
|
135
|
-
'.sidebar-toggle',
|
|
136
|
-
'.sidebar-drawer',
|
|
137
|
-
'.toc-drawer',
|
|
138
|
-
'.mobile-header',
|
|
139
|
-
'.overlay.sidebar-overlay',
|
|
140
|
-
'.overlay.toc-overlay',
|
|
141
|
-
'.baseline-indicator',
|
|
142
|
-
'.back-to-top',
|
|
143
|
-
'.backtotop',
|
|
144
|
-
'.headerlink',
|
|
145
|
-
'[title="Edit this page"]',
|
|
146
|
-
'.article-footer',
|
|
147
|
-
'.baseline-indicator',
|
|
148
|
-
'baseline-indicator',
|
|
149
|
-
'mdn-content-feedback',
|
|
150
|
-
'interactive-example',
|
|
151
|
-
];
|
|
152
|
-
let cachedContext;
|
|
153
|
-
let lastContextKey;
|
|
154
|
-
function escapeRegexLiteral(value) {
|
|
155
|
-
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
156
|
-
}
|
|
157
|
-
function buildTokenRegex(tokens) {
|
|
158
|
-
if (tokens.size === 0)
|
|
159
|
-
return NO_MATCH_REGEX;
|
|
160
|
-
const pattern = new RegExp(`(?:^|[^a-z0-9])(?:${[...tokens].map(escapeRegexLiteral).join('|')})(?:$|[^a-z0-9])`, 'i');
|
|
161
|
-
return pattern;
|
|
162
|
-
}
|
|
163
|
-
function getPromoMatchers(currentConfig, enabledCategories) {
|
|
164
|
-
const baseTokens = new Set(PROMO_TOKENS_ALWAYS);
|
|
165
|
-
const aggressiveTokens = new Set();
|
|
166
|
-
if (currentConfig.aggressiveMode) {
|
|
167
|
-
for (const token of PROMO_TOKENS_AGGRESSIVE)
|
|
168
|
-
aggressiveTokens.add(token);
|
|
169
|
-
}
|
|
170
|
-
for (const [category, tokens] of Object.entries(PROMO_TOKENS_BY_CATEGORY)) {
|
|
171
|
-
if (enabledCategories.has(category)) {
|
|
172
|
-
for (const token of tokens)
|
|
173
|
-
baseTokens.add(token);
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
for (const t of currentConfig.extraTokens) {
|
|
177
|
-
const n = t.toLowerCase().trim();
|
|
178
|
-
if (n)
|
|
179
|
-
baseTokens.add(n);
|
|
180
|
-
}
|
|
181
|
-
return {
|
|
182
|
-
base: buildTokenRegex(baseTokens),
|
|
183
|
-
aggressive: buildTokenRegex(aggressiveTokens),
|
|
184
|
-
};
|
|
185
|
-
}
|
|
186
|
-
function buildNoiseSelector(flags) {
|
|
187
|
-
const selectors = [BASE_NOISE_SELECTORS.hidden];
|
|
188
|
-
if (flags.navFooter)
|
|
189
|
-
selectors.push(BASE_NOISE_SELECTORS.navFooter);
|
|
190
|
-
if (flags.cookieBanners)
|
|
191
|
-
selectors.push(BASE_NOISE_SELECTORS.cookieBanners);
|
|
192
|
-
return selectors.join(',');
|
|
193
|
-
}
|
|
194
|
-
function buildCandidateSelector(structuralTags) {
|
|
195
|
-
return [
|
|
196
|
-
...structuralTags,
|
|
197
|
-
...ALWAYS_NOISE_TAGS,
|
|
198
|
-
'aside',
|
|
199
|
-
'header',
|
|
200
|
-
'[class]',
|
|
201
|
-
'[id]',
|
|
202
|
-
'[role]',
|
|
203
|
-
'[style]',
|
|
204
|
-
].join(',');
|
|
205
|
-
}
|
|
206
|
-
function getContext() {
|
|
207
|
-
const currentConfig = config.noiseRemoval;
|
|
208
|
-
const contextKey = JSON.stringify({
|
|
209
|
-
locale: config.i18n.locale,
|
|
210
|
-
enabledCategories: currentConfig.enabledCategories,
|
|
211
|
-
extraTokens: currentConfig.extraTokens,
|
|
212
|
-
extraSelectors: currentConfig.extraSelectors,
|
|
213
|
-
aggressiveMode: currentConfig.aggressiveMode,
|
|
214
|
-
preserveSvgCanvas: currentConfig.preserveSvgCanvas,
|
|
215
|
-
});
|
|
216
|
-
if (cachedContext !== undefined && lastContextKey === contextKey)
|
|
217
|
-
return cachedContext;
|
|
218
|
-
const enabled = new Set(currentConfig.enabledCategories
|
|
219
|
-
.map((c) => {
|
|
220
|
-
const s = c.toLowerCase().trim();
|
|
221
|
-
const { locale } = config.i18n;
|
|
222
|
-
return locale ? s.toLocaleLowerCase(locale) : s;
|
|
223
|
-
})
|
|
224
|
-
.filter(Boolean));
|
|
225
|
-
const isEnabled = (cat) => enabled.has(cat);
|
|
226
|
-
const flags = {
|
|
227
|
-
navFooter: isEnabled('nav-footer'),
|
|
228
|
-
cookieBanners: isEnabled('cookie-banners'),
|
|
229
|
-
};
|
|
230
|
-
const structuralTags = new Set(BASE_STRUCTURAL_TAGS);
|
|
231
|
-
if (!currentConfig.preserveSvgCanvas) {
|
|
232
|
-
structuralTags.add('svg');
|
|
233
|
-
structuralTags.add('canvas');
|
|
234
|
-
}
|
|
235
|
-
const promoMatchers = getPromoMatchers(currentConfig, enabled);
|
|
236
|
-
const extraSelectors = currentConfig.extraSelectors
|
|
237
|
-
.map((s) => s.trim())
|
|
238
|
-
.filter((s) => s.length > 0);
|
|
239
|
-
const noiseSelector = buildNoiseSelector(flags);
|
|
240
|
-
const extraSelector = extraSelectors.length > 0 ? extraSelectors.join(',') : null;
|
|
241
|
-
const candidateSelector = buildCandidateSelector(structuralTags);
|
|
242
|
-
cachedContext = {
|
|
243
|
-
flags,
|
|
244
|
-
structuralTags,
|
|
245
|
-
promoMatchers,
|
|
246
|
-
promoEnabled: Object.keys(PROMO_TOKENS_BY_CATEGORY).some((cat) => enabled.has(cat)),
|
|
247
|
-
noiseSelector,
|
|
248
|
-
extraSelector,
|
|
249
|
-
candidateSelector,
|
|
250
|
-
};
|
|
251
|
-
lastContextKey = contextKey;
|
|
252
|
-
return cachedContext;
|
|
253
|
-
}
|
|
254
|
-
function isInteractive(element, role) {
|
|
255
|
-
if (role && INTERACTIVE_CONTENT_ROLES.has(role))
|
|
256
|
-
return true;
|
|
257
|
-
const tag = element.tagName.toLowerCase();
|
|
258
|
-
const ds = element.getAttribute('data-state');
|
|
259
|
-
if ((ds === 'inactive' || ds === 'closed') && !BASE_STRUCTURAL_TAGS.has(tag))
|
|
260
|
-
return true;
|
|
261
|
-
const dataOrientation = element.getAttribute('data-orientation');
|
|
262
|
-
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
|
|
263
|
-
return true;
|
|
264
|
-
return (element.hasAttribute('data-accordion-item') ||
|
|
265
|
-
element.hasAttribute('data-radix-collection-item'));
|
|
266
|
-
}
|
|
267
|
-
function isPrimaryContent(element, checkDescendants = false) {
|
|
268
|
-
if (element.closest('article,main,[role="main"]'))
|
|
269
|
-
return true;
|
|
270
|
-
if (checkDescendants && element.querySelector('article,main,[role="main"]'))
|
|
271
|
-
return true;
|
|
272
|
-
return false;
|
|
273
|
-
}
|
|
274
|
-
function isLinkDenseNavigation(element, checkContainedNav = false) {
|
|
275
|
-
if (checkContainedNav && element.querySelector('nav'))
|
|
276
|
-
return true;
|
|
277
|
-
const links = element.querySelectorAll('a[href]');
|
|
278
|
-
if (links.length < ASIDE_NAV_MIN_LINKS)
|
|
279
|
-
return false;
|
|
280
|
-
const textLen = (element.textContent || '').trim().length;
|
|
281
|
-
if (textLen === 0)
|
|
282
|
-
return true;
|
|
283
|
-
return (links.length / (textLen / DENSITY_BASE_CHARS) >=
|
|
284
|
-
ASIDE_NAV_LINK_DENSITY_THRESHOLD);
|
|
285
|
-
}
|
|
286
|
-
function shouldPreserve(element, tagName) {
|
|
287
|
-
// Check Dialog
|
|
288
|
-
const role = element.getAttribute('role');
|
|
289
|
-
if (role === 'dialog' || role === 'alertdialog') {
|
|
290
|
-
if (isPrimaryContent(element))
|
|
291
|
-
return true;
|
|
292
|
-
const textLen = (element.textContent || '').length;
|
|
293
|
-
if (textLen > DIALOG_MIN_CHARS_FOR_PRESERVATION)
|
|
294
|
-
return true;
|
|
295
|
-
return element.querySelector('h1,h2,h3,h4,h5,h6') !== null;
|
|
296
|
-
}
|
|
297
|
-
if (tagName === 'nav' || tagName === 'footer') {
|
|
298
|
-
if (element.querySelector('article,main,section,[role="main"]'))
|
|
299
|
-
return true;
|
|
300
|
-
const textLen = (element.textContent || '').trim().length;
|
|
301
|
-
if (textLen < NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION)
|
|
302
|
-
return false;
|
|
303
|
-
if (isLinkDenseNavigation(element))
|
|
304
|
-
return false;
|
|
305
|
-
return true;
|
|
306
|
-
}
|
|
307
|
-
// Check Aside — preserve only if it looks like article content, not navigation
|
|
308
|
-
if (tagName === 'aside') {
|
|
309
|
-
if (!isPrimaryContent(element))
|
|
310
|
-
return false;
|
|
311
|
-
return !isLinkDenseNavigation(element, true);
|
|
312
|
-
}
|
|
313
|
-
return false;
|
|
314
|
-
}
|
|
315
|
-
function removeNodes(nodes) {
|
|
316
|
-
for (let i = nodes.length - 1; i >= 0; i--) {
|
|
317
|
-
const node = nodes[i];
|
|
318
|
-
if (node?.parentNode && !shouldPreserve(node, node.tagName.toLowerCase())) {
|
|
319
|
-
node.remove();
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
function isStructuralNoise(tagName, interactive, context) {
|
|
324
|
-
return context.structuralTags.has(tagName) && !interactive;
|
|
325
|
-
}
|
|
326
|
-
function isNavigationNoise(tagName, role, className, id, context) {
|
|
327
|
-
if (!context.flags.navFooter)
|
|
328
|
-
return false;
|
|
329
|
-
if (ALWAYS_NOISE_TAGS.has(tagName))
|
|
330
|
-
return true;
|
|
331
|
-
if (tagName === 'header' &&
|
|
332
|
-
((role !== null && NAVIGATION_ROLES.has(role)) ||
|
|
333
|
-
HEADER_NOISE_PATTERN.test(`${className} ${id}`)))
|
|
334
|
-
return true;
|
|
335
|
-
if (tagName === 'aside')
|
|
336
|
-
return true;
|
|
337
|
-
return (role !== null &&
|
|
338
|
-
NAVIGATION_ROLES.has(role) &&
|
|
339
|
-
(tagName !== 'aside' || role !== 'complementary'));
|
|
340
|
-
}
|
|
341
|
-
function isHiddenNoise(hidden, interactive) {
|
|
342
|
-
return hidden && !interactive;
|
|
343
|
-
}
|
|
344
|
-
function isPositionalNoise(className, element) {
|
|
345
|
-
return (FIXED_OR_HIGH_Z_PATTERN.test(className) &&
|
|
346
|
-
(element.textContent || '').trim().length <
|
|
347
|
-
NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
|
|
348
|
-
}
|
|
349
|
-
function isPromoNoise(className, id, element, context) {
|
|
350
|
-
if (!context.promoEnabled)
|
|
351
|
-
return false;
|
|
352
|
-
const aggTest = context.promoMatchers.aggressive.test(className) ||
|
|
353
|
-
context.promoMatchers.aggressive.test(id);
|
|
354
|
-
if (aggTest && !isPrimaryContent(element))
|
|
355
|
-
return true;
|
|
356
|
-
if (context.promoMatchers.base.test(className) ||
|
|
357
|
-
context.promoMatchers.base.test(id)) {
|
|
358
|
-
if (!isPrimaryContent(element, true))
|
|
359
|
-
return true;
|
|
360
|
-
}
|
|
361
|
-
return false;
|
|
362
|
-
}
|
|
363
|
-
function isNoiseElement(element, context) {
|
|
364
|
-
const tagName = element.tagName.toLowerCase();
|
|
365
|
-
const role = element.getAttribute('role');
|
|
366
|
-
const className = element.getAttribute('class') ?? '';
|
|
367
|
-
const id = element.getAttribute('id') ?? '';
|
|
368
|
-
const interactive = isInteractive(element, role);
|
|
369
|
-
const style = element.getAttribute('style');
|
|
370
|
-
const hidden = element.hasAttribute('hidden') ||
|
|
371
|
-
element.getAttribute('aria-hidden') === 'true' ||
|
|
372
|
-
(style !== null && HIDDEN_STYLE_REGEX.test(style));
|
|
373
|
-
return (isStructuralNoise(tagName, interactive, context) ||
|
|
374
|
-
isNavigationNoise(tagName, role, className, id, context) ||
|
|
375
|
-
isHiddenNoise(hidden, interactive) ||
|
|
376
|
-
isPositionalNoise(className, element) ||
|
|
377
|
-
isPromoNoise(className, id, element, context));
|
|
378
|
-
}
|
|
379
|
-
function stripHeadingWrapperDivs(h) {
|
|
380
|
-
const divs = h.querySelectorAll('div');
|
|
381
|
-
for (let j = divs.length - 1; j >= 0; j--) {
|
|
382
|
-
const d = divs[j];
|
|
383
|
-
if (!d?.parentNode)
|
|
384
|
-
continue;
|
|
385
|
-
const cls = d.getAttribute('class') ?? '';
|
|
386
|
-
const stl = d.getAttribute('style') ?? '';
|
|
387
|
-
if (cls.includes('absolute') ||
|
|
388
|
-
stl.includes('position') ||
|
|
389
|
-
d.getAttribute('tabindex') === '-1') {
|
|
390
|
-
d.remove();
|
|
391
|
-
}
|
|
392
|
-
}
|
|
393
|
-
}
|
|
394
|
-
function stripPermalinkAnchors(h) {
|
|
395
|
-
const anchors = h.querySelectorAll('a');
|
|
396
|
-
for (let j = anchors.length - 1; j >= 0; j--) {
|
|
397
|
-
const a = anchors[j];
|
|
398
|
-
if (!a?.parentNode)
|
|
399
|
-
continue;
|
|
400
|
-
if (isHeadingPermalinkAnchor(a))
|
|
401
|
-
a.remove();
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
function stripZeroWidthSpaces(h, document) {
|
|
405
|
-
const walker = document.createTreeWalker(h, NODE_FILTER_SHOW_TEXT);
|
|
406
|
-
let node;
|
|
407
|
-
while ((node = walker.nextNode())) {
|
|
408
|
-
if (node.textContent?.includes('\u200B')) {
|
|
409
|
-
node.textContent = node.textContent.replace(/\u200B/g, '');
|
|
410
|
-
}
|
|
411
|
-
}
|
|
412
|
-
}
|
|
413
|
-
function cleanHeadings(document) {
|
|
414
|
-
const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
415
|
-
for (const h of headings) {
|
|
416
|
-
if (!h.parentNode)
|
|
417
|
-
continue;
|
|
418
|
-
stripHeadingWrapperDivs(h);
|
|
419
|
-
stripPermalinkAnchors(h);
|
|
420
|
-
stripZeroWidthSpaces(h, document);
|
|
421
|
-
}
|
|
422
|
-
}
|
|
423
|
-
function getCollapsedHeadingAnchorText(anchor) {
|
|
424
|
-
return (anchor.textContent || '').replace(/[\u200B\s]/g, '');
|
|
425
|
-
}
|
|
426
|
-
function isHeadingPermalinkAnchor(anchor) {
|
|
427
|
-
const href = anchor.getAttribute('href') ?? '';
|
|
428
|
-
if (!href.startsWith('#'))
|
|
429
|
-
return false;
|
|
430
|
-
const text = getCollapsedHeadingAnchorText(anchor);
|
|
431
|
-
if (text.length === 0 || HEADING_PERMALINK_TEXT_PATTERN.test(text)) {
|
|
432
|
-
return true;
|
|
433
|
-
}
|
|
434
|
-
const className = anchor.getAttribute('class') ?? '';
|
|
435
|
-
if (HEADING_PERMALINK_CLASS_PATTERN.test(className) &&
|
|
436
|
-
text.length <= MAX_PERMALINK_TEXT_LENGTH) {
|
|
437
|
-
return true;
|
|
438
|
-
}
|
|
439
|
-
const ariaHidden = anchor.getAttribute('aria-hidden');
|
|
440
|
-
const tabindex = anchor.getAttribute('tabindex');
|
|
441
|
-
return ((ariaHidden === 'true' || tabindex === '-1') &&
|
|
442
|
-
text.length <= MAX_PERMALINK_TEXT_LENGTH);
|
|
443
|
-
}
|
|
444
|
-
function hoistNestedRows(table) {
|
|
445
|
-
const nestedRows = table.querySelectorAll('td tr, th tr');
|
|
446
|
-
// Iterate backwards to preserve the original document order when inserting after the parent row
|
|
447
|
-
for (let i = nestedRows.length - 1; i >= 0; i--) {
|
|
448
|
-
const nestedRow = nestedRows[i];
|
|
449
|
-
if (nestedRow?.closest('table') !== table)
|
|
450
|
-
continue;
|
|
451
|
-
const parentRow = nestedRow.parentElement?.closest('tr');
|
|
452
|
-
if (parentRow && parentRow !== nestedRow) {
|
|
453
|
-
parentRow.after(nestedRow);
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
function stripNoise(document, signal) {
|
|
458
|
-
const context = getContext();
|
|
459
|
-
if (config.noiseRemoval.debug) {
|
|
460
|
-
logDebug('Noise removal audit enabled', {
|
|
461
|
-
categories: [...(context.flags.navFooter ? ['nav-footer'] : [])],
|
|
462
|
-
}, Loggers.LOG_TRANSFORM);
|
|
463
|
-
}
|
|
464
|
-
// Structural Removal
|
|
465
|
-
removeNodes(document.querySelectorAll(context.noiseSelector));
|
|
466
|
-
// Extra selectors (evaluated after base removal so DOM state is updated)
|
|
467
|
-
if (context.extraSelector) {
|
|
468
|
-
removeNodes(document.querySelectorAll(context.extraSelector));
|
|
469
|
-
}
|
|
470
|
-
// Candidates (conditional removal)
|
|
471
|
-
const candidates = document.querySelectorAll(context.candidateSelector);
|
|
472
|
-
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
473
|
-
if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
|
|
474
|
-
throw Error('Noise removal aborted');
|
|
475
|
-
}
|
|
476
|
-
const node = candidates[i];
|
|
477
|
-
if (!node)
|
|
478
|
-
continue;
|
|
479
|
-
if (!node.parentNode)
|
|
480
|
-
continue;
|
|
481
|
-
if (shouldPreserve(node, node.tagName.toLowerCase()))
|
|
482
|
-
continue;
|
|
483
|
-
if (isNoiseElement(node, context)) {
|
|
484
|
-
node.remove();
|
|
485
|
-
}
|
|
486
|
-
}
|
|
487
|
-
}
|
|
488
|
-
function parseSrcsetEntries(srcset) {
|
|
489
|
-
return srcset.split(',').map((entry) => {
|
|
490
|
-
const parts = entry.trim().split(/\s+/);
|
|
491
|
-
return { url: parts[0] ?? '', descriptor: parts.slice(1).join(' ') };
|
|
492
|
-
});
|
|
493
|
-
}
|
|
494
|
-
function processUrlElement(el, attr, base, isSrcset) {
|
|
495
|
-
if (!el.parentNode)
|
|
496
|
-
return;
|
|
497
|
-
if (isSrcset) {
|
|
498
|
-
const val = el.getAttribute(attr);
|
|
499
|
-
if (val) {
|
|
500
|
-
const newVal = parseSrcsetEntries(val)
|
|
501
|
-
.map((entry) => {
|
|
502
|
-
if (!entry.url)
|
|
503
|
-
return entry.descriptor;
|
|
504
|
-
const resolved = URL.parse(entry.url, base)?.href ?? entry.url;
|
|
505
|
-
return entry.descriptor
|
|
506
|
-
? `${resolved} ${entry.descriptor}`
|
|
507
|
-
: resolved;
|
|
508
|
-
})
|
|
509
|
-
.join(', ');
|
|
510
|
-
el.setAttribute(attr, newVal);
|
|
511
|
-
}
|
|
512
|
-
return;
|
|
513
|
-
}
|
|
514
|
-
const val = el.getAttribute(attr);
|
|
515
|
-
if (val &&
|
|
516
|
-
!SKIP_URL_PREFIXES.some((p) => val.trim().toLowerCase().startsWith(p))) {
|
|
517
|
-
const resolved = URL.parse(val, base);
|
|
518
|
-
if (resolved)
|
|
519
|
-
el.setAttribute(attr, resolved.href);
|
|
520
|
-
}
|
|
521
|
-
}
|
|
522
|
-
// Rewrite WordPress Photon CDN image URLs to point to the original host, since srcset URLs are often preserved with the updated domain while src is not.
|
|
523
|
-
// This ensures images are correctly resolved when the page is migrated to a new domain but still references the old domain in img src attributes.
|
|
524
|
-
export const WP_PHOTON_HOST_PATTERN = /^i\d\.wp\.com$/;
|
|
525
|
-
function rewritePhotonSrc(document, pageHost) {
|
|
526
|
-
for (const img of document.querySelectorAll('img[src]')) {
|
|
527
|
-
const src = img.getAttribute('src');
|
|
528
|
-
if (!src)
|
|
529
|
-
continue;
|
|
530
|
-
const parsed = URL.parse(src);
|
|
531
|
-
if (!parsed || !WP_PHOTON_HOST_PATTERN.test(parsed.hostname))
|
|
532
|
-
continue;
|
|
533
|
-
if (img.getAttribute('srcset'))
|
|
534
|
-
continue;
|
|
535
|
-
const segments = parsed.pathname.split('/').filter(Boolean);
|
|
536
|
-
if (segments.length < 2)
|
|
537
|
-
continue;
|
|
538
|
-
const originHost = segments[0];
|
|
539
|
-
if (!originHost?.includes('.'))
|
|
540
|
-
continue;
|
|
541
|
-
const resourcePath = `/${segments.slice(1).join('/')}`;
|
|
542
|
-
const rewritten = `https://${pageHost}${resourcePath}`;
|
|
543
|
-
img.setAttribute('src', rewritten);
|
|
544
|
-
}
|
|
545
|
-
}
|
|
546
|
-
// For images with src URLs pointing to a different domain than the page, check if their srcset contains a same-domain URL and prefer that for the src attribute.
|
|
547
|
-
// This can help preserve image loading when migrating content that references an old domain, as srcset entries are often left unchanged while src attributes are updated or removed.
|
|
548
|
-
function preferSameDomainSrc(document, base) {
|
|
549
|
-
const pageHost = base.hostname;
|
|
550
|
-
for (const img of document.querySelectorAll('img[src][srcset]')) {
|
|
551
|
-
const src = img.getAttribute('src');
|
|
552
|
-
if (!src)
|
|
553
|
-
continue;
|
|
554
|
-
const srcParsed = URL.parse(src);
|
|
555
|
-
if (!srcParsed || srcParsed.hostname === pageHost)
|
|
556
|
-
continue;
|
|
557
|
-
const srcset = img.getAttribute('srcset') ?? '';
|
|
558
|
-
for (const entry of parseSrcsetEntries(srcset)) {
|
|
559
|
-
if (!entry.url)
|
|
560
|
-
continue;
|
|
561
|
-
const parsed = URL.parse(entry.url);
|
|
562
|
-
if (parsed?.hostname === pageHost) {
|
|
563
|
-
img.setAttribute('src', entry.url);
|
|
564
|
-
break;
|
|
565
|
-
}
|
|
566
|
-
}
|
|
567
|
-
}
|
|
568
|
-
}
|
|
569
|
-
export function extractNoscriptImages(document) {
|
|
570
|
-
for (const noscript of document.querySelectorAll('noscript')) {
|
|
571
|
-
// linkedom may parse noscript children as DOM or raw text — handle both.
|
|
572
|
-
let imgs = Array.from(noscript.querySelectorAll('img'));
|
|
573
|
-
if (imgs.length === 0) {
|
|
574
|
-
const html = noscript.innerHTML || noscript.textContent || '';
|
|
575
|
-
if (!/<img\b/i.test(html))
|
|
576
|
-
continue;
|
|
577
|
-
const { document: fragDoc } = parseHTML(`<body>${html}</body>`);
|
|
578
|
-
imgs = Array.from(fragDoc.querySelectorAll('img'));
|
|
579
|
-
}
|
|
580
|
-
if (imgs.length === 0)
|
|
581
|
-
continue;
|
|
582
|
-
// Skip when the previous sibling is (or contains) an <img> — the
|
|
583
|
-
// lazy-loaded placeholder is already in the DOM and the translators
|
|
584
|
-
// handle data-src / placeholder detection.
|
|
585
|
-
const prev = noscript.previousElementSibling;
|
|
586
|
-
if (prev?.tagName === 'IMG' || prev?.querySelector('img'))
|
|
587
|
-
continue;
|
|
588
|
-
for (const img of imgs) {
|
|
589
|
-
// Skip tracking pixels (commonly 1×1 images placed in noscript by
|
|
590
|
-
// analytics providers).
|
|
591
|
-
if (img.getAttribute('width') === '1' ||
|
|
592
|
-
img.getAttribute('height') === '1')
|
|
593
|
-
continue;
|
|
594
|
-
noscript.before(img.cloneNode(true));
|
|
595
|
-
}
|
|
596
|
-
}
|
|
597
|
-
}
|
|
598
|
-
function resolveUrls(document, baseUrlStr) {
|
|
599
|
-
const base = URL.parse(baseUrlStr);
|
|
600
|
-
if (!base)
|
|
601
|
-
return;
|
|
602
|
-
rewritePhotonSrc(document, base.hostname);
|
|
603
|
-
preferSameDomainSrc(document, base);
|
|
604
|
-
const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
|
|
605
|
-
for (const el of elements) {
|
|
606
|
-
const tag = el.tagName.toLowerCase();
|
|
607
|
-
if (tag === 'a')
|
|
608
|
-
processUrlElement(el, 'href', base, false);
|
|
609
|
-
else if (tag === 'img')
|
|
610
|
-
processUrlElement(el, 'src', base, false);
|
|
611
|
-
else if (tag === 'source')
|
|
612
|
-
processUrlElement(el, 'srcset', base, true);
|
|
613
|
-
}
|
|
614
|
-
}
|
|
615
|
-
function getValidContentHtml(element) {
|
|
616
|
-
if (!element)
|
|
617
|
-
return null;
|
|
618
|
-
const html = element.innerHTML.trim();
|
|
619
|
-
return html.length > MIN_BODY_CONTENT_LENGTH ? html : null;
|
|
620
|
-
}
|
|
621
|
-
export function resolveDocumentBody(document) {
|
|
622
|
-
const { body } = document;
|
|
623
|
-
if (getValidContentHtml(body))
|
|
624
|
-
return body;
|
|
625
|
-
const { children } = document.documentElement;
|
|
626
|
-
for (const child of children) {
|
|
627
|
-
if (child.tagName === 'BODY' && getValidContentHtml(child)) {
|
|
628
|
-
return child;
|
|
629
|
-
}
|
|
630
|
-
}
|
|
631
|
-
return body;
|
|
632
|
-
}
|
|
633
|
-
export function serializeDocumentForMarkdown(document, fallback) {
|
|
634
|
-
const body = resolveDocumentBody(document);
|
|
635
|
-
const bodyHtml = getValidContentHtml(body);
|
|
636
|
-
if (bodyHtml)
|
|
637
|
-
return bodyHtml;
|
|
638
|
-
const outerHtml = document.documentElement.outerHTML.trim();
|
|
639
|
-
if (outerHtml.length > MIN_BODY_CONTENT_LENGTH)
|
|
640
|
-
return outerHtml;
|
|
641
|
-
return fallback;
|
|
642
|
-
}
|
|
643
|
-
function isFullDocumentHtml(html) {
|
|
644
|
-
return HTML_DOCUMENT_MARKERS.test(html);
|
|
645
|
-
}
|
|
646
|
-
function mayContainNoise(html) {
|
|
647
|
-
const sample = html.length <= NOISE_SCAN_LIMIT
|
|
648
|
-
? html
|
|
649
|
-
: `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
|
|
650
|
-
return NOISE_PATTERNS.some((re) => re.test(sample));
|
|
651
|
-
}
|
|
652
|
-
function surfaceHiddenTabPanels(document) {
|
|
653
|
-
const panels = document.querySelectorAll('[data-slot="tabContent"], [role="tabpanel"]');
|
|
654
|
-
for (const panel of panels) {
|
|
655
|
-
const style = panel.getAttribute('style') ?? '';
|
|
656
|
-
if (DISPLAY_NONE_REGEX.test(style)) {
|
|
657
|
-
panel.setAttribute('style', style.replace(DISPLAY_NONE_STRIP_REGEX, '').trim());
|
|
658
|
-
}
|
|
659
|
-
panel.removeAttribute('hidden');
|
|
660
|
-
}
|
|
661
|
-
}
|
|
662
|
-
function stripTabTriggers(document) {
|
|
663
|
-
const tabs = document.querySelectorAll('[role="tab"]');
|
|
664
|
-
for (let i = tabs.length - 1; i >= 0; i--) {
|
|
665
|
-
const tab = tabs[i];
|
|
666
|
-
if (!tab)
|
|
667
|
-
continue;
|
|
668
|
-
const isSelected = tab.getAttribute('aria-selected') === 'true' ||
|
|
669
|
-
tab.getAttribute('data-state') === 'active' ||
|
|
670
|
-
tab.hasAttribute('data-selected');
|
|
671
|
-
if (!isSelected) {
|
|
672
|
-
tab.remove();
|
|
673
|
-
}
|
|
674
|
-
}
|
|
675
|
-
}
|
|
676
|
-
/** Surface hidden tab panels, then strip unselected tab triggers. */
|
|
677
|
-
export function normalizeTabContent(document) {
|
|
678
|
-
surfaceHiddenTabPanels(document);
|
|
679
|
-
stripTabTriggers(document);
|
|
680
|
-
}
|
|
681
|
-
function convertBlockToSpan(block, document) {
|
|
682
|
-
if (!block.parentNode)
|
|
683
|
-
return;
|
|
684
|
-
const span = document.createElement('span');
|
|
685
|
-
span.appendChild(document.createTextNode(' '));
|
|
686
|
-
while (block.firstChild) {
|
|
687
|
-
span.appendChild(block.firstChild);
|
|
688
|
-
}
|
|
689
|
-
span.appendChild(document.createTextNode(' '));
|
|
690
|
-
for (const attr of Array.from(block.attributes)) {
|
|
691
|
-
span.setAttribute(attr.name, attr.value);
|
|
692
|
-
}
|
|
693
|
-
block.replaceWith(span);
|
|
694
|
-
}
|
|
695
|
-
function normalizeTableCellTextNodes(cell, document) {
|
|
696
|
-
const walker = document.createTreeWalker(cell, NODE_FILTER_SHOW_TEXT);
|
|
697
|
-
let node;
|
|
698
|
-
while ((node = walker.nextNode())) {
|
|
699
|
-
if (node.nodeValue) {
|
|
700
|
-
node.nodeValue = node.nodeValue.replace(/\r?\n/g, ' ');
|
|
701
|
-
if (node.nodeValue.includes('|')) {
|
|
702
|
-
node.nodeValue = node.nodeValue.replace(/\|/g, '\\|');
|
|
703
|
-
}
|
|
704
|
-
}
|
|
705
|
-
}
|
|
706
|
-
}
|
|
707
|
-
function normalizeTableCells(document) {
|
|
708
|
-
const cells = document.querySelectorAll('td, th');
|
|
709
|
-
for (const cell of cells) {
|
|
710
|
-
const brs = cell.querySelectorAll('br');
|
|
711
|
-
for (const br of brs) {
|
|
712
|
-
br.replaceWith(' ');
|
|
713
|
-
}
|
|
714
|
-
const blocks = Array.from(cell.querySelectorAll('div, p, ul, li, h1, h2, h3, h4, h5, h6'));
|
|
715
|
-
for (const block of blocks) {
|
|
716
|
-
convertBlockToSpan(block, document);
|
|
717
|
-
}
|
|
718
|
-
normalizeTableCellTextNodes(cell, document);
|
|
719
|
-
}
|
|
720
|
-
}
|
|
721
|
-
function normalizeWhitespace(value) {
|
|
722
|
-
return value.replace(/\s+/g, ' ').trim();
|
|
723
|
-
}
|
|
724
|
-
function hasDirectPreDescendant(element) {
|
|
725
|
-
return (element.tagName === 'PRE' ||
|
|
726
|
-
Array.from(element.children).some((child) => child.tagName === 'PRE' || child.querySelector('pre') !== null));
|
|
727
|
-
}
|
|
728
|
-
function collectLeafTextSegments(element) {
|
|
729
|
-
const seen = new Set();
|
|
730
|
-
const segments = [];
|
|
731
|
-
const candidates = element.querySelectorAll('p,li,div,span');
|
|
732
|
-
for (const candidate of candidates) {
|
|
733
|
-
if (candidate.children.length > 0 ||
|
|
734
|
-
candidate.querySelector('pre,code,table,ul,ol,blockquote,figure') !== null) {
|
|
735
|
-
continue;
|
|
736
|
-
}
|
|
737
|
-
const text = normalizeWhitespace(candidate.textContent || '');
|
|
738
|
-
if (text.length === 0 ||
|
|
739
|
-
text.length > REDUNDANT_PREVIEW_SEGMENT_MAX_CHARS ||
|
|
740
|
-
seen.has(text)) {
|
|
741
|
-
continue;
|
|
742
|
-
}
|
|
743
|
-
seen.add(text);
|
|
744
|
-
segments.push(text);
|
|
745
|
-
}
|
|
746
|
-
if (segments.length > 0)
|
|
747
|
-
return segments;
|
|
748
|
-
const fallback = normalizeWhitespace(element.textContent || '');
|
|
749
|
-
return fallback ? [fallback] : [];
|
|
750
|
-
}
|
|
751
|
-
function isHostnameLike(value) {
|
|
752
|
-
return /^[a-z0-9.-]+\.[a-z]{2,}$/i.test(value);
|
|
753
|
-
}
|
|
754
|
-
function hasPreviewMedia(element) {
|
|
755
|
-
return element.querySelector('svg,canvas') !== null;
|
|
756
|
-
}
|
|
757
|
-
function hasInteractiveOrComplexContent(preview) {
|
|
758
|
-
if (preview.tagName === 'FIGCAPTION')
|
|
759
|
-
return true;
|
|
760
|
-
return (preview.querySelector('a[href],button,input,select,textarea,form,video,audio,iframe,table,ul,ol,blockquote') !== null);
|
|
761
|
-
}
|
|
762
|
-
function hasValidTextSegments(segments) {
|
|
763
|
-
return (segments.length > 0 &&
|
|
764
|
-
segments.length <= REDUNDANT_PREVIEW_MAX_SEGMENTS &&
|
|
765
|
-
segments.every((segment) => segment.length <= REDUNDANT_PREVIEW_SEGMENT_MAX_CHARS));
|
|
766
|
-
}
|
|
767
|
-
function isRedundantCodePreview(preview, codeContainer) {
|
|
768
|
-
if (hasInteractiveOrComplexContent(preview))
|
|
769
|
-
return false;
|
|
770
|
-
const segments = collectLeafTextSegments(preview);
|
|
771
|
-
if (!hasValidTextSegments(segments))
|
|
772
|
-
return false;
|
|
773
|
-
const codeText = normalizeWhitespace(codeContainer.textContent || '');
|
|
774
|
-
if (!codeText)
|
|
775
|
-
return false;
|
|
776
|
-
const matchingSegments = segments.filter((segment) => codeText.includes(segment));
|
|
777
|
-
if (matchingSegments.length === segments.length)
|
|
778
|
-
return true;
|
|
779
|
-
return ((hasPreviewMedia(preview) || segments.some(isHostnameLike)) &&
|
|
780
|
-
matchingSegments.length > 0);
|
|
781
|
-
}
|
|
782
|
-
function pruneFigurePreviewPanes(document) {
|
|
783
|
-
for (const figure of document.querySelectorAll('figure')) {
|
|
784
|
-
const directChildren = Array.from(figure.children);
|
|
785
|
-
const codeChild = directChildren.find((child) => hasDirectPreDescendant(child));
|
|
786
|
-
if (!codeChild)
|
|
787
|
-
continue;
|
|
788
|
-
for (const child of directChildren) {
|
|
789
|
-
if (child === codeChild || child.tagName === 'FIGCAPTION')
|
|
790
|
-
continue;
|
|
791
|
-
if (isRedundantCodePreview(child, codeChild))
|
|
792
|
-
child.remove();
|
|
793
|
-
}
|
|
794
|
-
}
|
|
795
|
-
}
|
|
796
|
-
function isDemoInstructionBlock(element) {
|
|
797
|
-
if (element.querySelector('a[href],pre,code,table,ul,ol,blockquote,figure,h1,h2,h3,h4,h5,h6') !== null) {
|
|
798
|
-
return false;
|
|
799
|
-
}
|
|
800
|
-
const text = normalizeWhitespace(element.textContent || '');
|
|
801
|
-
if (text.length === 0 ||
|
|
802
|
-
text.length > INLINE_DEMO_INSTRUCTION_MAX_CHARS ||
|
|
803
|
-
/[.!?]$/.test(text)) {
|
|
804
|
-
return false;
|
|
805
|
-
}
|
|
806
|
-
return collectLeafTextSegments(element).length <= 3;
|
|
807
|
-
}
|
|
808
|
-
function pruneDemoInstructionBlocks(document) {
|
|
809
|
-
for (const container of document.querySelectorAll('div,section,article')) {
|
|
810
|
-
const children = Array.from(container.children);
|
|
811
|
-
const figureIndex = children.findIndex((child) => child.tagName === 'FIGURE' && child.querySelector('pre') !== null);
|
|
812
|
-
if (figureIndex <= 0)
|
|
813
|
-
continue;
|
|
814
|
-
for (let i = 0; i < figureIndex; i++) {
|
|
815
|
-
const child = children[i];
|
|
816
|
-
if (child && isDemoInstructionBlock(child))
|
|
817
|
-
child.remove();
|
|
818
|
-
}
|
|
819
|
-
}
|
|
820
|
-
}
|
|
821
|
-
function normalizeHighlightedCodeLines(document) {
|
|
822
|
-
for (const code of document.querySelectorAll('pre > code')) {
|
|
823
|
-
const directChildren = Array.from(code.children);
|
|
824
|
-
if (directChildren.length < 2)
|
|
825
|
-
continue;
|
|
826
|
-
const directSpans = directChildren.filter((child) => child.tagName === 'SPAN');
|
|
827
|
-
if (directSpans.length !== directChildren.length)
|
|
828
|
-
continue;
|
|
829
|
-
const hasLineClass = directSpans.some((child) => (child.getAttribute('class') ?? '').split(/\s+/).includes('line'));
|
|
830
|
-
const hasNewlineNode = Array.from(code.childNodes).some((node) => node.nodeType === 3 && /[\r\n]/.test(node.textContent ?? ''));
|
|
831
|
-
if (hasNewlineNode || !hasLineClass)
|
|
832
|
-
continue;
|
|
833
|
-
for (let i = 0; i < directSpans.length - 1; i++) {
|
|
834
|
-
const current = directSpans[i];
|
|
835
|
-
const next = current?.nextSibling;
|
|
836
|
-
if (next?.nodeType === 3 && (next.textContent ?? '').startsWith('\n')) {
|
|
837
|
-
continue;
|
|
838
|
-
}
|
|
839
|
-
current?.after(document.createTextNode('\n'));
|
|
840
|
-
}
|
|
841
|
-
}
|
|
842
|
-
}
|
|
843
|
-
const COPY_BUTTON_SELECTOR = 'button,a[href="#copy"],a[href="#"],span[class*="copy"]';
|
|
844
|
-
const COPY_BUTTON_TEXT_PATTERN = /^copy(?: code)?$/i;
|
|
845
|
-
function stripCodeBlockCopyButtons(document) {
|
|
846
|
-
for (const pre of document.querySelectorAll('pre')) {
|
|
847
|
-
const candidates = pre.querySelectorAll(COPY_BUTTON_SELECTOR);
|
|
848
|
-
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
849
|
-
const el = candidates[i];
|
|
850
|
-
if (!el)
|
|
851
|
-
continue;
|
|
852
|
-
const text = (el.textContent || '').trim();
|
|
853
|
-
if (el.tagName === 'BUTTON' ||
|
|
854
|
-
COPY_BUTTON_TEXT_PATTERN.test(text) ||
|
|
855
|
-
(el.getAttribute('href') ?? '').includes('#copy')) {
|
|
856
|
-
el.remove();
|
|
857
|
-
}
|
|
858
|
-
}
|
|
859
|
-
}
|
|
860
|
-
}
|
|
861
|
-
function cleanCodeExamples(document) {
|
|
862
|
-
stripCodeBlockCopyButtons(document);
|
|
863
|
-
pruneFigurePreviewPanes(document);
|
|
864
|
-
pruneDemoInstructionBlocks(document);
|
|
865
|
-
normalizeHighlightedCodeLines(document);
|
|
866
|
-
}
|
|
867
|
-
function stripPromoLinks(document) {
|
|
868
|
-
const links = document.querySelectorAll('a[href]');
|
|
869
|
-
for (let i = links.length - 1; i >= 0; i--) {
|
|
870
|
-
const link = links[i];
|
|
871
|
-
if (!link)
|
|
872
|
-
continue;
|
|
873
|
-
const href = link.getAttribute('href');
|
|
874
|
-
if (href && UTM_PARAM_REGEX.test(href)) {
|
|
875
|
-
link.remove();
|
|
876
|
-
}
|
|
877
|
-
}
|
|
878
|
-
}
|
|
879
|
-
function separateAdjacentInlineElements(document) {
|
|
880
|
-
const badges = document.querySelectorAll('span.chakra-badge, [data-scope="badge"], [class*="badge"], [data-slot="label"], [slot="label"]');
|
|
881
|
-
for (const badge of badges) {
|
|
882
|
-
const next = badge.nextSibling;
|
|
883
|
-
if (next?.nodeType === 1) {
|
|
884
|
-
badge.after(document.createTextNode(' '));
|
|
885
|
-
}
|
|
886
|
-
}
|
|
887
|
-
}
|
|
888
|
-
const CODE_EDITOR_LANG_REGEX = /\blanguage-(\S+)/;
|
|
889
|
-
// Some documentation sites render code examples as highlighted, aria-hidden blocks with a textarea containing the raw code for accessibility.
|
|
890
|
-
// Surface the textarea content and remove the redundant highlighted block to produce cleaner markdown output.
|
|
891
|
-
export function surfaceCodeEditorContent(document) {
|
|
892
|
-
for (const pre of document.querySelectorAll('pre[aria-hidden="true"]')) {
|
|
893
|
-
const codeChild = pre.querySelector('code');
|
|
894
|
-
if (!codeChild)
|
|
895
|
-
continue;
|
|
896
|
-
const container = pre.parentElement;
|
|
897
|
-
if (!container)
|
|
898
|
-
continue;
|
|
899
|
-
const textarea = container.querySelector('textarea');
|
|
900
|
-
if (!textarea)
|
|
901
|
-
continue;
|
|
902
|
-
// Extract language from the highlighted code element
|
|
903
|
-
const langMatch = CODE_EDITOR_LANG_REGEX.exec(codeChild.getAttribute('class') ?? '');
|
|
904
|
-
const lang = langMatch?.[1] ?? '';
|
|
905
|
-
// Build a clean pre>code block from the textarea plain text
|
|
906
|
-
const newPre = document.createElement('pre');
|
|
907
|
-
const newCode = document.createElement('code');
|
|
908
|
-
if (lang)
|
|
909
|
-
newCode.setAttribute('class', `language-${lang}`);
|
|
910
|
-
newCode.textContent = textarea.textContent || '';
|
|
911
|
-
newPre.appendChild(newCode);
|
|
912
|
-
container.insertBefore(newPre, pre);
|
|
913
|
-
pre.remove();
|
|
914
|
-
textarea.remove();
|
|
915
|
-
}
|
|
916
|
-
}
|
|
917
|
-
export function stripDocsControls(document) {
|
|
918
|
-
removeNodes(document.querySelectorAll(DOCS_CONTROL_SELECTORS.join(',')));
|
|
919
|
-
}
|
|
920
|
-
export function stripScreenReaderText(document) {
|
|
921
|
-
const selectors = [
|
|
922
|
-
'.sr-only',
|
|
923
|
-
'.screen-reader-text',
|
|
924
|
-
'.visually-hidden',
|
|
925
|
-
'[class*="sr-only"]',
|
|
926
|
-
'[class*="visually-hidden"]',
|
|
927
|
-
'.cdk-visually-hidden',
|
|
928
|
-
'.vh',
|
|
929
|
-
'.hidden-visually',
|
|
930
|
-
];
|
|
931
|
-
removeNodes(document.querySelectorAll(selectors.join(',')));
|
|
932
|
-
}
|
|
933
|
-
function stripAriaLiveInstructions(document) {
|
|
934
|
-
for (const el of document.querySelectorAll('[aria-live]')) {
|
|
935
|
-
const text = (el.textContent || '').trim();
|
|
936
|
-
if (text.length > 0 && text.length <= INLINE_DEMO_INSTRUCTION_MAX_CHARS) {
|
|
937
|
-
el.remove();
|
|
938
|
-
}
|
|
939
|
-
}
|
|
940
|
-
}
|
|
941
|
-
export function runDocsControlPass(document) {
|
|
942
|
-
normalizeTabContent(document);
|
|
943
|
-
surfaceCodeEditorContent(document);
|
|
944
|
-
cleanHeadings(document);
|
|
945
|
-
stripDocsControls(document);
|
|
946
|
-
stripAriaLiveInstructions(document);
|
|
947
|
-
stripPromoLinks(document);
|
|
948
|
-
separateAdjacentInlineElements(document);
|
|
949
|
-
}
|
|
950
|
-
const PHRASING_PARENTS = new Set([
|
|
951
|
-
'P',
|
|
952
|
-
'LI',
|
|
953
|
-
'TD',
|
|
954
|
-
'TH',
|
|
955
|
-
'DD',
|
|
956
|
-
'SPAN',
|
|
957
|
-
'LABEL',
|
|
958
|
-
'FIGCAPTION',
|
|
959
|
-
'BLOCKQUOTE',
|
|
960
|
-
]);
|
|
961
|
-
function unwrapInlineButtons(document) {
|
|
962
|
-
for (const btn of document.querySelectorAll('button')) {
|
|
963
|
-
const parent = btn.parentElement;
|
|
964
|
-
if (!parent || !PHRASING_PARENTS.has(parent.tagName))
|
|
965
|
-
continue;
|
|
966
|
-
btn.replaceWith(...Array.from(btn.childNodes));
|
|
967
|
-
}
|
|
968
|
-
}
|
|
969
|
-
function runStructuralNoisePass(document, signal) {
|
|
970
|
-
unwrapInlineButtons(document);
|
|
971
|
-
stripNoise(document, signal);
|
|
972
|
-
}
|
|
973
|
-
function runCodeExamplePass(document) {
|
|
974
|
-
cleanCodeExamples(document);
|
|
975
|
-
}
|
|
976
|
-
function unwrapOrphanedTableCells(document) {
|
|
977
|
-
for (const cell of document.querySelectorAll('td, th')) {
|
|
978
|
-
if (!cell.closest('table')) {
|
|
979
|
-
cell.replaceWith(...Array.from(cell.childNodes));
|
|
980
|
-
}
|
|
981
|
-
}
|
|
982
|
-
}
|
|
983
|
-
function runTableNormalizationPass(document) {
|
|
984
|
-
unwrapOrphanedTableCells(document);
|
|
985
|
-
normalizeTableCells(document);
|
|
986
|
-
normalizeTableStructure(document);
|
|
987
|
-
}
|
|
988
|
-
function runUrlResolutionPass(document, baseUrl) {
|
|
989
|
-
if (baseUrl)
|
|
990
|
-
resolveUrls(document, baseUrl);
|
|
991
|
-
}
|
|
992
|
-
// Called on both raw documents (pre-article path) and article fragments
|
|
993
|
-
// (post-Readability). Some passes (stripTabTriggers, etc.) are no-ops
|
|
994
|
-
// on Readability output since tabs are already stripped or absent.
|
|
995
|
-
export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
996
|
-
extractNoscriptImages(document);
|
|
997
|
-
runDocsControlPass(document);
|
|
998
|
-
runStructuralNoisePass(document, signal);
|
|
999
|
-
runCodeExamplePass(document);
|
|
1000
|
-
runTableNormalizationPass(document);
|
|
1001
|
-
runUrlResolutionPass(document, baseUrl);
|
|
1002
|
-
}
|
|
1003
|
-
// Some sites put tbody/thead/tfoot inside td/th, which breaks markdown tables.
|
|
1004
|
-
function normalizeTableStructure(document) {
|
|
1005
|
-
for (const table of document.querySelectorAll('table')) {
|
|
1006
|
-
const theadCells = table.querySelectorAll('thead td');
|
|
1007
|
-
for (const td of theadCells) {
|
|
1008
|
-
const th = document.createElement('th');
|
|
1009
|
-
th.innerHTML = td.innerHTML;
|
|
1010
|
-
for (const attr of Array.from(td.attributes)) {
|
|
1011
|
-
th.setAttribute(attr.name, attr.value);
|
|
1012
|
-
}
|
|
1013
|
-
td.replaceWith(th);
|
|
1014
|
-
}
|
|
1015
|
-
for (const cell of table.querySelectorAll('th, td')) {
|
|
1016
|
-
for (const tag of ['tbody', 'thead', 'tfoot']) {
|
|
1017
|
-
let nested = cell.querySelector(tag);
|
|
1018
|
-
while (nested) {
|
|
1019
|
-
table.appendChild(nested);
|
|
1020
|
-
nested = cell.querySelector(tag);
|
|
1021
|
-
}
|
|
1022
|
-
}
|
|
1023
|
-
}
|
|
1024
|
-
hoistNestedRows(table);
|
|
1025
|
-
}
|
|
1026
|
-
}
|
|
1027
|
-
export function removeNoiseFromHtml(html, document, baseUrl, signal) {
|
|
1028
|
-
const shouldParse = isFullDocumentHtml(html) ||
|
|
1029
|
-
mayContainNoise(html) ||
|
|
1030
|
-
HTML_FRAGMENT_MARKERS.test(html);
|
|
1031
|
-
if (!shouldParse)
|
|
1032
|
-
return html;
|
|
1033
|
-
try {
|
|
1034
|
-
const doc = document ?? parseHTML(html).document;
|
|
1035
|
-
prepareDocumentForMarkdown(doc, baseUrl, signal);
|
|
1036
|
-
return serializeDocumentForMarkdown(doc, html);
|
|
1037
|
-
}
|
|
1038
|
-
catch {
|
|
1039
|
-
return html;
|
|
1040
|
-
}
|
|
1041
|
-
}
|
|
1042
|
-
// ── Content evaluation heuristics ───────────────────────────────────
|
|
1043
|
-
const MIN_CONTENT_RATIO = 0.15;
|
|
1044
|
-
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
1045
|
-
const RETENTION_RULES = [
|
|
1046
|
-
{
|
|
1047
|
-
selector: 'h1,h2,h3,h4,h5,h6',
|
|
1048
|
-
pattern: /<h[1-6]\b/gi,
|
|
1049
|
-
minThreshold: 1,
|
|
1050
|
-
ratio: 0.3,
|
|
1051
|
-
},
|
|
1052
|
-
{ selector: 'pre', pattern: /<pre\b/gi, minThreshold: 1, ratio: 0.15 },
|
|
1053
|
-
{ selector: 'table', pattern: /<table\b/gi, minThreshold: 1, ratio: 0.5 },
|
|
1054
|
-
{ selector: 'img', pattern: /<img\b/gi, minThreshold: 4, ratio: 0.2 },
|
|
1055
|
-
];
|
|
1056
|
-
const MIN_HEADINGS_FOR_EMPTY_SECTION_GATE = 5;
|
|
1057
|
-
const MAX_EMPTY_SECTION_RATIO = 0.15;
|
|
1058
|
-
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
1059
|
-
const MAX_TRUNCATED_LINE_RATIO = 0.95;
|
|
1060
|
-
function resolveHtmlDocument(htmlOrDocument) {
|
|
1061
|
-
if (typeof htmlOrDocument !== 'string')
|
|
1062
|
-
return htmlOrDocument;
|
|
1063
|
-
const needsWrapper = !/^\s*<(?:!doctype|html|body)\b/i.test(htmlOrDocument);
|
|
1064
|
-
const htmlToParse = needsWrapper
|
|
1065
|
-
? `<!DOCTYPE html><html><body>${htmlOrDocument}</body></html>`
|
|
1066
|
-
: htmlOrDocument;
|
|
1067
|
-
try {
|
|
1068
|
-
return parseHTML(htmlToParse).document;
|
|
1069
|
-
}
|
|
1070
|
-
catch {
|
|
1071
|
-
// Don't crash on parse failures.
|
|
1072
|
-
return parseHTML('<!DOCTYPE html><html><body></body></html>').document;
|
|
1073
|
-
}
|
|
1074
|
-
}
|
|
1075
|
-
function getTextContentSkippingHidden(node, parts) {
|
|
1076
|
-
const { nodeType } = node;
|
|
1077
|
-
if (nodeType === 3) {
|
|
1078
|
-
const { textContent } = node;
|
|
1079
|
-
if (textContent)
|
|
1080
|
-
parts.push(textContent);
|
|
1081
|
-
return;
|
|
1082
|
-
}
|
|
1083
|
-
if (nodeType !== 1)
|
|
1084
|
-
return;
|
|
1085
|
-
const element = node;
|
|
1086
|
-
if (element.hasAttribute('hidden') ||
|
|
1087
|
-
element.getAttribute('aria-hidden') === 'true') {
|
|
1088
|
-
return;
|
|
1089
|
-
}
|
|
1090
|
-
const { tagName } = element;
|
|
1091
|
-
if (tagName === 'SCRIPT' || tagName === 'STYLE' || tagName === 'NOSCRIPT')
|
|
1092
|
-
return;
|
|
1093
|
-
for (const child of node.childNodes) {
|
|
1094
|
-
getTextContentSkippingHidden(child, parts);
|
|
1095
|
-
}
|
|
1096
|
-
}
|
|
1097
|
-
export function getVisibleTextLength(htmlOrDocument) {
|
|
1098
|
-
if (typeof htmlOrDocument === 'string') {
|
|
1099
|
-
const doc = resolveHtmlDocument(htmlOrDocument);
|
|
1100
|
-
const body = resolveDocumentBody(doc);
|
|
1101
|
-
for (const el of body.querySelectorAll('script,style,noscript')) {
|
|
1102
|
-
el.remove();
|
|
1103
|
-
}
|
|
1104
|
-
return (body.textContent || '').replace(/\s+/g, ' ').trim().length;
|
|
1105
|
-
}
|
|
1106
|
-
const body = resolveDocumentBody(htmlOrDocument);
|
|
1107
|
-
const parts = [];
|
|
1108
|
-
getTextContentSkippingHidden(body, parts);
|
|
1109
|
-
return parts.join('').replace(/\s+/g, ' ').trim().length;
|
|
1110
|
-
}
|
|
1111
|
-
function countMatchingElements(root, selector) {
|
|
1112
|
-
return root.querySelectorAll(selector).length;
|
|
1113
|
-
}
|
|
1114
|
-
function getHeadingLevel(heading) {
|
|
1115
|
-
const match = /^H([1-6])$/.exec(heading.tagName);
|
|
1116
|
-
if (!match)
|
|
1117
|
-
return null;
|
|
1118
|
-
return Number.parseInt(match[1] ?? '', 10);
|
|
1119
|
-
}
|
|
1120
|
-
function hasSectionContent(heading) {
|
|
1121
|
-
const level = getHeadingLevel(heading);
|
|
1122
|
-
if (level === null)
|
|
1123
|
-
return false;
|
|
1124
|
-
let current = heading.nextElementSibling;
|
|
1125
|
-
while (current) {
|
|
1126
|
-
const currentLevel = getHeadingLevel(current);
|
|
1127
|
-
if (currentLevel !== null && currentLevel <= level)
|
|
1128
|
-
return false;
|
|
1129
|
-
const text = current.textContent.trim();
|
|
1130
|
-
if (text.length > 0)
|
|
1131
|
-
return true;
|
|
1132
|
-
if (current.querySelector('img,table,pre,code,ul,ol,figure,blockquote')) {
|
|
1133
|
-
return true;
|
|
1134
|
-
}
|
|
1135
|
-
current = current.nextElementSibling;
|
|
1136
|
-
}
|
|
1137
|
-
return false;
|
|
1138
|
-
}
|
|
1139
|
-
function countEmptyHeadingSections(root) {
|
|
1140
|
-
let emptyCount = 0;
|
|
1141
|
-
const headings = root.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
1142
|
-
for (const heading of headings) {
|
|
1143
|
-
// Skip headings that are explicitly hidden or for screen readers
|
|
1144
|
-
const cls = heading.getAttribute('class') ?? '';
|
|
1145
|
-
if (cls.includes('screen-reader-text') ||
|
|
1146
|
-
cls.includes('sr-only') ||
|
|
1147
|
-
cls.includes('visually-hidden')) {
|
|
1148
|
-
continue;
|
|
1149
|
-
}
|
|
1150
|
-
if (!hasSectionContent(heading))
|
|
1151
|
-
emptyCount += 1;
|
|
1152
|
-
}
|
|
1153
|
-
return emptyCount;
|
|
1154
|
-
}
|
|
1155
|
-
// Heuristic to detect if the content was truncated due to length limits by checking for incomplete sentences.
|
|
1156
|
-
const SENTENCE_ENDING_CODES = new Set([
|
|
1157
|
-
CharCode.PERIOD,
|
|
1158
|
-
CharCode.EXCLAMATION,
|
|
1159
|
-
CharCode.QUESTION,
|
|
1160
|
-
CharCode.COLON,
|
|
1161
|
-
CharCode.SEMICOLON,
|
|
1162
|
-
CharCode.DOUBLE_QUOTE,
|
|
1163
|
-
CharCode.SINGLE_QUOTE,
|
|
1164
|
-
CharCode.RIGHT_PAREN,
|
|
1165
|
-
CharCode.RIGHT_BRACKET,
|
|
1166
|
-
CharCode.BACKTICK,
|
|
1167
|
-
]);
|
|
1168
|
-
function trimLineOffsets(text, lineStart, lineEnd) {
|
|
1169
|
-
let start = lineStart;
|
|
1170
|
-
while (start < lineEnd && isWhitespaceChar(text.charCodeAt(start)))
|
|
1171
|
-
start++;
|
|
1172
|
-
let end = lineEnd - 1;
|
|
1173
|
-
while (end >= start && isWhitespaceChar(text.charCodeAt(end)))
|
|
1174
|
-
end--;
|
|
1175
|
-
if (end < start)
|
|
1176
|
-
return null;
|
|
1177
|
-
const trimmedLen = end - start + 1;
|
|
1178
|
-
return trimmedLen > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK
|
|
1179
|
-
? { start, end }
|
|
1180
|
-
: null;
|
|
1181
|
-
}
|
|
1182
|
-
function classifyLine(text, lineStart, lineEnd) {
|
|
1183
|
-
const lineLength = lineEnd - lineStart;
|
|
1184
|
-
if (lineLength <= MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK)
|
|
1185
|
-
return { counted: false, incomplete: false };
|
|
1186
|
-
const trimmed = trimLineOffsets(text, lineStart, lineEnd);
|
|
1187
|
-
if (!trimmed)
|
|
1188
|
-
return { counted: false, incomplete: false };
|
|
1189
|
-
const lastChar = text.charCodeAt(trimmed.end);
|
|
1190
|
-
return { counted: true, incomplete: !SENTENCE_ENDING_CODES.has(lastChar) };
|
|
1191
|
-
}
|
|
1192
|
-
function hasTruncatedSentences(text) {
|
|
1193
|
-
let lineStart = 0;
|
|
1194
|
-
let linesFound = 0;
|
|
1195
|
-
let incompleteFound = 0;
|
|
1196
|
-
const len = text.length;
|
|
1197
|
-
for (let i = 0; i <= len; i++) {
|
|
1198
|
-
const isEnd = i === len;
|
|
1199
|
-
const isNewline = !isEnd && text.charCodeAt(i) === CharCode.LF;
|
|
1200
|
-
if (isNewline || isEnd) {
|
|
1201
|
-
const { counted, incomplete } = classifyLine(text, lineStart, i);
|
|
1202
|
-
if (counted) {
|
|
1203
|
-
linesFound++;
|
|
1204
|
-
if (incomplete)
|
|
1205
|
-
incompleteFound++;
|
|
1206
|
-
}
|
|
1207
|
-
lineStart = i + 1;
|
|
1208
|
-
}
|
|
1209
|
-
}
|
|
1210
|
-
if (linesFound < MIN_LINES_FOR_TRUNCATION_CHECK)
|
|
1211
|
-
return false;
|
|
1212
|
-
return incompleteFound / linesFound > MAX_TRUNCATED_LINE_RATIO;
|
|
1213
|
-
}
|
|
1214
|
-
function passesContentRatioGate(articleTextLength, document) {
|
|
1215
|
-
const originalLength = getVisibleTextLength(document);
|
|
1216
|
-
return (originalLength < MIN_HTML_LENGTH_FOR_GATE ||
|
|
1217
|
-
articleTextLength / originalLength >= MIN_CONTENT_RATIO);
|
|
1218
|
-
}
|
|
1219
|
-
const DATA_IMG_PATTERN = /<img\b[^>]*\bsrc\s*=\s*["']?data:/gi;
|
|
1220
|
-
function countRealImages(htmlOrDoc) {
|
|
1221
|
-
if (typeof htmlOrDoc === 'string') {
|
|
1222
|
-
const total = htmlOrDoc.match(/<img\b/gi)?.length ?? 0;
|
|
1223
|
-
const dataImages = htmlOrDoc.match(DATA_IMG_PATTERN)?.length ?? 0;
|
|
1224
|
-
return total - dataImages;
|
|
1225
|
-
}
|
|
1226
|
-
let count = 0;
|
|
1227
|
-
for (const img of htmlOrDoc.querySelectorAll('img')) {
|
|
1228
|
-
const src = img.getAttribute('src') ?? '';
|
|
1229
|
-
if (!src.startsWith('data:'))
|
|
1230
|
-
count++;
|
|
1231
|
-
}
|
|
1232
|
-
return count;
|
|
1233
|
-
}
|
|
1234
|
-
function passesRetentionRulesFromHtml(originalDoc, articleHtml) {
|
|
1235
|
-
return RETENTION_RULES.every(({ selector, pattern, minThreshold, ratio }) => {
|
|
1236
|
-
// Exclude lazy-loaded placeholder images (data: URI src) from the
|
|
1237
|
-
// original count so they don't inflate the denominator and cause
|
|
1238
|
-
// false retention failures.
|
|
1239
|
-
const original = selector === 'img'
|
|
1240
|
-
? countRealImages(originalDoc)
|
|
1241
|
-
: countMatchingElements(originalDoc, selector);
|
|
1242
|
-
if (original < minThreshold)
|
|
1243
|
-
return true;
|
|
1244
|
-
// For images, also exclude data: URIs from the article count to
|
|
1245
|
-
// align with the denominator's real-image filtering.
|
|
1246
|
-
const articleCount = selector === 'img'
|
|
1247
|
-
? countRealImages(articleHtml)
|
|
1248
|
-
: (articleHtml.match(pattern)?.length ?? 0);
|
|
1249
|
-
return articleCount / original >= ratio;
|
|
1250
|
-
});
|
|
1251
|
-
}
|
|
1252
|
-
function passesEmptySectionRatio(articleDoc) {
|
|
1253
|
-
const headings = Array.from(articleDoc.querySelectorAll('h1,h2,h3,h4,h5,h6')).filter((h) => {
|
|
1254
|
-
const cls = h.getAttribute('class') ?? '';
|
|
1255
|
-
return (!cls.includes('screen-reader-text') &&
|
|
1256
|
-
!cls.includes('sr-only') &&
|
|
1257
|
-
!cls.includes('visually-hidden'));
|
|
1258
|
-
});
|
|
1259
|
-
const headingCount = headings.length;
|
|
1260
|
-
return (headingCount < MIN_HEADINGS_FOR_EMPTY_SECTION_GATE ||
|
|
1261
|
-
countEmptyHeadingSections(articleDoc) / headingCount <=
|
|
1262
|
-
MAX_EMPTY_SECTION_RATIO);
|
|
1263
|
-
}
|
|
1264
|
-
export function evaluateArticleContent(article, document) {
|
|
1265
|
-
if (!passesContentRatioGate(article.textContent.length, document)) {
|
|
1266
|
-
logDebug('FAILED passesContentRatioGate', undefined, Loggers.LOG_TRANSFORM);
|
|
1267
|
-
return null;
|
|
1268
|
-
}
|
|
1269
|
-
if (!passesRetentionRulesFromHtml(document, article.content)) {
|
|
1270
|
-
logDebug('FAILED passesRetentionRulesFromHtml', undefined, Loggers.LOG_TRANSFORM);
|
|
1271
|
-
return null;
|
|
1272
|
-
}
|
|
1273
|
-
if (hasTruncatedSentences(article.textContent)) {
|
|
1274
|
-
logDebug('FAILED hasTruncatedSentences', undefined, Loggers.LOG_TRANSFORM);
|
|
1275
|
-
return null;
|
|
1276
|
-
}
|
|
1277
|
-
const articleDoc = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`).document;
|
|
1278
|
-
if (!passesEmptySectionRatio(articleDoc)) {
|
|
1279
|
-
const headings = articleDoc.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
1280
|
-
logDebug(`FAILED passesEmptySectionRatio: ${headings.length} headings`, undefined, Loggers.LOG_TRANSFORM);
|
|
1281
|
-
for (const h of headings) {
|
|
1282
|
-
logDebug(`H: ${h.textContent} ${String(hasSectionContent(h))}`, undefined, Loggers.LOG_TRANSFORM);
|
|
1283
|
-
}
|
|
1284
|
-
return null;
|
|
1285
|
-
}
|
|
1286
|
-
return articleDoc;
|
|
1287
|
-
}
|