@j0hanz/fetch-url-mcp 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +570 -0
- package/dist/AGENTS.md +115 -0
- package/dist/assets/logo.svg +24837 -0
- package/dist/cache.d.ts +47 -0
- package/dist/cache.js +316 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +48 -0
- package/dist/config.d.ts +142 -0
- package/dist/config.js +480 -0
- package/dist/crypto.d.ts +3 -0
- package/dist/crypto.js +49 -0
- package/dist/dom-noise-removal.d.ts +1 -0
- package/dist/dom-noise-removal.js +488 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +61 -0
- package/dist/fetch.d.ts +42 -0
- package/dist/fetch.js +1544 -0
- package/dist/host-normalization.d.ts +1 -0
- package/dist/host-normalization.js +77 -0
- package/dist/http-native.d.ts +5 -0
- package/dist/http-native.js +1313 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +91 -0
- package/dist/instructions.md +57 -0
- package/dist/ip-blocklist.d.ts +8 -0
- package/dist/ip-blocklist.js +74 -0
- package/dist/json.d.ts +1 -0
- package/dist/json.js +34 -0
- package/dist/language-detection.d.ts +2 -0
- package/dist/language-detection.js +364 -0
- package/dist/markdown-cleanup.d.ts +6 -0
- package/dist/markdown-cleanup.js +474 -0
- package/dist/mcp-validator.d.ts +15 -0
- package/dist/mcp-validator.js +44 -0
- package/dist/mcp.d.ts +4 -0
- package/dist/mcp.js +421 -0
- package/dist/observability.d.ts +21 -0
- package/dist/observability.js +211 -0
- package/dist/prompts.d.ts +7 -0
- package/dist/prompts.js +28 -0
- package/dist/resources.d.ts +8 -0
- package/dist/resources.js +216 -0
- package/dist/server-tuning.d.ts +13 -0
- package/dist/server-tuning.js +47 -0
- package/dist/server.d.ts +4 -0
- package/dist/server.js +174 -0
- package/dist/session.d.ts +39 -0
- package/dist/session.js +218 -0
- package/dist/tasks.d.ts +63 -0
- package/dist/tasks.js +327 -0
- package/dist/timer-utils.d.ts +5 -0
- package/dist/timer-utils.js +20 -0
- package/dist/tools.d.ts +135 -0
- package/dist/tools.js +812 -0
- package/dist/transform-types.d.ts +126 -0
- package/dist/transform-types.js +5 -0
- package/dist/transform.d.ts +36 -0
- package/dist/transform.js +2341 -0
- package/dist/type-guards.d.ts +14 -0
- package/dist/type-guards.js +13 -0
- package/dist/workers/transform-child.d.ts +1 -0
- package/dist/workers/transform-child.js +136 -0
- package/dist/workers/transform-worker.d.ts +1 -0
- package/dist/workers/transform-worker.js +128 -0
- package/package.json +91 -0
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import { config } from './config.js';
|
|
3
|
+
import { logDebug } from './observability.js';
|
|
4
|
+
// --- Constants & Pre-compiled Regex ---
|
|
5
|
+
const NOISE_SCAN_LIMIT = 50_000;
|
|
6
|
+
const MIN_BODY_CONTENT_LENGTH = 100;
|
|
7
|
+
const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
8
|
+
const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
9
|
+
// Merged markers for fast rejection
|
|
10
|
+
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
11
|
+
const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
|
|
12
|
+
// Split into smaller regexes to stay within sonarjs/regex-complexity limit
|
|
13
|
+
const NOISE_PATTERNS = [
|
|
14
|
+
/<\s*(?:script|style|noscript|iframe|nav|footer|header|form|button|input|select|textarea|svg|canvas)\b/i,
|
|
15
|
+
/[\s"']role\s*=\s*['"]?(?:navigation|banner|complementary|contentinfo|tree|menubar|menu)['"]?/i,
|
|
16
|
+
/[\s"'](?:aria-hidden\s*=\s*['"]?true['"]?|hidden)/i,
|
|
17
|
+
/[\s"'](?:banner|promo|announcement|cta|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast)\b/i,
|
|
18
|
+
/[\s"'](?:fixed|sticky|z-50|z-4|isolate|breadcrumb|pagination)\b/i,
|
|
19
|
+
];
|
|
20
|
+
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
21
|
+
const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50)|isolate)\b/;
|
|
22
|
+
const SKIP_URL_PREFIXES = [
|
|
23
|
+
'#',
|
|
24
|
+
'java' + 'script:',
|
|
25
|
+
'mailto:',
|
|
26
|
+
'tel:',
|
|
27
|
+
'data:',
|
|
28
|
+
'blob:',
|
|
29
|
+
];
|
|
30
|
+
const BASE_STRUCTURAL_TAGS = new Set([
|
|
31
|
+
'script',
|
|
32
|
+
'style',
|
|
33
|
+
'noscript',
|
|
34
|
+
'iframe',
|
|
35
|
+
'form',
|
|
36
|
+
'button',
|
|
37
|
+
'input',
|
|
38
|
+
'select',
|
|
39
|
+
'textarea',
|
|
40
|
+
]);
|
|
41
|
+
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer']);
|
|
42
|
+
const NAVIGATION_ROLES = new Set([
|
|
43
|
+
'navigation',
|
|
44
|
+
'banner',
|
|
45
|
+
'complementary',
|
|
46
|
+
'contentinfo',
|
|
47
|
+
'tree',
|
|
48
|
+
'menubar',
|
|
49
|
+
'menu',
|
|
50
|
+
'dialog',
|
|
51
|
+
'alertdialog',
|
|
52
|
+
'search',
|
|
53
|
+
]);
|
|
54
|
+
const INTERACTIVE_CONTENT_ROLES = new Set([
|
|
55
|
+
'tabpanel',
|
|
56
|
+
'tab',
|
|
57
|
+
'tablist',
|
|
58
|
+
'dialog',
|
|
59
|
+
'alertdialog',
|
|
60
|
+
'menu',
|
|
61
|
+
'menuitem',
|
|
62
|
+
'option',
|
|
63
|
+
'listbox',
|
|
64
|
+
'combobox',
|
|
65
|
+
'tooltip',
|
|
66
|
+
'alert',
|
|
67
|
+
]);
|
|
68
|
+
const PROMO_TOKENS_ALWAYS = [
|
|
69
|
+
'banner',
|
|
70
|
+
'promo',
|
|
71
|
+
'announcement',
|
|
72
|
+
'cta',
|
|
73
|
+
'advert',
|
|
74
|
+
'ads',
|
|
75
|
+
'sponsor',
|
|
76
|
+
'recommend',
|
|
77
|
+
'breadcrumb',
|
|
78
|
+
'pagination',
|
|
79
|
+
'pager',
|
|
80
|
+
'taglist',
|
|
81
|
+
];
|
|
82
|
+
const PROMO_TOKENS_AGGRESSIVE = ['ad', 'related', 'comment'];
|
|
83
|
+
const PROMO_TOKENS_BY_CATEGORY = {
|
|
84
|
+
'cookie-banners': ['cookie', 'consent', 'popup', 'modal', 'overlay', 'toast'],
|
|
85
|
+
newsletters: ['newsletter', 'subscribe'],
|
|
86
|
+
'social-share': ['share', 'social'],
|
|
87
|
+
};
|
|
88
|
+
const BASE_NOISE_SELECTORS = {
|
|
89
|
+
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
|
|
90
|
+
cookieBanners: '[role="dialog"]',
|
|
91
|
+
hidden: '[style*="display: none"],[style*="display:none"],[hidden],[aria-hidden="true"]',
|
|
92
|
+
};
|
|
93
|
+
const NO_MATCH_REGEX = /a^/i;
|
|
94
|
+
// --- State Cache ---
|
|
95
|
+
let cachedContext;
|
|
96
|
+
let lastConfigRef;
|
|
97
|
+
// --- Helpers Inlined/Optimized ---
|
|
98
|
+
function escapeRegexLiteral(value) {
|
|
99
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
100
|
+
}
|
|
101
|
+
function buildTokenRegex(tokens) {
|
|
102
|
+
if (tokens.size === 0)
|
|
103
|
+
return NO_MATCH_REGEX;
|
|
104
|
+
return new RegExp(`(?:^|[^a-z0-9])(?:${[...tokens].map(escapeRegexLiteral).join('|')})(?:$|[^a-z0-9])`, 'i');
|
|
105
|
+
}
|
|
106
|
+
function getPromoMatchers(currentConfig, flags) {
|
|
107
|
+
const baseTokens = new Set(PROMO_TOKENS_ALWAYS);
|
|
108
|
+
const aggressiveTokens = new Set();
|
|
109
|
+
if (currentConfig.aggressiveMode) {
|
|
110
|
+
for (const t of PROMO_TOKENS_AGGRESSIVE)
|
|
111
|
+
aggressiveTokens.add(t);
|
|
112
|
+
}
|
|
113
|
+
if (flags.cookieBanners)
|
|
114
|
+
for (const t of PROMO_TOKENS_BY_CATEGORY['cookie-banners'])
|
|
115
|
+
baseTokens.add(t);
|
|
116
|
+
if (flags.newsletters)
|
|
117
|
+
for (const t of PROMO_TOKENS_BY_CATEGORY['newsletters'])
|
|
118
|
+
baseTokens.add(t);
|
|
119
|
+
if (flags.socialShare)
|
|
120
|
+
for (const t of PROMO_TOKENS_BY_CATEGORY['social-share'])
|
|
121
|
+
baseTokens.add(t);
|
|
122
|
+
for (const t of currentConfig.extraTokens) {
|
|
123
|
+
const n = t.toLowerCase().trim();
|
|
124
|
+
if (n)
|
|
125
|
+
baseTokens.add(n);
|
|
126
|
+
}
|
|
127
|
+
return {
|
|
128
|
+
base: buildTokenRegex(baseTokens),
|
|
129
|
+
aggressive: buildTokenRegex(aggressiveTokens),
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
function getContext() {
|
|
133
|
+
const currentConfig = config.noiseRemoval;
|
|
134
|
+
if (cachedContext && lastConfigRef === currentConfig) {
|
|
135
|
+
return cachedContext;
|
|
136
|
+
}
|
|
137
|
+
const enabled = new Set(currentConfig.enabledCategories
|
|
138
|
+
.map((c) => {
|
|
139
|
+
const s = c.toLowerCase().trim();
|
|
140
|
+
const { locale } = config.i18n;
|
|
141
|
+
return locale ? s.toLocaleLowerCase(locale) : s;
|
|
142
|
+
})
|
|
143
|
+
.filter(Boolean));
|
|
144
|
+
const isEnabled = (cat) => enabled.has(cat);
|
|
145
|
+
const flags = {
|
|
146
|
+
navFooter: isEnabled('nav-footer'),
|
|
147
|
+
cookieBanners: isEnabled('cookie-banners'),
|
|
148
|
+
newsletters: isEnabled('newsletters'),
|
|
149
|
+
socialShare: isEnabled('social-share'),
|
|
150
|
+
};
|
|
151
|
+
const structuralTags = new Set(BASE_STRUCTURAL_TAGS);
|
|
152
|
+
if (!currentConfig.preserveSvgCanvas) {
|
|
153
|
+
structuralTags.add('svg');
|
|
154
|
+
structuralTags.add('canvas');
|
|
155
|
+
}
|
|
156
|
+
const promoMatchers = getPromoMatchers(currentConfig, flags);
|
|
157
|
+
const extraSelectors = currentConfig.extraSelectors
|
|
158
|
+
.map((s) => s.trim())
|
|
159
|
+
.filter((s) => s.length > 0);
|
|
160
|
+
// Pre-build selectors
|
|
161
|
+
const selectors = [BASE_NOISE_SELECTORS.hidden];
|
|
162
|
+
if (flags.navFooter)
|
|
163
|
+
selectors.push(BASE_NOISE_SELECTORS.navFooter);
|
|
164
|
+
if (flags.cookieBanners)
|
|
165
|
+
selectors.push(BASE_NOISE_SELECTORS.cookieBanners);
|
|
166
|
+
const baseSelector = selectors.join(',');
|
|
167
|
+
const candidateSelector = [
|
|
168
|
+
...structuralTags,
|
|
169
|
+
...ALWAYS_NOISE_TAGS,
|
|
170
|
+
'aside',
|
|
171
|
+
'header',
|
|
172
|
+
'[class]',
|
|
173
|
+
'[id]',
|
|
174
|
+
'[role]',
|
|
175
|
+
'[style]',
|
|
176
|
+
].join(',');
|
|
177
|
+
cachedContext = {
|
|
178
|
+
flags,
|
|
179
|
+
structuralTags,
|
|
180
|
+
weights: currentConfig.weights,
|
|
181
|
+
promoMatchers,
|
|
182
|
+
promoEnabled: flags.cookieBanners || flags.newsletters || flags.socialShare,
|
|
183
|
+
extraSelectors,
|
|
184
|
+
baseSelector,
|
|
185
|
+
candidateSelector,
|
|
186
|
+
};
|
|
187
|
+
lastConfigRef = currentConfig;
|
|
188
|
+
return cachedContext;
|
|
189
|
+
}
|
|
190
|
+
// --- Hot Path Logic ---
|
|
191
|
+
function isInteractive(element, role) {
|
|
192
|
+
if (role && INTERACTIVE_CONTENT_ROLES.has(role))
|
|
193
|
+
return true;
|
|
194
|
+
const ds = element.getAttribute('data-state');
|
|
195
|
+
if (ds === 'inactive' || ds === 'closed')
|
|
196
|
+
return true;
|
|
197
|
+
const dataOrientation = element.getAttribute('data-orientation');
|
|
198
|
+
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
|
|
199
|
+
return true;
|
|
200
|
+
return (element.hasAttribute('data-accordion-item') ||
|
|
201
|
+
element.hasAttribute('data-radix-collection-item'));
|
|
202
|
+
}
|
|
203
|
+
function isWithinPrimaryContent(element) {
|
|
204
|
+
let current = element;
|
|
205
|
+
while (current) {
|
|
206
|
+
const tagName = current.tagName.toLowerCase();
|
|
207
|
+
if (tagName === 'article' || tagName === 'main')
|
|
208
|
+
return true;
|
|
209
|
+
if (current.getAttribute('role') === 'main')
|
|
210
|
+
return true;
|
|
211
|
+
current = current.parentElement;
|
|
212
|
+
}
|
|
213
|
+
return false;
|
|
214
|
+
}
|
|
215
|
+
function shouldPreserve(element, tagName) {
|
|
216
|
+
// Check Dialog
|
|
217
|
+
const role = element.getAttribute('role');
|
|
218
|
+
if (role === 'dialog' || role === 'alertdialog') {
|
|
219
|
+
if (isWithinPrimaryContent(element))
|
|
220
|
+
return true;
|
|
221
|
+
const textLen = (element.textContent || '').length;
|
|
222
|
+
if (textLen > DIALOG_MIN_CHARS_FOR_PRESERVATION)
|
|
223
|
+
return true;
|
|
224
|
+
return element.querySelector('h1,h2,h3,h4,h5,h6') !== null;
|
|
225
|
+
}
|
|
226
|
+
// Check Nav/Footer
|
|
227
|
+
if (tagName === 'nav' || tagName === 'footer') {
|
|
228
|
+
if (element.querySelector('article,main,section,[role="main"]'))
|
|
229
|
+
return true;
|
|
230
|
+
return ((element.textContent || '').trim().length >=
|
|
231
|
+
NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
|
|
232
|
+
}
|
|
233
|
+
return false;
|
|
234
|
+
}
|
|
235
|
+
function removeNodes(nodes) {
|
|
236
|
+
for (let i = nodes.length - 1; i >= 0; i--) {
|
|
237
|
+
const node = nodes[i];
|
|
238
|
+
if (node?.parentNode && !shouldPreserve(node, node.tagName.toLowerCase())) {
|
|
239
|
+
node.remove();
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
function scoreNavFooter(tagName, role, className, id, weights) {
|
|
244
|
+
let score = 0;
|
|
245
|
+
if (ALWAYS_NOISE_TAGS.has(tagName))
|
|
246
|
+
score += weights.structural;
|
|
247
|
+
// Header Boilerplate
|
|
248
|
+
if (tagName === 'header') {
|
|
249
|
+
if ((role && NAVIGATION_ROLES.has(role)) ||
|
|
250
|
+
HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
|
|
251
|
+
score += weights.structural;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
// Role Noise
|
|
255
|
+
if (role && NAVIGATION_ROLES.has(role)) {
|
|
256
|
+
if (tagName !== 'aside' || role !== 'complementary') {
|
|
257
|
+
score += weights.structural;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
return score;
|
|
261
|
+
}
|
|
262
|
+
function extractElementMetadata(element) {
|
|
263
|
+
const tagName = element.tagName.toLowerCase();
|
|
264
|
+
const className = element.getAttribute('class') ?? '';
|
|
265
|
+
const id = element.getAttribute('id') ?? '';
|
|
266
|
+
const role = element.getAttribute('role');
|
|
267
|
+
const style = element.getAttribute('style');
|
|
268
|
+
const _isInteractive = isInteractive(element, role);
|
|
269
|
+
const isHidden = element.hasAttribute('hidden') ||
|
|
270
|
+
element.getAttribute('aria-hidden') === 'true' ||
|
|
271
|
+
(style !== null &&
|
|
272
|
+
/\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i.test(style));
|
|
273
|
+
return {
|
|
274
|
+
tagName,
|
|
275
|
+
className,
|
|
276
|
+
id,
|
|
277
|
+
role,
|
|
278
|
+
style,
|
|
279
|
+
isInteractive: _isInteractive,
|
|
280
|
+
isHidden,
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
function isNoiseElement(element, context) {
|
|
284
|
+
const meta = extractElementMetadata(element);
|
|
285
|
+
let score = 0;
|
|
286
|
+
const { weights } = context;
|
|
287
|
+
// Structural
|
|
288
|
+
if (context.structuralTags.has(meta.tagName) && !meta.isInteractive) {
|
|
289
|
+
score += weights.structural;
|
|
290
|
+
}
|
|
291
|
+
// Nav/Footer Scoring
|
|
292
|
+
if (context.flags.navFooter) {
|
|
293
|
+
score += scoreNavFooter(meta.tagName, meta.role, meta.className, meta.id, weights);
|
|
294
|
+
}
|
|
295
|
+
// Hidden
|
|
296
|
+
if (meta.isHidden && !meta.isInteractive) {
|
|
297
|
+
score += weights.hidden;
|
|
298
|
+
}
|
|
299
|
+
// Sticky/Fixed
|
|
300
|
+
if (FIXED_OR_HIGH_Z_PATTERN.test(meta.className)) {
|
|
301
|
+
score += weights.stickyFixed;
|
|
302
|
+
}
|
|
303
|
+
// Promo
|
|
304
|
+
if (context.promoEnabled) {
|
|
305
|
+
const aggTest = context.promoMatchers.aggressive.test(meta.className) ||
|
|
306
|
+
context.promoMatchers.aggressive.test(meta.id);
|
|
307
|
+
const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
|
|
308
|
+
const isBaseMatch = !aggTest &&
|
|
309
|
+
(context.promoMatchers.base.test(meta.className) ||
|
|
310
|
+
context.promoMatchers.base.test(meta.id));
|
|
311
|
+
if (isAggressiveMatch || isBaseMatch) {
|
|
312
|
+
score += weights.promo;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
return score >= weights.threshold;
|
|
316
|
+
}
|
|
317
|
+
function cleanHeadingWrapperDivs(h) {
|
|
318
|
+
const divs = h.querySelectorAll('div');
|
|
319
|
+
for (let j = divs.length - 1; j >= 0; j--) {
|
|
320
|
+
const d = divs[j];
|
|
321
|
+
if (!d?.parentNode)
|
|
322
|
+
continue;
|
|
323
|
+
const cls = d.getAttribute('class') ?? '';
|
|
324
|
+
const stl = d.getAttribute('style') ?? '';
|
|
325
|
+
if (cls.includes('absolute') ||
|
|
326
|
+
stl.includes('position') ||
|
|
327
|
+
d.getAttribute('tabindex') === '-1') {
|
|
328
|
+
d.remove();
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
function cleanHeadingAnchors(h) {
|
|
333
|
+
const anchors = h.querySelectorAll('a');
|
|
334
|
+
for (let j = anchors.length - 1; j >= 0; j--) {
|
|
335
|
+
const a = anchors[j];
|
|
336
|
+
if (!a?.parentNode)
|
|
337
|
+
continue;
|
|
338
|
+
const href = a.getAttribute('href') ?? '';
|
|
339
|
+
const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
|
|
340
|
+
if (href.startsWith('#') && txt.length === 0) {
|
|
341
|
+
a.remove();
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
function cleanHeadingZeroWidth(h, document) {
|
|
346
|
+
const walker = document.createTreeWalker(h, 4); // SHOW_TEXT
|
|
347
|
+
let node;
|
|
348
|
+
while ((node = walker.nextNode())) {
|
|
349
|
+
if (node.textContent?.includes('\u200B')) {
|
|
350
|
+
node.textContent = node.textContent.replace(/\u200B/g, '');
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
function cleanHeadings(document) {
|
|
355
|
+
// Clean Heading Anchors
|
|
356
|
+
const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
357
|
+
for (const h of headings) {
|
|
358
|
+
if (!h.parentNode)
|
|
359
|
+
continue;
|
|
360
|
+
cleanHeadingWrapperDivs(h);
|
|
361
|
+
cleanHeadingAnchors(h);
|
|
362
|
+
cleanHeadingZeroWidth(h, document);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
function stripNoise(document, context) {
|
|
366
|
+
cleanHeadings(document);
|
|
367
|
+
// Remove Base & Extra
|
|
368
|
+
const { baseSelector, extraSelectors } = context;
|
|
369
|
+
// Base
|
|
370
|
+
const baseNodes = document.querySelectorAll(baseSelector);
|
|
371
|
+
removeNodes(baseNodes);
|
|
372
|
+
// Extra
|
|
373
|
+
if (extraSelectors.length > 0) {
|
|
374
|
+
const combinedExtra = extraSelectors.join(',');
|
|
375
|
+
const extraNodes = document.querySelectorAll(combinedExtra);
|
|
376
|
+
removeNodes(extraNodes);
|
|
377
|
+
}
|
|
378
|
+
// Candidates
|
|
379
|
+
const candidates = document.querySelectorAll(context.candidateSelector);
|
|
380
|
+
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
381
|
+
const node = candidates[i];
|
|
382
|
+
if (!node)
|
|
383
|
+
continue;
|
|
384
|
+
if (!node.parentNode)
|
|
385
|
+
continue;
|
|
386
|
+
if (shouldPreserve(node, node.tagName.toLowerCase()))
|
|
387
|
+
continue;
|
|
388
|
+
if (isNoiseElement(node, context)) {
|
|
389
|
+
node.remove();
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
function processUrlElement(el, attr, base, isSrcset) {
|
|
394
|
+
if (!el.parentNode)
|
|
395
|
+
return;
|
|
396
|
+
if (isSrcset) {
|
|
397
|
+
const val = el.getAttribute(attr);
|
|
398
|
+
if (val) {
|
|
399
|
+
const newVal = val
|
|
400
|
+
.split(',')
|
|
401
|
+
.map((entry) => {
|
|
402
|
+
const parts = entry.trim().split(/\s+/);
|
|
403
|
+
if (!parts[0])
|
|
404
|
+
return entry;
|
|
405
|
+
try {
|
|
406
|
+
parts[0] = new URL(parts[0], base).href;
|
|
407
|
+
}
|
|
408
|
+
catch {
|
|
409
|
+
/* ignore */
|
|
410
|
+
}
|
|
411
|
+
return parts.join(' ');
|
|
412
|
+
})
|
|
413
|
+
.join(', ');
|
|
414
|
+
el.setAttribute(attr, newVal);
|
|
415
|
+
}
|
|
416
|
+
return;
|
|
417
|
+
}
|
|
418
|
+
const val = el.getAttribute(attr);
|
|
419
|
+
if (val &&
|
|
420
|
+
!SKIP_URL_PREFIXES.some((p) => val.trim().toLowerCase().startsWith(p))) {
|
|
421
|
+
try {
|
|
422
|
+
el.setAttribute(attr, new URL(val, base).href);
|
|
423
|
+
}
|
|
424
|
+
catch {
|
|
425
|
+
/* ignore */
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
function resolveUrls(document, baseUrlStr) {
|
|
430
|
+
let base;
|
|
431
|
+
try {
|
|
432
|
+
base = new URL(baseUrlStr);
|
|
433
|
+
}
|
|
434
|
+
catch {
|
|
435
|
+
return;
|
|
436
|
+
}
|
|
437
|
+
const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
|
|
438
|
+
for (const el of Array.from(elements)) {
|
|
439
|
+
const tag = el.tagName.toLowerCase();
|
|
440
|
+
if (tag === 'a')
|
|
441
|
+
processUrlElement(el, 'href', base, false);
|
|
442
|
+
else if (tag === 'img')
|
|
443
|
+
processUrlElement(el, 'src', base, false);
|
|
444
|
+
else if (tag === 'source')
|
|
445
|
+
processUrlElement(el, 'srcset', base, true);
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
function serialize(document, fallback) {
|
|
449
|
+
const bodyHtml = document.body.innerHTML;
|
|
450
|
+
if (bodyHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
|
|
451
|
+
return bodyHtml;
|
|
452
|
+
const outerHtml = document.documentElement.outerHTML;
|
|
453
|
+
if (outerHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
|
|
454
|
+
return outerHtml;
|
|
455
|
+
return fallback;
|
|
456
|
+
}
|
|
457
|
+
function isFullDocumentHtml(html) {
|
|
458
|
+
return HTML_DOCUMENT_MARKERS.test(html);
|
|
459
|
+
}
|
|
460
|
+
function mayContainNoise(html) {
|
|
461
|
+
const sample = html.length <= NOISE_SCAN_LIMIT
|
|
462
|
+
? html
|
|
463
|
+
: `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
|
|
464
|
+
return NOISE_PATTERNS.some((re) => re.test(sample));
|
|
465
|
+
}
|
|
466
|
+
export function removeNoiseFromHtml(html, document, baseUrl) {
|
|
467
|
+
const shouldParse = isFullDocumentHtml(html) ||
|
|
468
|
+
mayContainNoise(html) ||
|
|
469
|
+
HTML_FRAGMENT_MARKERS.test(html);
|
|
470
|
+
if (!shouldParse)
|
|
471
|
+
return html;
|
|
472
|
+
try {
|
|
473
|
+
const context = getContext();
|
|
474
|
+
if (config.noiseRemoval.debug) {
|
|
475
|
+
logDebug('Noise removal audit enabled', {
|
|
476
|
+
categories: [...(context.flags.navFooter ? ['nav-footer'] : [])],
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
const doc = document ?? parseHTML(html).document;
|
|
480
|
+
stripNoise(doc, context);
|
|
481
|
+
if (baseUrl)
|
|
482
|
+
resolveUrls(doc, baseUrl);
|
|
483
|
+
return serialize(doc, html);
|
|
484
|
+
}
|
|
485
|
+
catch {
|
|
486
|
+
return html;
|
|
487
|
+
}
|
|
488
|
+
}
|
package/dist/errors.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export declare class FetchError extends Error {
|
|
2
|
+
readonly url: string;
|
|
3
|
+
readonly statusCode: number;
|
|
4
|
+
readonly code: string;
|
|
5
|
+
readonly details: Readonly<Record<string, unknown>>;
|
|
6
|
+
constructor(message: string, url: string, httpStatus?: number, details?: Record<string, unknown>, options?: ErrorOptions);
|
|
7
|
+
}
|
|
8
|
+
export declare function getErrorMessage(error: unknown): string;
|
|
9
|
+
export declare function createErrorWithCode(message: string, code: string, options?: ErrorOptions): NodeJS.ErrnoException;
|
|
10
|
+
export declare function isSystemError(error: unknown): error is NodeJS.ErrnoException;
|
package/dist/errors.js
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { inspect } from 'node:util';
|
|
2
|
+
import { isError, isObject } from './type-guards.js';
|
|
3
|
+
const DEFAULT_HTTP_STATUS = 502;
|
|
4
|
+
export class FetchError extends Error {
|
|
5
|
+
url;
|
|
6
|
+
statusCode;
|
|
7
|
+
code;
|
|
8
|
+
details;
|
|
9
|
+
constructor(message, url, httpStatus, details = {}, options) {
|
|
10
|
+
super(message, options);
|
|
11
|
+
this.url = url;
|
|
12
|
+
this.name = 'FetchError';
|
|
13
|
+
this.statusCode = httpStatus ?? DEFAULT_HTTP_STATUS;
|
|
14
|
+
this.code = httpStatus ? `HTTP_${httpStatus}` : 'FETCH_ERROR';
|
|
15
|
+
this.details = Object.freeze({ url, httpStatus, ...details });
|
|
16
|
+
Error.captureStackTrace(this, this.constructor);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
export function getErrorMessage(error) {
|
|
20
|
+
if (isError(error))
|
|
21
|
+
return error.message;
|
|
22
|
+
if (typeof error === 'string' && error.length > 0)
|
|
23
|
+
return error;
|
|
24
|
+
if (isErrorWithMessage(error))
|
|
25
|
+
return error.message;
|
|
26
|
+
return formatUnknownError(error);
|
|
27
|
+
}
|
|
28
|
+
function isErrorWithMessage(error) {
|
|
29
|
+
if (!isObject(error))
|
|
30
|
+
return false;
|
|
31
|
+
const { message } = error;
|
|
32
|
+
return typeof message === 'string' && message.length > 0;
|
|
33
|
+
}
|
|
34
|
+
function formatUnknownError(error) {
|
|
35
|
+
if (error === null || error === undefined)
|
|
36
|
+
return 'Unknown error';
|
|
37
|
+
try {
|
|
38
|
+
return inspect(error, {
|
|
39
|
+
depth: 2,
|
|
40
|
+
maxStringLength: 200,
|
|
41
|
+
breakLength: Infinity,
|
|
42
|
+
compact: true,
|
|
43
|
+
colors: false,
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
return 'Unknown error';
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
export function createErrorWithCode(message, code, options) {
|
|
51
|
+
const error = new Error(message, options);
|
|
52
|
+
return Object.assign(error, { code });
|
|
53
|
+
}
|
|
54
|
+
export function isSystemError(error) {
|
|
55
|
+
if (!isError(error))
|
|
56
|
+
return false;
|
|
57
|
+
if (!('code' in error))
|
|
58
|
+
return false;
|
|
59
|
+
const { code } = error;
|
|
60
|
+
return typeof code === 'string';
|
|
61
|
+
}
|
package/dist/fetch.d.ts
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
export interface FetchOptions {
|
|
2
|
+
signal?: AbortSignal;
|
|
3
|
+
}
|
|
4
|
+
export interface TransformResult {
|
|
5
|
+
readonly url: string;
|
|
6
|
+
readonly transformed: boolean;
|
|
7
|
+
readonly platform?: string;
|
|
8
|
+
}
|
|
9
|
+
export interface FetchTelemetryContext {
|
|
10
|
+
requestId: string;
|
|
11
|
+
startTime: number;
|
|
12
|
+
url: string;
|
|
13
|
+
method: string;
|
|
14
|
+
contextRequestId?: string;
|
|
15
|
+
operationId?: string;
|
|
16
|
+
}
|
|
17
|
+
export declare function isBlockedIp(ip: string): boolean;
|
|
18
|
+
export declare function normalizeUrl(urlString: string): {
|
|
19
|
+
normalizedUrl: string;
|
|
20
|
+
hostname: string;
|
|
21
|
+
};
|
|
22
|
+
export declare function validateAndNormalizeUrl(urlString: string): string;
|
|
23
|
+
export declare function transformToRawUrl(url: string): TransformResult;
|
|
24
|
+
export declare function isRawTextContentUrl(url: string): boolean;
|
|
25
|
+
export declare function startFetchTelemetry(url: string, method: string): FetchTelemetryContext;
|
|
26
|
+
export declare function recordFetchResponse(context: FetchTelemetryContext, response: Response, contentSize?: number): void;
|
|
27
|
+
export declare function recordFetchError(context: FetchTelemetryContext, error: unknown, status?: number): void;
|
|
28
|
+
export declare function fetchWithRedirects(url: string, init: RequestInit, maxRedirects: number): Promise<{
|
|
29
|
+
response: Response;
|
|
30
|
+
url: string;
|
|
31
|
+
}>;
|
|
32
|
+
export declare function readResponseText(response: Response, url: string, maxBytes: number, signal?: AbortSignal, encoding?: string): Promise<{
|
|
33
|
+
text: string;
|
|
34
|
+
size: number;
|
|
35
|
+
}>;
|
|
36
|
+
export declare function fetchNormalizedUrl(normalizedUrl: string, options?: FetchOptions): Promise<string>;
|
|
37
|
+
export declare function fetchNormalizedUrlBuffer(normalizedUrl: string, options?: FetchOptions): Promise<{
|
|
38
|
+
buffer: Uint8Array;
|
|
39
|
+
encoding: string;
|
|
40
|
+
truncated: boolean;
|
|
41
|
+
finalUrl: string;
|
|
42
|
+
}>;
|