@j0hanz/fetch-url-mcp 1.3.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -21
- package/dist/cli.d.ts +3 -3
- package/dist/cli.js +15 -8
- package/dist/http/auth.d.ts +6 -6
- package/dist/http/auth.js +78 -23
- package/dist/http/health.d.ts +1 -2
- package/dist/http/health.js +7 -18
- package/dist/http/helpers.d.ts +3 -11
- package/dist/http/helpers.js +28 -26
- package/dist/http/native.d.ts +0 -1
- package/dist/http/native.js +63 -41
- package/dist/http/rate-limit.d.ts +2 -2
- package/dist/http/rate-limit.js +11 -16
- package/dist/index.d.ts +0 -1
- package/dist/index.js +17 -20
- package/dist/{markdown-cleanup.d.ts → lib/content.d.ts} +4 -2
- package/dist/lib/content.js +1356 -0
- package/dist/lib/core.d.ts +253 -0
- package/dist/lib/core.js +1228 -0
- package/dist/{tool-pipeline.d.ts → lib/fetch-pipeline.d.ts} +1 -3
- package/dist/{tool-pipeline.js → lib/fetch-pipeline.js} +18 -44
- package/dist/{fetch.d.ts → lib/http.d.ts} +7 -9
- package/dist/{fetch.js → lib/http.js} +721 -1004
- package/dist/lib/mcp-tools.d.ts +28 -0
- package/dist/lib/mcp-tools.js +107 -0
- package/dist/{tool-progress.d.ts → lib/progress.d.ts} +0 -2
- package/dist/{tool-progress.js → lib/progress.js} +9 -14
- package/dist/lib/task-handlers.d.ts +5 -0
- package/dist/{mcp.js → lib/task-handlers.js} +95 -31
- package/dist/lib/url.d.ts +70 -0
- package/dist/lib/url.js +686 -0
- package/dist/lib/utils.d.ts +58 -0
- package/dist/lib/utils.js +304 -0
- package/dist/{prompts.d.ts → prompts/index.d.ts} +0 -1
- package/dist/{prompts.js → prompts/index.js} +1 -2
- package/dist/{resources.d.ts → resources/index.d.ts} +0 -1
- package/dist/{resources.js → resources/index.js} +87 -64
- package/dist/{instructions.d.ts → resources/instructions.d.ts} +0 -1
- package/dist/{instructions.js → resources/instructions.js} +5 -3
- package/dist/schemas/inputs.d.ts +7 -0
- package/dist/schemas/inputs.js +24 -0
- package/dist/schemas/outputs.d.ts +23 -0
- package/dist/schemas/outputs.js +77 -0
- package/dist/server.d.ts +0 -1
- package/dist/server.js +26 -25
- package/dist/tasks/execution.d.ts +0 -1
- package/dist/tasks/execution.js +106 -70
- package/dist/tasks/manager.d.ts +11 -3
- package/dist/tasks/manager.js +97 -73
- package/dist/tasks/owner.d.ts +3 -3
- package/dist/tasks/owner.js +2 -2
- package/dist/tasks/tool-registry.d.ts +11 -0
- package/dist/tasks/tool-registry.js +13 -0
- package/dist/tools/fetch-url.d.ts +28 -0
- package/dist/{tools.js → tools/fetch-url.js} +95 -147
- package/dist/tools/index.d.ts +2 -0
- package/dist/tools/index.js +4 -0
- package/dist/transform/html-translators.d.ts +1 -0
- package/dist/transform/html-translators.js +454 -0
- package/dist/transform/metadata.d.ts +4 -0
- package/dist/transform/metadata.js +183 -0
- package/dist/transform/transform.d.ts +0 -1
- package/dist/transform/transform.js +44 -679
- package/dist/transform/types.d.ts +9 -12
- package/dist/transform/types.js +0 -1
- package/dist/transform/worker-pool.d.ts +0 -1
- package/dist/transform/worker-pool.js +7 -16
- package/dist/transform/workers/shared.d.ts +7 -0
- package/dist/transform/workers/shared.js +130 -0
- package/dist/transform/workers/transform-child.d.ts +0 -1
- package/dist/transform/workers/transform-child.js +5 -135
- package/dist/transform/workers/transform-worker.d.ts +0 -1
- package/dist/transform/workers/transform-worker.js +7 -128
- package/package.json +11 -7
- package/dist/cache.d.ts +0 -54
- package/dist/cache.d.ts.map +0 -1
- package/dist/cache.js +0 -261
- package/dist/cache.js.map +0 -1
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js.map +0 -1
- package/dist/config.d.ts +0 -141
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js +0 -473
- package/dist/config.js.map +0 -1
- package/dist/crypto.d.ts +0 -4
- package/dist/crypto.d.ts.map +0 -1
- package/dist/crypto.js +0 -56
- package/dist/crypto.js.map +0 -1
- package/dist/dom-noise-removal.d.ts +0 -2
- package/dist/dom-noise-removal.d.ts.map +0 -1
- package/dist/dom-noise-removal.js +0 -494
- package/dist/dom-noise-removal.js.map +0 -1
- package/dist/download.d.ts +0 -4
- package/dist/download.d.ts.map +0 -1
- package/dist/download.js +0 -106
- package/dist/download.js.map +0 -1
- package/dist/errors.d.ts +0 -11
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js +0 -65
- package/dist/errors.js.map +0 -1
- package/dist/examples/mcp-fetch-url-client.js +0 -329
- package/dist/examples/mcp-fetch-url-client.js.map +0 -1
- package/dist/fetch-content.d.ts +0 -5
- package/dist/fetch-content.d.ts.map +0 -1
- package/dist/fetch-content.js +0 -164
- package/dist/fetch-content.js.map +0 -1
- package/dist/fetch-stream.d.ts +0 -5
- package/dist/fetch-stream.d.ts.map +0 -1
- package/dist/fetch-stream.js +0 -29
- package/dist/fetch-stream.js.map +0 -1
- package/dist/fetch.d.ts.map +0 -1
- package/dist/fetch.js.map +0 -1
- package/dist/host-normalization.d.ts +0 -2
- package/dist/host-normalization.d.ts.map +0 -1
- package/dist/host-normalization.js +0 -91
- package/dist/host-normalization.js.map +0 -1
- package/dist/http/auth.d.ts.map +0 -1
- package/dist/http/auth.js.map +0 -1
- package/dist/http/health.d.ts.map +0 -1
- package/dist/http/health.js.map +0 -1
- package/dist/http/helpers.d.ts.map +0 -1
- package/dist/http/helpers.js.map +0 -1
- package/dist/http/native.d.ts.map +0 -1
- package/dist/http/native.js.map +0 -1
- package/dist/http/rate-limit.d.ts.map +0 -1
- package/dist/http/rate-limit.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/instructions.d.ts.map +0 -1
- package/dist/instructions.js.map +0 -1
- package/dist/ip-blocklist.d.ts +0 -9
- package/dist/ip-blocklist.d.ts.map +0 -1
- package/dist/ip-blocklist.js +0 -79
- package/dist/ip-blocklist.js.map +0 -1
- package/dist/json.d.ts +0 -2
- package/dist/json.d.ts.map +0 -1
- package/dist/json.js +0 -45
- package/dist/json.js.map +0 -1
- package/dist/language-detection.d.ts +0 -3
- package/dist/language-detection.d.ts.map +0 -1
- package/dist/language-detection.js +0 -355
- package/dist/language-detection.js.map +0 -1
- package/dist/markdown-cleanup.d.ts.map +0 -1
- package/dist/markdown-cleanup.js +0 -534
- package/dist/markdown-cleanup.js.map +0 -1
- package/dist/mcp-validator.d.ts +0 -17
- package/dist/mcp-validator.d.ts.map +0 -1
- package/dist/mcp-validator.js +0 -45
- package/dist/mcp-validator.js.map +0 -1
- package/dist/mcp.d.ts +0 -4
- package/dist/mcp.d.ts.map +0 -1
- package/dist/mcp.js.map +0 -1
- package/dist/observability.d.ts +0 -23
- package/dist/observability.d.ts.map +0 -1
- package/dist/observability.js +0 -238
- package/dist/observability.js.map +0 -1
- package/dist/prompts.d.ts.map +0 -1
- package/dist/prompts.js.map +0 -1
- package/dist/resources.d.ts.map +0 -1
- package/dist/resources.js.map +0 -1
- package/dist/server-tuning.d.ts +0 -15
- package/dist/server-tuning.d.ts.map +0 -1
- package/dist/server-tuning.js +0 -49
- package/dist/server-tuning.js.map +0 -1
- package/dist/server.d.ts.map +0 -1
- package/dist/server.js.map +0 -1
- package/dist/session.d.ts +0 -42
- package/dist/session.d.ts.map +0 -1
- package/dist/session.js +0 -255
- package/dist/session.js.map +0 -1
- package/dist/tasks/execution.d.ts.map +0 -1
- package/dist/tasks/execution.js.map +0 -1
- package/dist/tasks/manager.d.ts.map +0 -1
- package/dist/tasks/manager.js.map +0 -1
- package/dist/tasks/owner.d.ts.map +0 -1
- package/dist/tasks/owner.js.map +0 -1
- package/dist/timer-utils.d.ts +0 -6
- package/dist/timer-utils.d.ts.map +0 -1
- package/dist/timer-utils.js +0 -27
- package/dist/timer-utils.js.map +0 -1
- package/dist/tool-errors.d.ts +0 -12
- package/dist/tool-errors.d.ts.map +0 -1
- package/dist/tool-errors.js +0 -55
- package/dist/tool-errors.js.map +0 -1
- package/dist/tool-pipeline.d.ts.map +0 -1
- package/dist/tool-pipeline.js.map +0 -1
- package/dist/tool-progress.d.ts.map +0 -1
- package/dist/tool-progress.js.map +0 -1
- package/dist/tools.d.ts +0 -54
- package/dist/tools.d.ts.map +0 -1
- package/dist/tools.js.map +0 -1
- package/dist/transform/transform.d.ts.map +0 -1
- package/dist/transform/transform.js.map +0 -1
- package/dist/transform/types.d.ts.map +0 -1
- package/dist/transform/types.js.map +0 -1
- package/dist/transform/worker-pool.d.ts.map +0 -1
- package/dist/transform/worker-pool.js.map +0 -1
- package/dist/transform/workers/transform-child.d.ts.map +0 -1
- package/dist/transform/workers/transform-child.js.map +0 -1
- package/dist/transform/workers/transform-worker.d.ts.map +0 -1
- package/dist/transform/workers/transform-worker.js.map +0 -1
- package/dist/type-guards.d.ts +0 -16
- package/dist/type-guards.d.ts.map +0 -1
- package/dist/type-guards.js +0 -13
- package/dist/type-guards.js.map +0 -1
|
@@ -3,15 +3,16 @@ import diagnosticsChannel from 'node:diagnostics_channel';
|
|
|
3
3
|
import { performance } from 'node:perf_hooks';
|
|
4
4
|
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
5
5
|
import { parseHTML } from 'linkedom';
|
|
6
|
-
import {
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
9
|
-
import {
|
|
10
|
-
import { isRawTextContentUrl } from '../
|
|
11
|
-
import {
|
|
12
|
-
import {
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
6
|
+
import { removeNoiseFromHtml } from '../lib/content.js';
|
|
7
|
+
import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isRawTextContent, } from '../lib/content.js';
|
|
8
|
+
import { config } from '../lib/core.js';
|
|
9
|
+
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
|
|
10
|
+
import { isRawTextContentUrl } from '../lib/http.js';
|
|
11
|
+
import { createAbortError, throwIfAborted } from '../lib/utils.js';
|
|
12
|
+
import { FetchError, getErrorMessage } from '../lib/utils.js';
|
|
13
|
+
import { isObject } from '../lib/utils.js';
|
|
14
|
+
import { translateHtmlFragmentToMarkdown } from './html-translators.js';
|
|
15
|
+
import { extractMetadata, extractMetadataFromHead, mergeMetadata, } from './metadata.js';
|
|
15
16
|
import { getOrCreateWorkerPool, getWorkerPoolStats, shutdownWorkerPool, } from './worker-pool.js';
|
|
16
17
|
const utf8Decoder = new TextDecoder('utf-8');
|
|
17
18
|
function decodeInput(input, encoding) {
|
|
@@ -30,43 +31,9 @@ function decodeInput(input, encoding) {
|
|
|
30
31
|
return utf8Decoder.decode(input);
|
|
31
32
|
}
|
|
32
33
|
}
|
|
33
|
-
function getTagName(node) {
|
|
34
|
-
if (!isLikeNode(node))
|
|
35
|
-
return '';
|
|
36
|
-
const raw = node.tagName;
|
|
37
|
-
return typeof raw === 'string' ? raw.toUpperCase() : '';
|
|
38
|
-
}
|
|
39
34
|
function asError(value) {
|
|
40
35
|
return value instanceof Error ? value : undefined;
|
|
41
36
|
}
|
|
42
|
-
function getAbortReason(signal) {
|
|
43
|
-
const record = isObject(signal) ? signal : null;
|
|
44
|
-
return record && 'reason' in record ? record['reason'] : undefined;
|
|
45
|
-
}
|
|
46
|
-
function isTimeoutAbortReason(reason) {
|
|
47
|
-
return reason instanceof Error && reason.name === 'TimeoutError';
|
|
48
|
-
}
|
|
49
|
-
function throwIfAborted(signal, url, stage) {
|
|
50
|
-
if (!signal?.aborted)
|
|
51
|
-
return;
|
|
52
|
-
const reason = getAbortReason(signal);
|
|
53
|
-
if (isTimeoutAbortReason(reason)) {
|
|
54
|
-
throw new FetchError('Request timeout', url, 504, {
|
|
55
|
-
reason: 'timeout',
|
|
56
|
-
stage,
|
|
57
|
-
});
|
|
58
|
-
}
|
|
59
|
-
throw new FetchError('Request was canceled', url, 499, {
|
|
60
|
-
reason: 'aborted',
|
|
61
|
-
stage,
|
|
62
|
-
});
|
|
63
|
-
}
|
|
64
|
-
function createAbortError(url, stage) {
|
|
65
|
-
return new FetchError('Request was canceled', url, 499, {
|
|
66
|
-
reason: 'aborted',
|
|
67
|
-
stage,
|
|
68
|
-
});
|
|
69
|
-
}
|
|
70
37
|
const abortPolicy = { throwIfAborted, createAbortError };
|
|
71
38
|
function buildTransformSignal(signal) {
|
|
72
39
|
const { timeoutMs } = config.transform;
|
|
@@ -213,7 +180,22 @@ function trimUtf8Buffer(buffer, maxBytes) {
|
|
|
213
180
|
function trimDanglingTagFragment(content) {
|
|
214
181
|
const lastOpen = content.lastIndexOf('<');
|
|
215
182
|
const lastClose = content.lastIndexOf('>');
|
|
216
|
-
|
|
183
|
+
if (lastOpen > lastClose) {
|
|
184
|
+
if (lastOpen === content.length - 1) {
|
|
185
|
+
return content.substring(0, lastOpen);
|
|
186
|
+
}
|
|
187
|
+
const code = content.codePointAt(lastOpen + 1);
|
|
188
|
+
if (code !== undefined &&
|
|
189
|
+
(code === 47 || // '/'
|
|
190
|
+
code === 33 || // '!'
|
|
191
|
+
code === 63 || // '?'
|
|
192
|
+
(code >= 65 && code <= 90) || // A-Z
|
|
193
|
+
(code >= 97 && code <= 122)) // a-z
|
|
194
|
+
) {
|
|
195
|
+
return content.substring(0, lastOpen);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
return content;
|
|
217
199
|
}
|
|
218
200
|
function truncateHtml(html, inputTruncated = false) {
|
|
219
201
|
const maxSize = config.constants.maxHtmlSize;
|
|
@@ -241,184 +223,6 @@ function willTruncate(html) {
|
|
|
241
223
|
const maxSize = config.constants.maxHtmlSize;
|
|
242
224
|
return (maxSize > 0 && (html.length > maxSize || getUtf8ByteLength(html) > maxSize));
|
|
243
225
|
}
|
|
244
|
-
const HEAD_END_PATTERN = /<\/head\s*>|<body\b/i;
|
|
245
|
-
const MAX_HEAD_SCAN_LENGTH = 50_000;
|
|
246
|
-
function extractHeadSection(html) {
|
|
247
|
-
if (html.length <= MAX_HEAD_SCAN_LENGTH) {
|
|
248
|
-
const match = HEAD_END_PATTERN.exec(html);
|
|
249
|
-
return match ? html.substring(0, match.index) : null;
|
|
250
|
-
}
|
|
251
|
-
const searchText = html.substring(0, MAX_HEAD_SCAN_LENGTH);
|
|
252
|
-
const match = HEAD_END_PATTERN.exec(searchText);
|
|
253
|
-
if (!match)
|
|
254
|
-
return null;
|
|
255
|
-
return html.substring(0, match.index);
|
|
256
|
-
}
|
|
257
|
-
function extractMetadataFromHead(html, baseUrl) {
|
|
258
|
-
const headSection = extractHeadSection(html);
|
|
259
|
-
if (!headSection)
|
|
260
|
-
return null;
|
|
261
|
-
try {
|
|
262
|
-
const { document } = parseHTML(`<!DOCTYPE html><html>${headSection}</head><body></body></html>`);
|
|
263
|
-
return extractMetadata(document, baseUrl);
|
|
264
|
-
}
|
|
265
|
-
catch {
|
|
266
|
-
return null;
|
|
267
|
-
}
|
|
268
|
-
}
|
|
269
|
-
function mergeMetadata(early, late) {
|
|
270
|
-
if (!early)
|
|
271
|
-
return late;
|
|
272
|
-
const merged = {};
|
|
273
|
-
const title = late.title ?? early.title;
|
|
274
|
-
const description = late.description ?? early.description;
|
|
275
|
-
const author = late.author ?? early.author;
|
|
276
|
-
const image = late.image ?? early.image;
|
|
277
|
-
const publishedAt = late.publishedAt ?? early.publishedAt;
|
|
278
|
-
const modifiedAt = late.modifiedAt ?? early.modifiedAt;
|
|
279
|
-
if (title !== undefined)
|
|
280
|
-
merged.title = title;
|
|
281
|
-
if (description !== undefined)
|
|
282
|
-
merged.description = description;
|
|
283
|
-
if (author !== undefined)
|
|
284
|
-
merged.author = author;
|
|
285
|
-
if (image !== undefined)
|
|
286
|
-
merged.image = image;
|
|
287
|
-
if (publishedAt !== undefined)
|
|
288
|
-
merged.publishedAt = publishedAt;
|
|
289
|
-
if (modifiedAt !== undefined)
|
|
290
|
-
merged.modifiedAt = modifiedAt;
|
|
291
|
-
return merged;
|
|
292
|
-
}
|
|
293
|
-
const META_PROPERTY_HANDLERS = new Map([
|
|
294
|
-
[
|
|
295
|
-
'og:title',
|
|
296
|
-
(ctx, c) => {
|
|
297
|
-
ctx.title.og = c;
|
|
298
|
-
},
|
|
299
|
-
],
|
|
300
|
-
[
|
|
301
|
-
'og:description',
|
|
302
|
-
(ctx, c) => {
|
|
303
|
-
ctx.description.og = c;
|
|
304
|
-
},
|
|
305
|
-
],
|
|
306
|
-
[
|
|
307
|
-
'og:image',
|
|
308
|
-
(ctx, c) => {
|
|
309
|
-
ctx.image = c;
|
|
310
|
-
},
|
|
311
|
-
],
|
|
312
|
-
[
|
|
313
|
-
'article:published_time',
|
|
314
|
-
(ctx, c) => {
|
|
315
|
-
ctx.publishedAt = c;
|
|
316
|
-
},
|
|
317
|
-
],
|
|
318
|
-
[
|
|
319
|
-
'article:modified_time',
|
|
320
|
-
(ctx, c) => {
|
|
321
|
-
ctx.modifiedAt = c;
|
|
322
|
-
},
|
|
323
|
-
],
|
|
324
|
-
]);
|
|
325
|
-
const META_NAME_HANDLERS = new Map([
|
|
326
|
-
[
|
|
327
|
-
'twitter:title',
|
|
328
|
-
(ctx, c) => {
|
|
329
|
-
ctx.title.twitter = c;
|
|
330
|
-
},
|
|
331
|
-
],
|
|
332
|
-
[
|
|
333
|
-
'twitter:description',
|
|
334
|
-
(ctx, c) => {
|
|
335
|
-
ctx.description.twitter = c;
|
|
336
|
-
},
|
|
337
|
-
],
|
|
338
|
-
[
|
|
339
|
-
'description',
|
|
340
|
-
(ctx, c) => {
|
|
341
|
-
ctx.description.standard = c;
|
|
342
|
-
},
|
|
343
|
-
],
|
|
344
|
-
[
|
|
345
|
-
'author',
|
|
346
|
-
(ctx, c) => {
|
|
347
|
-
ctx.author = c;
|
|
348
|
-
},
|
|
349
|
-
],
|
|
350
|
-
]);
|
|
351
|
-
function processMetaTag(ctx, tag) {
|
|
352
|
-
const content = tag.getAttribute('content')?.trim();
|
|
353
|
-
if (!content)
|
|
354
|
-
return;
|
|
355
|
-
const property = tag.getAttribute('property');
|
|
356
|
-
if (property)
|
|
357
|
-
META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
|
|
358
|
-
const name = tag.getAttribute('name');
|
|
359
|
-
if (name)
|
|
360
|
-
META_NAME_HANDLERS.get(name)?.(ctx, content);
|
|
361
|
-
}
|
|
362
|
-
function buildMetaContext(document) {
|
|
363
|
-
const ctx = { title: {}, description: {} };
|
|
364
|
-
for (const tag of document.querySelectorAll('meta')) {
|
|
365
|
-
processMetaTag(ctx, tag);
|
|
366
|
-
}
|
|
367
|
-
const titleEl = document.querySelector('title');
|
|
368
|
-
if (!ctx.title.standard && titleEl?.textContent) {
|
|
369
|
-
ctx.title.standard = titleEl.textContent.trim();
|
|
370
|
-
}
|
|
371
|
-
return ctx;
|
|
372
|
-
}
|
|
373
|
-
function resolveMetadataFromContext(ctx) {
|
|
374
|
-
const metadata = {};
|
|
375
|
-
const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
|
|
376
|
-
const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
|
|
377
|
-
if (resolvedTitle)
|
|
378
|
-
metadata.title = resolvedTitle;
|
|
379
|
-
if (resolvedDesc)
|
|
380
|
-
metadata.description = resolvedDesc;
|
|
381
|
-
if (ctx.author)
|
|
382
|
-
metadata.author = ctx.author;
|
|
383
|
-
if (ctx.image)
|
|
384
|
-
metadata.image = ctx.image;
|
|
385
|
-
if (ctx.publishedAt)
|
|
386
|
-
metadata.publishedAt = ctx.publishedAt;
|
|
387
|
-
if (ctx.modifiedAt)
|
|
388
|
-
metadata.modifiedAt = ctx.modifiedAt;
|
|
389
|
-
return metadata;
|
|
390
|
-
}
|
|
391
|
-
function extractMetadata(document, baseUrl) {
|
|
392
|
-
const ctx = buildMetaContext(document);
|
|
393
|
-
const metadata = resolveMetadataFromContext(ctx);
|
|
394
|
-
if (baseUrl) {
|
|
395
|
-
const icon32 = document.querySelector('link[rel="icon"][sizes="32x32"]');
|
|
396
|
-
const href = icon32?.getAttribute('href');
|
|
397
|
-
if (href) {
|
|
398
|
-
const resolved = resolveFaviconUrl(href, baseUrl);
|
|
399
|
-
if (resolved)
|
|
400
|
-
metadata.favicon = resolved;
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
return metadata;
|
|
404
|
-
}
|
|
405
|
-
function resolveFaviconUrl(href, baseUrl) {
|
|
406
|
-
const trimmed = href.trim();
|
|
407
|
-
if (!trimmed)
|
|
408
|
-
return undefined;
|
|
409
|
-
if (trimmed.toLowerCase().startsWith('data:'))
|
|
410
|
-
return undefined;
|
|
411
|
-
try {
|
|
412
|
-
const resolved = new URL(trimmed, baseUrl);
|
|
413
|
-
if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') {
|
|
414
|
-
return undefined;
|
|
415
|
-
}
|
|
416
|
-
return resolved.toString();
|
|
417
|
-
}
|
|
418
|
-
catch {
|
|
419
|
-
return undefined;
|
|
420
|
-
}
|
|
421
|
-
}
|
|
422
226
|
function isReadabilityCompatible(doc) {
|
|
423
227
|
if (!isObject(doc))
|
|
424
228
|
return false;
|
|
@@ -588,433 +392,6 @@ export function extractContent(html, url, options = {
|
|
|
588
392
|
const result = extractContentContext(html, url, options);
|
|
589
393
|
return { article: result.article, metadata: result.metadata };
|
|
590
394
|
}
|
|
591
|
-
const CODE_BLOCK = {
|
|
592
|
-
fence: '```',
|
|
593
|
-
format: (code, language = '') => `\`\`\`${language}\n${code}\n\`\`\``,
|
|
594
|
-
};
|
|
595
|
-
function buildInlineCode(content) {
|
|
596
|
-
const trimmed = content.trim();
|
|
597
|
-
if (!trimmed)
|
|
598
|
-
return '``';
|
|
599
|
-
let maxBackticks = 0;
|
|
600
|
-
let currentRun = 0;
|
|
601
|
-
for (const char of trimmed) {
|
|
602
|
-
if (char === '`')
|
|
603
|
-
currentRun += 1;
|
|
604
|
-
else {
|
|
605
|
-
if (currentRun > maxBackticks)
|
|
606
|
-
maxBackticks = currentRun;
|
|
607
|
-
currentRun = 0;
|
|
608
|
-
}
|
|
609
|
-
}
|
|
610
|
-
if (currentRun > maxBackticks)
|
|
611
|
-
maxBackticks = currentRun;
|
|
612
|
-
const delimiter = '`'.repeat(maxBackticks + 1);
|
|
613
|
-
const padding = trimmed.startsWith('`') || trimmed.endsWith('`') ? ' ' : '';
|
|
614
|
-
return `${delimiter}${padding}${trimmed}${padding}${delimiter}`;
|
|
615
|
-
}
|
|
616
|
-
function deriveAltFromImageUrl(src) {
|
|
617
|
-
if (!src)
|
|
618
|
-
return '';
|
|
619
|
-
try {
|
|
620
|
-
const isAbsolute = URL.canParse(src);
|
|
621
|
-
let parsed = null;
|
|
622
|
-
if (isAbsolute) {
|
|
623
|
-
parsed = new URL(src);
|
|
624
|
-
}
|
|
625
|
-
else if (URL.canParse(src, 'http://localhost')) {
|
|
626
|
-
parsed = new URL(src, 'http://localhost');
|
|
627
|
-
}
|
|
628
|
-
if (!parsed)
|
|
629
|
-
return '';
|
|
630
|
-
if (isAbsolute &&
|
|
631
|
-
parsed.protocol !== 'http:' &&
|
|
632
|
-
parsed.protocol !== 'https:') {
|
|
633
|
-
return '';
|
|
634
|
-
}
|
|
635
|
-
const { pathname } = parsed;
|
|
636
|
-
const segments = pathname.split('/');
|
|
637
|
-
const filename = segments.pop() ?? '';
|
|
638
|
-
if (!filename)
|
|
639
|
-
return '';
|
|
640
|
-
const dotIndex = filename.lastIndexOf('.');
|
|
641
|
-
const name = dotIndex > 0 ? filename.slice(0, dotIndex) : filename;
|
|
642
|
-
return name.replace(/[_-]+/g, ' ').trim();
|
|
643
|
-
}
|
|
644
|
-
catch {
|
|
645
|
-
return '';
|
|
646
|
-
}
|
|
647
|
-
}
|
|
648
|
-
function hasGetAttribute(value) {
|
|
649
|
-
return (isObject(value) &&
|
|
650
|
-
typeof value.getAttribute === 'function');
|
|
651
|
-
}
|
|
652
|
-
function isCodeBlock(parent) {
|
|
653
|
-
const tagName = getTagName(parent);
|
|
654
|
-
return tagName === 'PRE' || tagName === 'WRAPPED-PRE';
|
|
655
|
-
}
|
|
656
|
-
function isAnchor(node) {
|
|
657
|
-
return getTagName(node) === 'A';
|
|
658
|
-
}
|
|
659
|
-
function resolveAttributeLanguage(node) {
|
|
660
|
-
const getAttribute = hasGetAttribute(node)
|
|
661
|
-
? node.getAttribute.bind(node)
|
|
662
|
-
: undefined;
|
|
663
|
-
const className = getAttribute?.('class') ?? '';
|
|
664
|
-
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
665
|
-
return resolveLanguageFromAttributes(className, dataLanguage);
|
|
666
|
-
}
|
|
667
|
-
function findLanguageFromCodeChild(node) {
|
|
668
|
-
if (!isLikeNode(node))
|
|
669
|
-
return undefined;
|
|
670
|
-
const childNodes = Array.from(node.childNodes ?? []);
|
|
671
|
-
for (const child of childNodes) {
|
|
672
|
-
if (!isLikeNode(child))
|
|
673
|
-
continue;
|
|
674
|
-
const raw = child.rawTagName;
|
|
675
|
-
const tagName = typeof raw === 'string' ? raw.toUpperCase() : '';
|
|
676
|
-
if (tagName === 'CODE')
|
|
677
|
-
return resolveAttributeLanguage(child);
|
|
678
|
-
}
|
|
679
|
-
return undefined;
|
|
680
|
-
}
|
|
681
|
-
function createCodeBlockPostprocessor(language) {
|
|
682
|
-
return ({ content }) => {
|
|
683
|
-
const trimmed = content.trim();
|
|
684
|
-
if (!trimmed)
|
|
685
|
-
return '';
|
|
686
|
-
const resolvedLanguage = language ?? detectLanguageFromCode(trimmed) ?? '';
|
|
687
|
-
return CODE_BLOCK.format(trimmed, resolvedLanguage);
|
|
688
|
-
};
|
|
689
|
-
}
|
|
690
|
-
function buildInlineCodeTranslator() {
|
|
691
|
-
return {
|
|
692
|
-
spaceIfRepeatingChar: true,
|
|
693
|
-
noEscape: true,
|
|
694
|
-
postprocess: ({ content }) => buildInlineCode(content),
|
|
695
|
-
};
|
|
696
|
-
}
|
|
697
|
-
function buildCodeTranslator(ctx) {
|
|
698
|
-
const inlineCodeTranslator = buildInlineCodeTranslator();
|
|
699
|
-
if (!isObject(ctx))
|
|
700
|
-
return inlineCodeTranslator;
|
|
701
|
-
const { parent } = ctx;
|
|
702
|
-
if (!isCodeBlock(parent))
|
|
703
|
-
return inlineCodeTranslator;
|
|
704
|
-
return { noEscape: true, preserveWhitespace: true };
|
|
705
|
-
}
|
|
706
|
-
function extractFirstSrcsetUrl(srcset) {
|
|
707
|
-
const first = srcset.split(',')[0];
|
|
708
|
-
if (!first)
|
|
709
|
-
return '';
|
|
710
|
-
return first.trim().split(/\s+/)[0] ?? '';
|
|
711
|
-
}
|
|
712
|
-
const LAZY_SRC_ATTRIBUTES = [
|
|
713
|
-
'data-src',
|
|
714
|
-
'data-lazy-src',
|
|
715
|
-
'data-original',
|
|
716
|
-
'data-srcset',
|
|
717
|
-
];
|
|
718
|
-
function isDataUri(value) {
|
|
719
|
-
return value.startsWith('data:');
|
|
720
|
-
}
|
|
721
|
-
function extractNonDataSrcsetUrl(value) {
|
|
722
|
-
const url = extractFirstSrcsetUrl(value);
|
|
723
|
-
return url && !isDataUri(url) ? url : undefined;
|
|
724
|
-
}
|
|
725
|
-
function resolveLazySrc(getAttribute) {
|
|
726
|
-
for (const attr of LAZY_SRC_ATTRIBUTES) {
|
|
727
|
-
const lazy = getAttribute(attr);
|
|
728
|
-
if (!lazy || isDataUri(lazy))
|
|
729
|
-
continue;
|
|
730
|
-
if (attr === 'data-srcset') {
|
|
731
|
-
const url = extractNonDataSrcsetUrl(lazy);
|
|
732
|
-
if (url)
|
|
733
|
-
return url;
|
|
734
|
-
continue;
|
|
735
|
-
}
|
|
736
|
-
return lazy;
|
|
737
|
-
}
|
|
738
|
-
return undefined;
|
|
739
|
-
}
|
|
740
|
-
function resolveImageSrc(getAttribute) {
|
|
741
|
-
if (!getAttribute)
|
|
742
|
-
return '';
|
|
743
|
-
const srcRaw = getAttribute('src') ?? '';
|
|
744
|
-
if (srcRaw && !isDataUri(srcRaw))
|
|
745
|
-
return srcRaw;
|
|
746
|
-
// First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
|
|
747
|
-
const lazySrc = resolveLazySrc(getAttribute);
|
|
748
|
-
if (lazySrc)
|
|
749
|
-
return lazySrc;
|
|
750
|
-
// If the src is a data URI or missing, check srcset for a valid URL. Some sites use srcset with data URIs in src and actual URLs in srcset for responsive images.
|
|
751
|
-
const srcset = getAttribute('srcset');
|
|
752
|
-
if (srcset) {
|
|
753
|
-
const url = extractNonDataSrcsetUrl(srcset);
|
|
754
|
-
if (url)
|
|
755
|
-
return url;
|
|
756
|
-
}
|
|
757
|
-
// If the only available src is a data URI, we choose to omit it rather than include the raw data in the alt text or URL, as data URIs can be very long and are not useful in Markdown output.
|
|
758
|
-
if (isDataUri(srcRaw))
|
|
759
|
-
return '[data URI removed]';
|
|
760
|
-
return '';
|
|
761
|
-
}
|
|
762
|
-
function buildImageTranslator(ctx) {
|
|
763
|
-
if (!isObject(ctx))
|
|
764
|
-
return { content: '' };
|
|
765
|
-
const { node, parent } = ctx;
|
|
766
|
-
const getAttribute = hasGetAttribute(node)
|
|
767
|
-
? node.getAttribute.bind(node)
|
|
768
|
-
: undefined;
|
|
769
|
-
const src = resolveImageSrc(getAttribute);
|
|
770
|
-
const existingAlt = getAttribute?.('alt') ?? '';
|
|
771
|
-
const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
|
|
772
|
-
const markdown = ``;
|
|
773
|
-
if (isAnchor(parent)) {
|
|
774
|
-
return { content: markdown };
|
|
775
|
-
}
|
|
776
|
-
return { content: `\n\n${markdown}\n\n` };
|
|
777
|
-
}
|
|
778
|
-
const GFM_ALERT_MAP = new Map([
|
|
779
|
-
['note', 'NOTE'],
|
|
780
|
-
['info', 'NOTE'],
|
|
781
|
-
['tip', 'TIP'],
|
|
782
|
-
['hint', 'TIP'],
|
|
783
|
-
['warning', 'WARNING'],
|
|
784
|
-
['warn', 'WARNING'],
|
|
785
|
-
['caution', 'CAUTION'],
|
|
786
|
-
['danger', 'CAUTION'],
|
|
787
|
-
['important', 'IMPORTANT'],
|
|
788
|
-
]);
|
|
789
|
-
function resolveGfmAlertType(className) {
|
|
790
|
-
const lower = className.toLowerCase();
|
|
791
|
-
for (const [key, type] of GFM_ALERT_MAP) {
|
|
792
|
-
if (lower.includes(key))
|
|
793
|
-
return type;
|
|
794
|
-
}
|
|
795
|
-
return undefined;
|
|
796
|
-
}
|
|
797
|
-
function resolveDlNodeName(child) {
|
|
798
|
-
if (!isLikeNode(child))
|
|
799
|
-
return '';
|
|
800
|
-
const raw = child.nodeName;
|
|
801
|
-
return typeof raw === 'string' ? raw.toUpperCase() : '';
|
|
802
|
-
}
|
|
803
|
-
function resolveDlTextContent(child) {
|
|
804
|
-
if (!isLikeNode(child))
|
|
805
|
-
return '';
|
|
806
|
-
const raw = child.textContent;
|
|
807
|
-
return typeof raw === 'string' ? raw.trim() : '';
|
|
808
|
-
}
|
|
809
|
-
function buildDlChildFragment(child) {
|
|
810
|
-
const nodeName = resolveDlNodeName(child);
|
|
811
|
-
if (nodeName === 'DT')
|
|
812
|
-
return `**${resolveDlTextContent(child)}**\n`;
|
|
813
|
-
if (nodeName === 'DD')
|
|
814
|
-
return `: ${resolveDlTextContent(child)}\n`;
|
|
815
|
-
return null;
|
|
816
|
-
}
|
|
817
|
-
function hasComplexTableLayout(node) {
|
|
818
|
-
if (!isLikeNode(node))
|
|
819
|
-
return false;
|
|
820
|
-
const innerHTML = typeof node.innerHTML === 'string' ? node.innerHTML : '';
|
|
821
|
-
return /(?:colspan|rowspan)=["']?[2-9]/i.test(innerHTML);
|
|
822
|
-
}
|
|
823
|
-
function buildPreTranslator(ctx) {
|
|
824
|
-
if (!isObject(ctx))
|
|
825
|
-
return {};
|
|
826
|
-
const { node } = ctx;
|
|
827
|
-
const attributeLanguage = resolveAttributeLanguage(node) ?? findLanguageFromCodeChild(node);
|
|
828
|
-
return {
|
|
829
|
-
noEscape: true,
|
|
830
|
-
preserveWhitespace: true,
|
|
831
|
-
postprocess: createCodeBlockPostprocessor(attributeLanguage),
|
|
832
|
-
};
|
|
833
|
-
}
|
|
834
|
-
function createCustomTranslators() {
|
|
835
|
-
return {
|
|
836
|
-
code: (ctx) => buildCodeTranslator(ctx),
|
|
837
|
-
img: (ctx) => buildImageTranslator(ctx),
|
|
838
|
-
table: (ctx) => {
|
|
839
|
-
if (!isObject(ctx))
|
|
840
|
-
return {};
|
|
841
|
-
const { node } = ctx;
|
|
842
|
-
if (hasComplexTableLayout(node)) {
|
|
843
|
-
return {
|
|
844
|
-
postprocess: ({ content }) => {
|
|
845
|
-
const trimmed = content.trim();
|
|
846
|
-
if (!trimmed)
|
|
847
|
-
return '';
|
|
848
|
-
return `\n\n${trimmed}\n\n`;
|
|
849
|
-
},
|
|
850
|
-
};
|
|
851
|
-
}
|
|
852
|
-
return {};
|
|
853
|
-
},
|
|
854
|
-
dl: (ctx) => {
|
|
855
|
-
if (!isObject(ctx))
|
|
856
|
-
return { content: '' };
|
|
857
|
-
const { node } = ctx;
|
|
858
|
-
if (!isLikeNode(node))
|
|
859
|
-
return { content: '' };
|
|
860
|
-
const childNodes = Array.from(node.childNodes ?? []);
|
|
861
|
-
let items = '';
|
|
862
|
-
for (const child of childNodes) {
|
|
863
|
-
const fragment = buildDlChildFragment(child);
|
|
864
|
-
if (fragment !== null)
|
|
865
|
-
items += fragment;
|
|
866
|
-
}
|
|
867
|
-
return { content: items ? `\n${items}\n` : '' };
|
|
868
|
-
},
|
|
869
|
-
div: (ctx) => {
|
|
870
|
-
if (!isObject(ctx))
|
|
871
|
-
return {};
|
|
872
|
-
const { node } = ctx;
|
|
873
|
-
if (!isLikeNode(node))
|
|
874
|
-
return {};
|
|
875
|
-
const getAttribute = typeof node.getAttribute === 'function'
|
|
876
|
-
? node.getAttribute.bind(node)
|
|
877
|
-
: undefined;
|
|
878
|
-
const className = getAttribute?.('class') ?? '';
|
|
879
|
-
if (className.includes('mermaid')) {
|
|
880
|
-
return {
|
|
881
|
-
noEscape: true,
|
|
882
|
-
preserveWhitespace: true,
|
|
883
|
-
postprocess: ({ content }) => `\n\n\`\`\`mermaid\n${content.trim()}\n\`\`\`\n\n`,
|
|
884
|
-
};
|
|
885
|
-
}
|
|
886
|
-
const isAdmonition = className.includes('admonition') ||
|
|
887
|
-
className.includes('callout') ||
|
|
888
|
-
className.includes('custom-block') ||
|
|
889
|
-
getAttribute?.('role') === 'alert' ||
|
|
890
|
-
/\b(note|tip|info|warning|danger|caution|important)\b/i.test(className);
|
|
891
|
-
if (isAdmonition) {
|
|
892
|
-
return {
|
|
893
|
-
postprocess: ({ content }) => {
|
|
894
|
-
const alertType = resolveGfmAlertType(className);
|
|
895
|
-
const lines = content.trim().split('\n');
|
|
896
|
-
const header = alertType ? `> [!${alertType}]\n` : '';
|
|
897
|
-
return `\n\n${header}> ${lines.join('\n> ')}\n\n`;
|
|
898
|
-
},
|
|
899
|
-
};
|
|
900
|
-
}
|
|
901
|
-
if (!className.includes('type'))
|
|
902
|
-
return {};
|
|
903
|
-
return {
|
|
904
|
-
postprocess: ({ content }) => {
|
|
905
|
-
const lines = content.split('\n');
|
|
906
|
-
const separated = [];
|
|
907
|
-
for (let i = 0; i < lines.length; i++) {
|
|
908
|
-
const line = lines[i] ?? '';
|
|
909
|
-
const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
|
|
910
|
-
separated.push(line);
|
|
911
|
-
if (line.trim() &&
|
|
912
|
-
nextLine.trim() &&
|
|
913
|
-
line.includes(':') &&
|
|
914
|
-
nextLine.includes(':') &&
|
|
915
|
-
!line.startsWith(' ') &&
|
|
916
|
-
!nextLine.startsWith(' ')) {
|
|
917
|
-
separated.push('');
|
|
918
|
-
}
|
|
919
|
-
}
|
|
920
|
-
return separated.join('\n');
|
|
921
|
-
},
|
|
922
|
-
};
|
|
923
|
-
},
|
|
924
|
-
kbd: () => ({
|
|
925
|
-
postprocess: ({ content }) => `\`${content}\``,
|
|
926
|
-
}),
|
|
927
|
-
mark: () => ({
|
|
928
|
-
postprocess: ({ content }) => `==${content}==`,
|
|
929
|
-
}),
|
|
930
|
-
sub: () => ({
|
|
931
|
-
postprocess: ({ content }) => `~${content}~`,
|
|
932
|
-
}),
|
|
933
|
-
sup: () => ({
|
|
934
|
-
postprocess: ({ content }) => `^${content}^`,
|
|
935
|
-
}),
|
|
936
|
-
section: (ctx) => {
|
|
937
|
-
if (isObject(ctx)) {
|
|
938
|
-
const { node } = ctx;
|
|
939
|
-
if (isLikeNode(node)) {
|
|
940
|
-
const getAttribute = typeof node.getAttribute === 'function'
|
|
941
|
-
? node.getAttribute.bind(node)
|
|
942
|
-
: undefined;
|
|
943
|
-
if (getAttribute?.('class')?.includes('tsd-member')) {
|
|
944
|
-
return {
|
|
945
|
-
postprocess: ({ content }) => `\n\n \n\n${content}\n\n`,
|
|
946
|
-
};
|
|
947
|
-
}
|
|
948
|
-
}
|
|
949
|
-
}
|
|
950
|
-
return {
|
|
951
|
-
postprocess: ({ content }) => `\n\n${content}\n\n`,
|
|
952
|
-
};
|
|
953
|
-
},
|
|
954
|
-
details: () => ({
|
|
955
|
-
postprocess: ({ content }) => {
|
|
956
|
-
const trimmed = content.trim();
|
|
957
|
-
if (!trimmed)
|
|
958
|
-
return '';
|
|
959
|
-
return `\n\n${trimmed}\n\n`;
|
|
960
|
-
},
|
|
961
|
-
}),
|
|
962
|
-
summary: () => ({
|
|
963
|
-
postprocess: ({ content }) => `${content.trim()}\n\n`,
|
|
964
|
-
}),
|
|
965
|
-
span: (ctx) => {
|
|
966
|
-
if (!isObject(ctx))
|
|
967
|
-
return {};
|
|
968
|
-
const { node } = ctx;
|
|
969
|
-
if (!isLikeNode(node))
|
|
970
|
-
return {};
|
|
971
|
-
const getAttribute = typeof node.getAttribute === 'function'
|
|
972
|
-
? node.getAttribute.bind(node)
|
|
973
|
-
: undefined;
|
|
974
|
-
const dataAs = getAttribute?.('data-as') ?? '';
|
|
975
|
-
if (dataAs === 'p') {
|
|
976
|
-
return {
|
|
977
|
-
postprocess: ({ content }) => `\n\n${content.trim()}\n\n`,
|
|
978
|
-
};
|
|
979
|
-
}
|
|
980
|
-
return {};
|
|
981
|
-
},
|
|
982
|
-
pre: (ctx) => {
|
|
983
|
-
if (!isObject(ctx))
|
|
984
|
-
return buildPreTranslator(ctx);
|
|
985
|
-
const { node } = ctx;
|
|
986
|
-
if (!isLikeNode(node)) {
|
|
987
|
-
return buildPreTranslator(ctx);
|
|
988
|
-
}
|
|
989
|
-
const getAttribute = typeof node.getAttribute === 'function'
|
|
990
|
-
? node.getAttribute.bind(node)
|
|
991
|
-
: undefined;
|
|
992
|
-
const className = getAttribute?.('class') ?? '';
|
|
993
|
-
if (className.includes('mermaid')) {
|
|
994
|
-
return {
|
|
995
|
-
noEscape: true,
|
|
996
|
-
preserveWhitespace: true,
|
|
997
|
-
postprocess: ({ content }) => `\n\n\`\`\`mermaid\n${content.trim()}\n\`\`\`\n\n`,
|
|
998
|
-
};
|
|
999
|
-
}
|
|
1000
|
-
return buildPreTranslator(ctx);
|
|
1001
|
-
},
|
|
1002
|
-
};
|
|
1003
|
-
}
|
|
1004
|
-
let markdownConverter = null;
|
|
1005
|
-
function getMarkdownConverter() {
|
|
1006
|
-
markdownConverter ??= new NodeHtmlMarkdown({
|
|
1007
|
-
codeFence: CODE_BLOCK.fence,
|
|
1008
|
-
codeBlockStyle: 'fenced',
|
|
1009
|
-
emDelimiter: '_',
|
|
1010
|
-
bulletMarker: '-',
|
|
1011
|
-
globalEscape: [/[\\`*_~]/gm, '\\$&'],
|
|
1012
|
-
}, createCustomTranslators());
|
|
1013
|
-
return markdownConverter;
|
|
1014
|
-
}
|
|
1015
|
-
function translateHtmlFragmentToMarkdown(html) {
|
|
1016
|
-
return getMarkdownConverter().translate(html).trim();
|
|
1017
|
-
}
|
|
1018
395
|
function isWhitespaceChar(code) {
|
|
1019
396
|
return code === 9 || code === 10 || code === 12 || code === 13 || code === 32;
|
|
1020
397
|
}
|
|
@@ -1127,8 +504,8 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
|
1127
504
|
let lastIndex = 0;
|
|
1128
505
|
let lineCount = 0;
|
|
1129
506
|
while (lastIndex < len) {
|
|
1130
|
-
if (++lineCount % 500 === 0
|
|
1131
|
-
|
|
507
|
+
if (++lineCount % 500 === 0) {
|
|
508
|
+
abortPolicy.throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
|
|
1132
509
|
}
|
|
1133
510
|
let nextIndex = markdown.indexOf('\n', lastIndex);
|
|
1134
511
|
let line;
|
|
@@ -1315,7 +692,12 @@ function getTextContentSkippingHidden(node, parts) {
|
|
|
1315
692
|
}
|
|
1316
693
|
if (nodeType !== 1)
|
|
1317
694
|
return;
|
|
1318
|
-
const
|
|
695
|
+
const element = node;
|
|
696
|
+
if (element.hasAttribute('hidden') ||
|
|
697
|
+
element.getAttribute('aria-hidden') === 'true') {
|
|
698
|
+
return;
|
|
699
|
+
}
|
|
700
|
+
const { tagName } = element;
|
|
1319
701
|
if (tagName === 'SCRIPT' || tagName === 'STYLE' || tagName === 'NOSCRIPT')
|
|
1320
702
|
return;
|
|
1321
703
|
const { childNodes } = node;
|
|
@@ -1479,19 +861,21 @@ function shouldUseArticleContent(article, originalHtmlOrDocument) {
|
|
|
1479
861
|
function buildContentSource(params) {
|
|
1480
862
|
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, skipNoiseRemoval, signal, } = params;
|
|
1481
863
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
864
|
+
const base = {
|
|
865
|
+
favicon: extractedMeta.favicon,
|
|
866
|
+
metadata,
|
|
867
|
+
extractedMetadata: extractedMeta,
|
|
868
|
+
truncated,
|
|
869
|
+
};
|
|
1482
870
|
if (useArticleContent && article) {
|
|
1483
|
-
// Readability output can still be noisy (unless user requested skip).
|
|
1484
871
|
const cleanedArticleHtml = skipNoiseRemoval
|
|
1485
872
|
? article.content
|
|
1486
873
|
: removeNoiseFromHtml(article.content, undefined, url, signal);
|
|
1487
874
|
return {
|
|
875
|
+
...base,
|
|
1488
876
|
sourceHtml: cleanedArticleHtml,
|
|
1489
877
|
title: article.title,
|
|
1490
|
-
favicon: extractedMeta.favicon,
|
|
1491
|
-
metadata,
|
|
1492
|
-
extractedMetadata: extractedMeta,
|
|
1493
878
|
skipNoiseRemoval: true,
|
|
1494
|
-
truncated,
|
|
1495
879
|
};
|
|
1496
880
|
}
|
|
1497
881
|
if (document) {
|
|
@@ -1499,36 +883,18 @@ function buildContentSource(params) {
|
|
|
1499
883
|
? html
|
|
1500
884
|
: removeNoiseFromHtml(html, document, url, signal);
|
|
1501
885
|
const contentRoot = findContentRoot(document);
|
|
1502
|
-
if (contentRoot) {
|
|
1503
|
-
return {
|
|
1504
|
-
sourceHtml: contentRoot,
|
|
1505
|
-
title: extractedMeta.title,
|
|
1506
|
-
favicon: extractedMeta.favicon,
|
|
1507
|
-
metadata,
|
|
1508
|
-
extractedMetadata: extractedMeta,
|
|
1509
|
-
skipNoiseRemoval: true,
|
|
1510
|
-
document,
|
|
1511
|
-
truncated,
|
|
1512
|
-
};
|
|
1513
|
-
}
|
|
1514
886
|
return {
|
|
1515
|
-
|
|
887
|
+
...base,
|
|
888
|
+
sourceHtml: contentRoot ?? cleanedHtml,
|
|
1516
889
|
title: extractedMeta.title,
|
|
1517
|
-
favicon: extractedMeta.favicon,
|
|
1518
|
-
metadata,
|
|
1519
|
-
extractedMetadata: extractedMeta,
|
|
1520
890
|
skipNoiseRemoval: true,
|
|
1521
891
|
document,
|
|
1522
|
-
truncated,
|
|
1523
892
|
};
|
|
1524
893
|
}
|
|
1525
894
|
return {
|
|
895
|
+
...base,
|
|
1526
896
|
sourceHtml: html,
|
|
1527
897
|
title: extractedMeta.title,
|
|
1528
|
-
favicon: extractedMeta.favicon,
|
|
1529
|
-
metadata,
|
|
1530
|
-
extractedMetadata: extractedMeta,
|
|
1531
|
-
truncated,
|
|
1532
898
|
};
|
|
1533
899
|
}
|
|
1534
900
|
function resolveContentSource(params) {
|
|
@@ -1719,4 +1085,3 @@ export async function transformHtmlToMarkdown(html, url, options) {
|
|
|
1719
1085
|
export async function transformBufferToMarkdown(htmlBuffer, url, options) {
|
|
1720
1086
|
return transformInputToMarkdown(htmlBuffer, url, options);
|
|
1721
1087
|
}
|
|
1722
|
-
//# sourceMappingURL=transform.js.map
|