@j0hanz/superfetch 2.2.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +243 -494
- package/dist/cache.d.ts +2 -3
- package/dist/cache.js +51 -241
- package/dist/config.d.ts +6 -1
- package/dist/config.js +29 -34
- package/dist/crypto.d.ts +0 -1
- package/dist/crypto.js +0 -1
- package/dist/dom-noise-removal.d.ts +5 -0
- package/dist/dom-noise-removal.js +485 -0
- package/dist/errors.d.ts +0 -1
- package/dist/errors.js +8 -6
- package/dist/fetch.d.ts +0 -1
- package/dist/fetch.js +71 -61
- package/dist/host-normalization.d.ts +1 -0
- package/dist/host-normalization.js +47 -0
- package/dist/http-native.d.ts +5 -0
- package/dist/http-native.js +693 -0
- package/dist/index.d.ts +0 -1
- package/dist/index.js +1 -2
- package/dist/instructions.md +22 -20
- package/dist/json.d.ts +1 -0
- package/dist/json.js +29 -0
- package/dist/language-detection.d.ts +12 -0
- package/dist/language-detection.js +291 -0
- package/dist/markdown-cleanup.d.ts +18 -0
- package/dist/markdown-cleanup.js +283 -0
- package/dist/mcp-validator.d.ts +14 -0
- package/dist/mcp-validator.js +22 -0
- package/dist/mcp.d.ts +0 -1
- package/dist/mcp.js +0 -1
- package/dist/observability.d.ts +1 -1
- package/dist/observability.js +15 -3
- package/dist/server-tuning.d.ts +9 -0
- package/dist/server-tuning.js +30 -0
- package/dist/session.d.ts +36 -0
- package/dist/session.js +159 -0
- package/dist/tools.d.ts +0 -1
- package/dist/tools.js +23 -33
- package/dist/transform-types.d.ts +80 -0
- package/dist/transform-types.js +5 -0
- package/dist/transform.d.ts +7 -53
- package/dist/transform.js +434 -856
- package/dist/type-guards.d.ts +1 -2
- package/dist/type-guards.js +1 -2
- package/dist/workers/transform-worker.d.ts +0 -1
- package/dist/workers/transform-worker.js +52 -43
- package/package.json +11 -12
- package/dist/cache.d.ts.map +0 -1
- package/dist/cache.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/crypto.d.ts.map +0 -1
- package/dist/crypto.js.map +0 -1
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js.map +0 -1
- package/dist/fetch.d.ts.map +0 -1
- package/dist/fetch.js.map +0 -1
- package/dist/http.d.ts +0 -90
- package/dist/http.d.ts.map +0 -1
- package/dist/http.js +0 -1576
- package/dist/http.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/mcp.d.ts.map +0 -1
- package/dist/mcp.js.map +0 -1
- package/dist/observability.d.ts.map +0 -1
- package/dist/observability.js.map +0 -1
- package/dist/tools.d.ts.map +0 -1
- package/dist/tools.js.map +0 -1
- package/dist/transform.d.ts.map +0 -1
- package/dist/transform.js.map +0 -1
- package/dist/type-guards.d.ts.map +0 -1
- package/dist/type-guards.js.map +0 -1
- package/dist/workers/transform-worker.d.ts.map +0 -1
- package/dist/workers/transform-worker.js.map +0 -1
package/dist/transform.js
CHANGED
|
@@ -8,44 +8,25 @@ import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
|
8
8
|
import { z } from 'zod';
|
|
9
9
|
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
10
10
|
import { config } from './config.js';
|
|
11
|
+
import { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
11
12
|
import { FetchError, getErrorMessage } from './errors.js';
|
|
12
13
|
import { isRawTextContentUrl } from './fetch.js';
|
|
14
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
15
|
+
import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
|
|
13
16
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
|
|
14
|
-
import {
|
|
17
|
+
import { isObject } from './type-guards.js';
|
|
18
|
+
// Re-export language detection for backward compatibility
|
|
19
|
+
export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
20
|
+
// Re-export markdown cleanup for backward compatibility
|
|
21
|
+
export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
|
|
22
|
+
// Re-export DOM noise removal for backward compatibility
|
|
23
|
+
export { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
15
24
|
function getAbortReason(signal) {
|
|
16
|
-
if (!
|
|
25
|
+
if (!isObject(signal))
|
|
17
26
|
return undefined;
|
|
18
27
|
return 'reason' in signal ? signal.reason : undefined;
|
|
19
28
|
}
|
|
20
|
-
|
|
21
|
-
if (!isRecord(document))
|
|
22
|
-
return undefined;
|
|
23
|
-
const { body } = document;
|
|
24
|
-
if (!isRecord(body))
|
|
25
|
-
return undefined;
|
|
26
|
-
const { innerHTML } = body;
|
|
27
|
-
return typeof innerHTML === 'string' && innerHTML.length > 0
|
|
28
|
-
? innerHTML
|
|
29
|
-
: undefined;
|
|
30
|
-
}
|
|
31
|
-
function getDocumentToString(document) {
|
|
32
|
-
if (!isRecord(document))
|
|
33
|
-
return undefined;
|
|
34
|
-
if (typeof document.toString !== 'function')
|
|
35
|
-
return undefined;
|
|
36
|
-
return document.toString.bind(document);
|
|
37
|
-
}
|
|
38
|
-
function getDocumentElementOuterHtml(document) {
|
|
39
|
-
if (!isRecord(document))
|
|
40
|
-
return undefined;
|
|
41
|
-
const { documentElement } = document;
|
|
42
|
-
if (!isRecord(documentElement))
|
|
43
|
-
return undefined;
|
|
44
|
-
const { outerHTML } = documentElement;
|
|
45
|
-
return typeof outerHTML === 'string' && outerHTML.length > 0
|
|
46
|
-
? outerHTML
|
|
47
|
-
: undefined;
|
|
48
|
-
}
|
|
29
|
+
// DOM accessor helpers moved to ./dom-noise-removal.ts
|
|
49
30
|
const CODE_BLOCK = {
|
|
50
31
|
fence: '```',
|
|
51
32
|
format: (code, language = '') => {
|
|
@@ -53,6 +34,10 @@ const CODE_BLOCK = {
|
|
|
53
34
|
},
|
|
54
35
|
};
|
|
55
36
|
const transformChannel = diagnosticsChannel.channel('superfetch.transform');
|
|
37
|
+
const LOG_URL_MAX = 80;
|
|
38
|
+
function truncateUrlForLog(url) {
|
|
39
|
+
return url.substring(0, LOG_URL_MAX);
|
|
40
|
+
}
|
|
56
41
|
function publishTransformEvent(event) {
|
|
57
42
|
if (!transformChannel.hasSubscribers)
|
|
58
43
|
return;
|
|
@@ -93,9 +78,13 @@ export function endTransformStage(context, options) {
|
|
|
93
78
|
}
|
|
94
79
|
function runTransformStage(url, stage, fn) {
|
|
95
80
|
const context = startTransformStage(url, stage);
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
81
|
+
try {
|
|
82
|
+
return fn();
|
|
83
|
+
}
|
|
84
|
+
finally {
|
|
85
|
+
// Emit duration even if the stage throws; callers decide how to handle the error.
|
|
86
|
+
endTransformStage(context);
|
|
87
|
+
}
|
|
99
88
|
}
|
|
100
89
|
function isTimeoutReason(reason) {
|
|
101
90
|
return reason instanceof Error && reason.name === 'TimeoutError';
|
|
@@ -129,46 +118,105 @@ function truncateHtml(html) {
|
|
|
129
118
|
});
|
|
130
119
|
return html.substring(0, maxSize);
|
|
131
120
|
}
|
|
121
|
+
const META_PROPERTY_HANDLERS = new Map([
|
|
122
|
+
[
|
|
123
|
+
'og:title',
|
|
124
|
+
(ctx, c) => {
|
|
125
|
+
ctx.title.og = c;
|
|
126
|
+
},
|
|
127
|
+
],
|
|
128
|
+
[
|
|
129
|
+
'og:description',
|
|
130
|
+
(ctx, c) => {
|
|
131
|
+
ctx.description.og = c;
|
|
132
|
+
},
|
|
133
|
+
],
|
|
134
|
+
[
|
|
135
|
+
'og:image',
|
|
136
|
+
(ctx, c) => {
|
|
137
|
+
ctx.image = c;
|
|
138
|
+
},
|
|
139
|
+
],
|
|
140
|
+
[
|
|
141
|
+
'article:published_time',
|
|
142
|
+
(ctx, c) => {
|
|
143
|
+
ctx.publishedAt = c;
|
|
144
|
+
},
|
|
145
|
+
],
|
|
146
|
+
[
|
|
147
|
+
'article:modified_time',
|
|
148
|
+
(ctx, c) => {
|
|
149
|
+
ctx.modifiedAt = c;
|
|
150
|
+
},
|
|
151
|
+
],
|
|
152
|
+
]);
|
|
153
|
+
const META_NAME_HANDLERS = new Map([
|
|
154
|
+
[
|
|
155
|
+
'twitter:title',
|
|
156
|
+
(ctx, c) => {
|
|
157
|
+
ctx.title.twitter = c;
|
|
158
|
+
},
|
|
159
|
+
],
|
|
160
|
+
[
|
|
161
|
+
'twitter:description',
|
|
162
|
+
(ctx, c) => {
|
|
163
|
+
ctx.description.twitter = c;
|
|
164
|
+
},
|
|
165
|
+
],
|
|
166
|
+
[
|
|
167
|
+
'description',
|
|
168
|
+
(ctx, c) => {
|
|
169
|
+
ctx.description.standard = c;
|
|
170
|
+
},
|
|
171
|
+
],
|
|
172
|
+
[
|
|
173
|
+
'author',
|
|
174
|
+
(ctx, c) => {
|
|
175
|
+
ctx.author = c;
|
|
176
|
+
},
|
|
177
|
+
],
|
|
178
|
+
]);
|
|
132
179
|
function extractMetadata(document) {
|
|
133
|
-
const
|
|
134
|
-
|
|
135
|
-
|
|
180
|
+
const ctx = {
|
|
181
|
+
title: {},
|
|
182
|
+
description: {},
|
|
183
|
+
};
|
|
136
184
|
for (const tag of document.querySelectorAll('meta')) {
|
|
137
185
|
const content = tag.getAttribute('content')?.trim();
|
|
138
186
|
if (!content)
|
|
139
187
|
continue;
|
|
140
188
|
const property = tag.getAttribute('property');
|
|
189
|
+
if (property) {
|
|
190
|
+
META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
|
|
191
|
+
}
|
|
141
192
|
const name = tag.getAttribute('name');
|
|
142
|
-
if (
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
description.og = content;
|
|
146
|
-
else if (name === 'twitter:title')
|
|
147
|
-
title.twitter = content;
|
|
148
|
-
else if (name === 'twitter:description')
|
|
149
|
-
description.twitter = content;
|
|
150
|
-
else if (name === 'description')
|
|
151
|
-
description.standard = content;
|
|
152
|
-
else if (name === 'author')
|
|
153
|
-
author = content;
|
|
193
|
+
if (name) {
|
|
194
|
+
META_NAME_HANDLERS.get(name)?.(ctx, content);
|
|
195
|
+
}
|
|
154
196
|
}
|
|
155
197
|
const titleEl = document.querySelector('title');
|
|
156
|
-
if (!title.standard && titleEl?.textContent) {
|
|
157
|
-
title.standard = titleEl.textContent.trim();
|
|
198
|
+
if (!ctx.title.standard && titleEl?.textContent) {
|
|
199
|
+
ctx.title.standard = titleEl.textContent.trim();
|
|
158
200
|
}
|
|
159
|
-
const resolvedTitle = title.og ?? title.twitter ?? title.standard;
|
|
160
|
-
const resolvedDesc = description.og ?? description.twitter ?? description.standard;
|
|
201
|
+
const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
|
|
202
|
+
const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
|
|
161
203
|
const metadata = {};
|
|
162
204
|
if (resolvedTitle)
|
|
163
205
|
metadata.title = resolvedTitle;
|
|
164
206
|
if (resolvedDesc)
|
|
165
207
|
metadata.description = resolvedDesc;
|
|
166
|
-
if (author)
|
|
167
|
-
metadata.author = author;
|
|
208
|
+
if (ctx.author)
|
|
209
|
+
metadata.author = ctx.author;
|
|
210
|
+
if (ctx.image)
|
|
211
|
+
metadata.image = ctx.image;
|
|
212
|
+
if (ctx.publishedAt)
|
|
213
|
+
metadata.publishedAt = ctx.publishedAt;
|
|
214
|
+
if (ctx.modifiedAt)
|
|
215
|
+
metadata.modifiedAt = ctx.modifiedAt;
|
|
168
216
|
return metadata;
|
|
169
217
|
}
|
|
170
218
|
function isReadabilityCompatible(doc) {
|
|
171
|
-
if (!
|
|
219
|
+
if (!isObject(doc))
|
|
172
220
|
return false;
|
|
173
221
|
return hasDocumentElement(doc) && hasQuerySelectors(doc);
|
|
174
222
|
}
|
|
@@ -185,14 +233,18 @@ function extractArticle(document) {
|
|
|
185
233
|
return null;
|
|
186
234
|
}
|
|
187
235
|
try {
|
|
188
|
-
const
|
|
189
|
-
const rawText =
|
|
190
|
-
documentClone.documentElement.textContent;
|
|
236
|
+
const doc = document;
|
|
237
|
+
const rawText = doc.querySelector('body')?.textContent ?? doc.documentElement.textContent;
|
|
191
238
|
const textLength = rawText.replace(/\s+/g, ' ').trim().length;
|
|
192
|
-
if (textLength
|
|
239
|
+
if (textLength < 100) {
|
|
240
|
+
logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
|
|
241
|
+
'This might be a client-side rendered (SPA) application. ' +
|
|
242
|
+
'Content extraction may be incomplete.', { textLength });
|
|
243
|
+
}
|
|
244
|
+
if (textLength >= 400 && !isProbablyReaderable(doc)) {
|
|
193
245
|
return null;
|
|
194
246
|
}
|
|
195
|
-
const reader = new Readability(
|
|
247
|
+
const reader = new Readability(doc, { maxElemsToParse: 20_000 });
|
|
196
248
|
const parsed = reader.parse();
|
|
197
249
|
if (!parsed)
|
|
198
250
|
return null;
|
|
@@ -218,7 +270,8 @@ export function extractContent(html, url, options = {
|
|
|
218
270
|
}
|
|
219
271
|
function extractContentWithDocument(html, url, options) {
|
|
220
272
|
if (!isValidInput(html, url)) {
|
|
221
|
-
|
|
273
|
+
const { document } = parseHTML('<html></html>');
|
|
274
|
+
return { article: null, metadata: {}, document };
|
|
222
275
|
}
|
|
223
276
|
return tryExtractContent(html, url, options);
|
|
224
277
|
}
|
|
@@ -233,7 +286,8 @@ function handleExtractionFailure(error, url, signal) {
|
|
|
233
286
|
}
|
|
234
287
|
throwIfAborted(signal, url, 'extract:error');
|
|
235
288
|
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
236
|
-
|
|
289
|
+
const { document } = parseHTML('<html></html>');
|
|
290
|
+
return { article: null, metadata: {}, document };
|
|
237
291
|
}
|
|
238
292
|
function extractContentStages(html, url, options) {
|
|
239
293
|
throwIfAborted(options.signal, url, 'extract:begin');
|
|
@@ -248,7 +302,8 @@ function extractContentStages(html, url, options) {
|
|
|
248
302
|
return {
|
|
249
303
|
article,
|
|
250
304
|
metadata,
|
|
251
|
-
|
|
305
|
+
document,
|
|
306
|
+
...(truncatedHtml.length !== html.length ? { truncated: true } : {}),
|
|
252
307
|
};
|
|
253
308
|
}
|
|
254
309
|
function tryExtractContent(html, url, options) {
|
|
@@ -285,532 +340,7 @@ function applyBaseUri(document, url) {
|
|
|
285
340
|
});
|
|
286
341
|
}
|
|
287
342
|
}
|
|
288
|
-
|
|
289
|
-
for (let index = 0; index < code.length - 1; index += 1) {
|
|
290
|
-
if (code[index] !== '<')
|
|
291
|
-
continue;
|
|
292
|
-
const next = code[index + 1];
|
|
293
|
-
if (!next)
|
|
294
|
-
continue;
|
|
295
|
-
if (next >= 'A' && next <= 'Z')
|
|
296
|
-
return true;
|
|
297
|
-
}
|
|
298
|
-
return false;
|
|
299
|
-
}
|
|
300
|
-
function containsWord(source, word) {
|
|
301
|
-
let startIndex = source.indexOf(word);
|
|
302
|
-
while (startIndex !== -1) {
|
|
303
|
-
const before = startIndex === 0 ? '' : source[startIndex - 1];
|
|
304
|
-
const afterIndex = startIndex + word.length;
|
|
305
|
-
const after = afterIndex >= source.length ? '' : source[afterIndex];
|
|
306
|
-
if (!isWordChar(before) && !isWordChar(after))
|
|
307
|
-
return true;
|
|
308
|
-
startIndex = source.indexOf(word, startIndex + word.length);
|
|
309
|
-
}
|
|
310
|
-
return false;
|
|
311
|
-
}
|
|
312
|
-
function splitLines(content) {
|
|
313
|
-
return content.split('\n');
|
|
314
|
-
}
|
|
315
|
-
function extractLanguageFromClassName(className) {
|
|
316
|
-
const tokens = className.match(/\S+/g);
|
|
317
|
-
if (!tokens)
|
|
318
|
-
return undefined;
|
|
319
|
-
for (const token of tokens) {
|
|
320
|
-
const lower = token.toLowerCase();
|
|
321
|
-
if (lower.startsWith('language-'))
|
|
322
|
-
return token.slice('language-'.length);
|
|
323
|
-
if (lower.startsWith('lang-'))
|
|
324
|
-
return token.slice('lang-'.length);
|
|
325
|
-
if (lower.startsWith('highlight-')) {
|
|
326
|
-
return token.slice('highlight-'.length);
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
if (tokens.includes('hljs')) {
|
|
330
|
-
const langClass = tokens.find((t) => t !== 'hljs' && !t.startsWith('hljs-'));
|
|
331
|
-
if (langClass)
|
|
332
|
-
return langClass;
|
|
333
|
-
}
|
|
334
|
-
return undefined;
|
|
335
|
-
}
|
|
336
|
-
function resolveLanguageFromDataAttribute(dataLang) {
|
|
337
|
-
const trimmed = dataLang.trim();
|
|
338
|
-
if (!trimmed)
|
|
339
|
-
return undefined;
|
|
340
|
-
for (const char of trimmed) {
|
|
341
|
-
if (!isWordChar(char))
|
|
342
|
-
return undefined;
|
|
343
|
-
}
|
|
344
|
-
return trimmed;
|
|
345
|
-
}
|
|
346
|
-
function isWordChar(char) {
|
|
347
|
-
if (!char)
|
|
348
|
-
return false;
|
|
349
|
-
const code = char.charCodeAt(0);
|
|
350
|
-
return ((code >= 48 && code <= 57) ||
|
|
351
|
-
(code >= 65 && code <= 90) ||
|
|
352
|
-
(code >= 97 && code <= 122) ||
|
|
353
|
-
char === '_');
|
|
354
|
-
}
|
|
355
|
-
const LANGUAGE_PATTERNS = [
|
|
356
|
-
{
|
|
357
|
-
language: 'jsx',
|
|
358
|
-
pattern: {
|
|
359
|
-
keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
|
|
360
|
-
custom: (code) => containsJsxTag(code),
|
|
361
|
-
},
|
|
362
|
-
},
|
|
363
|
-
{
|
|
364
|
-
language: 'typescript',
|
|
365
|
-
pattern: {
|
|
366
|
-
wordBoundary: ['interface', 'type'],
|
|
367
|
-
custom: (_, lower) => [
|
|
368
|
-
': string',
|
|
369
|
-
':string',
|
|
370
|
-
': number',
|
|
371
|
-
':number',
|
|
372
|
-
': boolean',
|
|
373
|
-
':boolean',
|
|
374
|
-
': void',
|
|
375
|
-
':void',
|
|
376
|
-
': any',
|
|
377
|
-
':any',
|
|
378
|
-
': unknown',
|
|
379
|
-
':unknown',
|
|
380
|
-
': never',
|
|
381
|
-
':never',
|
|
382
|
-
].some((hint) => lower.includes(hint)),
|
|
383
|
-
},
|
|
384
|
-
},
|
|
385
|
-
{
|
|
386
|
-
language: 'rust',
|
|
387
|
-
pattern: {
|
|
388
|
-
regex: /\b(?:fn|impl|struct|enum)\b/,
|
|
389
|
-
keywords: ['let mut'],
|
|
390
|
-
custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
|
|
391
|
-
},
|
|
392
|
-
},
|
|
393
|
-
{
|
|
394
|
-
language: 'javascript',
|
|
395
|
-
pattern: {
|
|
396
|
-
regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
|
|
397
|
-
},
|
|
398
|
-
},
|
|
399
|
-
{
|
|
400
|
-
language: 'python',
|
|
401
|
-
pattern: {
|
|
402
|
-
regex: /\b(?:def|class|import|from)\b/,
|
|
403
|
-
keywords: ['print(', '__name__'],
|
|
404
|
-
},
|
|
405
|
-
},
|
|
406
|
-
{
|
|
407
|
-
language: 'bash',
|
|
408
|
-
pattern: {
|
|
409
|
-
custom: (code) => detectBashIndicators(code),
|
|
410
|
-
},
|
|
411
|
-
},
|
|
412
|
-
{
|
|
413
|
-
language: 'css',
|
|
414
|
-
pattern: {
|
|
415
|
-
regex: /@media|@import|@keyframes/,
|
|
416
|
-
custom: (code) => detectCssStructure(code),
|
|
417
|
-
},
|
|
418
|
-
},
|
|
419
|
-
{
|
|
420
|
-
language: 'html',
|
|
421
|
-
pattern: {
|
|
422
|
-
keywords: [
|
|
423
|
-
'<!doctype',
|
|
424
|
-
'<html',
|
|
425
|
-
'<head',
|
|
426
|
-
'<body',
|
|
427
|
-
'<div',
|
|
428
|
-
'<span',
|
|
429
|
-
'<p',
|
|
430
|
-
'<a',
|
|
431
|
-
'<script',
|
|
432
|
-
'<style',
|
|
433
|
-
],
|
|
434
|
-
},
|
|
435
|
-
},
|
|
436
|
-
{
|
|
437
|
-
language: 'json',
|
|
438
|
-
pattern: {
|
|
439
|
-
startsWith: ['{', '['],
|
|
440
|
-
},
|
|
441
|
-
},
|
|
442
|
-
{
|
|
443
|
-
language: 'yaml',
|
|
444
|
-
pattern: {
|
|
445
|
-
custom: (code) => detectYamlStructure(code),
|
|
446
|
-
},
|
|
447
|
-
},
|
|
448
|
-
{
|
|
449
|
-
language: 'sql',
|
|
450
|
-
pattern: {
|
|
451
|
-
wordBoundary: [
|
|
452
|
-
'select',
|
|
453
|
-
'insert',
|
|
454
|
-
'update',
|
|
455
|
-
'delete',
|
|
456
|
-
'create',
|
|
457
|
-
'alter',
|
|
458
|
-
'drop',
|
|
459
|
-
],
|
|
460
|
-
},
|
|
461
|
-
},
|
|
462
|
-
{
|
|
463
|
-
language: 'go',
|
|
464
|
-
pattern: {
|
|
465
|
-
wordBoundary: ['package', 'func'],
|
|
466
|
-
keywords: ['import "'],
|
|
467
|
-
},
|
|
468
|
-
},
|
|
469
|
-
];
|
|
470
|
-
// Bash detection constants
|
|
471
|
-
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
472
|
-
const BASH_PKG_MANAGERS = [
|
|
473
|
-
'npm',
|
|
474
|
-
'yarn',
|
|
475
|
-
'pnpm',
|
|
476
|
-
'npx',
|
|
477
|
-
'brew',
|
|
478
|
-
'apt',
|
|
479
|
-
'pip',
|
|
480
|
-
'cargo',
|
|
481
|
-
'go',
|
|
482
|
-
];
|
|
483
|
-
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
484
|
-
function isShellPrefix(line) {
|
|
485
|
-
return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
|
|
486
|
-
}
|
|
487
|
-
function matchesBashCommand(line) {
|
|
488
|
-
return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
|
|
489
|
-
}
|
|
490
|
-
function matchesPackageManagerVerb(line) {
|
|
491
|
-
for (const mgr of BASH_PKG_MANAGERS) {
|
|
492
|
-
if (!line.startsWith(`${mgr} `))
|
|
493
|
-
continue;
|
|
494
|
-
const rest = line.slice(mgr.length + 1);
|
|
495
|
-
if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
|
|
496
|
-
return true;
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
return false;
|
|
500
|
-
}
|
|
501
|
-
function detectBashIndicators(code) {
|
|
502
|
-
for (const line of splitLines(code)) {
|
|
503
|
-
const trimmed = line.trimStart();
|
|
504
|
-
if (!trimmed)
|
|
505
|
-
continue;
|
|
506
|
-
if (isShellPrefix(trimmed) ||
|
|
507
|
-
matchesBashCommand(trimmed) ||
|
|
508
|
-
matchesPackageManagerVerb(trimmed)) {
|
|
509
|
-
return true;
|
|
510
|
-
}
|
|
511
|
-
}
|
|
512
|
-
return false;
|
|
513
|
-
}
|
|
514
|
-
function detectCssStructure(code) {
|
|
515
|
-
for (const line of splitLines(code)) {
|
|
516
|
-
const trimmed = line.trimStart();
|
|
517
|
-
if (!trimmed)
|
|
518
|
-
continue;
|
|
519
|
-
const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
|
|
520
|
-
trimmed.includes('{');
|
|
521
|
-
const isProperty = trimmed.includes(':') && trimmed.includes(';');
|
|
522
|
-
if (isSelector || isProperty)
|
|
523
|
-
return true;
|
|
524
|
-
}
|
|
525
|
-
return false;
|
|
526
|
-
}
|
|
527
|
-
function detectYamlStructure(code) {
|
|
528
|
-
for (const line of splitLines(code)) {
|
|
529
|
-
const trimmed = line.trim();
|
|
530
|
-
if (!trimmed)
|
|
531
|
-
continue;
|
|
532
|
-
const colonIdx = trimmed.indexOf(':');
|
|
533
|
-
if (colonIdx <= 0)
|
|
534
|
-
continue;
|
|
535
|
-
const after = trimmed[colonIdx + 1];
|
|
536
|
-
if (after === ' ' || after === '\t')
|
|
537
|
-
return true;
|
|
538
|
-
}
|
|
539
|
-
return false;
|
|
540
|
-
}
|
|
541
|
-
function matchesLanguagePattern(code, lower, pattern) {
|
|
542
|
-
if (pattern.keywords?.some((kw) => lower.includes(kw)))
|
|
543
|
-
return true;
|
|
544
|
-
if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
|
|
545
|
-
return true;
|
|
546
|
-
if (pattern.regex?.test(lower))
|
|
547
|
-
return true;
|
|
548
|
-
if (pattern.startsWith) {
|
|
549
|
-
const trimmed = code.trimStart();
|
|
550
|
-
if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
|
|
551
|
-
return true;
|
|
552
|
-
}
|
|
553
|
-
if (pattern.custom?.(code, lower))
|
|
554
|
-
return true;
|
|
555
|
-
return false;
|
|
556
|
-
}
|
|
557
|
-
export function detectLanguageFromCode(code) {
|
|
558
|
-
const lower = code.toLowerCase();
|
|
559
|
-
for (const { language, pattern } of LANGUAGE_PATTERNS) {
|
|
560
|
-
if (matchesLanguagePattern(code, lower, pattern))
|
|
561
|
-
return language;
|
|
562
|
-
}
|
|
563
|
-
return undefined;
|
|
564
|
-
}
|
|
565
|
-
export function resolveLanguageFromAttributes(className, dataLang) {
|
|
566
|
-
const classMatch = extractLanguageFromClassName(className);
|
|
567
|
-
return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
|
|
568
|
-
}
|
|
569
|
-
function isElement(node) {
|
|
570
|
-
return (isRecord(node) &&
|
|
571
|
-
'getAttribute' in node &&
|
|
572
|
-
typeof node.getAttribute === 'function');
|
|
573
|
-
}
|
|
574
|
-
const STRUCTURAL_TAGS = new Set([
|
|
575
|
-
'script',
|
|
576
|
-
'style',
|
|
577
|
-
'noscript',
|
|
578
|
-
'iframe',
|
|
579
|
-
'form',
|
|
580
|
-
'button',
|
|
581
|
-
'input',
|
|
582
|
-
'select',
|
|
583
|
-
'textarea',
|
|
584
|
-
'svg',
|
|
585
|
-
]);
|
|
586
|
-
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
|
|
587
|
-
const NAVIGATION_ROLES = new Set([
|
|
588
|
-
'navigation',
|
|
589
|
-
'banner',
|
|
590
|
-
'complementary',
|
|
591
|
-
'contentinfo',
|
|
592
|
-
'tree',
|
|
593
|
-
'menubar',
|
|
594
|
-
'menu',
|
|
595
|
-
'dialog',
|
|
596
|
-
'alertdialog',
|
|
597
|
-
'search',
|
|
598
|
-
]);
|
|
599
|
-
const PROMO_TOKENS = new Set([
|
|
600
|
-
'banner',
|
|
601
|
-
'promo',
|
|
602
|
-
'announcement',
|
|
603
|
-
'cta',
|
|
604
|
-
'callout',
|
|
605
|
-
'advert',
|
|
606
|
-
'ad',
|
|
607
|
-
'ads',
|
|
608
|
-
'sponsor',
|
|
609
|
-
'newsletter',
|
|
610
|
-
'subscribe',
|
|
611
|
-
'cookie',
|
|
612
|
-
'consent',
|
|
613
|
-
'popup',
|
|
614
|
-
'modal',
|
|
615
|
-
'overlay',
|
|
616
|
-
'toast',
|
|
617
|
-
'share',
|
|
618
|
-
'social',
|
|
619
|
-
'related',
|
|
620
|
-
'recommend',
|
|
621
|
-
'comment',
|
|
622
|
-
'breadcrumb',
|
|
623
|
-
'pagination',
|
|
624
|
-
'pager',
|
|
625
|
-
'taglist',
|
|
626
|
-
]);
|
|
627
|
-
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
628
|
-
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
629
|
-
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
630
|
-
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
631
|
-
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
632
|
-
const NOISE_MARKERS = [
|
|
633
|
-
'<script',
|
|
634
|
-
'<style',
|
|
635
|
-
'<noscript',
|
|
636
|
-
'<iframe',
|
|
637
|
-
'<nav',
|
|
638
|
-
'<footer',
|
|
639
|
-
'<aside',
|
|
640
|
-
'<header',
|
|
641
|
-
'<form',
|
|
642
|
-
'<button',
|
|
643
|
-
'<input',
|
|
644
|
-
'<select',
|
|
645
|
-
'<textarea',
|
|
646
|
-
'<svg',
|
|
647
|
-
'<canvas',
|
|
648
|
-
' aria-hidden="true"',
|
|
649
|
-
" aria-hidden='true'",
|
|
650
|
-
' hidden',
|
|
651
|
-
' role="navigation"',
|
|
652
|
-
" role='navigation'",
|
|
653
|
-
' role="banner"',
|
|
654
|
-
" role='banner'",
|
|
655
|
-
' role="complementary"',
|
|
656
|
-
" role='complementary'",
|
|
657
|
-
' role="contentinfo"',
|
|
658
|
-
" role='contentinfo'",
|
|
659
|
-
' role="tree"',
|
|
660
|
-
" role='tree'",
|
|
661
|
-
' role="menubar"',
|
|
662
|
-
" role='menubar'",
|
|
663
|
-
' role="menu"',
|
|
664
|
-
" role='menu'",
|
|
665
|
-
' banner',
|
|
666
|
-
' promo',
|
|
667
|
-
' announcement',
|
|
668
|
-
' cta',
|
|
669
|
-
' callout',
|
|
670
|
-
' advert',
|
|
671
|
-
' newsletter',
|
|
672
|
-
' subscribe',
|
|
673
|
-
' cookie',
|
|
674
|
-
' consent',
|
|
675
|
-
' popup',
|
|
676
|
-
' modal',
|
|
677
|
-
' overlay',
|
|
678
|
-
' toast',
|
|
679
|
-
' fixed',
|
|
680
|
-
' sticky',
|
|
681
|
-
' z-50',
|
|
682
|
-
' z-4',
|
|
683
|
-
' isolate',
|
|
684
|
-
' breadcrumb',
|
|
685
|
-
' pagination',
|
|
686
|
-
];
|
|
687
|
-
function mayContainNoise(html) {
|
|
688
|
-
const haystack = html.toLowerCase();
|
|
689
|
-
return NOISE_MARKERS.some((marker) => haystack.includes(marker));
|
|
690
|
-
}
|
|
691
|
-
function isFullDocumentHtml(html) {
|
|
692
|
-
return HTML_DOCUMENT_MARKERS.test(html);
|
|
693
|
-
}
|
|
694
|
-
function isStructuralNoiseTag(tagName) {
|
|
695
|
-
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
696
|
-
}
|
|
697
|
-
function isElementHidden(element) {
|
|
698
|
-
const style = element.getAttribute('style') ?? '';
|
|
699
|
-
return (element.getAttribute('hidden') !== null ||
|
|
700
|
-
element.getAttribute('aria-hidden') === 'true' ||
|
|
701
|
-
/\bdisplay\s*:\s*none\b/i.test(style) ||
|
|
702
|
-
/\bvisibility\s*:\s*hidden\b/i.test(style));
|
|
703
|
-
}
|
|
704
|
-
function hasNoiseRole(role) {
|
|
705
|
-
return role !== null && NAVIGATION_ROLES.has(role);
|
|
706
|
-
}
|
|
707
|
-
function tokenizeIdentifierLikeText(value) {
|
|
708
|
-
return value
|
|
709
|
-
.toLowerCase()
|
|
710
|
-
.replace(/[^a-z0-9]+/g, ' ')
|
|
711
|
-
.trim()
|
|
712
|
-
.split(' ')
|
|
713
|
-
.filter(Boolean);
|
|
714
|
-
}
|
|
715
|
-
function matchesPromoIdOrClass(className, id) {
|
|
716
|
-
const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
|
|
717
|
-
return tokens.some((token) => PROMO_TOKENS.has(token));
|
|
718
|
-
}
|
|
719
|
-
function matchesFixedOrHighZIsolate(className) {
|
|
720
|
-
return (FIXED_PATTERN.test(className) ||
|
|
721
|
-
(HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
|
|
722
|
-
}
|
|
723
|
-
function readElementMetadata(element) {
|
|
724
|
-
return {
|
|
725
|
-
tagName: element.tagName.toLowerCase(),
|
|
726
|
-
className: element.getAttribute('class') ?? '',
|
|
727
|
-
id: element.getAttribute('id') ?? '',
|
|
728
|
-
role: element.getAttribute('role'),
|
|
729
|
-
isHidden: isElementHidden(element),
|
|
730
|
-
};
|
|
731
|
-
}
|
|
732
|
-
function isBoilerplateHeader({ className, id, role, }) {
|
|
733
|
-
if (hasNoiseRole(role))
|
|
734
|
-
return true;
|
|
735
|
-
const combined = `${className} ${id}`.toLowerCase();
|
|
736
|
-
return HEADER_NOISE_PATTERN.test(combined);
|
|
737
|
-
}
|
|
738
|
-
function isNoiseElement(node) {
|
|
739
|
-
const metadata = readElementMetadata(node);
|
|
740
|
-
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
741
|
-
ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
|
|
742
|
-
(metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
|
|
743
|
-
metadata.isHidden ||
|
|
744
|
-
hasNoiseRole(metadata.role) ||
|
|
745
|
-
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
746
|
-
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
747
|
-
}
|
|
748
|
-
function removeNoiseNodes(nodes) {
|
|
749
|
-
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
750
|
-
const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
|
|
751
|
-
if (!node)
|
|
752
|
-
continue;
|
|
753
|
-
if (isElement(node) && isNoiseElement(node)) {
|
|
754
|
-
node.remove();
|
|
755
|
-
}
|
|
756
|
-
}
|
|
757
|
-
}
|
|
758
|
-
function stripNoiseNodes(document) {
|
|
759
|
-
// Use targeted selectors for common noise elements instead of querySelectorAll('*')
|
|
760
|
-
const targetSelectors = [
|
|
761
|
-
'nav',
|
|
762
|
-
'footer',
|
|
763
|
-
'aside',
|
|
764
|
-
'header[class*="site"]',
|
|
765
|
-
'header[class*="nav"]',
|
|
766
|
-
'header[class*="menu"]',
|
|
767
|
-
'[role="banner"]',
|
|
768
|
-
'[role="navigation"]',
|
|
769
|
-
'[role="dialog"]',
|
|
770
|
-
'[style*="display: none"]',
|
|
771
|
-
'[style*="display:none"]',
|
|
772
|
-
'[hidden]',
|
|
773
|
-
'[aria-hidden="true"]',
|
|
774
|
-
].join(',');
|
|
775
|
-
const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
|
|
776
|
-
// Remove in reverse order to handle nested elements correctly
|
|
777
|
-
removeNoiseNodes(potentialNoiseNodes);
|
|
778
|
-
// Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
|
|
779
|
-
const candidateSelectors = [
|
|
780
|
-
...STRUCTURAL_TAGS,
|
|
781
|
-
...ALWAYS_NOISE_TAGS,
|
|
782
|
-
'header',
|
|
783
|
-
'canvas',
|
|
784
|
-
'[class]',
|
|
785
|
-
'[id]',
|
|
786
|
-
'[role]',
|
|
787
|
-
'[style]',
|
|
788
|
-
].join(',');
|
|
789
|
-
const allElements = document.querySelectorAll(candidateSelectors);
|
|
790
|
-
removeNoiseNodes(allElements);
|
|
791
|
-
}
|
|
792
|
-
function removeNoiseFromHtml(html, document) {
|
|
793
|
-
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
794
|
-
if (!shouldParse)
|
|
795
|
-
return html;
|
|
796
|
-
try {
|
|
797
|
-
const resolvedDocument = document ?? parseHTML(html).document;
|
|
798
|
-
stripNoiseNodes(resolvedDocument);
|
|
799
|
-
const bodyInnerHtml = getBodyInnerHtml(resolvedDocument);
|
|
800
|
-
if (bodyInnerHtml)
|
|
801
|
-
return bodyInnerHtml;
|
|
802
|
-
const docToString = getDocumentToString(resolvedDocument);
|
|
803
|
-
if (docToString)
|
|
804
|
-
return docToString();
|
|
805
|
-
const documentElementOuterHtml = getDocumentElementOuterHtml(resolvedDocument);
|
|
806
|
-
if (documentElementOuterHtml)
|
|
807
|
-
return documentElementOuterHtml;
|
|
808
|
-
return html;
|
|
809
|
-
}
|
|
810
|
-
catch {
|
|
811
|
-
return html;
|
|
812
|
-
}
|
|
813
|
-
}
|
|
343
|
+
// DOM noise removal functions moved to ./dom-noise-removal.ts
|
|
814
344
|
function buildInlineCode(content) {
|
|
815
345
|
const runs = content.match(/`+/g);
|
|
816
346
|
let longest = '';
|
|
@@ -821,8 +351,11 @@ function buildInlineCode(content) {
|
|
|
821
351
|
}
|
|
822
352
|
}
|
|
823
353
|
}
|
|
354
|
+
// Use a fence longer than any run of backticks in the content.
|
|
824
355
|
const delimiter = `\`${longest}`;
|
|
825
|
-
|
|
356
|
+
// Only pad when needed to avoid altering code spans unnecessarily.
|
|
357
|
+
// CommonMark recommends padding when the code starts/ends with a backtick.
|
|
358
|
+
const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
|
|
826
359
|
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
827
360
|
}
|
|
828
361
|
function deriveAltFromImageUrl(src) {
|
|
@@ -845,16 +378,13 @@ function deriveAltFromImageUrl(src) {
|
|
|
845
378
|
}
|
|
846
379
|
}
|
|
847
380
|
function isCodeBlock(parent) {
|
|
848
|
-
if (!
|
|
381
|
+
if (!isObject(parent))
|
|
849
382
|
return false;
|
|
850
383
|
const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
|
|
851
384
|
return ['PRE', 'WRAPPED-PRE'].includes(tagName);
|
|
852
385
|
}
|
|
853
386
|
function hasGetAttribute(value) {
|
|
854
|
-
return
|
|
855
|
-
}
|
|
856
|
-
function hasCodeBlockTranslators(value) {
|
|
857
|
-
return isRecord(value) && isRecord(value.codeBlockTranslators);
|
|
387
|
+
return isObject(value) && typeof value.getAttribute === 'function';
|
|
858
388
|
}
|
|
859
389
|
function buildInlineCodeTranslator() {
|
|
860
390
|
return {
|
|
@@ -871,37 +401,19 @@ function resolveAttributeLanguage(node) {
|
|
|
871
401
|
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
872
402
|
return resolveLanguageFromAttributes(className, dataLanguage);
|
|
873
403
|
}
|
|
874
|
-
function resolveCodeBlockTranslators(visitor) {
|
|
875
|
-
const childTranslators = isRecord(visitor) ? visitor.instance : null;
|
|
876
|
-
return hasCodeBlockTranslators(childTranslators)
|
|
877
|
-
? childTranslators.codeBlockTranslators
|
|
878
|
-
: null;
|
|
879
|
-
}
|
|
880
|
-
function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
|
|
881
|
-
return {
|
|
882
|
-
noEscape: true,
|
|
883
|
-
preserveWhitespace: true,
|
|
884
|
-
...(codeBlockTranslators
|
|
885
|
-
? { childTranslators: codeBlockTranslators }
|
|
886
|
-
: null),
|
|
887
|
-
postprocess: ({ content }) => {
|
|
888
|
-
const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
|
|
889
|
-
return CODE_BLOCK.format(content, language);
|
|
890
|
-
},
|
|
891
|
-
};
|
|
892
|
-
}
|
|
893
404
|
function buildCodeTranslator(ctx) {
|
|
894
|
-
if (!
|
|
405
|
+
if (!isObject(ctx))
|
|
895
406
|
return buildInlineCodeTranslator();
|
|
896
|
-
const {
|
|
407
|
+
const { parent } = ctx;
|
|
897
408
|
if (!isCodeBlock(parent))
|
|
898
409
|
return buildInlineCodeTranslator();
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
410
|
+
return {
|
|
411
|
+
noEscape: true,
|
|
412
|
+
preserveWhitespace: true,
|
|
413
|
+
};
|
|
902
414
|
}
|
|
903
415
|
function buildImageTranslator(ctx) {
|
|
904
|
-
if (!
|
|
416
|
+
if (!isObject(ctx))
|
|
905
417
|
return { content: '' };
|
|
906
418
|
const { node } = ctx;
|
|
907
419
|
const getAttribute = hasGetAttribute(node)
|
|
@@ -914,19 +426,57 @@ function buildImageTranslator(ctx) {
|
|
|
914
426
|
content: ``,
|
|
915
427
|
};
|
|
916
428
|
}
|
|
429
|
+
function findLanguageFromCodeChild(node) {
|
|
430
|
+
if (!isObject(node))
|
|
431
|
+
return undefined;
|
|
432
|
+
const { childNodes } = node;
|
|
433
|
+
if (!Array.isArray(childNodes))
|
|
434
|
+
return undefined;
|
|
435
|
+
for (const child of childNodes) {
|
|
436
|
+
if (!isObject(child))
|
|
437
|
+
continue;
|
|
438
|
+
const tagName = typeof child.rawTagName === 'string'
|
|
439
|
+
? child.rawTagName.toUpperCase()
|
|
440
|
+
: '';
|
|
441
|
+
if (tagName === 'CODE') {
|
|
442
|
+
return resolveAttributeLanguage(child);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
return undefined;
|
|
446
|
+
}
|
|
447
|
+
function createCodeBlockPostprocessor(language) {
|
|
448
|
+
return ({ content }) => {
|
|
449
|
+
const trimmed = content.trim();
|
|
450
|
+
if (!trimmed)
|
|
451
|
+
return '';
|
|
452
|
+
const resolvedLanguage = language ?? detectLanguageFromCode(trimmed) ?? '';
|
|
453
|
+
return CODE_BLOCK.format(trimmed, resolvedLanguage);
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
function buildPreTranslator(ctx) {
|
|
457
|
+
if (!isObject(ctx))
|
|
458
|
+
return {};
|
|
459
|
+
const { node } = ctx;
|
|
460
|
+
const attributeLanguage = resolveAttributeLanguage(node) ?? findLanguageFromCodeChild(node);
|
|
461
|
+
return {
|
|
462
|
+
noEscape: true,
|
|
463
|
+
preserveWhitespace: true,
|
|
464
|
+
postprocess: createCodeBlockPostprocessor(attributeLanguage),
|
|
465
|
+
};
|
|
466
|
+
}
|
|
917
467
|
function createCustomTranslators() {
|
|
918
468
|
return {
|
|
919
469
|
code: (ctx) => buildCodeTranslator(ctx),
|
|
920
470
|
img: (ctx) => buildImageTranslator(ctx),
|
|
921
471
|
dl: (ctx) => {
|
|
922
|
-
if (!
|
|
472
|
+
if (!isObject(ctx) || !isObject(ctx.node)) {
|
|
923
473
|
return { content: '' };
|
|
924
474
|
}
|
|
925
475
|
const node = ctx.node;
|
|
926
476
|
const childNodes = Array.isArray(node.childNodes) ? node.childNodes : [];
|
|
927
477
|
const items = childNodes
|
|
928
478
|
.map((child) => {
|
|
929
|
-
if (!
|
|
479
|
+
if (!isObject(child))
|
|
930
480
|
return '';
|
|
931
481
|
const nodeName = typeof child.nodeName === 'string'
|
|
932
482
|
? child.nodeName.toUpperCase()
|
|
@@ -956,6 +506,8 @@ function createCustomTranslators() {
|
|
|
956
506
|
sup: () => ({
|
|
957
507
|
postprocess: ({ content }) => `^${content}^`,
|
|
958
508
|
}),
|
|
509
|
+
// Fix #6: Handle <pre> without <code> - wrap in fenced code block
|
|
510
|
+
pre: (ctx) => buildPreTranslator(ctx),
|
|
959
511
|
};
|
|
960
512
|
}
|
|
961
513
|
let markdownInstance = null;
|
|
@@ -971,9 +523,11 @@ function getMarkdownConverter() {
|
|
|
971
523
|
markdownInstance ??= createMarkdownInstance();
|
|
972
524
|
return markdownInstance;
|
|
973
525
|
}
|
|
974
|
-
function translateHtmlToMarkdown(html, url, signal, document) {
|
|
526
|
+
function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval) {
|
|
975
527
|
throwIfAborted(signal, url, 'markdown:begin');
|
|
976
|
-
const cleanedHtml =
|
|
528
|
+
const cleanedHtml = skipNoiseRemoval
|
|
529
|
+
? html
|
|
530
|
+
: runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
|
|
977
531
|
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
978
532
|
const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
|
|
979
533
|
throwIfAborted(signal, url, 'markdown:translated');
|
|
@@ -989,151 +543,18 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
989
543
|
if (!html)
|
|
990
544
|
return buildMetadataFooter(metadata, url);
|
|
991
545
|
try {
|
|
992
|
-
const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document);
|
|
546
|
+
const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document, options?.skipNoiseRemoval);
|
|
993
547
|
return appendMetadataFooter(content, metadata, url);
|
|
994
548
|
}
|
|
995
549
|
catch (error) {
|
|
996
550
|
if (error instanceof FetchError) {
|
|
997
551
|
throw error;
|
|
998
552
|
}
|
|
553
|
+
logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined);
|
|
999
554
|
return buildMetadataFooter(metadata, url);
|
|
1000
555
|
}
|
|
1001
556
|
}
|
|
1002
|
-
|
|
1003
|
-
let result = content;
|
|
1004
|
-
const fixOrphanHeadings = (text) => {
|
|
1005
|
-
return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
|
|
1006
|
-
if (typeof prefix !== 'string' ||
|
|
1007
|
-
typeof hashes !== 'string' ||
|
|
1008
|
-
typeof heading !== 'string') {
|
|
1009
|
-
return match;
|
|
1010
|
-
}
|
|
1011
|
-
if (heading.length > 150) {
|
|
1012
|
-
return match;
|
|
1013
|
-
}
|
|
1014
|
-
const trimmedPrefix = prefix.trim();
|
|
1015
|
-
if (trimmedPrefix === '') {
|
|
1016
|
-
return `${hashes} ${heading}\n\n`;
|
|
1017
|
-
}
|
|
1018
|
-
return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
|
|
1019
|
-
});
|
|
1020
|
-
};
|
|
1021
|
-
result = fixOrphanHeadings(result);
|
|
1022
|
-
result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
|
|
1023
|
-
const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
|
|
1024
|
-
result = result.replace(zeroWidthAnchorLink, '');
|
|
1025
|
-
result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
|
|
1026
|
-
result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
|
|
1027
|
-
result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
|
|
1028
|
-
result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
|
|
1029
|
-
result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
|
|
1030
|
-
result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
|
|
1031
|
-
const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
|
|
1032
|
-
const lines = result.split('\n');
|
|
1033
|
-
const filtered = [];
|
|
1034
|
-
let skipTocBlock = false;
|
|
1035
|
-
for (let i = 0; i < lines.length; i += 1) {
|
|
1036
|
-
const line = lines[i] ?? '';
|
|
1037
|
-
const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
|
|
1038
|
-
const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
|
|
1039
|
-
if (tocLinkLine.test(line)) {
|
|
1040
|
-
const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
|
|
1041
|
-
const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
|
|
1042
|
-
if (prevIsToc || nextIsToc) {
|
|
1043
|
-
skipTocBlock = true;
|
|
1044
|
-
continue;
|
|
1045
|
-
}
|
|
1046
|
-
}
|
|
1047
|
-
else if (line.trim() === '' && skipTocBlock) {
|
|
1048
|
-
skipTocBlock = false;
|
|
1049
|
-
continue;
|
|
1050
|
-
}
|
|
1051
|
-
else {
|
|
1052
|
-
skipTocBlock = false;
|
|
1053
|
-
}
|
|
1054
|
-
filtered.push(line);
|
|
1055
|
-
}
|
|
1056
|
-
result = filtered.join('\n');
|
|
1057
|
-
result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
|
|
1058
|
-
result = result.replace(/^Was this page helpful\??\s*$/gim, '');
|
|
1059
|
-
result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
|
|
1060
|
-
result = result.replace(/\\([[]])/g, '$1');
|
|
1061
|
-
result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
|
|
1062
|
-
result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
|
|
1063
|
-
result = result.replace(/\n{3,}/g, '\n\n');
|
|
1064
|
-
return result.trim();
|
|
1065
|
-
}
|
|
1066
|
-
const HEADING_KEYWORDS = new Set([
|
|
1067
|
-
'overview',
|
|
1068
|
-
'introduction',
|
|
1069
|
-
'summary',
|
|
1070
|
-
'conclusion',
|
|
1071
|
-
'prerequisites',
|
|
1072
|
-
'requirements',
|
|
1073
|
-
'installation',
|
|
1074
|
-
'configuration',
|
|
1075
|
-
'usage',
|
|
1076
|
-
'features',
|
|
1077
|
-
'limitations',
|
|
1078
|
-
'troubleshooting',
|
|
1079
|
-
'faq',
|
|
1080
|
-
'resources',
|
|
1081
|
-
'references',
|
|
1082
|
-
'changelog',
|
|
1083
|
-
'license',
|
|
1084
|
-
'acknowledgments',
|
|
1085
|
-
'appendix',
|
|
1086
|
-
]);
|
|
1087
|
-
function isLikelyHeadingLine(line) {
|
|
1088
|
-
const trimmed = line.trim();
|
|
1089
|
-
if (!trimmed || trimmed.length > 80)
|
|
1090
|
-
return false;
|
|
1091
|
-
if (/^#{1,6}\s/.test(trimmed))
|
|
1092
|
-
return false;
|
|
1093
|
-
if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
|
|
1094
|
-
return false;
|
|
1095
|
-
if (/[.!?]$/.test(trimmed))
|
|
1096
|
-
return false;
|
|
1097
|
-
if (/^\[.*\]\(.*\)$/.test(trimmed))
|
|
1098
|
-
return false;
|
|
1099
|
-
if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
|
|
1100
|
-
return true;
|
|
1101
|
-
}
|
|
1102
|
-
const words = trimmed.split(/\s+/);
|
|
1103
|
-
if (words.length >= 2 && words.length <= 6) {
|
|
1104
|
-
const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
|
|
1105
|
-
if (isTitleCase)
|
|
1106
|
-
return true;
|
|
1107
|
-
}
|
|
1108
|
-
if (words.length === 1) {
|
|
1109
|
-
const lower = trimmed.toLowerCase();
|
|
1110
|
-
if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
|
|
1111
|
-
return true;
|
|
1112
|
-
}
|
|
1113
|
-
}
|
|
1114
|
-
return false;
|
|
1115
|
-
}
|
|
1116
|
-
function promoteOrphanHeadings(markdown) {
|
|
1117
|
-
const lines = markdown.split('\n');
|
|
1118
|
-
const result = [];
|
|
1119
|
-
for (let i = 0; i < lines.length; i += 1) {
|
|
1120
|
-
const line = lines[i] ?? '';
|
|
1121
|
-
const prevLine = i > 0 ? lines[i - 1] : '';
|
|
1122
|
-
const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
|
|
1123
|
-
const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
|
|
1124
|
-
const isPrecededByBlank = prevLine?.trim() === '';
|
|
1125
|
-
if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
|
|
1126
|
-
const trimmed = line.trim();
|
|
1127
|
-
const isExample = /^example:\s/i.test(trimmed);
|
|
1128
|
-
const prefix = isExample ? '### ' : '## ';
|
|
1129
|
-
result.push(prefix + trimmed);
|
|
1130
|
-
}
|
|
1131
|
-
else {
|
|
1132
|
-
result.push(line);
|
|
1133
|
-
}
|
|
1134
|
-
}
|
|
1135
|
-
return result.join('\n');
|
|
1136
|
-
}
|
|
557
|
+
// Markdown cleanup functions moved to ./markdown-cleanup.ts
|
|
1137
558
|
function formatFetchedDate(isoString) {
|
|
1138
559
|
try {
|
|
1139
560
|
const date = new Date(isoString);
|
|
@@ -1363,13 +784,9 @@ function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
|
|
|
1363
784
|
: rawContent;
|
|
1364
785
|
return { content, title };
|
|
1365
786
|
}
|
|
1366
|
-
function
|
|
1367
|
-
if (!shouldPreserveRawContent(url, html)) {
|
|
1368
|
-
return null;
|
|
1369
|
-
}
|
|
1370
|
-
logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
|
|
787
|
+
function buildRawMarkdownResult({ rawContent, url, includeMetadata, }) {
|
|
1371
788
|
const { content, title } = buildRawMarkdownPayload({
|
|
1372
|
-
rawContent
|
|
789
|
+
rawContent,
|
|
1373
790
|
url,
|
|
1374
791
|
includeMetadata,
|
|
1375
792
|
});
|
|
@@ -1379,57 +796,115 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
|
1379
796
|
truncated: false,
|
|
1380
797
|
};
|
|
1381
798
|
}
|
|
799
|
+
function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
800
|
+
if (!shouldPreserveRawContent(url, html)) {
|
|
801
|
+
return null;
|
|
802
|
+
}
|
|
803
|
+
logDebug('Preserving raw markdown content', { url: truncateUrlForLog(url) });
|
|
804
|
+
return buildRawMarkdownResult({
|
|
805
|
+
rawContent: html,
|
|
806
|
+
url,
|
|
807
|
+
includeMetadata,
|
|
808
|
+
});
|
|
809
|
+
}
|
|
1382
810
|
const MIN_CONTENT_RATIO = 0.3;
|
|
1383
811
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
1384
812
|
const MIN_HEADING_RETENTION_RATIO = 0.7;
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
}
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
813
|
+
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
|
|
814
|
+
/**
|
|
815
|
+
* Check if HTML string needs document wrapper for proper parsing.
|
|
816
|
+
* Fragments without doctype/html/body tags need wrapping.
|
|
817
|
+
*/
|
|
818
|
+
function needsDocumentWrapper(html) {
|
|
819
|
+
const trimmed = html.trim().toLowerCase();
|
|
820
|
+
return (!trimmed.startsWith('<!doctype') &&
|
|
821
|
+
!trimmed.startsWith('<html') &&
|
|
822
|
+
!trimmed.startsWith('<body'));
|
|
823
|
+
}
|
|
824
|
+
/**
|
|
825
|
+
* Wrap HTML fragment in minimal document structure for proper parsing.
|
|
826
|
+
*/
|
|
827
|
+
function wrapHtmlFragment(html) {
|
|
828
|
+
return `<!DOCTYPE html><html><body>${html}</body></html>`;
|
|
829
|
+
}
|
|
830
|
+
function resolveHtmlDocument(htmlOrDocument) {
|
|
831
|
+
if (typeof htmlOrDocument !== 'string') {
|
|
832
|
+
return htmlOrDocument;
|
|
833
|
+
}
|
|
834
|
+
const htmlToParse = needsDocumentWrapper(htmlOrDocument)
|
|
835
|
+
? wrapHtmlFragment(htmlOrDocument)
|
|
836
|
+
: htmlOrDocument;
|
|
837
|
+
return parseHTML(htmlToParse).document;
|
|
838
|
+
}
|
|
839
|
+
function countDomSelector(htmlOrDocument, selector) {
|
|
840
|
+
return resolveHtmlDocument(htmlOrDocument).querySelectorAll(selector).length;
|
|
841
|
+
}
|
|
842
|
+
/**
|
|
843
|
+
* Count headings using DOM querySelectorAll.
|
|
844
|
+
* Handles nested content like <h2><span>Text</span></h2> correctly.
|
|
845
|
+
*/
|
|
846
|
+
function countHeadingsDom(htmlOrDocument) {
|
|
847
|
+
return countDomSelector(htmlOrDocument, 'h1,h2,h3,h4,h5,h6');
|
|
848
|
+
}
|
|
849
|
+
function countCodeBlocksDom(htmlOrDocument) {
|
|
850
|
+
return countDomSelector(htmlOrDocument, 'pre');
|
|
851
|
+
}
|
|
852
|
+
function cloneDocumentIfNeeded(htmlOrDocument, doc) {
|
|
853
|
+
return typeof htmlOrDocument === 'string'
|
|
854
|
+
? doc
|
|
855
|
+
: doc.cloneNode(true);
|
|
856
|
+
}
|
|
857
|
+
function stripNonVisibleNodes(doc) {
|
|
858
|
+
for (const el of doc.querySelectorAll('script,style,noscript')) {
|
|
859
|
+
el.remove();
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
function resolveDocumentText(doc) {
|
|
863
|
+
// Note: linkedom may return null for body on HTML fragments despite types
|
|
864
|
+
const body = doc.body;
|
|
865
|
+
const docElement = doc.documentElement;
|
|
866
|
+
return body?.textContent ?? docElement?.textContent ?? '';
|
|
867
|
+
}
|
|
868
|
+
/**
|
|
869
|
+
* Get visible text length from HTML, excluding script/style/noscript content.
|
|
870
|
+
* Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
|
|
871
|
+
*/
|
|
872
|
+
function getVisibleTextLength(htmlOrDocument) {
|
|
873
|
+
const doc = resolveHtmlDocument(htmlOrDocument);
|
|
874
|
+
const workDoc = cloneDocumentIfNeeded(htmlOrDocument, doc);
|
|
875
|
+
stripNonVisibleNodes(workDoc);
|
|
876
|
+
const text = resolveDocumentText(workDoc);
|
|
877
|
+
return text.replace(/\s+/g, ' ').trim().length;
|
|
878
|
+
}
|
|
879
|
+
export function isExtractionSufficient(article, originalHtmlOrDocument) {
|
|
1423
880
|
if (!article)
|
|
1424
881
|
return false;
|
|
1425
882
|
const articleLength = article.textContent.length;
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
.trim().length;
|
|
883
|
+
// Use DOM-based visible text length to exclude script/style content
|
|
884
|
+
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
1429
885
|
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
1430
886
|
return true;
|
|
1431
887
|
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
1432
888
|
}
|
|
889
|
+
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
890
|
+
const MAX_TRUNCATED_LINE_RATIO = 0.5;
|
|
891
|
+
/**
|
|
892
|
+
* Detect if extracted text has many truncated/incomplete sentences.
|
|
893
|
+
* Lines longer than 20 chars that don't end with sentence punctuation
|
|
894
|
+
* are considered potentially truncated.
|
|
895
|
+
*/
|
|
896
|
+
function hasTruncatedSentences(text) {
|
|
897
|
+
const lines = text
|
|
898
|
+
.split('\n')
|
|
899
|
+
.filter((line) => line.trim().length > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK);
|
|
900
|
+
if (lines.length < 3)
|
|
901
|
+
return false;
|
|
902
|
+
const incompleteLines = lines.filter((line) => {
|
|
903
|
+
const trimmed = line.trim();
|
|
904
|
+
return !/[.!?:;]$/.test(trimmed);
|
|
905
|
+
});
|
|
906
|
+
return incompleteLines.length / lines.length > MAX_TRUNCATED_LINE_RATIO;
|
|
907
|
+
}
|
|
1433
908
|
export function determineContentExtractionSource(article) {
|
|
1434
909
|
return article !== null;
|
|
1435
910
|
}
|
|
@@ -1459,40 +934,147 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
1459
934
|
}
|
|
1460
935
|
return metadata;
|
|
1461
936
|
}
|
|
937
|
+
/**
|
|
938
|
+
* Content root selectors in priority order.
|
|
939
|
+
* These identify the main content area on a page.
|
|
940
|
+
*/
|
|
941
|
+
const CONTENT_ROOT_SELECTORS = [
|
|
942
|
+
'main',
|
|
943
|
+
'article',
|
|
944
|
+
'[role="main"]',
|
|
945
|
+
'#content',
|
|
946
|
+
'#main-content',
|
|
947
|
+
'.content',
|
|
948
|
+
'.main-content',
|
|
949
|
+
'.post-content',
|
|
950
|
+
'.article-content',
|
|
951
|
+
'.entry-content',
|
|
952
|
+
'[itemprop="articleBody"]',
|
|
953
|
+
'[data-content]',
|
|
954
|
+
'.post-body',
|
|
955
|
+
'.article-body',
|
|
956
|
+
];
|
|
957
|
+
/**
|
|
958
|
+
* Find the main content root element in a document.
|
|
959
|
+
* Returns the innerHTML if found, undefined otherwise.
|
|
960
|
+
*/
|
|
961
|
+
function findContentRoot(document) {
|
|
962
|
+
for (const selector of CONTENT_ROOT_SELECTORS) {
|
|
963
|
+
const element = document.querySelector(selector);
|
|
964
|
+
if (!element)
|
|
965
|
+
continue;
|
|
966
|
+
// Check if element has meaningful content
|
|
967
|
+
const innerHTML = typeof element.innerHTML === 'string'
|
|
968
|
+
? element.innerHTML
|
|
969
|
+
: undefined;
|
|
970
|
+
if (innerHTML && innerHTML.trim().length > 100) {
|
|
971
|
+
return innerHTML;
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
return undefined;
|
|
975
|
+
}
|
|
1462
976
|
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
|
|
1463
977
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
978
|
+
// If using article content, return it directly
|
|
979
|
+
if (useArticleContent && article) {
|
|
980
|
+
return {
|
|
981
|
+
sourceHtml: article.content,
|
|
982
|
+
title: article.title,
|
|
983
|
+
metadata,
|
|
984
|
+
};
|
|
985
|
+
}
|
|
986
|
+
// Try content root fallback before using full HTML
|
|
987
|
+
if (document) {
|
|
988
|
+
// Apply noise removal to HTML first (without passing document) to get cleaned HTML,
|
|
989
|
+
// then parse and find content root. This prevents the aggressive DOM stripping that
|
|
990
|
+
// happens when noise removal is given the original parsed document.
|
|
991
|
+
const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
|
|
992
|
+
const { document: cleanedDoc } = parseHTML(cleanedHtml);
|
|
993
|
+
const contentRoot = findContentRoot(cleanedDoc);
|
|
994
|
+
if (contentRoot) {
|
|
995
|
+
logDebug('Using content root fallback instead of full HTML', {
|
|
996
|
+
url: truncateUrlForLog(url),
|
|
997
|
+
contentLength: contentRoot.length,
|
|
998
|
+
});
|
|
999
|
+
return {
|
|
1000
|
+
sourceHtml: contentRoot,
|
|
1001
|
+
title: extractedMeta.title,
|
|
1002
|
+
metadata,
|
|
1003
|
+
// Skip noise removal - this HTML is already from a cleaned document
|
|
1004
|
+
skipNoiseRemoval: true,
|
|
1005
|
+
};
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
// Fall back to full HTML
|
|
1009
|
+
return {
|
|
1010
|
+
sourceHtml: html,
|
|
1011
|
+
title: extractedMeta.title,
|
|
1467
1012
|
metadata,
|
|
1013
|
+
...(document ? { document } : {}),
|
|
1468
1014
|
};
|
|
1469
|
-
if (!useArticleContent && document) {
|
|
1470
|
-
return { ...source, document };
|
|
1471
|
-
}
|
|
1472
|
-
return source;
|
|
1473
1015
|
}
|
|
1474
|
-
function logQualityGateFallback({
|
|
1016
|
+
function logQualityGateFallback({ safeUrl, articleLength, }) {
|
|
1475
1017
|
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
1476
|
-
url:
|
|
1018
|
+
url: safeUrl,
|
|
1477
1019
|
articleLength,
|
|
1478
1020
|
});
|
|
1479
1021
|
}
|
|
1480
|
-
function shouldUseArticleContent(article,
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1022
|
+
function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
|
|
1023
|
+
const articleLength = article.textContent.length;
|
|
1024
|
+
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
1025
|
+
const safeUrl = truncateUrlForLog(url);
|
|
1026
|
+
let articleDocument = null;
|
|
1027
|
+
const getArticleDocument = () => {
|
|
1028
|
+
if (articleDocument)
|
|
1029
|
+
return articleDocument;
|
|
1030
|
+
articleDocument = resolveHtmlDocument(article.content);
|
|
1031
|
+
return articleDocument;
|
|
1032
|
+
};
|
|
1033
|
+
// If the document is tiny, don't gate too aggressively.
|
|
1034
|
+
if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
|
|
1035
|
+
const ratio = articleLength / originalLength;
|
|
1036
|
+
if (ratio < MIN_CONTENT_RATIO) {
|
|
1037
|
+
logQualityGateFallback({ safeUrl, articleLength });
|
|
1038
|
+
return false;
|
|
1039
|
+
}
|
|
1488
1040
|
}
|
|
1489
|
-
//
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1041
|
+
// Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
|
|
1042
|
+
const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
|
|
1043
|
+
if (originalHeadings > 0) {
|
|
1044
|
+
const articleHeadings = countHeadingsDom(getArticleDocument());
|
|
1045
|
+
const retentionRatio = articleHeadings / originalHeadings;
|
|
1046
|
+
if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
|
|
1047
|
+
logDebug('Quality gate: Readability broke heading structure, using full HTML', {
|
|
1048
|
+
url: safeUrl,
|
|
1049
|
+
originalHeadings,
|
|
1050
|
+
articleHeadings,
|
|
1051
|
+
});
|
|
1052
|
+
return false;
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
|
|
1056
|
+
if (originalCodeBlocks > 0) {
|
|
1057
|
+
const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
|
|
1058
|
+
const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
|
|
1059
|
+
// Always log code block counts for debugging
|
|
1060
|
+
logDebug('Code block retention check', {
|
|
1061
|
+
url: safeUrl,
|
|
1062
|
+
originalCodeBlocks,
|
|
1063
|
+
articleCodeBlocks,
|
|
1064
|
+
codeRetentionRatio,
|
|
1495
1065
|
});
|
|
1066
|
+
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
|
|
1067
|
+
logDebug('Quality gate: Readability removed code blocks, using full HTML', {
|
|
1068
|
+
url: safeUrl,
|
|
1069
|
+
originalCodeBlocks,
|
|
1070
|
+
articleCodeBlocks,
|
|
1071
|
+
});
|
|
1072
|
+
return false;
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
// Layout extraction issue: truncated/fragmented lines.
|
|
1076
|
+
if (hasTruncatedSentences(article.textContent)) {
|
|
1077
|
+
logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: safeUrl });
|
|
1496
1078
|
return false;
|
|
1497
1079
|
}
|
|
1498
1080
|
return true;
|
|
@@ -1502,8 +1084,9 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
|
1502
1084
|
extractArticle: true,
|
|
1503
1085
|
...(signal ? { signal } : {}),
|
|
1504
1086
|
});
|
|
1087
|
+
const originalDocument = document;
|
|
1505
1088
|
const useArticleContent = article
|
|
1506
|
-
? shouldUseArticleContent(article,
|
|
1089
|
+
? shouldUseArticleContent(article, originalDocument, url)
|
|
1507
1090
|
: false;
|
|
1508
1091
|
return buildContentSource({
|
|
1509
1092
|
html,
|
|
@@ -1512,7 +1095,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
|
1512
1095
|
extractedMeta,
|
|
1513
1096
|
includeMetadata,
|
|
1514
1097
|
useArticleContent,
|
|
1515
|
-
|
|
1098
|
+
document,
|
|
1516
1099
|
});
|
|
1517
1100
|
}
|
|
1518
1101
|
function tryTransformRawStage(html, url, includeMetadata) {
|
|
@@ -1535,6 +1118,7 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
1535
1118
|
url,
|
|
1536
1119
|
...(signal ? { signal } : {}),
|
|
1537
1120
|
...(context.document ? { document: context.document } : {}),
|
|
1121
|
+
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1538
1122
|
}));
|
|
1539
1123
|
return {
|
|
1540
1124
|
markdown: content,
|
|
@@ -1551,11 +1135,14 @@ function runTotalTransformStage(url, fn) {
|
|
|
1551
1135
|
return result;
|
|
1552
1136
|
}
|
|
1553
1137
|
finally {
|
|
1554
|
-
|
|
1555
|
-
endTransformStage(totalStage, { truncated: false });
|
|
1556
|
-
}
|
|
1138
|
+
finalizeTotalTransformStage(totalStage, success);
|
|
1557
1139
|
}
|
|
1558
1140
|
}
|
|
1141
|
+
function finalizeTotalTransformStage(stage, success) {
|
|
1142
|
+
if (!success)
|
|
1143
|
+
return;
|
|
1144
|
+
endTransformStage(stage, { truncated: false });
|
|
1145
|
+
}
|
|
1559
1146
|
async function runTotalTransformStageAsync(url, fn) {
|
|
1560
1147
|
const totalStage = startTransformStage(url, 'transform:total');
|
|
1561
1148
|
let success = false;
|
|
@@ -1565,9 +1152,7 @@ async function runTotalTransformStageAsync(url, fn) {
|
|
|
1565
1152
|
return result;
|
|
1566
1153
|
}
|
|
1567
1154
|
finally {
|
|
1568
|
-
|
|
1569
|
-
endTransformStage(totalStage, { truncated: false });
|
|
1570
|
-
}
|
|
1155
|
+
finalizeTotalTransformStage(totalStage, success);
|
|
1571
1156
|
}
|
|
1572
1157
|
}
|
|
1573
1158
|
export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
@@ -1628,6 +1213,12 @@ class WorkerPool {
|
|
|
1628
1213
|
timeoutMs;
|
|
1629
1214
|
queueMax;
|
|
1630
1215
|
closed = false;
|
|
1216
|
+
createAbortError(url, stage) {
|
|
1217
|
+
return new FetchError('Request was canceled', url, 499, {
|
|
1218
|
+
reason: 'aborted',
|
|
1219
|
+
stage,
|
|
1220
|
+
});
|
|
1221
|
+
}
|
|
1631
1222
|
ensureOpen() {
|
|
1632
1223
|
if (this.closed) {
|
|
1633
1224
|
throw new Error('Transform worker pool closed');
|
|
@@ -1636,10 +1227,7 @@ class WorkerPool {
|
|
|
1636
1227
|
ensureNotAborted(signal, url, stage) {
|
|
1637
1228
|
if (!signal?.aborted)
|
|
1638
1229
|
return;
|
|
1639
|
-
throw
|
|
1640
|
-
reason: 'aborted',
|
|
1641
|
-
stage,
|
|
1642
|
-
});
|
|
1230
|
+
throw this.createAbortError(url, stage);
|
|
1643
1231
|
}
|
|
1644
1232
|
ensureQueueCapacity(url) {
|
|
1645
1233
|
if (this.queue.length < this.queueMax)
|
|
@@ -1704,10 +1292,7 @@ class WorkerPool {
|
|
|
1704
1292
|
abortInflightTask(id, url, workerIndex) {
|
|
1705
1293
|
const slot = this.workers[workerIndex];
|
|
1706
1294
|
this.cancelWorkerTask(slot, id);
|
|
1707
|
-
this.failTask(id,
|
|
1708
|
-
reason: 'aborted',
|
|
1709
|
-
stage: 'transform:signal-abort',
|
|
1710
|
-
}));
|
|
1295
|
+
this.failTask(id, this.createAbortError(url, 'transform:signal-abort'));
|
|
1711
1296
|
if (slot) {
|
|
1712
1297
|
this.restartWorker(workerIndex, slot);
|
|
1713
1298
|
}
|
|
@@ -1717,10 +1302,7 @@ class WorkerPool {
|
|
|
1717
1302
|
if (queuedIndex === -1)
|
|
1718
1303
|
return;
|
|
1719
1304
|
this.queue.splice(queuedIndex, 1);
|
|
1720
|
-
reject(
|
|
1721
|
-
reason: 'aborted',
|
|
1722
|
-
stage: 'transform:queued-abort',
|
|
1723
|
-
}));
|
|
1305
|
+
reject(this.createAbortError(url, 'transform:queued-abort'));
|
|
1724
1306
|
}
|
|
1725
1307
|
createWorkerSlot(worker) {
|
|
1726
1308
|
return {
|
|
@@ -1876,10 +1458,7 @@ class WorkerPool {
|
|
|
1876
1458
|
if (!task.signal?.aborted)
|
|
1877
1459
|
return false;
|
|
1878
1460
|
this.clearAbortListener(task.signal, task.abortListener);
|
|
1879
|
-
task.reject(
|
|
1880
|
-
reason: 'aborted',
|
|
1881
|
-
stage: 'transform:dispatch',
|
|
1882
|
-
}));
|
|
1461
|
+
task.reject(this.createAbortError(task.url, 'transform:dispatch'));
|
|
1883
1462
|
return true;
|
|
1884
1463
|
}
|
|
1885
1464
|
markSlotBusy(slot, task) {
|
|
@@ -1984,4 +1563,3 @@ export async function transformHtmlToMarkdown(html, url, options) {
|
|
|
1984
1563
|
}
|
|
1985
1564
|
});
|
|
1986
1565
|
}
|
|
1987
|
-
//# sourceMappingURL=transform.js.map
|