@j0hanz/superfetch 2.2.1 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +363 -614
- package/dist/cache.d.ts +2 -2
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +47 -225
- package/dist/cache.js.map +1 -1
- package/dist/config.d.ts +6 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +20 -27
- package/dist/config.js.map +1 -1
- package/dist/dom-noise-removal.d.ts +6 -0
- package/dist/dom-noise-removal.d.ts.map +1 -0
- package/dist/dom-noise-removal.js +482 -0
- package/dist/dom-noise-removal.js.map +1 -0
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +8 -5
- package/dist/errors.js.map +1 -1
- package/dist/fetch.d.ts.map +1 -1
- package/dist/fetch.js +26 -32
- package/dist/fetch.js.map +1 -1
- package/dist/http-native.d.ts +6 -0
- package/dist/http-native.d.ts.map +1 -0
- package/dist/http-native.js +645 -0
- package/dist/http-native.js.map +1 -0
- package/dist/http-utils.d.ts +61 -0
- package/dist/http-utils.d.ts.map +1 -0
- package/dist/http-utils.js +252 -0
- package/dist/http-utils.js.map +1 -0
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/dist/instructions.md +41 -39
- package/dist/json.d.ts +2 -0
- package/dist/json.d.ts.map +1 -0
- package/dist/json.js +30 -0
- package/dist/json.js.map +1 -0
- package/dist/language-detection.d.ts +13 -0
- package/dist/language-detection.d.ts.map +1 -0
- package/dist/language-detection.js +283 -0
- package/dist/language-detection.js.map +1 -0
- package/dist/markdown-cleanup.d.ts +19 -0
- package/dist/markdown-cleanup.d.ts.map +1 -0
- package/dist/markdown-cleanup.js +283 -0
- package/dist/markdown-cleanup.js.map +1 -0
- package/dist/observability.d.ts +1 -0
- package/dist/observability.d.ts.map +1 -1
- package/dist/observability.js +10 -0
- package/dist/observability.js.map +1 -1
- package/dist/tools.js +4 -4
- package/dist/transform-types.d.ts +81 -0
- package/dist/transform-types.d.ts.map +1 -0
- package/dist/transform-types.js +6 -0
- package/dist/transform-types.js.map +1 -0
- package/dist/transform.d.ts +7 -52
- package/dist/transform.d.ts.map +1 -1
- package/dist/transform.js +411 -839
- package/dist/transform.js.map +1 -1
- package/dist/type-guards.d.ts +1 -1
- package/dist/type-guards.d.ts.map +1 -1
- package/dist/type-guards.js +1 -1
- package/dist/type-guards.js.map +1 -1
- package/dist/workers/transform-worker.js +23 -24
- package/dist/workers/transform-worker.js.map +1 -1
- package/package.json +85 -86
- package/dist/http.d.ts +0 -90
- package/dist/http.d.ts.map +0 -1
- package/dist/http.js +0 -1576
- package/dist/http.js.map +0 -1
package/dist/transform.js
CHANGED
|
@@ -8,44 +8,25 @@ import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
|
8
8
|
import { z } from 'zod';
|
|
9
9
|
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
10
10
|
import { config } from './config.js';
|
|
11
|
+
import { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
11
12
|
import { FetchError, getErrorMessage } from './errors.js';
|
|
12
13
|
import { isRawTextContentUrl } from './fetch.js';
|
|
14
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
15
|
+
import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
|
|
13
16
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
|
|
14
|
-
import {
|
|
17
|
+
import { isObject } from './type-guards.js';
|
|
18
|
+
// Re-export language detection for backward compatibility
|
|
19
|
+
export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
20
|
+
// Re-export markdown cleanup for backward compatibility
|
|
21
|
+
export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
|
|
22
|
+
// Re-export DOM noise removal for backward compatibility
|
|
23
|
+
export { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
15
24
|
function getAbortReason(signal) {
|
|
16
|
-
if (!
|
|
25
|
+
if (!isObject(signal))
|
|
17
26
|
return undefined;
|
|
18
27
|
return 'reason' in signal ? signal.reason : undefined;
|
|
19
28
|
}
|
|
20
|
-
|
|
21
|
-
if (!isRecord(document))
|
|
22
|
-
return undefined;
|
|
23
|
-
const { body } = document;
|
|
24
|
-
if (!isRecord(body))
|
|
25
|
-
return undefined;
|
|
26
|
-
const { innerHTML } = body;
|
|
27
|
-
return typeof innerHTML === 'string' && innerHTML.length > 0
|
|
28
|
-
? innerHTML
|
|
29
|
-
: undefined;
|
|
30
|
-
}
|
|
31
|
-
function getDocumentToString(document) {
|
|
32
|
-
if (!isRecord(document))
|
|
33
|
-
return undefined;
|
|
34
|
-
if (typeof document.toString !== 'function')
|
|
35
|
-
return undefined;
|
|
36
|
-
return document.toString.bind(document);
|
|
37
|
-
}
|
|
38
|
-
function getDocumentElementOuterHtml(document) {
|
|
39
|
-
if (!isRecord(document))
|
|
40
|
-
return undefined;
|
|
41
|
-
const { documentElement } = document;
|
|
42
|
-
if (!isRecord(documentElement))
|
|
43
|
-
return undefined;
|
|
44
|
-
const { outerHTML } = documentElement;
|
|
45
|
-
return typeof outerHTML === 'string' && outerHTML.length > 0
|
|
46
|
-
? outerHTML
|
|
47
|
-
: undefined;
|
|
48
|
-
}
|
|
29
|
+
// DOM accessor helpers moved to ./dom-noise-removal.ts
|
|
49
30
|
const CODE_BLOCK = {
|
|
50
31
|
fence: '```',
|
|
51
32
|
format: (code, language = '') => {
|
|
@@ -93,9 +74,13 @@ export function endTransformStage(context, options) {
|
|
|
93
74
|
}
|
|
94
75
|
function runTransformStage(url, stage, fn) {
|
|
95
76
|
const context = startTransformStage(url, stage);
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
77
|
+
try {
|
|
78
|
+
return fn();
|
|
79
|
+
}
|
|
80
|
+
finally {
|
|
81
|
+
// Emit duration even if the stage throws; callers decide how to handle the error.
|
|
82
|
+
endTransformStage(context);
|
|
83
|
+
}
|
|
99
84
|
}
|
|
100
85
|
function isTimeoutReason(reason) {
|
|
101
86
|
return reason instanceof Error && reason.name === 'TimeoutError';
|
|
@@ -129,46 +114,105 @@ function truncateHtml(html) {
|
|
|
129
114
|
});
|
|
130
115
|
return html.substring(0, maxSize);
|
|
131
116
|
}
|
|
117
|
+
const META_PROPERTY_HANDLERS = new Map([
|
|
118
|
+
[
|
|
119
|
+
'og:title',
|
|
120
|
+
(ctx, c) => {
|
|
121
|
+
ctx.title.og = c;
|
|
122
|
+
},
|
|
123
|
+
],
|
|
124
|
+
[
|
|
125
|
+
'og:description',
|
|
126
|
+
(ctx, c) => {
|
|
127
|
+
ctx.description.og = c;
|
|
128
|
+
},
|
|
129
|
+
],
|
|
130
|
+
[
|
|
131
|
+
'og:image',
|
|
132
|
+
(ctx, c) => {
|
|
133
|
+
ctx.image = c;
|
|
134
|
+
},
|
|
135
|
+
],
|
|
136
|
+
[
|
|
137
|
+
'article:published_time',
|
|
138
|
+
(ctx, c) => {
|
|
139
|
+
ctx.publishedAt = c;
|
|
140
|
+
},
|
|
141
|
+
],
|
|
142
|
+
[
|
|
143
|
+
'article:modified_time',
|
|
144
|
+
(ctx, c) => {
|
|
145
|
+
ctx.modifiedAt = c;
|
|
146
|
+
},
|
|
147
|
+
],
|
|
148
|
+
]);
|
|
149
|
+
const META_NAME_HANDLERS = new Map([
|
|
150
|
+
[
|
|
151
|
+
'twitter:title',
|
|
152
|
+
(ctx, c) => {
|
|
153
|
+
ctx.title.twitter = c;
|
|
154
|
+
},
|
|
155
|
+
],
|
|
156
|
+
[
|
|
157
|
+
'twitter:description',
|
|
158
|
+
(ctx, c) => {
|
|
159
|
+
ctx.description.twitter = c;
|
|
160
|
+
},
|
|
161
|
+
],
|
|
162
|
+
[
|
|
163
|
+
'description',
|
|
164
|
+
(ctx, c) => {
|
|
165
|
+
ctx.description.standard = c;
|
|
166
|
+
},
|
|
167
|
+
],
|
|
168
|
+
[
|
|
169
|
+
'author',
|
|
170
|
+
(ctx, c) => {
|
|
171
|
+
ctx.author = c;
|
|
172
|
+
},
|
|
173
|
+
],
|
|
174
|
+
]);
|
|
132
175
|
function extractMetadata(document) {
|
|
133
|
-
const
|
|
134
|
-
|
|
135
|
-
|
|
176
|
+
const ctx = {
|
|
177
|
+
title: {},
|
|
178
|
+
description: {},
|
|
179
|
+
};
|
|
136
180
|
for (const tag of document.querySelectorAll('meta')) {
|
|
137
181
|
const content = tag.getAttribute('content')?.trim();
|
|
138
182
|
if (!content)
|
|
139
183
|
continue;
|
|
140
184
|
const property = tag.getAttribute('property');
|
|
185
|
+
if (property) {
|
|
186
|
+
META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
|
|
187
|
+
}
|
|
141
188
|
const name = tag.getAttribute('name');
|
|
142
|
-
if (
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
description.og = content;
|
|
146
|
-
else if (name === 'twitter:title')
|
|
147
|
-
title.twitter = content;
|
|
148
|
-
else if (name === 'twitter:description')
|
|
149
|
-
description.twitter = content;
|
|
150
|
-
else if (name === 'description')
|
|
151
|
-
description.standard = content;
|
|
152
|
-
else if (name === 'author')
|
|
153
|
-
author = content;
|
|
189
|
+
if (name) {
|
|
190
|
+
META_NAME_HANDLERS.get(name)?.(ctx, content);
|
|
191
|
+
}
|
|
154
192
|
}
|
|
155
193
|
const titleEl = document.querySelector('title');
|
|
156
|
-
if (!title.standard && titleEl?.textContent) {
|
|
157
|
-
title.standard = titleEl.textContent.trim();
|
|
194
|
+
if (!ctx.title.standard && titleEl?.textContent) {
|
|
195
|
+
ctx.title.standard = titleEl.textContent.trim();
|
|
158
196
|
}
|
|
159
|
-
const resolvedTitle = title.og ?? title.twitter ?? title.standard;
|
|
160
|
-
const resolvedDesc = description.og ?? description.twitter ?? description.standard;
|
|
197
|
+
const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
|
|
198
|
+
const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
|
|
161
199
|
const metadata = {};
|
|
162
200
|
if (resolvedTitle)
|
|
163
201
|
metadata.title = resolvedTitle;
|
|
164
202
|
if (resolvedDesc)
|
|
165
203
|
metadata.description = resolvedDesc;
|
|
166
|
-
if (author)
|
|
167
|
-
metadata.author = author;
|
|
204
|
+
if (ctx.author)
|
|
205
|
+
metadata.author = ctx.author;
|
|
206
|
+
if (ctx.image)
|
|
207
|
+
metadata.image = ctx.image;
|
|
208
|
+
if (ctx.publishedAt)
|
|
209
|
+
metadata.publishedAt = ctx.publishedAt;
|
|
210
|
+
if (ctx.modifiedAt)
|
|
211
|
+
metadata.modifiedAt = ctx.modifiedAt;
|
|
168
212
|
return metadata;
|
|
169
213
|
}
|
|
170
214
|
function isReadabilityCompatible(doc) {
|
|
171
|
-
if (!
|
|
215
|
+
if (!isObject(doc))
|
|
172
216
|
return false;
|
|
173
217
|
return hasDocumentElement(doc) && hasQuerySelectors(doc);
|
|
174
218
|
}
|
|
@@ -185,14 +229,18 @@ function extractArticle(document) {
|
|
|
185
229
|
return null;
|
|
186
230
|
}
|
|
187
231
|
try {
|
|
188
|
-
const
|
|
189
|
-
const rawText =
|
|
190
|
-
documentClone.documentElement.textContent;
|
|
232
|
+
const doc = document;
|
|
233
|
+
const rawText = doc.querySelector('body')?.textContent ?? doc.documentElement.textContent;
|
|
191
234
|
const textLength = rawText.replace(/\s+/g, ' ').trim().length;
|
|
192
|
-
if (textLength
|
|
235
|
+
if (textLength < 100) {
|
|
236
|
+
logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
|
|
237
|
+
'This might be a client-side rendered (SPA) application. ' +
|
|
238
|
+
'Content extraction may be incomplete.', { textLength });
|
|
239
|
+
}
|
|
240
|
+
if (textLength >= 400 && !isProbablyReaderable(doc)) {
|
|
193
241
|
return null;
|
|
194
242
|
}
|
|
195
|
-
const reader = new Readability(
|
|
243
|
+
const reader = new Readability(doc, { maxElemsToParse: 20_000 });
|
|
196
244
|
const parsed = reader.parse();
|
|
197
245
|
if (!parsed)
|
|
198
246
|
return null;
|
|
@@ -218,7 +266,8 @@ export function extractContent(html, url, options = {
|
|
|
218
266
|
}
|
|
219
267
|
function extractContentWithDocument(html, url, options) {
|
|
220
268
|
if (!isValidInput(html, url)) {
|
|
221
|
-
|
|
269
|
+
const { document } = parseHTML('<html></html>');
|
|
270
|
+
return { article: null, metadata: {}, document };
|
|
222
271
|
}
|
|
223
272
|
return tryExtractContent(html, url, options);
|
|
224
273
|
}
|
|
@@ -233,7 +282,8 @@ function handleExtractionFailure(error, url, signal) {
|
|
|
233
282
|
}
|
|
234
283
|
throwIfAborted(signal, url, 'extract:error');
|
|
235
284
|
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
236
|
-
|
|
285
|
+
const { document } = parseHTML('<html></html>');
|
|
286
|
+
return { article: null, metadata: {}, document };
|
|
237
287
|
}
|
|
238
288
|
function extractContentStages(html, url, options) {
|
|
239
289
|
throwIfAborted(options.signal, url, 'extract:begin');
|
|
@@ -248,7 +298,8 @@ function extractContentStages(html, url, options) {
|
|
|
248
298
|
return {
|
|
249
299
|
article,
|
|
250
300
|
metadata,
|
|
251
|
-
|
|
301
|
+
document,
|
|
302
|
+
...(truncatedHtml.length !== html.length ? { truncated: true } : {}),
|
|
252
303
|
};
|
|
253
304
|
}
|
|
254
305
|
function tryExtractContent(html, url, options) {
|
|
@@ -285,532 +336,7 @@ function applyBaseUri(document, url) {
|
|
|
285
336
|
});
|
|
286
337
|
}
|
|
287
338
|
}
|
|
288
|
-
|
|
289
|
-
for (let index = 0; index < code.length - 1; index += 1) {
|
|
290
|
-
if (code[index] !== '<')
|
|
291
|
-
continue;
|
|
292
|
-
const next = code[index + 1];
|
|
293
|
-
if (!next)
|
|
294
|
-
continue;
|
|
295
|
-
if (next >= 'A' && next <= 'Z')
|
|
296
|
-
return true;
|
|
297
|
-
}
|
|
298
|
-
return false;
|
|
299
|
-
}
|
|
300
|
-
function containsWord(source, word) {
|
|
301
|
-
let startIndex = source.indexOf(word);
|
|
302
|
-
while (startIndex !== -1) {
|
|
303
|
-
const before = startIndex === 0 ? '' : source[startIndex - 1];
|
|
304
|
-
const afterIndex = startIndex + word.length;
|
|
305
|
-
const after = afterIndex >= source.length ? '' : source[afterIndex];
|
|
306
|
-
if (!isWordChar(before) && !isWordChar(after))
|
|
307
|
-
return true;
|
|
308
|
-
startIndex = source.indexOf(word, startIndex + word.length);
|
|
309
|
-
}
|
|
310
|
-
return false;
|
|
311
|
-
}
|
|
312
|
-
function splitLines(content) {
|
|
313
|
-
return content.split('\n');
|
|
314
|
-
}
|
|
315
|
-
function extractLanguageFromClassName(className) {
|
|
316
|
-
const tokens = className.match(/\S+/g);
|
|
317
|
-
if (!tokens)
|
|
318
|
-
return undefined;
|
|
319
|
-
for (const token of tokens) {
|
|
320
|
-
const lower = token.toLowerCase();
|
|
321
|
-
if (lower.startsWith('language-'))
|
|
322
|
-
return token.slice('language-'.length);
|
|
323
|
-
if (lower.startsWith('lang-'))
|
|
324
|
-
return token.slice('lang-'.length);
|
|
325
|
-
if (lower.startsWith('highlight-')) {
|
|
326
|
-
return token.slice('highlight-'.length);
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
if (tokens.includes('hljs')) {
|
|
330
|
-
const langClass = tokens.find((t) => t !== 'hljs' && !t.startsWith('hljs-'));
|
|
331
|
-
if (langClass)
|
|
332
|
-
return langClass;
|
|
333
|
-
}
|
|
334
|
-
return undefined;
|
|
335
|
-
}
|
|
336
|
-
function resolveLanguageFromDataAttribute(dataLang) {
|
|
337
|
-
const trimmed = dataLang.trim();
|
|
338
|
-
if (!trimmed)
|
|
339
|
-
return undefined;
|
|
340
|
-
for (const char of trimmed) {
|
|
341
|
-
if (!isWordChar(char))
|
|
342
|
-
return undefined;
|
|
343
|
-
}
|
|
344
|
-
return trimmed;
|
|
345
|
-
}
|
|
346
|
-
function isWordChar(char) {
|
|
347
|
-
if (!char)
|
|
348
|
-
return false;
|
|
349
|
-
const code = char.charCodeAt(0);
|
|
350
|
-
return ((code >= 48 && code <= 57) ||
|
|
351
|
-
(code >= 65 && code <= 90) ||
|
|
352
|
-
(code >= 97 && code <= 122) ||
|
|
353
|
-
char === '_');
|
|
354
|
-
}
|
|
355
|
-
const LANGUAGE_PATTERNS = [
|
|
356
|
-
{
|
|
357
|
-
language: 'jsx',
|
|
358
|
-
pattern: {
|
|
359
|
-
keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
|
|
360
|
-
custom: (code) => containsJsxTag(code),
|
|
361
|
-
},
|
|
362
|
-
},
|
|
363
|
-
{
|
|
364
|
-
language: 'typescript',
|
|
365
|
-
pattern: {
|
|
366
|
-
wordBoundary: ['interface', 'type'],
|
|
367
|
-
custom: (_, lower) => [
|
|
368
|
-
': string',
|
|
369
|
-
':string',
|
|
370
|
-
': number',
|
|
371
|
-
':number',
|
|
372
|
-
': boolean',
|
|
373
|
-
':boolean',
|
|
374
|
-
': void',
|
|
375
|
-
':void',
|
|
376
|
-
': any',
|
|
377
|
-
':any',
|
|
378
|
-
': unknown',
|
|
379
|
-
':unknown',
|
|
380
|
-
': never',
|
|
381
|
-
':never',
|
|
382
|
-
].some((hint) => lower.includes(hint)),
|
|
383
|
-
},
|
|
384
|
-
},
|
|
385
|
-
{
|
|
386
|
-
language: 'rust',
|
|
387
|
-
pattern: {
|
|
388
|
-
regex: /\b(?:fn|impl|struct|enum)\b/,
|
|
389
|
-
keywords: ['let mut'],
|
|
390
|
-
custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
|
|
391
|
-
},
|
|
392
|
-
},
|
|
393
|
-
{
|
|
394
|
-
language: 'javascript',
|
|
395
|
-
pattern: {
|
|
396
|
-
regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
|
|
397
|
-
},
|
|
398
|
-
},
|
|
399
|
-
{
|
|
400
|
-
language: 'python',
|
|
401
|
-
pattern: {
|
|
402
|
-
regex: /\b(?:def|class|import|from)\b/,
|
|
403
|
-
keywords: ['print(', '__name__'],
|
|
404
|
-
},
|
|
405
|
-
},
|
|
406
|
-
{
|
|
407
|
-
language: 'bash',
|
|
408
|
-
pattern: {
|
|
409
|
-
custom: (code) => detectBashIndicators(code),
|
|
410
|
-
},
|
|
411
|
-
},
|
|
412
|
-
{
|
|
413
|
-
language: 'css',
|
|
414
|
-
pattern: {
|
|
415
|
-
regex: /@media|@import|@keyframes/,
|
|
416
|
-
custom: (code) => detectCssStructure(code),
|
|
417
|
-
},
|
|
418
|
-
},
|
|
419
|
-
{
|
|
420
|
-
language: 'html',
|
|
421
|
-
pattern: {
|
|
422
|
-
keywords: [
|
|
423
|
-
'<!doctype',
|
|
424
|
-
'<html',
|
|
425
|
-
'<head',
|
|
426
|
-
'<body',
|
|
427
|
-
'<div',
|
|
428
|
-
'<span',
|
|
429
|
-
'<p',
|
|
430
|
-
'<a',
|
|
431
|
-
'<script',
|
|
432
|
-
'<style',
|
|
433
|
-
],
|
|
434
|
-
},
|
|
435
|
-
},
|
|
436
|
-
{
|
|
437
|
-
language: 'json',
|
|
438
|
-
pattern: {
|
|
439
|
-
startsWith: ['{', '['],
|
|
440
|
-
},
|
|
441
|
-
},
|
|
442
|
-
{
|
|
443
|
-
language: 'yaml',
|
|
444
|
-
pattern: {
|
|
445
|
-
custom: (code) => detectYamlStructure(code),
|
|
446
|
-
},
|
|
447
|
-
},
|
|
448
|
-
{
|
|
449
|
-
language: 'sql',
|
|
450
|
-
pattern: {
|
|
451
|
-
wordBoundary: [
|
|
452
|
-
'select',
|
|
453
|
-
'insert',
|
|
454
|
-
'update',
|
|
455
|
-
'delete',
|
|
456
|
-
'create',
|
|
457
|
-
'alter',
|
|
458
|
-
'drop',
|
|
459
|
-
],
|
|
460
|
-
},
|
|
461
|
-
},
|
|
462
|
-
{
|
|
463
|
-
language: 'go',
|
|
464
|
-
pattern: {
|
|
465
|
-
wordBoundary: ['package', 'func'],
|
|
466
|
-
keywords: ['import "'],
|
|
467
|
-
},
|
|
468
|
-
},
|
|
469
|
-
];
|
|
470
|
-
// Bash detection constants
|
|
471
|
-
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
472
|
-
const BASH_PKG_MANAGERS = [
|
|
473
|
-
'npm',
|
|
474
|
-
'yarn',
|
|
475
|
-
'pnpm',
|
|
476
|
-
'npx',
|
|
477
|
-
'brew',
|
|
478
|
-
'apt',
|
|
479
|
-
'pip',
|
|
480
|
-
'cargo',
|
|
481
|
-
'go',
|
|
482
|
-
];
|
|
483
|
-
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
484
|
-
function isShellPrefix(line) {
|
|
485
|
-
return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
|
|
486
|
-
}
|
|
487
|
-
function matchesBashCommand(line) {
|
|
488
|
-
return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
|
|
489
|
-
}
|
|
490
|
-
function matchesPackageManagerVerb(line) {
|
|
491
|
-
for (const mgr of BASH_PKG_MANAGERS) {
|
|
492
|
-
if (!line.startsWith(`${mgr} `))
|
|
493
|
-
continue;
|
|
494
|
-
const rest = line.slice(mgr.length + 1);
|
|
495
|
-
if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
|
|
496
|
-
return true;
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
return false;
|
|
500
|
-
}
|
|
501
|
-
function detectBashIndicators(code) {
|
|
502
|
-
for (const line of splitLines(code)) {
|
|
503
|
-
const trimmed = line.trimStart();
|
|
504
|
-
if (!trimmed)
|
|
505
|
-
continue;
|
|
506
|
-
if (isShellPrefix(trimmed) ||
|
|
507
|
-
matchesBashCommand(trimmed) ||
|
|
508
|
-
matchesPackageManagerVerb(trimmed)) {
|
|
509
|
-
return true;
|
|
510
|
-
}
|
|
511
|
-
}
|
|
512
|
-
return false;
|
|
513
|
-
}
|
|
514
|
-
function detectCssStructure(code) {
|
|
515
|
-
for (const line of splitLines(code)) {
|
|
516
|
-
const trimmed = line.trimStart();
|
|
517
|
-
if (!trimmed)
|
|
518
|
-
continue;
|
|
519
|
-
const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
|
|
520
|
-
trimmed.includes('{');
|
|
521
|
-
const isProperty = trimmed.includes(':') && trimmed.includes(';');
|
|
522
|
-
if (isSelector || isProperty)
|
|
523
|
-
return true;
|
|
524
|
-
}
|
|
525
|
-
return false;
|
|
526
|
-
}
|
|
527
|
-
function detectYamlStructure(code) {
|
|
528
|
-
for (const line of splitLines(code)) {
|
|
529
|
-
const trimmed = line.trim();
|
|
530
|
-
if (!trimmed)
|
|
531
|
-
continue;
|
|
532
|
-
const colonIdx = trimmed.indexOf(':');
|
|
533
|
-
if (colonIdx <= 0)
|
|
534
|
-
continue;
|
|
535
|
-
const after = trimmed[colonIdx + 1];
|
|
536
|
-
if (after === ' ' || after === '\t')
|
|
537
|
-
return true;
|
|
538
|
-
}
|
|
539
|
-
return false;
|
|
540
|
-
}
|
|
541
|
-
function matchesLanguagePattern(code, lower, pattern) {
|
|
542
|
-
if (pattern.keywords?.some((kw) => lower.includes(kw)))
|
|
543
|
-
return true;
|
|
544
|
-
if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
|
|
545
|
-
return true;
|
|
546
|
-
if (pattern.regex?.test(lower))
|
|
547
|
-
return true;
|
|
548
|
-
if (pattern.startsWith) {
|
|
549
|
-
const trimmed = code.trimStart();
|
|
550
|
-
if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
|
|
551
|
-
return true;
|
|
552
|
-
}
|
|
553
|
-
if (pattern.custom?.(code, lower))
|
|
554
|
-
return true;
|
|
555
|
-
return false;
|
|
556
|
-
}
|
|
557
|
-
export function detectLanguageFromCode(code) {
|
|
558
|
-
const lower = code.toLowerCase();
|
|
559
|
-
for (const { language, pattern } of LANGUAGE_PATTERNS) {
|
|
560
|
-
if (matchesLanguagePattern(code, lower, pattern))
|
|
561
|
-
return language;
|
|
562
|
-
}
|
|
563
|
-
return undefined;
|
|
564
|
-
}
|
|
565
|
-
export function resolveLanguageFromAttributes(className, dataLang) {
|
|
566
|
-
const classMatch = extractLanguageFromClassName(className);
|
|
567
|
-
return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
|
|
568
|
-
}
|
|
569
|
-
function isElement(node) {
|
|
570
|
-
return (isRecord(node) &&
|
|
571
|
-
'getAttribute' in node &&
|
|
572
|
-
typeof node.getAttribute === 'function');
|
|
573
|
-
}
|
|
574
|
-
const STRUCTURAL_TAGS = new Set([
|
|
575
|
-
'script',
|
|
576
|
-
'style',
|
|
577
|
-
'noscript',
|
|
578
|
-
'iframe',
|
|
579
|
-
'form',
|
|
580
|
-
'button',
|
|
581
|
-
'input',
|
|
582
|
-
'select',
|
|
583
|
-
'textarea',
|
|
584
|
-
'svg',
|
|
585
|
-
]);
|
|
586
|
-
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
|
|
587
|
-
const NAVIGATION_ROLES = new Set([
|
|
588
|
-
'navigation',
|
|
589
|
-
'banner',
|
|
590
|
-
'complementary',
|
|
591
|
-
'contentinfo',
|
|
592
|
-
'tree',
|
|
593
|
-
'menubar',
|
|
594
|
-
'menu',
|
|
595
|
-
'dialog',
|
|
596
|
-
'alertdialog',
|
|
597
|
-
'search',
|
|
598
|
-
]);
|
|
599
|
-
const PROMO_TOKENS = new Set([
|
|
600
|
-
'banner',
|
|
601
|
-
'promo',
|
|
602
|
-
'announcement',
|
|
603
|
-
'cta',
|
|
604
|
-
'callout',
|
|
605
|
-
'advert',
|
|
606
|
-
'ad',
|
|
607
|
-
'ads',
|
|
608
|
-
'sponsor',
|
|
609
|
-
'newsletter',
|
|
610
|
-
'subscribe',
|
|
611
|
-
'cookie',
|
|
612
|
-
'consent',
|
|
613
|
-
'popup',
|
|
614
|
-
'modal',
|
|
615
|
-
'overlay',
|
|
616
|
-
'toast',
|
|
617
|
-
'share',
|
|
618
|
-
'social',
|
|
619
|
-
'related',
|
|
620
|
-
'recommend',
|
|
621
|
-
'comment',
|
|
622
|
-
'breadcrumb',
|
|
623
|
-
'pagination',
|
|
624
|
-
'pager',
|
|
625
|
-
'taglist',
|
|
626
|
-
]);
|
|
627
|
-
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
628
|
-
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
629
|
-
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
630
|
-
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
631
|
-
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
632
|
-
const NOISE_MARKERS = [
|
|
633
|
-
'<script',
|
|
634
|
-
'<style',
|
|
635
|
-
'<noscript',
|
|
636
|
-
'<iframe',
|
|
637
|
-
'<nav',
|
|
638
|
-
'<footer',
|
|
639
|
-
'<aside',
|
|
640
|
-
'<header',
|
|
641
|
-
'<form',
|
|
642
|
-
'<button',
|
|
643
|
-
'<input',
|
|
644
|
-
'<select',
|
|
645
|
-
'<textarea',
|
|
646
|
-
'<svg',
|
|
647
|
-
'<canvas',
|
|
648
|
-
' aria-hidden="true"',
|
|
649
|
-
" aria-hidden='true'",
|
|
650
|
-
' hidden',
|
|
651
|
-
' role="navigation"',
|
|
652
|
-
" role='navigation'",
|
|
653
|
-
' role="banner"',
|
|
654
|
-
" role='banner'",
|
|
655
|
-
' role="complementary"',
|
|
656
|
-
" role='complementary'",
|
|
657
|
-
' role="contentinfo"',
|
|
658
|
-
" role='contentinfo'",
|
|
659
|
-
' role="tree"',
|
|
660
|
-
" role='tree'",
|
|
661
|
-
' role="menubar"',
|
|
662
|
-
" role='menubar'",
|
|
663
|
-
' role="menu"',
|
|
664
|
-
" role='menu'",
|
|
665
|
-
' banner',
|
|
666
|
-
' promo',
|
|
667
|
-
' announcement',
|
|
668
|
-
' cta',
|
|
669
|
-
' callout',
|
|
670
|
-
' advert',
|
|
671
|
-
' newsletter',
|
|
672
|
-
' subscribe',
|
|
673
|
-
' cookie',
|
|
674
|
-
' consent',
|
|
675
|
-
' popup',
|
|
676
|
-
' modal',
|
|
677
|
-
' overlay',
|
|
678
|
-
' toast',
|
|
679
|
-
' fixed',
|
|
680
|
-
' sticky',
|
|
681
|
-
' z-50',
|
|
682
|
-
' z-4',
|
|
683
|
-
' isolate',
|
|
684
|
-
' breadcrumb',
|
|
685
|
-
' pagination',
|
|
686
|
-
];
|
|
687
|
-
function mayContainNoise(html) {
|
|
688
|
-
const haystack = html.toLowerCase();
|
|
689
|
-
return NOISE_MARKERS.some((marker) => haystack.includes(marker));
|
|
690
|
-
}
|
|
691
|
-
function isFullDocumentHtml(html) {
|
|
692
|
-
return HTML_DOCUMENT_MARKERS.test(html);
|
|
693
|
-
}
|
|
694
|
-
function isStructuralNoiseTag(tagName) {
|
|
695
|
-
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
696
|
-
}
|
|
697
|
-
function isElementHidden(element) {
|
|
698
|
-
const style = element.getAttribute('style') ?? '';
|
|
699
|
-
return (element.getAttribute('hidden') !== null ||
|
|
700
|
-
element.getAttribute('aria-hidden') === 'true' ||
|
|
701
|
-
/\bdisplay\s*:\s*none\b/i.test(style) ||
|
|
702
|
-
/\bvisibility\s*:\s*hidden\b/i.test(style));
|
|
703
|
-
}
|
|
704
|
-
function hasNoiseRole(role) {
|
|
705
|
-
return role !== null && NAVIGATION_ROLES.has(role);
|
|
706
|
-
}
|
|
707
|
-
function tokenizeIdentifierLikeText(value) {
|
|
708
|
-
return value
|
|
709
|
-
.toLowerCase()
|
|
710
|
-
.replace(/[^a-z0-9]+/g, ' ')
|
|
711
|
-
.trim()
|
|
712
|
-
.split(' ')
|
|
713
|
-
.filter(Boolean);
|
|
714
|
-
}
|
|
715
|
-
function matchesPromoIdOrClass(className, id) {
|
|
716
|
-
const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
|
|
717
|
-
return tokens.some((token) => PROMO_TOKENS.has(token));
|
|
718
|
-
}
|
|
719
|
-
function matchesFixedOrHighZIsolate(className) {
|
|
720
|
-
return (FIXED_PATTERN.test(className) ||
|
|
721
|
-
(HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
|
|
722
|
-
}
|
|
723
|
-
function readElementMetadata(element) {
|
|
724
|
-
return {
|
|
725
|
-
tagName: element.tagName.toLowerCase(),
|
|
726
|
-
className: element.getAttribute('class') ?? '',
|
|
727
|
-
id: element.getAttribute('id') ?? '',
|
|
728
|
-
role: element.getAttribute('role'),
|
|
729
|
-
isHidden: isElementHidden(element),
|
|
730
|
-
};
|
|
731
|
-
}
|
|
732
|
-
function isBoilerplateHeader({ className, id, role, }) {
|
|
733
|
-
if (hasNoiseRole(role))
|
|
734
|
-
return true;
|
|
735
|
-
const combined = `${className} ${id}`.toLowerCase();
|
|
736
|
-
return HEADER_NOISE_PATTERN.test(combined);
|
|
737
|
-
}
|
|
738
|
-
function isNoiseElement(node) {
|
|
739
|
-
const metadata = readElementMetadata(node);
|
|
740
|
-
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
741
|
-
ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
|
|
742
|
-
(metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
|
|
743
|
-
metadata.isHidden ||
|
|
744
|
-
hasNoiseRole(metadata.role) ||
|
|
745
|
-
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
746
|
-
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
747
|
-
}
|
|
748
|
-
function removeNoiseNodes(nodes) {
|
|
749
|
-
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
750
|
-
const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
|
|
751
|
-
if (!node)
|
|
752
|
-
continue;
|
|
753
|
-
if (isElement(node) && isNoiseElement(node)) {
|
|
754
|
-
node.remove();
|
|
755
|
-
}
|
|
756
|
-
}
|
|
757
|
-
}
|
|
758
|
-
function stripNoiseNodes(document) {
|
|
759
|
-
// Use targeted selectors for common noise elements instead of querySelectorAll('*')
|
|
760
|
-
const targetSelectors = [
|
|
761
|
-
'nav',
|
|
762
|
-
'footer',
|
|
763
|
-
'aside',
|
|
764
|
-
'header[class*="site"]',
|
|
765
|
-
'header[class*="nav"]',
|
|
766
|
-
'header[class*="menu"]',
|
|
767
|
-
'[role="banner"]',
|
|
768
|
-
'[role="navigation"]',
|
|
769
|
-
'[role="dialog"]',
|
|
770
|
-
'[style*="display: none"]',
|
|
771
|
-
'[style*="display:none"]',
|
|
772
|
-
'[hidden]',
|
|
773
|
-
'[aria-hidden="true"]',
|
|
774
|
-
].join(',');
|
|
775
|
-
const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
|
|
776
|
-
// Remove in reverse order to handle nested elements correctly
|
|
777
|
-
removeNoiseNodes(potentialNoiseNodes);
|
|
778
|
-
// Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
|
|
779
|
-
const candidateSelectors = [
|
|
780
|
-
...STRUCTURAL_TAGS,
|
|
781
|
-
...ALWAYS_NOISE_TAGS,
|
|
782
|
-
'header',
|
|
783
|
-
'canvas',
|
|
784
|
-
'[class]',
|
|
785
|
-
'[id]',
|
|
786
|
-
'[role]',
|
|
787
|
-
'[style]',
|
|
788
|
-
].join(',');
|
|
789
|
-
const allElements = document.querySelectorAll(candidateSelectors);
|
|
790
|
-
removeNoiseNodes(allElements);
|
|
791
|
-
}
|
|
792
|
-
function removeNoiseFromHtml(html, document) {
|
|
793
|
-
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
794
|
-
if (!shouldParse)
|
|
795
|
-
return html;
|
|
796
|
-
try {
|
|
797
|
-
const resolvedDocument = document ?? parseHTML(html).document;
|
|
798
|
-
stripNoiseNodes(resolvedDocument);
|
|
799
|
-
const bodyInnerHtml = getBodyInnerHtml(resolvedDocument);
|
|
800
|
-
if (bodyInnerHtml)
|
|
801
|
-
return bodyInnerHtml;
|
|
802
|
-
const docToString = getDocumentToString(resolvedDocument);
|
|
803
|
-
if (docToString)
|
|
804
|
-
return docToString();
|
|
805
|
-
const documentElementOuterHtml = getDocumentElementOuterHtml(resolvedDocument);
|
|
806
|
-
if (documentElementOuterHtml)
|
|
807
|
-
return documentElementOuterHtml;
|
|
808
|
-
return html;
|
|
809
|
-
}
|
|
810
|
-
catch {
|
|
811
|
-
return html;
|
|
812
|
-
}
|
|
813
|
-
}
|
|
339
|
+
// DOM noise removal functions moved to ./dom-noise-removal.ts
|
|
814
340
|
function buildInlineCode(content) {
|
|
815
341
|
const runs = content.match(/`+/g);
|
|
816
342
|
let longest = '';
|
|
@@ -821,8 +347,11 @@ function buildInlineCode(content) {
|
|
|
821
347
|
}
|
|
822
348
|
}
|
|
823
349
|
}
|
|
350
|
+
// Use a fence longer than any run of backticks in the content.
|
|
824
351
|
const delimiter = `\`${longest}`;
|
|
825
|
-
|
|
352
|
+
// Only pad when needed to avoid altering code spans unnecessarily.
|
|
353
|
+
// CommonMark recommends padding when the code starts/ends with a backtick.
|
|
354
|
+
const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
|
|
826
355
|
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
827
356
|
}
|
|
828
357
|
function deriveAltFromImageUrl(src) {
|
|
@@ -845,16 +374,13 @@ function deriveAltFromImageUrl(src) {
|
|
|
845
374
|
}
|
|
846
375
|
}
|
|
847
376
|
function isCodeBlock(parent) {
|
|
848
|
-
if (!
|
|
377
|
+
if (!isObject(parent))
|
|
849
378
|
return false;
|
|
850
379
|
const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
|
|
851
380
|
return ['PRE', 'WRAPPED-PRE'].includes(tagName);
|
|
852
381
|
}
|
|
853
382
|
function hasGetAttribute(value) {
|
|
854
|
-
return
|
|
855
|
-
}
|
|
856
|
-
function hasCodeBlockTranslators(value) {
|
|
857
|
-
return isRecord(value) && isRecord(value.codeBlockTranslators);
|
|
383
|
+
return isObject(value) && typeof value.getAttribute === 'function';
|
|
858
384
|
}
|
|
859
385
|
function buildInlineCodeTranslator() {
|
|
860
386
|
return {
|
|
@@ -871,37 +397,19 @@ function resolveAttributeLanguage(node) {
|
|
|
871
397
|
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
872
398
|
return resolveLanguageFromAttributes(className, dataLanguage);
|
|
873
399
|
}
|
|
874
|
-
function resolveCodeBlockTranslators(visitor) {
|
|
875
|
-
const childTranslators = isRecord(visitor) ? visitor.instance : null;
|
|
876
|
-
return hasCodeBlockTranslators(childTranslators)
|
|
877
|
-
? childTranslators.codeBlockTranslators
|
|
878
|
-
: null;
|
|
879
|
-
}
|
|
880
|
-
function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
|
|
881
|
-
return {
|
|
882
|
-
noEscape: true,
|
|
883
|
-
preserveWhitespace: true,
|
|
884
|
-
...(codeBlockTranslators
|
|
885
|
-
? { childTranslators: codeBlockTranslators }
|
|
886
|
-
: null),
|
|
887
|
-
postprocess: ({ content }) => {
|
|
888
|
-
const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
|
|
889
|
-
return CODE_BLOCK.format(content, language);
|
|
890
|
-
},
|
|
891
|
-
};
|
|
892
|
-
}
|
|
893
400
|
function buildCodeTranslator(ctx) {
|
|
894
|
-
if (!
|
|
401
|
+
if (!isObject(ctx))
|
|
895
402
|
return buildInlineCodeTranslator();
|
|
896
|
-
const {
|
|
403
|
+
const { parent } = ctx;
|
|
897
404
|
if (!isCodeBlock(parent))
|
|
898
405
|
return buildInlineCodeTranslator();
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
406
|
+
return {
|
|
407
|
+
noEscape: true,
|
|
408
|
+
preserveWhitespace: true,
|
|
409
|
+
};
|
|
902
410
|
}
|
|
903
411
|
function buildImageTranslator(ctx) {
|
|
904
|
-
if (!
|
|
412
|
+
if (!isObject(ctx))
|
|
905
413
|
return { content: '' };
|
|
906
414
|
const { node } = ctx;
|
|
907
415
|
const getAttribute = hasGetAttribute(node)
|
|
@@ -914,19 +422,57 @@ function buildImageTranslator(ctx) {
|
|
|
914
422
|
content: ``,
|
|
915
423
|
};
|
|
916
424
|
}
|
|
425
|
+
function findLanguageFromCodeChild(node) {
|
|
426
|
+
if (!isObject(node))
|
|
427
|
+
return undefined;
|
|
428
|
+
const { childNodes } = node;
|
|
429
|
+
if (!Array.isArray(childNodes))
|
|
430
|
+
return undefined;
|
|
431
|
+
for (const child of childNodes) {
|
|
432
|
+
if (!isObject(child))
|
|
433
|
+
continue;
|
|
434
|
+
const tagName = typeof child.rawTagName === 'string'
|
|
435
|
+
? child.rawTagName.toUpperCase()
|
|
436
|
+
: '';
|
|
437
|
+
if (tagName === 'CODE') {
|
|
438
|
+
return resolveAttributeLanguage(child);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
return undefined;
|
|
442
|
+
}
|
|
443
|
+
function createCodeBlockPostprocessor(language) {
|
|
444
|
+
return ({ content }) => {
|
|
445
|
+
const trimmed = content.trim();
|
|
446
|
+
if (!trimmed)
|
|
447
|
+
return '';
|
|
448
|
+
const resolvedLanguage = language ?? detectLanguageFromCode(trimmed) ?? '';
|
|
449
|
+
return CODE_BLOCK.format(trimmed, resolvedLanguage);
|
|
450
|
+
};
|
|
451
|
+
}
|
|
452
|
+
function buildPreTranslator(ctx) {
|
|
453
|
+
if (!isObject(ctx))
|
|
454
|
+
return {};
|
|
455
|
+
const { node } = ctx;
|
|
456
|
+
const attributeLanguage = resolveAttributeLanguage(node) ?? findLanguageFromCodeChild(node);
|
|
457
|
+
return {
|
|
458
|
+
noEscape: true,
|
|
459
|
+
preserveWhitespace: true,
|
|
460
|
+
postprocess: createCodeBlockPostprocessor(attributeLanguage),
|
|
461
|
+
};
|
|
462
|
+
}
|
|
917
463
|
function createCustomTranslators() {
|
|
918
464
|
return {
|
|
919
465
|
code: (ctx) => buildCodeTranslator(ctx),
|
|
920
466
|
img: (ctx) => buildImageTranslator(ctx),
|
|
921
467
|
dl: (ctx) => {
|
|
922
|
-
if (!
|
|
468
|
+
if (!isObject(ctx) || !isObject(ctx.node)) {
|
|
923
469
|
return { content: '' };
|
|
924
470
|
}
|
|
925
471
|
const node = ctx.node;
|
|
926
472
|
const childNodes = Array.isArray(node.childNodes) ? node.childNodes : [];
|
|
927
473
|
const items = childNodes
|
|
928
474
|
.map((child) => {
|
|
929
|
-
if (!
|
|
475
|
+
if (!isObject(child))
|
|
930
476
|
return '';
|
|
931
477
|
const nodeName = typeof child.nodeName === 'string'
|
|
932
478
|
? child.nodeName.toUpperCase()
|
|
@@ -956,6 +502,8 @@ function createCustomTranslators() {
|
|
|
956
502
|
sup: () => ({
|
|
957
503
|
postprocess: ({ content }) => `^${content}^`,
|
|
958
504
|
}),
|
|
505
|
+
// Fix #6: Handle <pre> without <code> - wrap in fenced code block
|
|
506
|
+
pre: (ctx) => buildPreTranslator(ctx),
|
|
959
507
|
};
|
|
960
508
|
}
|
|
961
509
|
let markdownInstance = null;
|
|
@@ -971,9 +519,11 @@ function getMarkdownConverter() {
|
|
|
971
519
|
markdownInstance ??= createMarkdownInstance();
|
|
972
520
|
return markdownInstance;
|
|
973
521
|
}
|
|
974
|
-
function translateHtmlToMarkdown(html, url, signal, document) {
|
|
522
|
+
function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval) {
|
|
975
523
|
throwIfAborted(signal, url, 'markdown:begin');
|
|
976
|
-
const cleanedHtml =
|
|
524
|
+
const cleanedHtml = skipNoiseRemoval
|
|
525
|
+
? html
|
|
526
|
+
: runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
|
|
977
527
|
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
978
528
|
const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
|
|
979
529
|
throwIfAborted(signal, url, 'markdown:translated');
|
|
@@ -989,151 +539,18 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
989
539
|
if (!html)
|
|
990
540
|
return buildMetadataFooter(metadata, url);
|
|
991
541
|
try {
|
|
992
|
-
const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document);
|
|
542
|
+
const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document, options?.skipNoiseRemoval);
|
|
993
543
|
return appendMetadataFooter(content, metadata, url);
|
|
994
544
|
}
|
|
995
545
|
catch (error) {
|
|
996
546
|
if (error instanceof FetchError) {
|
|
997
547
|
throw error;
|
|
998
548
|
}
|
|
549
|
+
logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined);
|
|
999
550
|
return buildMetadataFooter(metadata, url);
|
|
1000
551
|
}
|
|
1001
552
|
}
|
|
1002
|
-
|
|
1003
|
-
let result = content;
|
|
1004
|
-
const fixOrphanHeadings = (text) => {
|
|
1005
|
-
return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
|
|
1006
|
-
if (typeof prefix !== 'string' ||
|
|
1007
|
-
typeof hashes !== 'string' ||
|
|
1008
|
-
typeof heading !== 'string') {
|
|
1009
|
-
return match;
|
|
1010
|
-
}
|
|
1011
|
-
if (heading.length > 150) {
|
|
1012
|
-
return match;
|
|
1013
|
-
}
|
|
1014
|
-
const trimmedPrefix = prefix.trim();
|
|
1015
|
-
if (trimmedPrefix === '') {
|
|
1016
|
-
return `${hashes} ${heading}\n\n`;
|
|
1017
|
-
}
|
|
1018
|
-
return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
|
|
1019
|
-
});
|
|
1020
|
-
};
|
|
1021
|
-
result = fixOrphanHeadings(result);
|
|
1022
|
-
result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
|
|
1023
|
-
const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
|
|
1024
|
-
result = result.replace(zeroWidthAnchorLink, '');
|
|
1025
|
-
result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
|
|
1026
|
-
result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
|
|
1027
|
-
result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
|
|
1028
|
-
result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
|
|
1029
|
-
result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
|
|
1030
|
-
result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
|
|
1031
|
-
const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
|
|
1032
|
-
const lines = result.split('\n');
|
|
1033
|
-
const filtered = [];
|
|
1034
|
-
let skipTocBlock = false;
|
|
1035
|
-
for (let i = 0; i < lines.length; i += 1) {
|
|
1036
|
-
const line = lines[i] ?? '';
|
|
1037
|
-
const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
|
|
1038
|
-
const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
|
|
1039
|
-
if (tocLinkLine.test(line)) {
|
|
1040
|
-
const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
|
|
1041
|
-
const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
|
|
1042
|
-
if (prevIsToc || nextIsToc) {
|
|
1043
|
-
skipTocBlock = true;
|
|
1044
|
-
continue;
|
|
1045
|
-
}
|
|
1046
|
-
}
|
|
1047
|
-
else if (line.trim() === '' && skipTocBlock) {
|
|
1048
|
-
skipTocBlock = false;
|
|
1049
|
-
continue;
|
|
1050
|
-
}
|
|
1051
|
-
else {
|
|
1052
|
-
skipTocBlock = false;
|
|
1053
|
-
}
|
|
1054
|
-
filtered.push(line);
|
|
1055
|
-
}
|
|
1056
|
-
result = filtered.join('\n');
|
|
1057
|
-
result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
|
|
1058
|
-
result = result.replace(/^Was this page helpful\??\s*$/gim, '');
|
|
1059
|
-
result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
|
|
1060
|
-
result = result.replace(/\\([[]])/g, '$1');
|
|
1061
|
-
result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
|
|
1062
|
-
result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
|
|
1063
|
-
result = result.replace(/\n{3,}/g, '\n\n');
|
|
1064
|
-
return result.trim();
|
|
1065
|
-
}
|
|
1066
|
-
const HEADING_KEYWORDS = new Set([
|
|
1067
|
-
'overview',
|
|
1068
|
-
'introduction',
|
|
1069
|
-
'summary',
|
|
1070
|
-
'conclusion',
|
|
1071
|
-
'prerequisites',
|
|
1072
|
-
'requirements',
|
|
1073
|
-
'installation',
|
|
1074
|
-
'configuration',
|
|
1075
|
-
'usage',
|
|
1076
|
-
'features',
|
|
1077
|
-
'limitations',
|
|
1078
|
-
'troubleshooting',
|
|
1079
|
-
'faq',
|
|
1080
|
-
'resources',
|
|
1081
|
-
'references',
|
|
1082
|
-
'changelog',
|
|
1083
|
-
'license',
|
|
1084
|
-
'acknowledgments',
|
|
1085
|
-
'appendix',
|
|
1086
|
-
]);
|
|
1087
|
-
function isLikelyHeadingLine(line) {
|
|
1088
|
-
const trimmed = line.trim();
|
|
1089
|
-
if (!trimmed || trimmed.length > 80)
|
|
1090
|
-
return false;
|
|
1091
|
-
if (/^#{1,6}\s/.test(trimmed))
|
|
1092
|
-
return false;
|
|
1093
|
-
if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
|
|
1094
|
-
return false;
|
|
1095
|
-
if (/[.!?]$/.test(trimmed))
|
|
1096
|
-
return false;
|
|
1097
|
-
if (/^\[.*\]\(.*\)$/.test(trimmed))
|
|
1098
|
-
return false;
|
|
1099
|
-
if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
|
|
1100
|
-
return true;
|
|
1101
|
-
}
|
|
1102
|
-
const words = trimmed.split(/\s+/);
|
|
1103
|
-
if (words.length >= 2 && words.length <= 6) {
|
|
1104
|
-
const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
|
|
1105
|
-
if (isTitleCase)
|
|
1106
|
-
return true;
|
|
1107
|
-
}
|
|
1108
|
-
if (words.length === 1) {
|
|
1109
|
-
const lower = trimmed.toLowerCase();
|
|
1110
|
-
if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
|
|
1111
|
-
return true;
|
|
1112
|
-
}
|
|
1113
|
-
}
|
|
1114
|
-
return false;
|
|
1115
|
-
}
|
|
1116
|
-
function promoteOrphanHeadings(markdown) {
|
|
1117
|
-
const lines = markdown.split('\n');
|
|
1118
|
-
const result = [];
|
|
1119
|
-
for (let i = 0; i < lines.length; i += 1) {
|
|
1120
|
-
const line = lines[i] ?? '';
|
|
1121
|
-
const prevLine = i > 0 ? lines[i - 1] : '';
|
|
1122
|
-
const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
|
|
1123
|
-
const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
|
|
1124
|
-
const isPrecededByBlank = prevLine?.trim() === '';
|
|
1125
|
-
if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
|
|
1126
|
-
const trimmed = line.trim();
|
|
1127
|
-
const isExample = /^example:\s/i.test(trimmed);
|
|
1128
|
-
const prefix = isExample ? '### ' : '## ';
|
|
1129
|
-
result.push(prefix + trimmed);
|
|
1130
|
-
}
|
|
1131
|
-
else {
|
|
1132
|
-
result.push(line);
|
|
1133
|
-
}
|
|
1134
|
-
}
|
|
1135
|
-
return result.join('\n');
|
|
1136
|
-
}
|
|
553
|
+
// Markdown cleanup functions moved to ./markdown-cleanup.ts
|
|
1137
554
|
function formatFetchedDate(isoString) {
|
|
1138
555
|
try {
|
|
1139
556
|
const date = new Date(isoString);
|
|
@@ -1382,54 +799,114 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
|
1382
799
|
const MIN_CONTENT_RATIO = 0.3;
|
|
1383
800
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
1384
801
|
const MIN_HEADING_RETENTION_RATIO = 0.7;
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
802
|
+
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
|
|
803
|
+
/**
|
|
804
|
+
* Count headings using DOM querySelectorAll.
|
|
805
|
+
* Handles nested content like <h2><span>Text</span></h2> correctly.
|
|
806
|
+
*/
|
|
807
|
+
function countHeadingsDom(htmlOrDocument) {
|
|
808
|
+
if (typeof htmlOrDocument === 'string') {
|
|
809
|
+
// Wrap fragments in document structure for proper parsing
|
|
810
|
+
const htmlToParse = needsDocumentWrapper(htmlOrDocument)
|
|
811
|
+
? wrapHtmlFragment(htmlOrDocument)
|
|
812
|
+
: htmlOrDocument;
|
|
813
|
+
const { document: doc } = parseHTML(htmlToParse);
|
|
814
|
+
return doc.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
|
|
815
|
+
}
|
|
816
|
+
return htmlOrDocument.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
|
|
817
|
+
}
|
|
818
|
+
function countCodeBlocksDom(htmlOrDocument) {
|
|
819
|
+
if (typeof htmlOrDocument === 'string') {
|
|
820
|
+
// Wrap fragments in document structure for proper parsing
|
|
821
|
+
const htmlToParse = needsDocumentWrapper(htmlOrDocument)
|
|
822
|
+
? wrapHtmlFragment(htmlOrDocument)
|
|
823
|
+
: htmlOrDocument;
|
|
824
|
+
const { document: doc } = parseHTML(htmlToParse);
|
|
825
|
+
return doc.querySelectorAll('pre').length;
|
|
826
|
+
}
|
|
827
|
+
return htmlOrDocument.querySelectorAll('pre').length;
|
|
828
|
+
}
|
|
829
|
+
/**
|
|
830
|
+
* Check if HTML string needs document wrapper for proper parsing.
|
|
831
|
+
* Fragments without doctype/html/body tags need wrapping.
|
|
832
|
+
*/
|
|
833
|
+
function needsDocumentWrapper(html) {
|
|
834
|
+
const trimmed = html.trim().toLowerCase();
|
|
835
|
+
return (!trimmed.startsWith('<!doctype') &&
|
|
836
|
+
!trimmed.startsWith('<html') &&
|
|
837
|
+
!trimmed.startsWith('<body'));
|
|
838
|
+
}
|
|
839
|
+
/**
|
|
840
|
+
* Wrap HTML fragment in minimal document structure for proper parsing.
|
|
841
|
+
*/
|
|
842
|
+
function wrapHtmlFragment(html) {
|
|
843
|
+
return `<!DOCTYPE html><html><body>${html}</body></html>`;
|
|
844
|
+
}
|
|
845
|
+
/**
|
|
846
|
+
* Get visible text length from HTML, excluding script/style/noscript content.
|
|
847
|
+
* Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
|
|
848
|
+
*/
|
|
849
|
+
function getVisibleTextLength(htmlOrDocument) {
|
|
850
|
+
// For string input, parse the HTML
|
|
851
|
+
if (typeof htmlOrDocument === 'string') {
|
|
852
|
+
// Wrap fragments in document structure for proper parsing
|
|
853
|
+
const htmlToParse = needsDocumentWrapper(htmlOrDocument)
|
|
854
|
+
? wrapHtmlFragment(htmlOrDocument)
|
|
855
|
+
: htmlOrDocument;
|
|
856
|
+
const { document: doc } = parseHTML(htmlToParse);
|
|
857
|
+
// Remove non-visible content that inflates text length
|
|
858
|
+
for (const el of doc.querySelectorAll('script,style,noscript')) {
|
|
859
|
+
el.remove();
|
|
1415
860
|
}
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
861
|
+
// Get text content from body or documentElement
|
|
862
|
+
// Note: linkedom may return null for body on HTML fragments despite types
|
|
863
|
+
const body = doc.body;
|
|
864
|
+
const docElement = doc.documentElement;
|
|
865
|
+
const text = body?.textContent ?? docElement?.textContent ?? '';
|
|
866
|
+
return text.replace(/\s+/g, ' ').trim().length;
|
|
867
|
+
}
|
|
868
|
+
// For Document input, clone to avoid mutation
|
|
869
|
+
const workDoc = htmlOrDocument.cloneNode(true);
|
|
870
|
+
// Remove non-visible content that inflates text length
|
|
871
|
+
for (const el of workDoc.querySelectorAll('script,style,noscript')) {
|
|
872
|
+
el.remove();
|
|
873
|
+
}
|
|
874
|
+
// Get text content from body or documentElement
|
|
875
|
+
// Note: linkedom may return null for body on HTML fragments despite types
|
|
876
|
+
const body = workDoc.body;
|
|
877
|
+
const docElement = workDoc.documentElement;
|
|
878
|
+
const text = body?.textContent ?? docElement?.textContent ?? '';
|
|
879
|
+
return text.replace(/\s+/g, ' ').trim().length;
|
|
880
|
+
}
|
|
881
|
+
export function isExtractionSufficient(article, originalHtmlOrDocument) {
|
|
1423
882
|
if (!article)
|
|
1424
883
|
return false;
|
|
1425
884
|
const articleLength = article.textContent.length;
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
.trim().length;
|
|
885
|
+
// Use DOM-based visible text length to exclude script/style content
|
|
886
|
+
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
1429
887
|
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
1430
888
|
return true;
|
|
1431
889
|
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
1432
890
|
}
|
|
891
|
+
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
892
|
+
const MAX_TRUNCATED_LINE_RATIO = 0.5;
|
|
893
|
+
/**
|
|
894
|
+
* Detect if extracted text has many truncated/incomplete sentences.
|
|
895
|
+
* Lines longer than 20 chars that don't end with sentence punctuation
|
|
896
|
+
* are considered potentially truncated.
|
|
897
|
+
*/
|
|
898
|
+
function hasTruncatedSentences(text) {
|
|
899
|
+
const lines = text
|
|
900
|
+
.split('\n')
|
|
901
|
+
.filter((line) => line.trim().length > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK);
|
|
902
|
+
if (lines.length < 3)
|
|
903
|
+
return false;
|
|
904
|
+
const incompleteLines = lines.filter((line) => {
|
|
905
|
+
const trimmed = line.trim();
|
|
906
|
+
return !/[.!?:;]$/.test(trimmed);
|
|
907
|
+
});
|
|
908
|
+
return incompleteLines.length / lines.length > MAX_TRUNCATED_LINE_RATIO;
|
|
909
|
+
}
|
|
1433
910
|
export function determineContentExtractionSource(article) {
|
|
1434
911
|
return article !== null;
|
|
1435
912
|
}
|
|
@@ -1459,17 +936,84 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
1459
936
|
}
|
|
1460
937
|
return metadata;
|
|
1461
938
|
}
|
|
939
|
+
/**
|
|
940
|
+
* Content root selectors in priority order.
|
|
941
|
+
* These identify the main content area on a page.
|
|
942
|
+
*/
|
|
943
|
+
const CONTENT_ROOT_SELECTORS = [
|
|
944
|
+
'main',
|
|
945
|
+
'article',
|
|
946
|
+
'[role="main"]',
|
|
947
|
+
'#content',
|
|
948
|
+
'#main-content',
|
|
949
|
+
'.content',
|
|
950
|
+
'.main-content',
|
|
951
|
+
'.post-content',
|
|
952
|
+
'.article-content',
|
|
953
|
+
'.entry-content',
|
|
954
|
+
'[itemprop="articleBody"]',
|
|
955
|
+
'[data-content]',
|
|
956
|
+
'.post-body',
|
|
957
|
+
'.article-body',
|
|
958
|
+
];
|
|
959
|
+
/**
|
|
960
|
+
* Find the main content root element in a document.
|
|
961
|
+
* Returns the innerHTML if found, undefined otherwise.
|
|
962
|
+
*/
|
|
963
|
+
function findContentRoot(document) {
|
|
964
|
+
for (const selector of CONTENT_ROOT_SELECTORS) {
|
|
965
|
+
const element = document.querySelector(selector);
|
|
966
|
+
if (!element)
|
|
967
|
+
continue;
|
|
968
|
+
// Check if element has meaningful content
|
|
969
|
+
const innerHTML = typeof element.innerHTML === 'string'
|
|
970
|
+
? element.innerHTML
|
|
971
|
+
: undefined;
|
|
972
|
+
if (innerHTML && innerHTML.trim().length > 100) {
|
|
973
|
+
return innerHTML;
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
return undefined;
|
|
977
|
+
}
|
|
1462
978
|
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
|
|
1463
979
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
980
|
+
// If using article content, return it directly
|
|
981
|
+
if (useArticleContent && article) {
|
|
982
|
+
return {
|
|
983
|
+
sourceHtml: article.content,
|
|
984
|
+
title: article.title,
|
|
985
|
+
metadata,
|
|
986
|
+
};
|
|
987
|
+
}
|
|
988
|
+
// Try content root fallback before using full HTML
|
|
989
|
+
if (document) {
|
|
990
|
+
// Apply noise removal to HTML first (without passing document) to get cleaned HTML,
|
|
991
|
+
// then parse and find content root. This prevents the aggressive DOM stripping that
|
|
992
|
+
// happens when noise removal is given the original parsed document.
|
|
993
|
+
const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
|
|
994
|
+
const { document: cleanedDoc } = parseHTML(cleanedHtml);
|
|
995
|
+
const contentRoot = findContentRoot(cleanedDoc);
|
|
996
|
+
if (contentRoot) {
|
|
997
|
+
logDebug('Using content root fallback instead of full HTML', {
|
|
998
|
+
url: url.substring(0, 80),
|
|
999
|
+
contentLength: contentRoot.length,
|
|
1000
|
+
});
|
|
1001
|
+
return {
|
|
1002
|
+
sourceHtml: contentRoot,
|
|
1003
|
+
title: extractedMeta.title,
|
|
1004
|
+
metadata,
|
|
1005
|
+
// Skip noise removal - this HTML is already from a cleaned document
|
|
1006
|
+
skipNoiseRemoval: true,
|
|
1007
|
+
};
|
|
1008
|
+
}
|
|
1009
|
+
}
|
|
1010
|
+
// Fall back to full HTML
|
|
1011
|
+
return {
|
|
1012
|
+
sourceHtml: html,
|
|
1013
|
+
title: extractedMeta.title,
|
|
1467
1014
|
metadata,
|
|
1015
|
+
...(document ? { document } : {}),
|
|
1468
1016
|
};
|
|
1469
|
-
if (!useArticleContent && document) {
|
|
1470
|
-
return { ...source, document };
|
|
1471
|
-
}
|
|
1472
|
-
return source;
|
|
1473
1017
|
}
|
|
1474
1018
|
function logQualityGateFallback({ url, articleLength, }) {
|
|
1475
1019
|
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
@@ -1477,22 +1021,54 @@ function logQualityGateFallback({ url, articleLength, }) {
|
|
|
1477
1021
|
articleLength,
|
|
1478
1022
|
});
|
|
1479
1023
|
}
|
|
1480
|
-
function shouldUseArticleContent(article,
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1024
|
+
function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
|
|
1025
|
+
const articleLength = article.textContent.length;
|
|
1026
|
+
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
1027
|
+
// If the document is tiny, don't gate too aggressively.
|
|
1028
|
+
if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
|
|
1029
|
+
const ratio = articleLength / originalLength;
|
|
1030
|
+
if (ratio < MIN_CONTENT_RATIO) {
|
|
1031
|
+
logQualityGateFallback({ url, articleLength });
|
|
1032
|
+
return false;
|
|
1033
|
+
}
|
|
1488
1034
|
}
|
|
1489
|
-
//
|
|
1490
|
-
|
|
1491
|
-
|
|
1035
|
+
// Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
|
|
1036
|
+
const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
|
|
1037
|
+
if (originalHeadings > 0) {
|
|
1038
|
+
const articleHeadings = countHeadingsDom(article.content);
|
|
1039
|
+
const retentionRatio = articleHeadings / originalHeadings;
|
|
1040
|
+
if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
|
|
1041
|
+
logDebug('Quality gate: Readability broke heading structure, using full HTML', {
|
|
1042
|
+
url: url.substring(0, 80),
|
|
1043
|
+
originalHeadings,
|
|
1044
|
+
articleHeadings,
|
|
1045
|
+
});
|
|
1046
|
+
return false;
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
|
|
1050
|
+
if (originalCodeBlocks > 0) {
|
|
1051
|
+
const articleCodeBlocks = countCodeBlocksDom(article.content);
|
|
1052
|
+
const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
|
|
1053
|
+
// Always log code block counts for debugging
|
|
1054
|
+
logDebug('Code block retention check', {
|
|
1492
1055
|
url: url.substring(0, 80),
|
|
1493
|
-
|
|
1494
|
-
|
|
1056
|
+
originalCodeBlocks,
|
|
1057
|
+
articleCodeBlocks,
|
|
1058
|
+
codeRetentionRatio,
|
|
1495
1059
|
});
|
|
1060
|
+
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
|
|
1061
|
+
logDebug('Quality gate: Readability removed code blocks, using full HTML', {
|
|
1062
|
+
url: url.substring(0, 80),
|
|
1063
|
+
originalCodeBlocks,
|
|
1064
|
+
articleCodeBlocks,
|
|
1065
|
+
});
|
|
1066
|
+
return false;
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
// Layout extraction issue: truncated/fragmented lines.
|
|
1070
|
+
if (hasTruncatedSentences(article.textContent)) {
|
|
1071
|
+
logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: url.substring(0, 80) });
|
|
1496
1072
|
return false;
|
|
1497
1073
|
}
|
|
1498
1074
|
return true;
|
|
@@ -1502,8 +1078,9 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
|
1502
1078
|
extractArticle: true,
|
|
1503
1079
|
...(signal ? { signal } : {}),
|
|
1504
1080
|
});
|
|
1081
|
+
const originalDocument = parseHTML(html).document;
|
|
1505
1082
|
const useArticleContent = article
|
|
1506
|
-
? shouldUseArticleContent(article,
|
|
1083
|
+
? shouldUseArticleContent(article, originalDocument, url)
|
|
1507
1084
|
: false;
|
|
1508
1085
|
return buildContentSource({
|
|
1509
1086
|
html,
|
|
@@ -1512,7 +1089,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
|
1512
1089
|
extractedMeta,
|
|
1513
1090
|
includeMetadata,
|
|
1514
1091
|
useArticleContent,
|
|
1515
|
-
|
|
1092
|
+
document,
|
|
1516
1093
|
});
|
|
1517
1094
|
}
|
|
1518
1095
|
function tryTransformRawStage(html, url, includeMetadata) {
|
|
@@ -1535,6 +1112,7 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
1535
1112
|
url,
|
|
1536
1113
|
...(signal ? { signal } : {}),
|
|
1537
1114
|
...(context.document ? { document: context.document } : {}),
|
|
1115
|
+
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1538
1116
|
}));
|
|
1539
1117
|
return {
|
|
1540
1118
|
markdown: content,
|
|
@@ -1628,6 +1206,12 @@ class WorkerPool {
|
|
|
1628
1206
|
timeoutMs;
|
|
1629
1207
|
queueMax;
|
|
1630
1208
|
closed = false;
|
|
1209
|
+
createAbortError(url, stage) {
|
|
1210
|
+
return new FetchError('Request was canceled', url, 499, {
|
|
1211
|
+
reason: 'aborted',
|
|
1212
|
+
stage,
|
|
1213
|
+
});
|
|
1214
|
+
}
|
|
1631
1215
|
ensureOpen() {
|
|
1632
1216
|
if (this.closed) {
|
|
1633
1217
|
throw new Error('Transform worker pool closed');
|
|
@@ -1636,10 +1220,7 @@ class WorkerPool {
|
|
|
1636
1220
|
ensureNotAborted(signal, url, stage) {
|
|
1637
1221
|
if (!signal?.aborted)
|
|
1638
1222
|
return;
|
|
1639
|
-
throw
|
|
1640
|
-
reason: 'aborted',
|
|
1641
|
-
stage,
|
|
1642
|
-
});
|
|
1223
|
+
throw this.createAbortError(url, stage);
|
|
1643
1224
|
}
|
|
1644
1225
|
ensureQueueCapacity(url) {
|
|
1645
1226
|
if (this.queue.length < this.queueMax)
|
|
@@ -1704,10 +1285,7 @@ class WorkerPool {
|
|
|
1704
1285
|
abortInflightTask(id, url, workerIndex) {
|
|
1705
1286
|
const slot = this.workers[workerIndex];
|
|
1706
1287
|
this.cancelWorkerTask(slot, id);
|
|
1707
|
-
this.failTask(id,
|
|
1708
|
-
reason: 'aborted',
|
|
1709
|
-
stage: 'transform:signal-abort',
|
|
1710
|
-
}));
|
|
1288
|
+
this.failTask(id, this.createAbortError(url, 'transform:signal-abort'));
|
|
1711
1289
|
if (slot) {
|
|
1712
1290
|
this.restartWorker(workerIndex, slot);
|
|
1713
1291
|
}
|
|
@@ -1717,10 +1295,7 @@ class WorkerPool {
|
|
|
1717
1295
|
if (queuedIndex === -1)
|
|
1718
1296
|
return;
|
|
1719
1297
|
this.queue.splice(queuedIndex, 1);
|
|
1720
|
-
reject(
|
|
1721
|
-
reason: 'aborted',
|
|
1722
|
-
stage: 'transform:queued-abort',
|
|
1723
|
-
}));
|
|
1298
|
+
reject(this.createAbortError(url, 'transform:queued-abort'));
|
|
1724
1299
|
}
|
|
1725
1300
|
createWorkerSlot(worker) {
|
|
1726
1301
|
return {
|
|
@@ -1876,10 +1451,7 @@ class WorkerPool {
|
|
|
1876
1451
|
if (!task.signal?.aborted)
|
|
1877
1452
|
return false;
|
|
1878
1453
|
this.clearAbortListener(task.signal, task.abortListener);
|
|
1879
|
-
task.reject(
|
|
1880
|
-
reason: 'aborted',
|
|
1881
|
-
stage: 'transform:dispatch',
|
|
1882
|
-
}));
|
|
1454
|
+
task.reject(this.createAbortError(task.url, 'transform:dispatch'));
|
|
1883
1455
|
return true;
|
|
1884
1456
|
}
|
|
1885
1457
|
markSlotBusy(slot, task) {
|