@j0hanz/superfetch 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +363 -614
- package/dist/cache.d.ts +2 -2
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +49 -227
- package/dist/cache.js.map +1 -1
- package/dist/config.d.ts +6 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +20 -27
- package/dist/config.js.map +1 -1
- package/dist/dom-noise-removal.d.ts +6 -0
- package/dist/dom-noise-removal.d.ts.map +1 -0
- package/dist/dom-noise-removal.js +482 -0
- package/dist/dom-noise-removal.js.map +1 -0
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +8 -5
- package/dist/errors.js.map +1 -1
- package/dist/fetch.d.ts.map +1 -1
- package/dist/fetch.js +26 -32
- package/dist/fetch.js.map +1 -1
- package/dist/http-native.d.ts +6 -0
- package/dist/http-native.d.ts.map +1 -0
- package/dist/http-native.js +645 -0
- package/dist/http-native.js.map +1 -0
- package/dist/http-utils.d.ts +61 -0
- package/dist/http-utils.d.ts.map +1 -0
- package/dist/http-utils.js +252 -0
- package/dist/http-utils.js.map +1 -0
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/dist/instructions.md +41 -39
- package/dist/json.d.ts +2 -0
- package/dist/json.d.ts.map +1 -0
- package/dist/json.js +30 -0
- package/dist/json.js.map +1 -0
- package/dist/language-detection.d.ts +13 -0
- package/dist/language-detection.d.ts.map +1 -0
- package/dist/language-detection.js +283 -0
- package/dist/language-detection.js.map +1 -0
- package/dist/markdown-cleanup.d.ts +19 -0
- package/dist/markdown-cleanup.d.ts.map +1 -0
- package/dist/markdown-cleanup.js +283 -0
- package/dist/markdown-cleanup.js.map +1 -0
- package/dist/observability.d.ts +1 -0
- package/dist/observability.d.ts.map +1 -1
- package/dist/observability.js +10 -0
- package/dist/observability.js.map +1 -1
- package/dist/tools.d.ts.map +1 -1
- package/dist/tools.js +23 -8
- package/dist/tools.js.map +1 -1
- package/dist/transform-types.d.ts +81 -0
- package/dist/transform-types.d.ts.map +1 -0
- package/dist/transform-types.js +6 -0
- package/dist/transform-types.js.map +1 -0
- package/dist/transform.d.ts +8 -52
- package/dist/transform.d.ts.map +1 -1
- package/dist/transform.js +419 -825
- package/dist/transform.js.map +1 -1
- package/dist/type-guards.d.ts +1 -1
- package/dist/type-guards.d.ts.map +1 -1
- package/dist/type-guards.js +1 -1
- package/dist/type-guards.js.map +1 -1
- package/dist/workers/transform-worker.js +23 -24
- package/dist/workers/transform-worker.js.map +1 -1
- package/package.json +85 -86
- package/dist/http.d.ts +0 -90
- package/dist/http.d.ts.map +0 -1
- package/dist/http.js +0 -1576
- package/dist/http.js.map +0 -1
package/dist/transform.js
CHANGED
|
@@ -8,44 +8,25 @@ import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
|
8
8
|
import { z } from 'zod';
|
|
9
9
|
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
10
10
|
import { config } from './config.js';
|
|
11
|
+
import { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
11
12
|
import { FetchError, getErrorMessage } from './errors.js';
|
|
12
13
|
import { isRawTextContentUrl } from './fetch.js';
|
|
14
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
15
|
+
import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
|
|
13
16
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
|
|
14
|
-
import {
|
|
17
|
+
import { isObject } from './type-guards.js';
|
|
18
|
+
// Re-export language detection for backward compatibility
|
|
19
|
+
export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
20
|
+
// Re-export markdown cleanup for backward compatibility
|
|
21
|
+
export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
|
|
22
|
+
// Re-export DOM noise removal for backward compatibility
|
|
23
|
+
export { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
15
24
|
function getAbortReason(signal) {
|
|
16
|
-
if (!
|
|
25
|
+
if (!isObject(signal))
|
|
17
26
|
return undefined;
|
|
18
27
|
return 'reason' in signal ? signal.reason : undefined;
|
|
19
28
|
}
|
|
20
|
-
|
|
21
|
-
if (!isRecord(document))
|
|
22
|
-
return undefined;
|
|
23
|
-
const { body } = document;
|
|
24
|
-
if (!isRecord(body))
|
|
25
|
-
return undefined;
|
|
26
|
-
const { innerHTML } = body;
|
|
27
|
-
return typeof innerHTML === 'string' && innerHTML.length > 0
|
|
28
|
-
? innerHTML
|
|
29
|
-
: undefined;
|
|
30
|
-
}
|
|
31
|
-
function getDocumentToString(document) {
|
|
32
|
-
if (!isRecord(document))
|
|
33
|
-
return undefined;
|
|
34
|
-
if (typeof document.toString !== 'function')
|
|
35
|
-
return undefined;
|
|
36
|
-
return document.toString.bind(document);
|
|
37
|
-
}
|
|
38
|
-
function getDocumentElementOuterHtml(document) {
|
|
39
|
-
if (!isRecord(document))
|
|
40
|
-
return undefined;
|
|
41
|
-
const { documentElement } = document;
|
|
42
|
-
if (!isRecord(documentElement))
|
|
43
|
-
return undefined;
|
|
44
|
-
const { outerHTML } = documentElement;
|
|
45
|
-
return typeof outerHTML === 'string' && outerHTML.length > 0
|
|
46
|
-
? outerHTML
|
|
47
|
-
: undefined;
|
|
48
|
-
}
|
|
29
|
+
// DOM accessor helpers moved to ./dom-noise-removal.ts
|
|
49
30
|
const CODE_BLOCK = {
|
|
50
31
|
fence: '```',
|
|
51
32
|
format: (code, language = '') => {
|
|
@@ -93,9 +74,13 @@ export function endTransformStage(context, options) {
|
|
|
93
74
|
}
|
|
94
75
|
function runTransformStage(url, stage, fn) {
|
|
95
76
|
const context = startTransformStage(url, stage);
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
77
|
+
try {
|
|
78
|
+
return fn();
|
|
79
|
+
}
|
|
80
|
+
finally {
|
|
81
|
+
// Emit duration even if the stage throws; callers decide how to handle the error.
|
|
82
|
+
endTransformStage(context);
|
|
83
|
+
}
|
|
99
84
|
}
|
|
100
85
|
function isTimeoutReason(reason) {
|
|
101
86
|
return reason instanceof Error && reason.name === 'TimeoutError';
|
|
@@ -129,46 +114,105 @@ function truncateHtml(html) {
|
|
|
129
114
|
});
|
|
130
115
|
return html.substring(0, maxSize);
|
|
131
116
|
}
|
|
117
|
+
const META_PROPERTY_HANDLERS = new Map([
|
|
118
|
+
[
|
|
119
|
+
'og:title',
|
|
120
|
+
(ctx, c) => {
|
|
121
|
+
ctx.title.og = c;
|
|
122
|
+
},
|
|
123
|
+
],
|
|
124
|
+
[
|
|
125
|
+
'og:description',
|
|
126
|
+
(ctx, c) => {
|
|
127
|
+
ctx.description.og = c;
|
|
128
|
+
},
|
|
129
|
+
],
|
|
130
|
+
[
|
|
131
|
+
'og:image',
|
|
132
|
+
(ctx, c) => {
|
|
133
|
+
ctx.image = c;
|
|
134
|
+
},
|
|
135
|
+
],
|
|
136
|
+
[
|
|
137
|
+
'article:published_time',
|
|
138
|
+
(ctx, c) => {
|
|
139
|
+
ctx.publishedAt = c;
|
|
140
|
+
},
|
|
141
|
+
],
|
|
142
|
+
[
|
|
143
|
+
'article:modified_time',
|
|
144
|
+
(ctx, c) => {
|
|
145
|
+
ctx.modifiedAt = c;
|
|
146
|
+
},
|
|
147
|
+
],
|
|
148
|
+
]);
|
|
149
|
+
const META_NAME_HANDLERS = new Map([
|
|
150
|
+
[
|
|
151
|
+
'twitter:title',
|
|
152
|
+
(ctx, c) => {
|
|
153
|
+
ctx.title.twitter = c;
|
|
154
|
+
},
|
|
155
|
+
],
|
|
156
|
+
[
|
|
157
|
+
'twitter:description',
|
|
158
|
+
(ctx, c) => {
|
|
159
|
+
ctx.description.twitter = c;
|
|
160
|
+
},
|
|
161
|
+
],
|
|
162
|
+
[
|
|
163
|
+
'description',
|
|
164
|
+
(ctx, c) => {
|
|
165
|
+
ctx.description.standard = c;
|
|
166
|
+
},
|
|
167
|
+
],
|
|
168
|
+
[
|
|
169
|
+
'author',
|
|
170
|
+
(ctx, c) => {
|
|
171
|
+
ctx.author = c;
|
|
172
|
+
},
|
|
173
|
+
],
|
|
174
|
+
]);
|
|
132
175
|
function extractMetadata(document) {
|
|
133
|
-
const
|
|
134
|
-
|
|
135
|
-
|
|
176
|
+
const ctx = {
|
|
177
|
+
title: {},
|
|
178
|
+
description: {},
|
|
179
|
+
};
|
|
136
180
|
for (const tag of document.querySelectorAll('meta')) {
|
|
137
181
|
const content = tag.getAttribute('content')?.trim();
|
|
138
182
|
if (!content)
|
|
139
183
|
continue;
|
|
140
184
|
const property = tag.getAttribute('property');
|
|
185
|
+
if (property) {
|
|
186
|
+
META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
|
|
187
|
+
}
|
|
141
188
|
const name = tag.getAttribute('name');
|
|
142
|
-
if (
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
description.og = content;
|
|
146
|
-
else if (name === 'twitter:title')
|
|
147
|
-
title.twitter = content;
|
|
148
|
-
else if (name === 'twitter:description')
|
|
149
|
-
description.twitter = content;
|
|
150
|
-
else if (name === 'description')
|
|
151
|
-
description.standard = content;
|
|
152
|
-
else if (name === 'author')
|
|
153
|
-
author = content;
|
|
189
|
+
if (name) {
|
|
190
|
+
META_NAME_HANDLERS.get(name)?.(ctx, content);
|
|
191
|
+
}
|
|
154
192
|
}
|
|
155
193
|
const titleEl = document.querySelector('title');
|
|
156
|
-
if (!title.standard && titleEl?.textContent) {
|
|
157
|
-
title.standard = titleEl.textContent.trim();
|
|
194
|
+
if (!ctx.title.standard && titleEl?.textContent) {
|
|
195
|
+
ctx.title.standard = titleEl.textContent.trim();
|
|
158
196
|
}
|
|
159
|
-
const resolvedTitle = title.og ?? title.twitter ?? title.standard;
|
|
160
|
-
const resolvedDesc = description.og ?? description.twitter ?? description.standard;
|
|
197
|
+
const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
|
|
198
|
+
const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
|
|
161
199
|
const metadata = {};
|
|
162
200
|
if (resolvedTitle)
|
|
163
201
|
metadata.title = resolvedTitle;
|
|
164
202
|
if (resolvedDesc)
|
|
165
203
|
metadata.description = resolvedDesc;
|
|
166
|
-
if (author)
|
|
167
|
-
metadata.author = author;
|
|
204
|
+
if (ctx.author)
|
|
205
|
+
metadata.author = ctx.author;
|
|
206
|
+
if (ctx.image)
|
|
207
|
+
metadata.image = ctx.image;
|
|
208
|
+
if (ctx.publishedAt)
|
|
209
|
+
metadata.publishedAt = ctx.publishedAt;
|
|
210
|
+
if (ctx.modifiedAt)
|
|
211
|
+
metadata.modifiedAt = ctx.modifiedAt;
|
|
168
212
|
return metadata;
|
|
169
213
|
}
|
|
170
214
|
function isReadabilityCompatible(doc) {
|
|
171
|
-
if (!
|
|
215
|
+
if (!isObject(doc))
|
|
172
216
|
return false;
|
|
173
217
|
return hasDocumentElement(doc) && hasQuerySelectors(doc);
|
|
174
218
|
}
|
|
@@ -185,14 +229,18 @@ function extractArticle(document) {
|
|
|
185
229
|
return null;
|
|
186
230
|
}
|
|
187
231
|
try {
|
|
188
|
-
const
|
|
189
|
-
const rawText =
|
|
190
|
-
documentClone.documentElement.textContent;
|
|
232
|
+
const doc = document;
|
|
233
|
+
const rawText = doc.querySelector('body')?.textContent ?? doc.documentElement.textContent;
|
|
191
234
|
const textLength = rawText.replace(/\s+/g, ' ').trim().length;
|
|
192
|
-
if (textLength
|
|
235
|
+
if (textLength < 100) {
|
|
236
|
+
logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
|
|
237
|
+
'This might be a client-side rendered (SPA) application. ' +
|
|
238
|
+
'Content extraction may be incomplete.', { textLength });
|
|
239
|
+
}
|
|
240
|
+
if (textLength >= 400 && !isProbablyReaderable(doc)) {
|
|
193
241
|
return null;
|
|
194
242
|
}
|
|
195
|
-
const reader = new Readability(
|
|
243
|
+
const reader = new Readability(doc, { maxElemsToParse: 20_000 });
|
|
196
244
|
const parsed = reader.parse();
|
|
197
245
|
if (!parsed)
|
|
198
246
|
return null;
|
|
@@ -213,8 +261,13 @@ function extractArticle(document) {
|
|
|
213
261
|
export function extractContent(html, url, options = {
|
|
214
262
|
extractArticle: true,
|
|
215
263
|
}) {
|
|
264
|
+
const result = extractContentWithDocument(html, url, options);
|
|
265
|
+
return { article: result.article, metadata: result.metadata };
|
|
266
|
+
}
|
|
267
|
+
function extractContentWithDocument(html, url, options) {
|
|
216
268
|
if (!isValidInput(html, url)) {
|
|
217
|
-
|
|
269
|
+
const { document } = parseHTML('<html></html>');
|
|
270
|
+
return { article: null, metadata: {}, document };
|
|
218
271
|
}
|
|
219
272
|
return tryExtractContent(html, url, options);
|
|
220
273
|
}
|
|
@@ -229,11 +282,13 @@ function handleExtractionFailure(error, url, signal) {
|
|
|
229
282
|
}
|
|
230
283
|
throwIfAborted(signal, url, 'extract:error');
|
|
231
284
|
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
232
|
-
|
|
285
|
+
const { document } = parseHTML('<html></html>');
|
|
286
|
+
return { article: null, metadata: {}, document };
|
|
233
287
|
}
|
|
234
288
|
function extractContentStages(html, url, options) {
|
|
235
289
|
throwIfAborted(options.signal, url, 'extract:begin');
|
|
236
|
-
const
|
|
290
|
+
const truncatedHtml = truncateHtml(html);
|
|
291
|
+
const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncatedHtml));
|
|
237
292
|
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
238
293
|
applyBaseUri(document, url);
|
|
239
294
|
const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
|
|
@@ -243,6 +298,8 @@ function extractContentStages(html, url, options) {
|
|
|
243
298
|
return {
|
|
244
299
|
article,
|
|
245
300
|
metadata,
|
|
301
|
+
document,
|
|
302
|
+
...(truncatedHtml.length !== html.length ? { truncated: true } : {}),
|
|
246
303
|
};
|
|
247
304
|
}
|
|
248
305
|
function tryExtractContent(html, url, options) {
|
|
@@ -279,522 +336,7 @@ function applyBaseUri(document, url) {
|
|
|
279
336
|
});
|
|
280
337
|
}
|
|
281
338
|
}
|
|
282
|
-
|
|
283
|
-
for (let index = 0; index < code.length - 1; index += 1) {
|
|
284
|
-
if (code[index] !== '<')
|
|
285
|
-
continue;
|
|
286
|
-
const next = code[index + 1];
|
|
287
|
-
if (!next)
|
|
288
|
-
continue;
|
|
289
|
-
if (next >= 'A' && next <= 'Z')
|
|
290
|
-
return true;
|
|
291
|
-
}
|
|
292
|
-
return false;
|
|
293
|
-
}
|
|
294
|
-
function containsWord(source, word) {
|
|
295
|
-
let startIndex = source.indexOf(word);
|
|
296
|
-
while (startIndex !== -1) {
|
|
297
|
-
const before = startIndex === 0 ? '' : source[startIndex - 1];
|
|
298
|
-
const afterIndex = startIndex + word.length;
|
|
299
|
-
const after = afterIndex >= source.length ? '' : source[afterIndex];
|
|
300
|
-
if (!isWordChar(before) && !isWordChar(after))
|
|
301
|
-
return true;
|
|
302
|
-
startIndex = source.indexOf(word, startIndex + word.length);
|
|
303
|
-
}
|
|
304
|
-
return false;
|
|
305
|
-
}
|
|
306
|
-
function splitLines(content) {
|
|
307
|
-
return content.split('\n');
|
|
308
|
-
}
|
|
309
|
-
function extractLanguageFromClassName(className) {
|
|
310
|
-
const tokens = className.match(/\S+/g);
|
|
311
|
-
if (!tokens)
|
|
312
|
-
return undefined;
|
|
313
|
-
for (const token of tokens) {
|
|
314
|
-
const lower = token.toLowerCase();
|
|
315
|
-
if (lower.startsWith('language-'))
|
|
316
|
-
return token.slice('language-'.length);
|
|
317
|
-
if (lower.startsWith('lang-'))
|
|
318
|
-
return token.slice('lang-'.length);
|
|
319
|
-
if (lower.startsWith('highlight-')) {
|
|
320
|
-
return token.slice('highlight-'.length);
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
if (tokens.includes('hljs')) {
|
|
324
|
-
const langClass = tokens.find((t) => t !== 'hljs' && !t.startsWith('hljs-'));
|
|
325
|
-
if (langClass)
|
|
326
|
-
return langClass;
|
|
327
|
-
}
|
|
328
|
-
return undefined;
|
|
329
|
-
}
|
|
330
|
-
function resolveLanguageFromDataAttribute(dataLang) {
|
|
331
|
-
const trimmed = dataLang.trim();
|
|
332
|
-
if (!trimmed)
|
|
333
|
-
return undefined;
|
|
334
|
-
for (const char of trimmed) {
|
|
335
|
-
if (!isWordChar(char))
|
|
336
|
-
return undefined;
|
|
337
|
-
}
|
|
338
|
-
return trimmed;
|
|
339
|
-
}
|
|
340
|
-
function isWordChar(char) {
|
|
341
|
-
if (!char)
|
|
342
|
-
return false;
|
|
343
|
-
const code = char.charCodeAt(0);
|
|
344
|
-
return ((code >= 48 && code <= 57) ||
|
|
345
|
-
(code >= 65 && code <= 90) ||
|
|
346
|
-
(code >= 97 && code <= 122) ||
|
|
347
|
-
char === '_');
|
|
348
|
-
}
|
|
349
|
-
const LANGUAGE_PATTERNS = [
|
|
350
|
-
{
|
|
351
|
-
language: 'jsx',
|
|
352
|
-
pattern: {
|
|
353
|
-
keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
|
|
354
|
-
custom: (code) => containsJsxTag(code),
|
|
355
|
-
},
|
|
356
|
-
},
|
|
357
|
-
{
|
|
358
|
-
language: 'typescript',
|
|
359
|
-
pattern: {
|
|
360
|
-
wordBoundary: ['interface', 'type'],
|
|
361
|
-
custom: (_, lower) => [
|
|
362
|
-
': string',
|
|
363
|
-
':string',
|
|
364
|
-
': number',
|
|
365
|
-
':number',
|
|
366
|
-
': boolean',
|
|
367
|
-
':boolean',
|
|
368
|
-
': void',
|
|
369
|
-
':void',
|
|
370
|
-
': any',
|
|
371
|
-
':any',
|
|
372
|
-
': unknown',
|
|
373
|
-
':unknown',
|
|
374
|
-
': never',
|
|
375
|
-
':never',
|
|
376
|
-
].some((hint) => lower.includes(hint)),
|
|
377
|
-
},
|
|
378
|
-
},
|
|
379
|
-
{
|
|
380
|
-
language: 'rust',
|
|
381
|
-
pattern: {
|
|
382
|
-
regex: /\b(?:fn|impl|struct|enum)\b/,
|
|
383
|
-
keywords: ['let mut'],
|
|
384
|
-
custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
|
|
385
|
-
},
|
|
386
|
-
},
|
|
387
|
-
{
|
|
388
|
-
language: 'javascript',
|
|
389
|
-
pattern: {
|
|
390
|
-
regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
|
|
391
|
-
},
|
|
392
|
-
},
|
|
393
|
-
{
|
|
394
|
-
language: 'python',
|
|
395
|
-
pattern: {
|
|
396
|
-
regex: /\b(?:def|class|import|from)\b/,
|
|
397
|
-
keywords: ['print(', '__name__'],
|
|
398
|
-
},
|
|
399
|
-
},
|
|
400
|
-
{
|
|
401
|
-
language: 'bash',
|
|
402
|
-
pattern: {
|
|
403
|
-
custom: (code) => detectBashIndicators(code),
|
|
404
|
-
},
|
|
405
|
-
},
|
|
406
|
-
{
|
|
407
|
-
language: 'css',
|
|
408
|
-
pattern: {
|
|
409
|
-
regex: /@media|@import|@keyframes/,
|
|
410
|
-
custom: (code) => detectCssStructure(code),
|
|
411
|
-
},
|
|
412
|
-
},
|
|
413
|
-
{
|
|
414
|
-
language: 'html',
|
|
415
|
-
pattern: {
|
|
416
|
-
keywords: [
|
|
417
|
-
'<!doctype',
|
|
418
|
-
'<html',
|
|
419
|
-
'<head',
|
|
420
|
-
'<body',
|
|
421
|
-
'<div',
|
|
422
|
-
'<span',
|
|
423
|
-
'<p',
|
|
424
|
-
'<a',
|
|
425
|
-
'<script',
|
|
426
|
-
'<style',
|
|
427
|
-
],
|
|
428
|
-
},
|
|
429
|
-
},
|
|
430
|
-
{
|
|
431
|
-
language: 'json',
|
|
432
|
-
pattern: {
|
|
433
|
-
startsWith: ['{', '['],
|
|
434
|
-
},
|
|
435
|
-
},
|
|
436
|
-
{
|
|
437
|
-
language: 'yaml',
|
|
438
|
-
pattern: {
|
|
439
|
-
custom: (code) => detectYamlStructure(code),
|
|
440
|
-
},
|
|
441
|
-
},
|
|
442
|
-
{
|
|
443
|
-
language: 'sql',
|
|
444
|
-
pattern: {
|
|
445
|
-
wordBoundary: [
|
|
446
|
-
'select',
|
|
447
|
-
'insert',
|
|
448
|
-
'update',
|
|
449
|
-
'delete',
|
|
450
|
-
'create',
|
|
451
|
-
'alter',
|
|
452
|
-
'drop',
|
|
453
|
-
],
|
|
454
|
-
},
|
|
455
|
-
},
|
|
456
|
-
{
|
|
457
|
-
language: 'go',
|
|
458
|
-
pattern: {
|
|
459
|
-
wordBoundary: ['package', 'func'],
|
|
460
|
-
keywords: ['import "'],
|
|
461
|
-
},
|
|
462
|
-
},
|
|
463
|
-
];
|
|
464
|
-
// Bash detection constants
|
|
465
|
-
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
466
|
-
const BASH_PKG_MANAGERS = [
|
|
467
|
-
'npm',
|
|
468
|
-
'yarn',
|
|
469
|
-
'pnpm',
|
|
470
|
-
'npx',
|
|
471
|
-
'brew',
|
|
472
|
-
'apt',
|
|
473
|
-
'pip',
|
|
474
|
-
'cargo',
|
|
475
|
-
'go',
|
|
476
|
-
];
|
|
477
|
-
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
478
|
-
function isShellPrefix(line) {
|
|
479
|
-
return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
|
|
480
|
-
}
|
|
481
|
-
function matchesBashCommand(line) {
|
|
482
|
-
return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
|
|
483
|
-
}
|
|
484
|
-
function matchesPackageManagerVerb(line) {
|
|
485
|
-
for (const mgr of BASH_PKG_MANAGERS) {
|
|
486
|
-
if (!line.startsWith(`${mgr} `))
|
|
487
|
-
continue;
|
|
488
|
-
const rest = line.slice(mgr.length + 1);
|
|
489
|
-
if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
|
|
490
|
-
return true;
|
|
491
|
-
}
|
|
492
|
-
}
|
|
493
|
-
return false;
|
|
494
|
-
}
|
|
495
|
-
function detectBashIndicators(code) {
|
|
496
|
-
for (const line of splitLines(code)) {
|
|
497
|
-
const trimmed = line.trimStart();
|
|
498
|
-
if (!trimmed)
|
|
499
|
-
continue;
|
|
500
|
-
if (isShellPrefix(trimmed) ||
|
|
501
|
-
matchesBashCommand(trimmed) ||
|
|
502
|
-
matchesPackageManagerVerb(trimmed)) {
|
|
503
|
-
return true;
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
return false;
|
|
507
|
-
}
|
|
508
|
-
function detectCssStructure(code) {
|
|
509
|
-
for (const line of splitLines(code)) {
|
|
510
|
-
const trimmed = line.trimStart();
|
|
511
|
-
if (!trimmed)
|
|
512
|
-
continue;
|
|
513
|
-
const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
|
|
514
|
-
trimmed.includes('{');
|
|
515
|
-
const isProperty = trimmed.includes(':') && trimmed.includes(';');
|
|
516
|
-
if (isSelector || isProperty)
|
|
517
|
-
return true;
|
|
518
|
-
}
|
|
519
|
-
return false;
|
|
520
|
-
}
|
|
521
|
-
function detectYamlStructure(code) {
|
|
522
|
-
for (const line of splitLines(code)) {
|
|
523
|
-
const trimmed = line.trim();
|
|
524
|
-
if (!trimmed)
|
|
525
|
-
continue;
|
|
526
|
-
const colonIdx = trimmed.indexOf(':');
|
|
527
|
-
if (colonIdx <= 0)
|
|
528
|
-
continue;
|
|
529
|
-
const after = trimmed[colonIdx + 1];
|
|
530
|
-
if (after === ' ' || after === '\t')
|
|
531
|
-
return true;
|
|
532
|
-
}
|
|
533
|
-
return false;
|
|
534
|
-
}
|
|
535
|
-
function matchesLanguagePattern(code, lower, pattern) {
|
|
536
|
-
if (pattern.keywords?.some((kw) => lower.includes(kw)))
|
|
537
|
-
return true;
|
|
538
|
-
if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
|
|
539
|
-
return true;
|
|
540
|
-
if (pattern.regex?.test(lower))
|
|
541
|
-
return true;
|
|
542
|
-
if (pattern.startsWith) {
|
|
543
|
-
const trimmed = code.trimStart();
|
|
544
|
-
if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
|
|
545
|
-
return true;
|
|
546
|
-
}
|
|
547
|
-
if (pattern.custom?.(code, lower))
|
|
548
|
-
return true;
|
|
549
|
-
return false;
|
|
550
|
-
}
|
|
551
|
-
export function detectLanguageFromCode(code) {
|
|
552
|
-
const lower = code.toLowerCase();
|
|
553
|
-
for (const { language, pattern } of LANGUAGE_PATTERNS) {
|
|
554
|
-
if (matchesLanguagePattern(code, lower, pattern))
|
|
555
|
-
return language;
|
|
556
|
-
}
|
|
557
|
-
return undefined;
|
|
558
|
-
}
|
|
559
|
-
export function resolveLanguageFromAttributes(className, dataLang) {
|
|
560
|
-
const classMatch = extractLanguageFromClassName(className);
|
|
561
|
-
return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
|
|
562
|
-
}
|
|
563
|
-
function isElement(node) {
|
|
564
|
-
return (isRecord(node) &&
|
|
565
|
-
'getAttribute' in node &&
|
|
566
|
-
typeof node.getAttribute === 'function');
|
|
567
|
-
}
|
|
568
|
-
const STRUCTURAL_TAGS = new Set([
|
|
569
|
-
'script',
|
|
570
|
-
'style',
|
|
571
|
-
'noscript',
|
|
572
|
-
'iframe',
|
|
573
|
-
'form',
|
|
574
|
-
'button',
|
|
575
|
-
'input',
|
|
576
|
-
'select',
|
|
577
|
-
'textarea',
|
|
578
|
-
'svg',
|
|
579
|
-
]);
|
|
580
|
-
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
|
|
581
|
-
const NAVIGATION_ROLES = new Set([
|
|
582
|
-
'navigation',
|
|
583
|
-
'banner',
|
|
584
|
-
'complementary',
|
|
585
|
-
'contentinfo',
|
|
586
|
-
'tree',
|
|
587
|
-
'menubar',
|
|
588
|
-
'menu',
|
|
589
|
-
'dialog',
|
|
590
|
-
'alertdialog',
|
|
591
|
-
'search',
|
|
592
|
-
]);
|
|
593
|
-
const PROMO_TOKENS = new Set([
|
|
594
|
-
'banner',
|
|
595
|
-
'promo',
|
|
596
|
-
'announcement',
|
|
597
|
-
'cta',
|
|
598
|
-
'callout',
|
|
599
|
-
'advert',
|
|
600
|
-
'ad',
|
|
601
|
-
'ads',
|
|
602
|
-
'sponsor',
|
|
603
|
-
'newsletter',
|
|
604
|
-
'subscribe',
|
|
605
|
-
'cookie',
|
|
606
|
-
'consent',
|
|
607
|
-
'popup',
|
|
608
|
-
'modal',
|
|
609
|
-
'overlay',
|
|
610
|
-
'toast',
|
|
611
|
-
'share',
|
|
612
|
-
'social',
|
|
613
|
-
'related',
|
|
614
|
-
'recommend',
|
|
615
|
-
'comment',
|
|
616
|
-
'breadcrumb',
|
|
617
|
-
'pagination',
|
|
618
|
-
'pager',
|
|
619
|
-
'taglist',
|
|
620
|
-
]);
|
|
621
|
-
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
622
|
-
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
623
|
-
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
624
|
-
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
625
|
-
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
626
|
-
const NOISE_MARKERS = [
|
|
627
|
-
'<script',
|
|
628
|
-
'<style',
|
|
629
|
-
'<noscript',
|
|
630
|
-
'<iframe',
|
|
631
|
-
'<nav',
|
|
632
|
-
'<footer',
|
|
633
|
-
'<aside',
|
|
634
|
-
'<header',
|
|
635
|
-
'<form',
|
|
636
|
-
'<button',
|
|
637
|
-
'<input',
|
|
638
|
-
'<select',
|
|
639
|
-
'<textarea',
|
|
640
|
-
'<svg',
|
|
641
|
-
'<canvas',
|
|
642
|
-
' aria-hidden="true"',
|
|
643
|
-
" aria-hidden='true'",
|
|
644
|
-
' hidden',
|
|
645
|
-
' role="navigation"',
|
|
646
|
-
" role='navigation'",
|
|
647
|
-
' role="banner"',
|
|
648
|
-
" role='banner'",
|
|
649
|
-
' role="complementary"',
|
|
650
|
-
" role='complementary'",
|
|
651
|
-
' role="contentinfo"',
|
|
652
|
-
" role='contentinfo'",
|
|
653
|
-
' role="tree"',
|
|
654
|
-
" role='tree'",
|
|
655
|
-
' role="menubar"',
|
|
656
|
-
" role='menubar'",
|
|
657
|
-
' role="menu"',
|
|
658
|
-
" role='menu'",
|
|
659
|
-
' banner',
|
|
660
|
-
' promo',
|
|
661
|
-
' announcement',
|
|
662
|
-
' cta',
|
|
663
|
-
' callout',
|
|
664
|
-
' advert',
|
|
665
|
-
' newsletter',
|
|
666
|
-
' subscribe',
|
|
667
|
-
' cookie',
|
|
668
|
-
' consent',
|
|
669
|
-
' popup',
|
|
670
|
-
' modal',
|
|
671
|
-
' overlay',
|
|
672
|
-
' toast',
|
|
673
|
-
' fixed',
|
|
674
|
-
' sticky',
|
|
675
|
-
' z-50',
|
|
676
|
-
' z-4',
|
|
677
|
-
' isolate',
|
|
678
|
-
' breadcrumb',
|
|
679
|
-
' pagination',
|
|
680
|
-
];
|
|
681
|
-
function mayContainNoise(html) {
|
|
682
|
-
const haystack = html.toLowerCase();
|
|
683
|
-
return NOISE_MARKERS.some((marker) => haystack.includes(marker));
|
|
684
|
-
}
|
|
685
|
-
function isFullDocumentHtml(html) {
|
|
686
|
-
return HTML_DOCUMENT_MARKERS.test(html);
|
|
687
|
-
}
|
|
688
|
-
function isStructuralNoiseTag(tagName) {
|
|
689
|
-
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
690
|
-
}
|
|
691
|
-
function isElementHidden(element) {
|
|
692
|
-
const style = element.getAttribute('style') ?? '';
|
|
693
|
-
return (element.getAttribute('hidden') !== null ||
|
|
694
|
-
element.getAttribute('aria-hidden') === 'true' ||
|
|
695
|
-
/\bdisplay\s*:\s*none\b/i.test(style) ||
|
|
696
|
-
/\bvisibility\s*:\s*hidden\b/i.test(style));
|
|
697
|
-
}
|
|
698
|
-
function hasNoiseRole(role) {
|
|
699
|
-
return role !== null && NAVIGATION_ROLES.has(role);
|
|
700
|
-
}
|
|
701
|
-
function tokenizeIdentifierLikeText(value) {
|
|
702
|
-
return value
|
|
703
|
-
.toLowerCase()
|
|
704
|
-
.replace(/[^a-z0-9]+/g, ' ')
|
|
705
|
-
.trim()
|
|
706
|
-
.split(' ')
|
|
707
|
-
.filter(Boolean);
|
|
708
|
-
}
|
|
709
|
-
function matchesPromoIdOrClass(className, id) {
|
|
710
|
-
const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
|
|
711
|
-
return tokens.some((token) => PROMO_TOKENS.has(token));
|
|
712
|
-
}
|
|
713
|
-
function matchesFixedOrHighZIsolate(className) {
|
|
714
|
-
return (FIXED_PATTERN.test(className) ||
|
|
715
|
-
(HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
|
|
716
|
-
}
|
|
717
|
-
function readElementMetadata(element) {
|
|
718
|
-
return {
|
|
719
|
-
tagName: element.tagName.toLowerCase(),
|
|
720
|
-
className: element.getAttribute('class') ?? '',
|
|
721
|
-
id: element.getAttribute('id') ?? '',
|
|
722
|
-
role: element.getAttribute('role'),
|
|
723
|
-
isHidden: isElementHidden(element),
|
|
724
|
-
};
|
|
725
|
-
}
|
|
726
|
-
function isBoilerplateHeader({ className, id, role, }) {
|
|
727
|
-
if (hasNoiseRole(role))
|
|
728
|
-
return true;
|
|
729
|
-
const combined = `${className} ${id}`.toLowerCase();
|
|
730
|
-
return HEADER_NOISE_PATTERN.test(combined);
|
|
731
|
-
}
|
|
732
|
-
function isNoiseElement(node) {
|
|
733
|
-
const metadata = readElementMetadata(node);
|
|
734
|
-
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
735
|
-
ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
|
|
736
|
-
(metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
|
|
737
|
-
metadata.isHidden ||
|
|
738
|
-
hasNoiseRole(metadata.role) ||
|
|
739
|
-
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
740
|
-
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
741
|
-
}
|
|
742
|
-
function removeNoiseNodes(nodes) {
|
|
743
|
-
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
744
|
-
const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
|
|
745
|
-
if (!node)
|
|
746
|
-
continue;
|
|
747
|
-
if (isElement(node) && isNoiseElement(node)) {
|
|
748
|
-
node.remove();
|
|
749
|
-
}
|
|
750
|
-
}
|
|
751
|
-
}
|
|
752
|
-
function stripNoiseNodes(document) {
|
|
753
|
-
// Use targeted selectors for common noise elements instead of querySelectorAll('*')
|
|
754
|
-
const targetSelectors = [
|
|
755
|
-
'nav',
|
|
756
|
-
'footer',
|
|
757
|
-
'aside',
|
|
758
|
-
'header[class*="site"]',
|
|
759
|
-
'header[class*="nav"]',
|
|
760
|
-
'header[class*="menu"]',
|
|
761
|
-
'[role="banner"]',
|
|
762
|
-
'[role="navigation"]',
|
|
763
|
-
'[role="dialog"]',
|
|
764
|
-
'[style*="display: none"]',
|
|
765
|
-
'[style*="display:none"]',
|
|
766
|
-
'[hidden]',
|
|
767
|
-
'[aria-hidden="true"]',
|
|
768
|
-
].join(',');
|
|
769
|
-
const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
|
|
770
|
-
// Remove in reverse order to handle nested elements correctly
|
|
771
|
-
removeNoiseNodes(potentialNoiseNodes);
|
|
772
|
-
// Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
|
|
773
|
-
const allElements = document.querySelectorAll('*');
|
|
774
|
-
removeNoiseNodes(allElements);
|
|
775
|
-
}
|
|
776
|
-
function removeNoiseFromHtml(html) {
|
|
777
|
-
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
778
|
-
if (!shouldParse)
|
|
779
|
-
return html;
|
|
780
|
-
try {
|
|
781
|
-
const { document } = parseHTML(html);
|
|
782
|
-
stripNoiseNodes(document);
|
|
783
|
-
const bodyInnerHtml = getBodyInnerHtml(document);
|
|
784
|
-
if (bodyInnerHtml)
|
|
785
|
-
return bodyInnerHtml;
|
|
786
|
-
const docToString = getDocumentToString(document);
|
|
787
|
-
if (docToString)
|
|
788
|
-
return docToString();
|
|
789
|
-
const documentElementOuterHtml = getDocumentElementOuterHtml(document);
|
|
790
|
-
if (documentElementOuterHtml)
|
|
791
|
-
return documentElementOuterHtml;
|
|
792
|
-
return html;
|
|
793
|
-
}
|
|
794
|
-
catch {
|
|
795
|
-
return html;
|
|
796
|
-
}
|
|
797
|
-
}
|
|
339
|
+
// DOM noise removal functions moved to ./dom-noise-removal.ts
|
|
798
340
|
function buildInlineCode(content) {
|
|
799
341
|
const runs = content.match(/`+/g);
|
|
800
342
|
let longest = '';
|
|
@@ -805,8 +347,11 @@ function buildInlineCode(content) {
|
|
|
805
347
|
}
|
|
806
348
|
}
|
|
807
349
|
}
|
|
350
|
+
// Use a fence longer than any run of backticks in the content.
|
|
808
351
|
const delimiter = `\`${longest}`;
|
|
809
|
-
|
|
352
|
+
// Only pad when needed to avoid altering code spans unnecessarily.
|
|
353
|
+
// CommonMark recommends padding when the code starts/ends with a backtick.
|
|
354
|
+
const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
|
|
810
355
|
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
811
356
|
}
|
|
812
357
|
function deriveAltFromImageUrl(src) {
|
|
@@ -829,16 +374,13 @@ function deriveAltFromImageUrl(src) {
|
|
|
829
374
|
}
|
|
830
375
|
}
|
|
831
376
|
function isCodeBlock(parent) {
|
|
832
|
-
if (!
|
|
377
|
+
if (!isObject(parent))
|
|
833
378
|
return false;
|
|
834
379
|
const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
|
|
835
380
|
return ['PRE', 'WRAPPED-PRE'].includes(tagName);
|
|
836
381
|
}
|
|
837
382
|
function hasGetAttribute(value) {
|
|
838
|
-
return
|
|
839
|
-
}
|
|
840
|
-
function hasCodeBlockTranslators(value) {
|
|
841
|
-
return isRecord(value) && isRecord(value.codeBlockTranslators);
|
|
383
|
+
return isObject(value) && typeof value.getAttribute === 'function';
|
|
842
384
|
}
|
|
843
385
|
function buildInlineCodeTranslator() {
|
|
844
386
|
return {
|
|
@@ -855,37 +397,19 @@ function resolveAttributeLanguage(node) {
|
|
|
855
397
|
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
856
398
|
return resolveLanguageFromAttributes(className, dataLanguage);
|
|
857
399
|
}
|
|
858
|
-
function resolveCodeBlockTranslators(visitor) {
|
|
859
|
-
const childTranslators = isRecord(visitor) ? visitor.instance : null;
|
|
860
|
-
return hasCodeBlockTranslators(childTranslators)
|
|
861
|
-
? childTranslators.codeBlockTranslators
|
|
862
|
-
: null;
|
|
863
|
-
}
|
|
864
|
-
function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
|
|
865
|
-
return {
|
|
866
|
-
noEscape: true,
|
|
867
|
-
preserveWhitespace: true,
|
|
868
|
-
...(codeBlockTranslators
|
|
869
|
-
? { childTranslators: codeBlockTranslators }
|
|
870
|
-
: null),
|
|
871
|
-
postprocess: ({ content }) => {
|
|
872
|
-
const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
|
|
873
|
-
return CODE_BLOCK.format(content, language);
|
|
874
|
-
},
|
|
875
|
-
};
|
|
876
|
-
}
|
|
877
400
|
function buildCodeTranslator(ctx) {
|
|
878
|
-
if (!
|
|
401
|
+
if (!isObject(ctx))
|
|
879
402
|
return buildInlineCodeTranslator();
|
|
880
|
-
const {
|
|
403
|
+
const { parent } = ctx;
|
|
881
404
|
if (!isCodeBlock(parent))
|
|
882
405
|
return buildInlineCodeTranslator();
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
406
|
+
return {
|
|
407
|
+
noEscape: true,
|
|
408
|
+
preserveWhitespace: true,
|
|
409
|
+
};
|
|
886
410
|
}
|
|
887
411
|
function buildImageTranslator(ctx) {
|
|
888
|
-
if (!
|
|
412
|
+
if (!isObject(ctx))
|
|
889
413
|
return { content: '' };
|
|
890
414
|
const { node } = ctx;
|
|
891
415
|
const getAttribute = hasGetAttribute(node)
|
|
@@ -898,19 +422,57 @@ function buildImageTranslator(ctx) {
|
|
|
898
422
|
content: ``,
|
|
899
423
|
};
|
|
900
424
|
}
|
|
425
|
+
function findLanguageFromCodeChild(node) {
|
|
426
|
+
if (!isObject(node))
|
|
427
|
+
return undefined;
|
|
428
|
+
const { childNodes } = node;
|
|
429
|
+
if (!Array.isArray(childNodes))
|
|
430
|
+
return undefined;
|
|
431
|
+
for (const child of childNodes) {
|
|
432
|
+
if (!isObject(child))
|
|
433
|
+
continue;
|
|
434
|
+
const tagName = typeof child.rawTagName === 'string'
|
|
435
|
+
? child.rawTagName.toUpperCase()
|
|
436
|
+
: '';
|
|
437
|
+
if (tagName === 'CODE') {
|
|
438
|
+
return resolveAttributeLanguage(child);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
return undefined;
|
|
442
|
+
}
|
|
443
|
+
function createCodeBlockPostprocessor(language) {
|
|
444
|
+
return ({ content }) => {
|
|
445
|
+
const trimmed = content.trim();
|
|
446
|
+
if (!trimmed)
|
|
447
|
+
return '';
|
|
448
|
+
const resolvedLanguage = language ?? detectLanguageFromCode(trimmed) ?? '';
|
|
449
|
+
return CODE_BLOCK.format(trimmed, resolvedLanguage);
|
|
450
|
+
};
|
|
451
|
+
}
|
|
452
|
+
function buildPreTranslator(ctx) {
|
|
453
|
+
if (!isObject(ctx))
|
|
454
|
+
return {};
|
|
455
|
+
const { node } = ctx;
|
|
456
|
+
const attributeLanguage = resolveAttributeLanguage(node) ?? findLanguageFromCodeChild(node);
|
|
457
|
+
return {
|
|
458
|
+
noEscape: true,
|
|
459
|
+
preserveWhitespace: true,
|
|
460
|
+
postprocess: createCodeBlockPostprocessor(attributeLanguage),
|
|
461
|
+
};
|
|
462
|
+
}
|
|
901
463
|
function createCustomTranslators() {
|
|
902
464
|
return {
|
|
903
465
|
code: (ctx) => buildCodeTranslator(ctx),
|
|
904
466
|
img: (ctx) => buildImageTranslator(ctx),
|
|
905
467
|
dl: (ctx) => {
|
|
906
|
-
if (!
|
|
468
|
+
if (!isObject(ctx) || !isObject(ctx.node)) {
|
|
907
469
|
return { content: '' };
|
|
908
470
|
}
|
|
909
471
|
const node = ctx.node;
|
|
910
472
|
const childNodes = Array.isArray(node.childNodes) ? node.childNodes : [];
|
|
911
473
|
const items = childNodes
|
|
912
474
|
.map((child) => {
|
|
913
|
-
if (!
|
|
475
|
+
if (!isObject(child))
|
|
914
476
|
return '';
|
|
915
477
|
const nodeName = typeof child.nodeName === 'string'
|
|
916
478
|
? child.nodeName.toUpperCase()
|
|
@@ -940,6 +502,8 @@ function createCustomTranslators() {
|
|
|
940
502
|
sup: () => ({
|
|
941
503
|
postprocess: ({ content }) => `^${content}^`,
|
|
942
504
|
}),
|
|
505
|
+
// Fix #6: Handle <pre> without <code> - wrap in fenced code block
|
|
506
|
+
pre: (ctx) => buildPreTranslator(ctx),
|
|
943
507
|
};
|
|
944
508
|
}
|
|
945
509
|
let markdownInstance = null;
|
|
@@ -955,9 +519,11 @@ function getMarkdownConverter() {
|
|
|
955
519
|
markdownInstance ??= createMarkdownInstance();
|
|
956
520
|
return markdownInstance;
|
|
957
521
|
}
|
|
958
|
-
function translateHtmlToMarkdown(html, url, signal) {
|
|
522
|
+
function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval) {
|
|
959
523
|
throwIfAborted(signal, url, 'markdown:begin');
|
|
960
|
-
const cleanedHtml =
|
|
524
|
+
const cleanedHtml = skipNoiseRemoval
|
|
525
|
+
? html
|
|
526
|
+
: runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
|
|
961
527
|
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
962
528
|
const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
|
|
963
529
|
throwIfAborted(signal, url, 'markdown:translated');
|
|
@@ -973,151 +539,18 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
973
539
|
if (!html)
|
|
974
540
|
return buildMetadataFooter(metadata, url);
|
|
975
541
|
try {
|
|
976
|
-
const content = translateHtmlToMarkdown(html, url, options?.signal);
|
|
542
|
+
const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document, options?.skipNoiseRemoval);
|
|
977
543
|
return appendMetadataFooter(content, metadata, url);
|
|
978
544
|
}
|
|
979
545
|
catch (error) {
|
|
980
546
|
if (error instanceof FetchError) {
|
|
981
547
|
throw error;
|
|
982
548
|
}
|
|
549
|
+
logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined);
|
|
983
550
|
return buildMetadataFooter(metadata, url);
|
|
984
551
|
}
|
|
985
552
|
}
|
|
986
|
-
|
|
987
|
-
let result = content;
|
|
988
|
-
const fixOrphanHeadings = (text) => {
|
|
989
|
-
return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
|
|
990
|
-
if (typeof prefix !== 'string' ||
|
|
991
|
-
typeof hashes !== 'string' ||
|
|
992
|
-
typeof heading !== 'string') {
|
|
993
|
-
return match;
|
|
994
|
-
}
|
|
995
|
-
if (heading.length > 150) {
|
|
996
|
-
return match;
|
|
997
|
-
}
|
|
998
|
-
const trimmedPrefix = prefix.trim();
|
|
999
|
-
if (trimmedPrefix === '') {
|
|
1000
|
-
return `${hashes} ${heading}\n\n`;
|
|
1001
|
-
}
|
|
1002
|
-
return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
|
|
1003
|
-
});
|
|
1004
|
-
};
|
|
1005
|
-
result = fixOrphanHeadings(result);
|
|
1006
|
-
result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
|
|
1007
|
-
const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
|
|
1008
|
-
result = result.replace(zeroWidthAnchorLink, '');
|
|
1009
|
-
result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
|
|
1010
|
-
result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
|
|
1011
|
-
result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
|
|
1012
|
-
result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
|
|
1013
|
-
result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
|
|
1014
|
-
result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
|
|
1015
|
-
const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
|
|
1016
|
-
const lines = result.split('\n');
|
|
1017
|
-
const filtered = [];
|
|
1018
|
-
let skipTocBlock = false;
|
|
1019
|
-
for (let i = 0; i < lines.length; i += 1) {
|
|
1020
|
-
const line = lines[i] ?? '';
|
|
1021
|
-
const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
|
|
1022
|
-
const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
|
|
1023
|
-
if (tocLinkLine.test(line)) {
|
|
1024
|
-
const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
|
|
1025
|
-
const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
|
|
1026
|
-
if (prevIsToc || nextIsToc) {
|
|
1027
|
-
skipTocBlock = true;
|
|
1028
|
-
continue;
|
|
1029
|
-
}
|
|
1030
|
-
}
|
|
1031
|
-
else if (line.trim() === '' && skipTocBlock) {
|
|
1032
|
-
skipTocBlock = false;
|
|
1033
|
-
continue;
|
|
1034
|
-
}
|
|
1035
|
-
else {
|
|
1036
|
-
skipTocBlock = false;
|
|
1037
|
-
}
|
|
1038
|
-
filtered.push(line);
|
|
1039
|
-
}
|
|
1040
|
-
result = filtered.join('\n');
|
|
1041
|
-
result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
|
|
1042
|
-
result = result.replace(/^Was this page helpful\??\s*$/gim, '');
|
|
1043
|
-
result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
|
|
1044
|
-
result = result.replace(/\\([[]])/g, '$1');
|
|
1045
|
-
result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
|
|
1046
|
-
result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
|
|
1047
|
-
result = result.replace(/\n{3,}/g, '\n\n');
|
|
1048
|
-
return result.trim();
|
|
1049
|
-
}
|
|
1050
|
-
const HEADING_KEYWORDS = new Set([
|
|
1051
|
-
'overview',
|
|
1052
|
-
'introduction',
|
|
1053
|
-
'summary',
|
|
1054
|
-
'conclusion',
|
|
1055
|
-
'prerequisites',
|
|
1056
|
-
'requirements',
|
|
1057
|
-
'installation',
|
|
1058
|
-
'configuration',
|
|
1059
|
-
'usage',
|
|
1060
|
-
'features',
|
|
1061
|
-
'limitations',
|
|
1062
|
-
'troubleshooting',
|
|
1063
|
-
'faq',
|
|
1064
|
-
'resources',
|
|
1065
|
-
'references',
|
|
1066
|
-
'changelog',
|
|
1067
|
-
'license',
|
|
1068
|
-
'acknowledgments',
|
|
1069
|
-
'appendix',
|
|
1070
|
-
]);
|
|
1071
|
-
function isLikelyHeadingLine(line) {
|
|
1072
|
-
const trimmed = line.trim();
|
|
1073
|
-
if (!trimmed || trimmed.length > 80)
|
|
1074
|
-
return false;
|
|
1075
|
-
if (/^#{1,6}\s/.test(trimmed))
|
|
1076
|
-
return false;
|
|
1077
|
-
if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
|
|
1078
|
-
return false;
|
|
1079
|
-
if (/[.!?]$/.test(trimmed))
|
|
1080
|
-
return false;
|
|
1081
|
-
if (/^\[.*\]\(.*\)$/.test(trimmed))
|
|
1082
|
-
return false;
|
|
1083
|
-
if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
|
|
1084
|
-
return true;
|
|
1085
|
-
}
|
|
1086
|
-
const words = trimmed.split(/\s+/);
|
|
1087
|
-
if (words.length >= 2 && words.length <= 6) {
|
|
1088
|
-
const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
|
|
1089
|
-
if (isTitleCase)
|
|
1090
|
-
return true;
|
|
1091
|
-
}
|
|
1092
|
-
if (words.length === 1) {
|
|
1093
|
-
const lower = trimmed.toLowerCase();
|
|
1094
|
-
if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
|
|
1095
|
-
return true;
|
|
1096
|
-
}
|
|
1097
|
-
}
|
|
1098
|
-
return false;
|
|
1099
|
-
}
|
|
1100
|
-
function promoteOrphanHeadings(markdown) {
|
|
1101
|
-
const lines = markdown.split('\n');
|
|
1102
|
-
const result = [];
|
|
1103
|
-
for (let i = 0; i < lines.length; i += 1) {
|
|
1104
|
-
const line = lines[i] ?? '';
|
|
1105
|
-
const prevLine = i > 0 ? lines[i - 1] : '';
|
|
1106
|
-
const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
|
|
1107
|
-
const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
|
|
1108
|
-
const isPrecededByBlank = prevLine?.trim() === '';
|
|
1109
|
-
if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
|
|
1110
|
-
const trimmed = line.trim();
|
|
1111
|
-
const isExample = /^example:\s/i.test(trimmed);
|
|
1112
|
-
const prefix = isExample ? '### ' : '## ';
|
|
1113
|
-
result.push(prefix + trimmed);
|
|
1114
|
-
}
|
|
1115
|
-
else {
|
|
1116
|
-
result.push(line);
|
|
1117
|
-
}
|
|
1118
|
-
}
|
|
1119
|
-
return result.join('\n');
|
|
1120
|
-
}
|
|
553
|
+
// Markdown cleanup functions moved to ./markdown-cleanup.ts
|
|
1121
554
|
function formatFetchedDate(isoString) {
|
|
1122
555
|
try {
|
|
1123
556
|
const date = new Date(isoString);
|
|
@@ -1366,54 +799,114 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
|
1366
799
|
const MIN_CONTENT_RATIO = 0.3;
|
|
1367
800
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
1368
801
|
const MIN_HEADING_RETENTION_RATIO = 0.7;
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
802
|
+
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
|
|
803
|
+
/**
|
|
804
|
+
* Count headings using DOM querySelectorAll.
|
|
805
|
+
* Handles nested content like <h2><span>Text</span></h2> correctly.
|
|
806
|
+
*/
|
|
807
|
+
function countHeadingsDom(htmlOrDocument) {
|
|
808
|
+
if (typeof htmlOrDocument === 'string') {
|
|
809
|
+
// Wrap fragments in document structure for proper parsing
|
|
810
|
+
const htmlToParse = needsDocumentWrapper(htmlOrDocument)
|
|
811
|
+
? wrapHtmlFragment(htmlOrDocument)
|
|
812
|
+
: htmlOrDocument;
|
|
813
|
+
const { document: doc } = parseHTML(htmlToParse);
|
|
814
|
+
return doc.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
|
|
815
|
+
}
|
|
816
|
+
return htmlOrDocument.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
|
|
817
|
+
}
|
|
818
|
+
function countCodeBlocksDom(htmlOrDocument) {
|
|
819
|
+
if (typeof htmlOrDocument === 'string') {
|
|
820
|
+
// Wrap fragments in document structure for proper parsing
|
|
821
|
+
const htmlToParse = needsDocumentWrapper(htmlOrDocument)
|
|
822
|
+
? wrapHtmlFragment(htmlOrDocument)
|
|
823
|
+
: htmlOrDocument;
|
|
824
|
+
const { document: doc } = parseHTML(htmlToParse);
|
|
825
|
+
return doc.querySelectorAll('pre').length;
|
|
826
|
+
}
|
|
827
|
+
return htmlOrDocument.querySelectorAll('pre').length;
|
|
828
|
+
}
|
|
829
|
+
/**
|
|
830
|
+
* Check if HTML string needs document wrapper for proper parsing.
|
|
831
|
+
* Fragments without doctype/html/body tags need wrapping.
|
|
832
|
+
*/
|
|
833
|
+
function needsDocumentWrapper(html) {
|
|
834
|
+
const trimmed = html.trim().toLowerCase();
|
|
835
|
+
return (!trimmed.startsWith('<!doctype') &&
|
|
836
|
+
!trimmed.startsWith('<html') &&
|
|
837
|
+
!trimmed.startsWith('<body'));
|
|
838
|
+
}
|
|
839
|
+
/**
|
|
840
|
+
* Wrap HTML fragment in minimal document structure for proper parsing.
|
|
841
|
+
*/
|
|
842
|
+
function wrapHtmlFragment(html) {
|
|
843
|
+
return `<!DOCTYPE html><html><body>${html}</body></html>`;
|
|
844
|
+
}
|
|
845
|
+
/**
|
|
846
|
+
* Get visible text length from HTML, excluding script/style/noscript content.
|
|
847
|
+
* Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
|
|
848
|
+
*/
|
|
849
|
+
function getVisibleTextLength(htmlOrDocument) {
|
|
850
|
+
// For string input, parse the HTML
|
|
851
|
+
if (typeof htmlOrDocument === 'string') {
|
|
852
|
+
// Wrap fragments in document structure for proper parsing
|
|
853
|
+
const htmlToParse = needsDocumentWrapper(htmlOrDocument)
|
|
854
|
+
? wrapHtmlFragment(htmlOrDocument)
|
|
855
|
+
: htmlOrDocument;
|
|
856
|
+
const { document: doc } = parseHTML(htmlToParse);
|
|
857
|
+
// Remove non-visible content that inflates text length
|
|
858
|
+
for (const el of doc.querySelectorAll('script,style,noscript')) {
|
|
859
|
+
el.remove();
|
|
1396
860
|
}
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
}
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
861
|
+
// Get text content from body or documentElement
|
|
862
|
+
// Note: linkedom may return null for body on HTML fragments despite types
|
|
863
|
+
const body = doc.body;
|
|
864
|
+
const docElement = doc.documentElement;
|
|
865
|
+
const text = body?.textContent ?? docElement?.textContent ?? '';
|
|
866
|
+
return text.replace(/\s+/g, ' ').trim().length;
|
|
867
|
+
}
|
|
868
|
+
// For Document input, clone to avoid mutation
|
|
869
|
+
const workDoc = htmlOrDocument.cloneNode(true);
|
|
870
|
+
// Remove non-visible content that inflates text length
|
|
871
|
+
for (const el of workDoc.querySelectorAll('script,style,noscript')) {
|
|
872
|
+
el.remove();
|
|
873
|
+
}
|
|
874
|
+
// Get text content from body or documentElement
|
|
875
|
+
// Note: linkedom may return null for body on HTML fragments despite types
|
|
876
|
+
const body = workDoc.body;
|
|
877
|
+
const docElement = workDoc.documentElement;
|
|
878
|
+
const text = body?.textContent ?? docElement?.textContent ?? '';
|
|
879
|
+
return text.replace(/\s+/g, ' ').trim().length;
|
|
880
|
+
}
|
|
881
|
+
export function isExtractionSufficient(article, originalHtmlOrDocument) {
|
|
1407
882
|
if (!article)
|
|
1408
883
|
return false;
|
|
1409
884
|
const articleLength = article.textContent.length;
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
.trim().length;
|
|
885
|
+
// Use DOM-based visible text length to exclude script/style content
|
|
886
|
+
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
1413
887
|
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
1414
888
|
return true;
|
|
1415
889
|
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
1416
890
|
}
|
|
891
|
+
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
892
|
+
const MAX_TRUNCATED_LINE_RATIO = 0.5;
|
|
893
|
+
/**
|
|
894
|
+
* Detect if extracted text has many truncated/incomplete sentences.
|
|
895
|
+
* Lines longer than 20 chars that don't end with sentence punctuation
|
|
896
|
+
* are considered potentially truncated.
|
|
897
|
+
*/
|
|
898
|
+
function hasTruncatedSentences(text) {
|
|
899
|
+
const lines = text
|
|
900
|
+
.split('\n')
|
|
901
|
+
.filter((line) => line.trim().length > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK);
|
|
902
|
+
if (lines.length < 3)
|
|
903
|
+
return false;
|
|
904
|
+
const incompleteLines = lines.filter((line) => {
|
|
905
|
+
const trimmed = line.trim();
|
|
906
|
+
return !/[.!?:;]$/.test(trimmed);
|
|
907
|
+
});
|
|
908
|
+
return incompleteLines.length / lines.length > MAX_TRUNCATED_LINE_RATIO;
|
|
909
|
+
}
|
|
1417
910
|
export function determineContentExtractionSource(article) {
|
|
1418
911
|
return article !== null;
|
|
1419
912
|
}
|
|
@@ -1443,12 +936,83 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
1443
936
|
}
|
|
1444
937
|
return metadata;
|
|
1445
938
|
}
|
|
1446
|
-
|
|
939
|
+
/**
|
|
940
|
+
* Content root selectors in priority order.
|
|
941
|
+
* These identify the main content area on a page.
|
|
942
|
+
*/
|
|
943
|
+
const CONTENT_ROOT_SELECTORS = [
|
|
944
|
+
'main',
|
|
945
|
+
'article',
|
|
946
|
+
'[role="main"]',
|
|
947
|
+
'#content',
|
|
948
|
+
'#main-content',
|
|
949
|
+
'.content',
|
|
950
|
+
'.main-content',
|
|
951
|
+
'.post-content',
|
|
952
|
+
'.article-content',
|
|
953
|
+
'.entry-content',
|
|
954
|
+
'[itemprop="articleBody"]',
|
|
955
|
+
'[data-content]',
|
|
956
|
+
'.post-body',
|
|
957
|
+
'.article-body',
|
|
958
|
+
];
|
|
959
|
+
/**
|
|
960
|
+
* Find the main content root element in a document.
|
|
961
|
+
* Returns the innerHTML if found, undefined otherwise.
|
|
962
|
+
*/
|
|
963
|
+
function findContentRoot(document) {
|
|
964
|
+
for (const selector of CONTENT_ROOT_SELECTORS) {
|
|
965
|
+
const element = document.querySelector(selector);
|
|
966
|
+
if (!element)
|
|
967
|
+
continue;
|
|
968
|
+
// Check if element has meaningful content
|
|
969
|
+
const innerHTML = typeof element.innerHTML === 'string'
|
|
970
|
+
? element.innerHTML
|
|
971
|
+
: undefined;
|
|
972
|
+
if (innerHTML && innerHTML.trim().length > 100) {
|
|
973
|
+
return innerHTML;
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
return undefined;
|
|
977
|
+
}
|
|
978
|
+
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
|
|
1447
979
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
980
|
+
// If using article content, return it directly
|
|
981
|
+
if (useArticleContent && article) {
|
|
982
|
+
return {
|
|
983
|
+
sourceHtml: article.content,
|
|
984
|
+
title: article.title,
|
|
985
|
+
metadata,
|
|
986
|
+
};
|
|
987
|
+
}
|
|
988
|
+
// Try content root fallback before using full HTML
|
|
989
|
+
if (document) {
|
|
990
|
+
// Apply noise removal to HTML first (without passing document) to get cleaned HTML,
|
|
991
|
+
// then parse and find content root. This prevents the aggressive DOM stripping that
|
|
992
|
+
// happens when noise removal is given the original parsed document.
|
|
993
|
+
const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
|
|
994
|
+
const { document: cleanedDoc } = parseHTML(cleanedHtml);
|
|
995
|
+
const contentRoot = findContentRoot(cleanedDoc);
|
|
996
|
+
if (contentRoot) {
|
|
997
|
+
logDebug('Using content root fallback instead of full HTML', {
|
|
998
|
+
url: url.substring(0, 80),
|
|
999
|
+
contentLength: contentRoot.length,
|
|
1000
|
+
});
|
|
1001
|
+
return {
|
|
1002
|
+
sourceHtml: contentRoot,
|
|
1003
|
+
title: extractedMeta.title,
|
|
1004
|
+
metadata,
|
|
1005
|
+
// Skip noise removal - this HTML is already from a cleaned document
|
|
1006
|
+
skipNoiseRemoval: true,
|
|
1007
|
+
};
|
|
1008
|
+
}
|
|
1009
|
+
}
|
|
1010
|
+
// Fall back to full HTML
|
|
1448
1011
|
return {
|
|
1449
|
-
sourceHtml:
|
|
1450
|
-
title:
|
|
1012
|
+
sourceHtml: html,
|
|
1013
|
+
title: extractedMeta.title,
|
|
1451
1014
|
metadata,
|
|
1015
|
+
...(document ? { document } : {}),
|
|
1452
1016
|
};
|
|
1453
1017
|
}
|
|
1454
1018
|
function logQualityGateFallback({ url, articleLength, }) {
|
|
@@ -1457,33 +1021,66 @@ function logQualityGateFallback({ url, articleLength, }) {
|
|
|
1457
1021
|
articleLength,
|
|
1458
1022
|
});
|
|
1459
1023
|
}
|
|
1460
|
-
function shouldUseArticleContent(article,
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1024
|
+
function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
|
|
1025
|
+
const articleLength = article.textContent.length;
|
|
1026
|
+
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
1027
|
+
// If the document is tiny, don't gate too aggressively.
|
|
1028
|
+
if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
|
|
1029
|
+
const ratio = articleLength / originalLength;
|
|
1030
|
+
if (ratio < MIN_CONTENT_RATIO) {
|
|
1031
|
+
logQualityGateFallback({ url, articleLength });
|
|
1032
|
+
return false;
|
|
1033
|
+
}
|
|
1468
1034
|
}
|
|
1469
|
-
//
|
|
1470
|
-
|
|
1471
|
-
|
|
1035
|
+
// Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
|
|
1036
|
+
const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
|
|
1037
|
+
if (originalHeadings > 0) {
|
|
1038
|
+
const articleHeadings = countHeadingsDom(article.content);
|
|
1039
|
+
const retentionRatio = articleHeadings / originalHeadings;
|
|
1040
|
+
if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
|
|
1041
|
+
logDebug('Quality gate: Readability broke heading structure, using full HTML', {
|
|
1042
|
+
url: url.substring(0, 80),
|
|
1043
|
+
originalHeadings,
|
|
1044
|
+
articleHeadings,
|
|
1045
|
+
});
|
|
1046
|
+
return false;
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
|
|
1050
|
+
if (originalCodeBlocks > 0) {
|
|
1051
|
+
const articleCodeBlocks = countCodeBlocksDom(article.content);
|
|
1052
|
+
const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
|
|
1053
|
+
// Always log code block counts for debugging
|
|
1054
|
+
logDebug('Code block retention check', {
|
|
1472
1055
|
url: url.substring(0, 80),
|
|
1473
|
-
|
|
1474
|
-
|
|
1056
|
+
originalCodeBlocks,
|
|
1057
|
+
articleCodeBlocks,
|
|
1058
|
+
codeRetentionRatio,
|
|
1475
1059
|
});
|
|
1060
|
+
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
|
|
1061
|
+
logDebug('Quality gate: Readability removed code blocks, using full HTML', {
|
|
1062
|
+
url: url.substring(0, 80),
|
|
1063
|
+
originalCodeBlocks,
|
|
1064
|
+
articleCodeBlocks,
|
|
1065
|
+
});
|
|
1066
|
+
return false;
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
// Layout extraction issue: truncated/fragmented lines.
|
|
1070
|
+
if (hasTruncatedSentences(article.textContent)) {
|
|
1071
|
+
logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: url.substring(0, 80) });
|
|
1476
1072
|
return false;
|
|
1477
1073
|
}
|
|
1478
1074
|
return true;
|
|
1479
1075
|
}
|
|
1480
1076
|
function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
1481
|
-
const { article, metadata: extractedMeta } =
|
|
1077
|
+
const { article, metadata: extractedMeta, document, } = extractContentWithDocument(html, url, {
|
|
1482
1078
|
extractArticle: true,
|
|
1483
1079
|
...(signal ? { signal } : {}),
|
|
1484
1080
|
});
|
|
1081
|
+
const originalDocument = parseHTML(html).document;
|
|
1485
1082
|
const useArticleContent = article
|
|
1486
|
-
? shouldUseArticleContent(article,
|
|
1083
|
+
? shouldUseArticleContent(article, originalDocument, url)
|
|
1487
1084
|
: false;
|
|
1488
1085
|
return buildContentSource({
|
|
1489
1086
|
html,
|
|
@@ -1492,6 +1089,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
|
1492
1089
|
extractedMeta,
|
|
1493
1090
|
includeMetadata,
|
|
1494
1091
|
useArticleContent,
|
|
1092
|
+
document,
|
|
1495
1093
|
});
|
|
1496
1094
|
}
|
|
1497
1095
|
function tryTransformRawStage(html, url, includeMetadata) {
|
|
@@ -1513,6 +1111,8 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
1513
1111
|
const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
1514
1112
|
url,
|
|
1515
1113
|
...(signal ? { signal } : {}),
|
|
1114
|
+
...(context.document ? { document: context.document } : {}),
|
|
1115
|
+
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1516
1116
|
}));
|
|
1517
1117
|
return {
|
|
1518
1118
|
markdown: content,
|
|
@@ -1606,6 +1206,12 @@ class WorkerPool {
|
|
|
1606
1206
|
timeoutMs;
|
|
1607
1207
|
queueMax;
|
|
1608
1208
|
closed = false;
|
|
1209
|
+
createAbortError(url, stage) {
|
|
1210
|
+
return new FetchError('Request was canceled', url, 499, {
|
|
1211
|
+
reason: 'aborted',
|
|
1212
|
+
stage,
|
|
1213
|
+
});
|
|
1214
|
+
}
|
|
1609
1215
|
ensureOpen() {
|
|
1610
1216
|
if (this.closed) {
|
|
1611
1217
|
throw new Error('Transform worker pool closed');
|
|
@@ -1614,10 +1220,7 @@ class WorkerPool {
|
|
|
1614
1220
|
ensureNotAborted(signal, url, stage) {
|
|
1615
1221
|
if (!signal?.aborted)
|
|
1616
1222
|
return;
|
|
1617
|
-
throw
|
|
1618
|
-
reason: 'aborted',
|
|
1619
|
-
stage,
|
|
1620
|
-
});
|
|
1223
|
+
throw this.createAbortError(url, stage);
|
|
1621
1224
|
}
|
|
1622
1225
|
ensureQueueCapacity(url) {
|
|
1623
1226
|
if (this.queue.length < this.queueMax)
|
|
@@ -1682,10 +1285,7 @@ class WorkerPool {
|
|
|
1682
1285
|
abortInflightTask(id, url, workerIndex) {
|
|
1683
1286
|
const slot = this.workers[workerIndex];
|
|
1684
1287
|
this.cancelWorkerTask(slot, id);
|
|
1685
|
-
this.failTask(id,
|
|
1686
|
-
reason: 'aborted',
|
|
1687
|
-
stage: 'transform:signal-abort',
|
|
1688
|
-
}));
|
|
1288
|
+
this.failTask(id, this.createAbortError(url, 'transform:signal-abort'));
|
|
1689
1289
|
if (slot) {
|
|
1690
1290
|
this.restartWorker(workerIndex, slot);
|
|
1691
1291
|
}
|
|
@@ -1695,10 +1295,7 @@ class WorkerPool {
|
|
|
1695
1295
|
if (queuedIndex === -1)
|
|
1696
1296
|
return;
|
|
1697
1297
|
this.queue.splice(queuedIndex, 1);
|
|
1698
|
-
reject(
|
|
1699
|
-
reason: 'aborted',
|
|
1700
|
-
stage: 'transform:queued-abort',
|
|
1701
|
-
}));
|
|
1298
|
+
reject(this.createAbortError(url, 'transform:queued-abort'));
|
|
1702
1299
|
}
|
|
1703
1300
|
createWorkerSlot(worker) {
|
|
1704
1301
|
return {
|
|
@@ -1854,10 +1451,7 @@ class WorkerPool {
|
|
|
1854
1451
|
if (!task.signal?.aborted)
|
|
1855
1452
|
return false;
|
|
1856
1453
|
this.clearAbortListener(task.signal, task.abortListener);
|
|
1857
|
-
task.reject(
|
|
1858
|
-
reason: 'aborted',
|
|
1859
|
-
stage: 'transform:dispatch',
|
|
1860
|
-
}));
|
|
1454
|
+
task.reject(this.createAbortError(task.url, 'transform:dispatch'));
|
|
1861
1455
|
return true;
|
|
1862
1456
|
}
|
|
1863
1457
|
markSlotBusy(slot, task) {
|