@j0hanz/superfetch 2.1.8 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -34
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +54 -11
- package/dist/cache.js.map +1 -1
- package/dist/config.d.ts +3 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +6 -0
- package/dist/config.js.map +1 -1
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +14 -1
- package/dist/errors.js.map +1 -1
- package/dist/fetch.d.ts.map +1 -1
- package/dist/fetch.js +6 -3
- package/dist/fetch.js.map +1 -1
- package/dist/http.d.ts +1 -1
- package/dist/http.d.ts.map +1 -1
- package/dist/http.js +50 -25
- package/dist/http.js.map +1 -1
- package/dist/index.js +8 -11
- package/dist/index.js.map +1 -1
- package/dist/mcp.d.ts.map +1 -1
- package/dist/mcp.js +6 -5
- package/dist/mcp.js.map +1 -1
- package/dist/observability.d.ts.map +1 -1
- package/dist/observability.js +9 -12
- package/dist/observability.js.map +1 -1
- package/dist/tools.d.ts.map +1 -1
- package/dist/tools.js +26 -28
- package/dist/tools.js.map +1 -1
- package/dist/transform.d.ts.map +1 -1
- package/dist/transform.js +462 -354
- package/dist/transform.js.map +1 -1
- package/dist/type-guards.js +1 -1
- package/dist/type-guards.js.map +1 -1
- package/package.json +1 -1
package/dist/transform.js
CHANGED
|
@@ -129,83 +129,41 @@ function truncateHtml(html) {
|
|
|
129
129
|
});
|
|
130
130
|
return html.substring(0, maxSize);
|
|
131
131
|
}
|
|
132
|
-
function
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
}
|
|
155
|
-
function parseStandardKey(name) {
|
|
156
|
-
if (name === 'description')
|
|
157
|
-
return 'description';
|
|
158
|
-
if (name === 'author')
|
|
159
|
-
return 'author';
|
|
160
|
-
return null;
|
|
161
|
-
}
|
|
162
|
-
function collectMetaTag(state, tag) {
|
|
163
|
-
const content = tag.getAttribute('content')?.trim();
|
|
164
|
-
if (!content)
|
|
165
|
-
return;
|
|
166
|
-
const ogKey = parseOpenGraphKey(tag.getAttribute('property'));
|
|
167
|
-
if (ogKey) {
|
|
168
|
-
state[ogKey].og = content;
|
|
169
|
-
return;
|
|
170
|
-
}
|
|
171
|
-
const name = tag.getAttribute('name');
|
|
172
|
-
const twitterKey = parseTwitterKey(name);
|
|
173
|
-
if (twitterKey) {
|
|
174
|
-
state[twitterKey].twitter = content;
|
|
175
|
-
return;
|
|
176
|
-
}
|
|
177
|
-
const standardKey = parseStandardKey(name);
|
|
178
|
-
if (standardKey) {
|
|
179
|
-
state[standardKey].standard = content;
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
function scanMetaTags(document, state) {
|
|
183
|
-
const metaTags = document.querySelectorAll('meta');
|
|
184
|
-
for (const tag of metaTags) {
|
|
185
|
-
collectMetaTag(state, tag);
|
|
132
|
+
function extractMetadata(document) {
|
|
133
|
+
const title = {};
|
|
134
|
+
const description = {};
|
|
135
|
+
let author;
|
|
136
|
+
for (const tag of document.querySelectorAll('meta')) {
|
|
137
|
+
const content = tag.getAttribute('content')?.trim();
|
|
138
|
+
if (!content)
|
|
139
|
+
continue;
|
|
140
|
+
const property = tag.getAttribute('property');
|
|
141
|
+
const name = tag.getAttribute('name');
|
|
142
|
+
if (property === 'og:title')
|
|
143
|
+
title.og = content;
|
|
144
|
+
else if (property === 'og:description')
|
|
145
|
+
description.og = content;
|
|
146
|
+
else if (name === 'twitter:title')
|
|
147
|
+
title.twitter = content;
|
|
148
|
+
else if (name === 'twitter:description')
|
|
149
|
+
description.twitter = content;
|
|
150
|
+
else if (name === 'description')
|
|
151
|
+
description.standard = content;
|
|
152
|
+
else if (name === 'author')
|
|
153
|
+
author = content;
|
|
186
154
|
}
|
|
187
|
-
}
|
|
188
|
-
function ensureTitleFallback(document, state) {
|
|
189
|
-
if (state.title.standard)
|
|
190
|
-
return;
|
|
191
155
|
const titleEl = document.querySelector('title');
|
|
192
|
-
if (titleEl?.textContent) {
|
|
193
|
-
|
|
156
|
+
if (!title.standard && titleEl?.textContent) {
|
|
157
|
+
title.standard = titleEl.textContent.trim();
|
|
194
158
|
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
const state = createMetaCollectorState();
|
|
198
|
-
scanMetaTags(document, state);
|
|
199
|
-
ensureTitleFallback(document, state);
|
|
159
|
+
const resolvedTitle = title.og ?? title.twitter ?? title.standard;
|
|
160
|
+
const resolvedDesc = description.og ?? description.twitter ?? description.standard;
|
|
200
161
|
const metadata = {};
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
if (description !== undefined)
|
|
207
|
-
metadata.description = description;
|
|
208
|
-
if (author !== undefined)
|
|
162
|
+
if (resolvedTitle)
|
|
163
|
+
metadata.title = resolvedTitle;
|
|
164
|
+
if (resolvedDesc)
|
|
165
|
+
metadata.description = resolvedDesc;
|
|
166
|
+
if (author)
|
|
209
167
|
metadata.author = author;
|
|
210
168
|
return metadata;
|
|
211
169
|
}
|
|
@@ -226,9 +184,6 @@ function extractArticle(document) {
|
|
|
226
184
|
logWarn('Document not compatible with Readability');
|
|
227
185
|
return null;
|
|
228
186
|
}
|
|
229
|
-
return mapParsedArticle(parseReadabilityArticle(document));
|
|
230
|
-
}
|
|
231
|
-
function parseReadabilityArticle(document) {
|
|
232
187
|
try {
|
|
233
188
|
const documentClone = document.cloneNode(true);
|
|
234
189
|
const rawText = documentClone.body.textContent ||
|
|
@@ -238,54 +193,31 @@ function parseReadabilityArticle(document) {
|
|
|
238
193
|
return null;
|
|
239
194
|
}
|
|
240
195
|
const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
|
|
241
|
-
|
|
196
|
+
const parsed = reader.parse();
|
|
197
|
+
if (!parsed)
|
|
198
|
+
return null;
|
|
199
|
+
return {
|
|
200
|
+
content: parsed.content ?? '',
|
|
201
|
+
textContent: parsed.textContent ?? '',
|
|
202
|
+
...(parsed.title != null && { title: parsed.title }),
|
|
203
|
+
...(parsed.byline != null && { byline: parsed.byline }),
|
|
204
|
+
...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
|
|
205
|
+
...(parsed.siteName != null && { siteName: parsed.siteName }),
|
|
206
|
+
};
|
|
242
207
|
}
|
|
243
208
|
catch (error) {
|
|
244
|
-
logError('Failed to extract article with Readability',
|
|
209
|
+
logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
|
|
245
210
|
return null;
|
|
246
211
|
}
|
|
247
212
|
}
|
|
248
|
-
function asError(error) {
|
|
249
|
-
if (error instanceof Error) {
|
|
250
|
-
return error;
|
|
251
|
-
}
|
|
252
|
-
return undefined;
|
|
253
|
-
}
|
|
254
|
-
function mapParsedArticle(parsed) {
|
|
255
|
-
return parsed ? mapReadabilityResult(parsed) : null;
|
|
256
|
-
}
|
|
257
|
-
function mapReadabilityResult(parsed) {
|
|
258
|
-
return {
|
|
259
|
-
content: parsed.content ?? '',
|
|
260
|
-
textContent: parsed.textContent ?? '',
|
|
261
|
-
...buildOptionalArticleFields(parsed),
|
|
262
|
-
};
|
|
263
|
-
}
|
|
264
|
-
function buildOptionalArticleFields(parsed) {
|
|
265
|
-
const optional = {};
|
|
266
|
-
addOptionalField(optional, 'title', parsed.title);
|
|
267
|
-
addOptionalField(optional, 'byline', parsed.byline);
|
|
268
|
-
addOptionalField(optional, 'excerpt', parsed.excerpt);
|
|
269
|
-
addOptionalField(optional, 'siteName', parsed.siteName);
|
|
270
|
-
return optional;
|
|
271
|
-
}
|
|
272
|
-
function addOptionalField(target, key, value) {
|
|
273
|
-
if (value == null)
|
|
274
|
-
return;
|
|
275
|
-
target[key] = value;
|
|
276
|
-
}
|
|
277
213
|
export function extractContent(html, url, options = {
|
|
278
214
|
extractArticle: true,
|
|
279
215
|
}) {
|
|
280
|
-
const emptyResult = createEmptyExtractionResult();
|
|
281
216
|
if (!isValidInput(html, url)) {
|
|
282
|
-
return
|
|
217
|
+
return { article: null, metadata: {} };
|
|
283
218
|
}
|
|
284
219
|
return tryExtractContent(html, url, options);
|
|
285
220
|
}
|
|
286
|
-
function createEmptyExtractionResult() {
|
|
287
|
-
return { article: null, metadata: {} };
|
|
288
|
-
}
|
|
289
221
|
function extractArticleWithStage(document, url, shouldExtract) {
|
|
290
222
|
if (!shouldExtract)
|
|
291
223
|
return null;
|
|
@@ -297,7 +229,7 @@ function handleExtractionFailure(error, url, signal) {
|
|
|
297
229
|
}
|
|
298
230
|
throwIfAborted(signal, url, 'extract:error');
|
|
299
231
|
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
300
|
-
return
|
|
232
|
+
return { article: null, metadata: {} };
|
|
301
233
|
}
|
|
302
234
|
function extractContentStages(html, url, options) {
|
|
303
235
|
throwIfAborted(options.signal, url, 'extract:begin');
|
|
@@ -325,14 +257,11 @@ function isValidInput(html, url) {
|
|
|
325
257
|
return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
|
|
326
258
|
}
|
|
327
259
|
function validateRequiredString(value, message) {
|
|
328
|
-
if (
|
|
260
|
+
if (typeof value === 'string' && value.length > 0)
|
|
329
261
|
return true;
|
|
330
262
|
logWarn(message);
|
|
331
263
|
return false;
|
|
332
264
|
}
|
|
333
|
-
function isNonEmptyString(value) {
|
|
334
|
-
return typeof value === 'string' && value.length > 0;
|
|
335
|
-
}
|
|
336
265
|
function resolveArticleExtraction(document, shouldExtract) {
|
|
337
266
|
return shouldExtract ? extractArticle(document) : null;
|
|
338
267
|
}
|
|
@@ -417,7 +346,124 @@ function isWordChar(char) {
|
|
|
417
346
|
(code >= 97 && code <= 122) ||
|
|
418
347
|
char === '_');
|
|
419
348
|
}
|
|
420
|
-
const
|
|
349
|
+
const LANGUAGE_PATTERNS = [
|
|
350
|
+
{
|
|
351
|
+
language: 'jsx',
|
|
352
|
+
pattern: {
|
|
353
|
+
keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
|
|
354
|
+
custom: (code) => containsJsxTag(code),
|
|
355
|
+
},
|
|
356
|
+
},
|
|
357
|
+
{
|
|
358
|
+
language: 'typescript',
|
|
359
|
+
pattern: {
|
|
360
|
+
wordBoundary: ['interface', 'type'],
|
|
361
|
+
custom: (_, lower) => [
|
|
362
|
+
': string',
|
|
363
|
+
':string',
|
|
364
|
+
': number',
|
|
365
|
+
':number',
|
|
366
|
+
': boolean',
|
|
367
|
+
':boolean',
|
|
368
|
+
': void',
|
|
369
|
+
':void',
|
|
370
|
+
': any',
|
|
371
|
+
':any',
|
|
372
|
+
': unknown',
|
|
373
|
+
':unknown',
|
|
374
|
+
': never',
|
|
375
|
+
':never',
|
|
376
|
+
].some((hint) => lower.includes(hint)),
|
|
377
|
+
},
|
|
378
|
+
},
|
|
379
|
+
{
|
|
380
|
+
language: 'rust',
|
|
381
|
+
pattern: {
|
|
382
|
+
regex: /\b(?:fn|impl|struct|enum)\b/,
|
|
383
|
+
keywords: ['let mut'],
|
|
384
|
+
custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
|
|
385
|
+
},
|
|
386
|
+
},
|
|
387
|
+
{
|
|
388
|
+
language: 'javascript',
|
|
389
|
+
pattern: {
|
|
390
|
+
regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
|
|
391
|
+
},
|
|
392
|
+
},
|
|
393
|
+
{
|
|
394
|
+
language: 'python',
|
|
395
|
+
pattern: {
|
|
396
|
+
regex: /\b(?:def|class|import|from)\b/,
|
|
397
|
+
keywords: ['print(', '__name__'],
|
|
398
|
+
},
|
|
399
|
+
},
|
|
400
|
+
{
|
|
401
|
+
language: 'bash',
|
|
402
|
+
pattern: {
|
|
403
|
+
custom: (code) => detectBashIndicators(code),
|
|
404
|
+
},
|
|
405
|
+
},
|
|
406
|
+
{
|
|
407
|
+
language: 'css',
|
|
408
|
+
pattern: {
|
|
409
|
+
regex: /@media|@import|@keyframes/,
|
|
410
|
+
custom: (code) => detectCssStructure(code),
|
|
411
|
+
},
|
|
412
|
+
},
|
|
413
|
+
{
|
|
414
|
+
language: 'html',
|
|
415
|
+
pattern: {
|
|
416
|
+
keywords: [
|
|
417
|
+
'<!doctype',
|
|
418
|
+
'<html',
|
|
419
|
+
'<head',
|
|
420
|
+
'<body',
|
|
421
|
+
'<div',
|
|
422
|
+
'<span',
|
|
423
|
+
'<p',
|
|
424
|
+
'<a',
|
|
425
|
+
'<script',
|
|
426
|
+
'<style',
|
|
427
|
+
],
|
|
428
|
+
},
|
|
429
|
+
},
|
|
430
|
+
{
|
|
431
|
+
language: 'json',
|
|
432
|
+
pattern: {
|
|
433
|
+
startsWith: ['{', '['],
|
|
434
|
+
},
|
|
435
|
+
},
|
|
436
|
+
{
|
|
437
|
+
language: 'yaml',
|
|
438
|
+
pattern: {
|
|
439
|
+
custom: (code) => detectYamlStructure(code),
|
|
440
|
+
},
|
|
441
|
+
},
|
|
442
|
+
{
|
|
443
|
+
language: 'sql',
|
|
444
|
+
pattern: {
|
|
445
|
+
wordBoundary: [
|
|
446
|
+
'select',
|
|
447
|
+
'insert',
|
|
448
|
+
'update',
|
|
449
|
+
'delete',
|
|
450
|
+
'create',
|
|
451
|
+
'alter',
|
|
452
|
+
'drop',
|
|
453
|
+
],
|
|
454
|
+
},
|
|
455
|
+
},
|
|
456
|
+
{
|
|
457
|
+
language: 'go',
|
|
458
|
+
pattern: {
|
|
459
|
+
wordBoundary: ['package', 'func'],
|
|
460
|
+
keywords: ['import "'],
|
|
461
|
+
},
|
|
462
|
+
},
|
|
463
|
+
];
|
|
464
|
+
// Bash detection constants
|
|
465
|
+
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
466
|
+
const BASH_PKG_MANAGERS = [
|
|
421
467
|
'npm',
|
|
422
468
|
'yarn',
|
|
423
469
|
'pnpm',
|
|
@@ -429,184 +475,83 @@ const BASH_PACKAGE_MANAGERS = [
|
|
|
429
475
|
'go',
|
|
430
476
|
];
|
|
431
477
|
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
478
|
+
function isShellPrefix(line) {
|
|
479
|
+
return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
|
|
480
|
+
}
|
|
481
|
+
function matchesBashCommand(line) {
|
|
482
|
+
return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
|
|
483
|
+
}
|
|
484
|
+
function matchesPackageManagerVerb(line) {
|
|
485
|
+
for (const mgr of BASH_PKG_MANAGERS) {
|
|
486
|
+
if (!line.startsWith(`${mgr} `))
|
|
438
487
|
continue;
|
|
439
|
-
|
|
488
|
+
const rest = line.slice(mgr.length + 1);
|
|
489
|
+
if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
|
|
440
490
|
return true;
|
|
491
|
+
}
|
|
441
492
|
}
|
|
442
493
|
return false;
|
|
443
494
|
}
|
|
444
|
-
function
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
return line.startsWith('#!');
|
|
455
|
-
}
|
|
456
|
-
function isPromptLine(line) {
|
|
457
|
-
return line.startsWith('$ ') || line.startsWith('# ');
|
|
458
|
-
}
|
|
459
|
-
function startsWithPackageManagerCommand(line) {
|
|
460
|
-
return BASH_PACKAGE_MANAGERS.some((manager) => {
|
|
461
|
-
if (!line.startsWith(`${manager} `))
|
|
462
|
-
return false;
|
|
463
|
-
const rest = line.slice(manager.length + 1);
|
|
464
|
-
return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
|
|
465
|
-
});
|
|
466
|
-
}
|
|
467
|
-
const TYPE_HINTS = [
|
|
468
|
-
'string',
|
|
469
|
-
'number',
|
|
470
|
-
'boolean',
|
|
471
|
-
'void',
|
|
472
|
-
'any',
|
|
473
|
-
'unknown',
|
|
474
|
-
'never',
|
|
475
|
-
];
|
|
476
|
-
const HTML_TAGS = [
|
|
477
|
-
'<!doctype',
|
|
478
|
-
'<html',
|
|
479
|
-
'<head',
|
|
480
|
-
'<body',
|
|
481
|
-
'<div',
|
|
482
|
-
'<span',
|
|
483
|
-
'<p',
|
|
484
|
-
'<a',
|
|
485
|
-
'<script',
|
|
486
|
-
'<style',
|
|
487
|
-
];
|
|
488
|
-
const SQL_KEYWORDS = [
|
|
489
|
-
'select',
|
|
490
|
-
'insert',
|
|
491
|
-
'update',
|
|
492
|
-
'delete',
|
|
493
|
-
'create',
|
|
494
|
-
'alter',
|
|
495
|
-
'drop',
|
|
496
|
-
];
|
|
497
|
-
const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
|
|
498
|
-
const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
|
|
499
|
-
const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
|
|
500
|
-
const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
|
|
501
|
-
const CODE_DETECTORS = [
|
|
502
|
-
{ language: 'jsx', detect: detectJsx },
|
|
503
|
-
{ language: 'typescript', detect: detectTypescript },
|
|
504
|
-
{ language: 'rust', detect: detectRust },
|
|
505
|
-
{ language: 'javascript', detect: detectJavascript },
|
|
506
|
-
{ language: 'python', detect: detectPython },
|
|
507
|
-
{ language: 'bash', detect: detectBash },
|
|
508
|
-
{ language: 'css', detect: detectCss },
|
|
509
|
-
{ language: 'html', detect: detectHtml },
|
|
510
|
-
{ language: 'json', detect: detectJson },
|
|
511
|
-
{ language: 'yaml', detect: detectYaml },
|
|
512
|
-
{ language: 'sql', detect: detectSql },
|
|
513
|
-
{ language: 'go', detect: detectGo },
|
|
514
|
-
];
|
|
515
|
-
function detectJsx(code) {
|
|
516
|
-
const lower = code.toLowerCase();
|
|
517
|
-
if (lower.includes('classname='))
|
|
518
|
-
return true;
|
|
519
|
-
if (lower.includes('jsx:'))
|
|
520
|
-
return true;
|
|
521
|
-
if (lower.includes("from 'react'") || lower.includes('from "react"')) {
|
|
522
|
-
return true;
|
|
495
|
+
function detectBashIndicators(code) {
|
|
496
|
+
for (const line of splitLines(code)) {
|
|
497
|
+
const trimmed = line.trimStart();
|
|
498
|
+
if (!trimmed)
|
|
499
|
+
continue;
|
|
500
|
+
if (isShellPrefix(trimmed) ||
|
|
501
|
+
matchesBashCommand(trimmed) ||
|
|
502
|
+
matchesPackageManagerVerb(trimmed)) {
|
|
503
|
+
return true;
|
|
504
|
+
}
|
|
523
505
|
}
|
|
524
|
-
return
|
|
525
|
-
}
|
|
526
|
-
function detectTypescript(code) {
|
|
527
|
-
const lower = code.toLowerCase();
|
|
528
|
-
if (containsWord(lower, 'interface'))
|
|
529
|
-
return true;
|
|
530
|
-
if (containsWord(lower, 'type'))
|
|
531
|
-
return true;
|
|
532
|
-
return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
|
|
533
|
-
}
|
|
534
|
-
function detectRust(code) {
|
|
535
|
-
const lower = code.toLowerCase();
|
|
536
|
-
return (RUST_WORD_REGEX.test(lower) ||
|
|
537
|
-
lower.includes('let mut') ||
|
|
538
|
-
(lower.includes('use ') && lower.includes('::')));
|
|
539
|
-
}
|
|
540
|
-
function detectJavascript(code) {
|
|
541
|
-
const lower = code.toLowerCase();
|
|
542
|
-
return JS_WORD_REGEX.test(lower);
|
|
543
|
-
}
|
|
544
|
-
function detectPython(code) {
|
|
545
|
-
const lower = code.toLowerCase();
|
|
546
|
-
return (PYTHON_WORD_REGEX.test(lower) ||
|
|
547
|
-
lower.includes('print(') ||
|
|
548
|
-
lower.includes('__name__'));
|
|
506
|
+
return false;
|
|
549
507
|
}
|
|
550
|
-
function
|
|
551
|
-
const
|
|
552
|
-
if (CSS_DIRECTIVE_REGEX.test(lower))
|
|
553
|
-
return true;
|
|
554
|
-
const lines = splitLines(code);
|
|
555
|
-
for (const line of lines) {
|
|
508
|
+
function detectCssStructure(code) {
|
|
509
|
+
for (const line of splitLines(code)) {
|
|
556
510
|
const trimmed = line.trimStart();
|
|
557
511
|
if (!trimmed)
|
|
558
512
|
continue;
|
|
559
|
-
|
|
513
|
+
const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
|
|
514
|
+
trimmed.includes('{');
|
|
515
|
+
const isProperty = trimmed.includes(':') && trimmed.includes(';');
|
|
516
|
+
if (isSelector || isProperty)
|
|
560
517
|
return true;
|
|
561
518
|
}
|
|
562
519
|
return false;
|
|
563
520
|
}
|
|
564
|
-
function
|
|
565
|
-
const
|
|
566
|
-
return HTML_TAGS.some((tag) => lower.includes(tag));
|
|
567
|
-
}
|
|
568
|
-
function detectJson(code) {
|
|
569
|
-
const trimmed = code.trimStart();
|
|
570
|
-
if (!trimmed)
|
|
571
|
-
return false;
|
|
572
|
-
return trimmed.startsWith('{') || trimmed.startsWith('[');
|
|
573
|
-
}
|
|
574
|
-
function detectYaml(code) {
|
|
575
|
-
const lines = splitLines(code);
|
|
576
|
-
for (const line of lines) {
|
|
521
|
+
function detectYamlStructure(code) {
|
|
522
|
+
for (const line of splitLines(code)) {
|
|
577
523
|
const trimmed = line.trim();
|
|
578
524
|
if (!trimmed)
|
|
579
525
|
continue;
|
|
580
|
-
const
|
|
581
|
-
if (
|
|
526
|
+
const colonIdx = trimmed.indexOf(':');
|
|
527
|
+
if (colonIdx <= 0)
|
|
582
528
|
continue;
|
|
583
|
-
const after = trimmed[
|
|
529
|
+
const after = trimmed[colonIdx + 1];
|
|
584
530
|
if (after === ' ' || after === '\t')
|
|
585
531
|
return true;
|
|
586
532
|
}
|
|
587
533
|
return false;
|
|
588
534
|
}
|
|
589
|
-
function
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
function isCssPropertyLine(line) {
|
|
605
|
-
return line.includes(':') && line.includes(';');
|
|
535
|
+
function matchesLanguagePattern(code, lower, pattern) {
|
|
536
|
+
if (pattern.keywords?.some((kw) => lower.includes(kw)))
|
|
537
|
+
return true;
|
|
538
|
+
if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
|
|
539
|
+
return true;
|
|
540
|
+
if (pattern.regex?.test(lower))
|
|
541
|
+
return true;
|
|
542
|
+
if (pattern.startsWith) {
|
|
543
|
+
const trimmed = code.trimStart();
|
|
544
|
+
if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
|
|
545
|
+
return true;
|
|
546
|
+
}
|
|
547
|
+
if (pattern.custom?.(code, lower))
|
|
548
|
+
return true;
|
|
549
|
+
return false;
|
|
606
550
|
}
|
|
607
551
|
export function detectLanguageFromCode(code) {
|
|
608
|
-
|
|
609
|
-
|
|
552
|
+
const lower = code.toLowerCase();
|
|
553
|
+
for (const { language, pattern } of LANGUAGE_PATTERNS) {
|
|
554
|
+
if (matchesLanguagePattern(code, lower, pattern))
|
|
610
555
|
return language;
|
|
611
556
|
}
|
|
612
557
|
return undefined;
|
|
@@ -630,6 +575,7 @@ const STRUCTURAL_TAGS = new Set([
|
|
|
630
575
|
'input',
|
|
631
576
|
'select',
|
|
632
577
|
'textarea',
|
|
578
|
+
'svg',
|
|
633
579
|
]);
|
|
634
580
|
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
|
|
635
581
|
const NAVIGATION_ROLES = new Set([
|
|
@@ -642,6 +588,7 @@ const NAVIGATION_ROLES = new Set([
|
|
|
642
588
|
'menu',
|
|
643
589
|
'dialog',
|
|
644
590
|
'alertdialog',
|
|
591
|
+
'search',
|
|
645
592
|
]);
|
|
646
593
|
const PROMO_TOKENS = new Set([
|
|
647
594
|
'banner',
|
|
@@ -669,6 +616,7 @@ const PROMO_TOKENS = new Set([
|
|
|
669
616
|
'breadcrumb',
|
|
670
617
|
'pagination',
|
|
671
618
|
'pager',
|
|
619
|
+
'taglist',
|
|
672
620
|
]);
|
|
673
621
|
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
674
622
|
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
@@ -727,6 +675,8 @@ const NOISE_MARKERS = [
|
|
|
727
675
|
' z-50',
|
|
728
676
|
' z-4',
|
|
729
677
|
' isolate',
|
|
678
|
+
' breadcrumb',
|
|
679
|
+
' pagination',
|
|
730
680
|
];
|
|
731
681
|
function mayContainNoise(html) {
|
|
732
682
|
const haystack = html.toLowerCase();
|
|
@@ -760,11 +710,9 @@ function matchesPromoIdOrClass(className, id) {
|
|
|
760
710
|
const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
|
|
761
711
|
return tokens.some((token) => PROMO_TOKENS.has(token));
|
|
762
712
|
}
|
|
763
|
-
function matchesHighZIsolate(className) {
|
|
764
|
-
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
765
|
-
}
|
|
766
713
|
function matchesFixedOrHighZIsolate(className) {
|
|
767
|
-
return FIXED_PATTERN.test(className) ||
|
|
714
|
+
return (FIXED_PATTERN.test(className) ||
|
|
715
|
+
(HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
|
|
768
716
|
}
|
|
769
717
|
function readElementMetadata(element) {
|
|
770
718
|
return {
|
|
@@ -791,8 +739,7 @@ function isNoiseElement(node) {
|
|
|
791
739
|
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
792
740
|
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
793
741
|
}
|
|
794
|
-
function
|
|
795
|
-
const nodes = document.querySelectorAll('*');
|
|
742
|
+
function removeNoiseNodes(nodes) {
|
|
796
743
|
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
797
744
|
const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
|
|
798
745
|
if (!node)
|
|
@@ -802,6 +749,30 @@ function stripNoiseNodes(document) {
|
|
|
802
749
|
}
|
|
803
750
|
}
|
|
804
751
|
}
|
|
752
|
+
function stripNoiseNodes(document) {
|
|
753
|
+
// Use targeted selectors for common noise elements instead of querySelectorAll('*')
|
|
754
|
+
const targetSelectors = [
|
|
755
|
+
'nav',
|
|
756
|
+
'footer',
|
|
757
|
+
'aside',
|
|
758
|
+
'header[class*="site"]',
|
|
759
|
+
'header[class*="nav"]',
|
|
760
|
+
'header[class*="menu"]',
|
|
761
|
+
'[role="banner"]',
|
|
762
|
+
'[role="navigation"]',
|
|
763
|
+
'[role="dialog"]',
|
|
764
|
+
'[style*="display: none"]',
|
|
765
|
+
'[style*="display:none"]',
|
|
766
|
+
'[hidden]',
|
|
767
|
+
'[aria-hidden="true"]',
|
|
768
|
+
].join(',');
|
|
769
|
+
const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
|
|
770
|
+
// Remove in reverse order to handle nested elements correctly
|
|
771
|
+
removeNoiseNodes(potentialNoiseNodes);
|
|
772
|
+
// Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
|
|
773
|
+
const allElements = document.querySelectorAll('*');
|
|
774
|
+
removeNoiseNodes(allElements);
|
|
775
|
+
}
|
|
805
776
|
function removeNoiseFromHtml(html) {
|
|
806
777
|
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
807
778
|
if (!shouldParse)
|
|
@@ -826,7 +797,14 @@ function removeNoiseFromHtml(html) {
|
|
|
826
797
|
}
|
|
827
798
|
function buildInlineCode(content) {
|
|
828
799
|
const runs = content.match(/`+/g);
|
|
829
|
-
|
|
800
|
+
let longest = '';
|
|
801
|
+
if (runs) {
|
|
802
|
+
for (const run of runs) {
|
|
803
|
+
if (run.length > longest.length) {
|
|
804
|
+
longest = run;
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
}
|
|
830
808
|
const delimiter = `\`${longest}`;
|
|
831
809
|
const padding = delimiter.length > 1 ? ' ' : '';
|
|
832
810
|
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
@@ -983,11 +961,8 @@ function translateHtmlToMarkdown(html, url, signal) {
|
|
|
983
961
|
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
984
962
|
const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
|
|
985
963
|
throwIfAborted(signal, url, 'markdown:translated');
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
finalMarkdown = normalizeTableWhitespace(finalMarkdown);
|
|
989
|
-
finalMarkdown = normalizeLineEndings(finalMarkdown);
|
|
990
|
-
return finalMarkdown;
|
|
964
|
+
const cleaned = cleanupMarkdownArtifacts(content);
|
|
965
|
+
return promoteOrphanHeadings(cleaned);
|
|
991
966
|
}
|
|
992
967
|
function appendMetadataFooter(content, metadata, url) {
|
|
993
968
|
const footer = buildMetadataFooter(metadata, url);
|
|
@@ -1010,37 +985,146 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
1010
985
|
}
|
|
1011
986
|
function cleanupMarkdownArtifacts(content) {
|
|
1012
987
|
let result = content;
|
|
988
|
+
const fixOrphanHeadings = (text) => {
|
|
989
|
+
return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
|
|
990
|
+
if (typeof prefix !== 'string' ||
|
|
991
|
+
typeof hashes !== 'string' ||
|
|
992
|
+
typeof heading !== 'string') {
|
|
993
|
+
return match;
|
|
994
|
+
}
|
|
995
|
+
if (heading.length > 150) {
|
|
996
|
+
return match;
|
|
997
|
+
}
|
|
998
|
+
const trimmedPrefix = prefix.trim();
|
|
999
|
+
if (trimmedPrefix === '') {
|
|
1000
|
+
return `${hashes} ${heading}\n\n`;
|
|
1001
|
+
}
|
|
1002
|
+
return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
|
|
1003
|
+
});
|
|
1004
|
+
};
|
|
1005
|
+
result = fixOrphanHeadings(result);
|
|
1013
1006
|
result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
|
|
1014
1007
|
const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
|
|
1015
1008
|
result = result.replace(zeroWidthAnchorLink, '');
|
|
1009
|
+
result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
|
|
1010
|
+
result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
|
|
1011
|
+
result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
|
|
1012
|
+
result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
|
|
1013
|
+
result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
|
|
1014
|
+
result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
|
|
1015
|
+
const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
|
|
1016
|
+
const lines = result.split('\n');
|
|
1017
|
+
const filtered = [];
|
|
1018
|
+
let skipTocBlock = false;
|
|
1019
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
1020
|
+
const line = lines[i] ?? '';
|
|
1021
|
+
const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
|
|
1022
|
+
const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
|
|
1023
|
+
if (tocLinkLine.test(line)) {
|
|
1024
|
+
const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
|
|
1025
|
+
const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
|
|
1026
|
+
if (prevIsToc || nextIsToc) {
|
|
1027
|
+
skipTocBlock = true;
|
|
1028
|
+
continue;
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
else if (line.trim() === '' && skipTocBlock) {
|
|
1032
|
+
skipTocBlock = false;
|
|
1033
|
+
continue;
|
|
1034
|
+
}
|
|
1035
|
+
else {
|
|
1036
|
+
skipTocBlock = false;
|
|
1037
|
+
}
|
|
1038
|
+
filtered.push(line);
|
|
1039
|
+
}
|
|
1040
|
+
result = filtered.join('\n');
|
|
1016
1041
|
result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
|
|
1017
1042
|
result = result.replace(/^Was this page helpful\??\s*$/gim, '');
|
|
1043
|
+
result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
|
|
1044
|
+
result = result.replace(/\\([[]])/g, '$1');
|
|
1045
|
+
result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
|
|
1046
|
+
result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
|
|
1018
1047
|
result = result.replace(/\n{3,}/g, '\n\n');
|
|
1019
1048
|
return result.trim();
|
|
1020
1049
|
}
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1050
|
+
const HEADING_KEYWORDS = new Set([
|
|
1051
|
+
'overview',
|
|
1052
|
+
'introduction',
|
|
1053
|
+
'summary',
|
|
1054
|
+
'conclusion',
|
|
1055
|
+
'prerequisites',
|
|
1056
|
+
'requirements',
|
|
1057
|
+
'installation',
|
|
1058
|
+
'configuration',
|
|
1059
|
+
'usage',
|
|
1060
|
+
'features',
|
|
1061
|
+
'limitations',
|
|
1062
|
+
'troubleshooting',
|
|
1063
|
+
'faq',
|
|
1064
|
+
'resources',
|
|
1065
|
+
'references',
|
|
1066
|
+
'changelog',
|
|
1067
|
+
'license',
|
|
1068
|
+
'acknowledgments',
|
|
1069
|
+
'appendix',
|
|
1070
|
+
]);
|
|
1071
|
+
function isLikelyHeadingLine(line) {
|
|
1072
|
+
const trimmed = line.trim();
|
|
1073
|
+
if (!trimmed || trimmed.length > 80)
|
|
1074
|
+
return false;
|
|
1075
|
+
if (/^#{1,6}\s/.test(trimmed))
|
|
1076
|
+
return false;
|
|
1077
|
+
if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
|
|
1078
|
+
return false;
|
|
1079
|
+
if (/[.!?]$/.test(trimmed))
|
|
1080
|
+
return false;
|
|
1081
|
+
if (/^\[.*\]\(.*\)$/.test(trimmed))
|
|
1082
|
+
return false;
|
|
1083
|
+
if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
|
|
1084
|
+
return true;
|
|
1085
|
+
}
|
|
1086
|
+
const words = trimmed.split(/\s+/);
|
|
1087
|
+
if (words.length >= 2 && words.length <= 6) {
|
|
1088
|
+
const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
|
|
1089
|
+
if (isTitleCase)
|
|
1090
|
+
return true;
|
|
1091
|
+
}
|
|
1092
|
+
if (words.length === 1) {
|
|
1093
|
+
const lower = trimmed.toLowerCase();
|
|
1094
|
+
if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
|
|
1095
|
+
return true;
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
return false;
|
|
1031
1099
|
}
|
|
1032
|
-
function
|
|
1033
|
-
|
|
1100
|
+
function promoteOrphanHeadings(markdown) {
|
|
1101
|
+
const lines = markdown.split('\n');
|
|
1102
|
+
const result = [];
|
|
1103
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
1104
|
+
const line = lines[i] ?? '';
|
|
1105
|
+
const prevLine = i > 0 ? lines[i - 1] : '';
|
|
1106
|
+
const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
|
|
1107
|
+
const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
|
|
1108
|
+
const isPrecededByBlank = prevLine?.trim() === '';
|
|
1109
|
+
if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
|
|
1110
|
+
const trimmed = line.trim();
|
|
1111
|
+
const isExample = /^example:\s/i.test(trimmed);
|
|
1112
|
+
const prefix = isExample ? '### ' : '## ';
|
|
1113
|
+
result.push(prefix + trimmed);
|
|
1114
|
+
}
|
|
1115
|
+
else {
|
|
1116
|
+
result.push(line);
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
return result.join('\n');
|
|
1034
1120
|
}
|
|
1035
1121
|
function formatFetchedDate(isoString) {
|
|
1036
1122
|
try {
|
|
1037
1123
|
const date = new Date(isoString);
|
|
1038
|
-
const
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
};
|
|
1043
|
-
return date.toLocaleDateString('en-US', options);
|
|
1124
|
+
const day = String(date.getDate()).padStart(2, '0');
|
|
1125
|
+
const month = String(date.getMonth() + 1).padStart(2, '0');
|
|
1126
|
+
const year = date.getFullYear();
|
|
1127
|
+
return `${day}-${month}-${year}`;
|
|
1044
1128
|
}
|
|
1045
1129
|
catch {
|
|
1046
1130
|
return isoString;
|
|
@@ -1049,20 +1133,24 @@ function formatFetchedDate(isoString) {
|
|
|
1049
1133
|
function buildMetadataFooter(metadata, fallbackUrl) {
|
|
1050
1134
|
if (!metadata)
|
|
1051
1135
|
return '';
|
|
1052
|
-
const lines = [];
|
|
1136
|
+
const lines = ['---', ''];
|
|
1137
|
+
const url = metadata.url || fallbackUrl;
|
|
1138
|
+
const parts = [];
|
|
1053
1139
|
if (metadata.title)
|
|
1054
|
-
|
|
1055
|
-
if (metadata.description)
|
|
1056
|
-
lines.push(`> *${metadata.description}*`);
|
|
1140
|
+
parts.push(`_${metadata.title}_`);
|
|
1057
1141
|
if (metadata.author)
|
|
1058
|
-
|
|
1059
|
-
if (
|
|
1060
|
-
|
|
1061
|
-
else if (fallbackUrl)
|
|
1062
|
-
lines.push(`> *<${fallbackUrl}>*`);
|
|
1142
|
+
parts.push(`_${metadata.author}_`);
|
|
1143
|
+
if (url)
|
|
1144
|
+
parts.push(`[_Original Source_](${url})`);
|
|
1063
1145
|
if (metadata.fetchedAt) {
|
|
1064
1146
|
const formattedDate = formatFetchedDate(metadata.fetchedAt);
|
|
1065
|
-
|
|
1147
|
+
parts.push(`_${formattedDate}_`);
|
|
1148
|
+
}
|
|
1149
|
+
if (parts.length > 0) {
|
|
1150
|
+
lines.push(` ${parts.join(' | ')}`);
|
|
1151
|
+
}
|
|
1152
|
+
if (metadata.description) {
|
|
1153
|
+
lines.push(` <sub>${metadata.description}</sub>`);
|
|
1066
1154
|
}
|
|
1067
1155
|
return lines.join('\n');
|
|
1068
1156
|
}
|
|
@@ -1277,69 +1365,82 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
|
1277
1365
|
}
|
|
1278
1366
|
const MIN_CONTENT_RATIO = 0.3;
|
|
1279
1367
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
1280
|
-
|
|
1281
|
-
|
|
1368
|
+
const MIN_HEADING_RETENTION_RATIO = 0.7;
|
|
1369
|
+
function countHeadings(html) {
|
|
1370
|
+
if (!html)
|
|
1371
|
+
return 0;
|
|
1372
|
+
// Match opening heading tags <h1> through <h6>
|
|
1373
|
+
const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
|
|
1374
|
+
const matches = html.match(headingPattern);
|
|
1375
|
+
return matches ? matches.length : 0;
|
|
1376
|
+
}
|
|
1377
|
+
function isHeadingStructurePreserved(article, originalHtml) {
|
|
1378
|
+
if (!article)
|
|
1379
|
+
return false;
|
|
1380
|
+
// Cache heading counts to avoid duplicate regex matching
|
|
1381
|
+
const originalHeadingCount = countHeadings(originalHtml);
|
|
1382
|
+
const articleHeadingCount = countHeadings(article.content);
|
|
1383
|
+
// If original has no headings, structure is trivially preserved
|
|
1384
|
+
if (originalHeadingCount === 0)
|
|
1385
|
+
return true;
|
|
1386
|
+
// If article lost >50% of headings, structure is broken
|
|
1387
|
+
const retentionRatio = articleHeadingCount / originalHeadingCount;
|
|
1388
|
+
return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
|
|
1389
|
+
}
|
|
1390
|
+
function stripHtmlTagsForLength(html) {
|
|
1391
|
+
let result = '';
|
|
1282
1392
|
let inTag = false;
|
|
1283
1393
|
for (const char of html) {
|
|
1284
1394
|
if (char === '<') {
|
|
1285
1395
|
inTag = true;
|
|
1286
|
-
continue;
|
|
1287
1396
|
}
|
|
1288
|
-
if (char === '>') {
|
|
1397
|
+
else if (char === '>') {
|
|
1289
1398
|
inTag = false;
|
|
1290
|
-
continue;
|
|
1291
1399
|
}
|
|
1292
|
-
if (!inTag) {
|
|
1293
|
-
|
|
1400
|
+
else if (!inTag) {
|
|
1401
|
+
result += char;
|
|
1294
1402
|
}
|
|
1295
1403
|
}
|
|
1296
|
-
return
|
|
1297
|
-
}
|
|
1298
|
-
function estimateTextLength(html) {
|
|
1299
|
-
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
1404
|
+
return result;
|
|
1300
1405
|
}
|
|
1301
1406
|
export function isExtractionSufficient(article, originalHtml) {
|
|
1302
1407
|
if (!article)
|
|
1303
1408
|
return false;
|
|
1304
1409
|
const articleLength = article.textContent.length;
|
|
1305
|
-
const originalLength =
|
|
1410
|
+
const originalLength = stripHtmlTagsForLength(originalHtml)
|
|
1411
|
+
.replace(/\s+/g, ' ')
|
|
1412
|
+
.trim().length;
|
|
1306
1413
|
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
1307
1414
|
return true;
|
|
1308
1415
|
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
1309
1416
|
}
|
|
1310
1417
|
export function determineContentExtractionSource(article) {
|
|
1311
|
-
return
|
|
1312
|
-
}
|
|
1313
|
-
function applyArticleMetadata(metadata, article) {
|
|
1314
|
-
if (article.title !== undefined)
|
|
1315
|
-
metadata.title = article.title;
|
|
1316
|
-
if (article.byline !== undefined)
|
|
1317
|
-
metadata.author = article.byline;
|
|
1318
|
-
}
|
|
1319
|
-
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
1320
|
-
if (extractedMeta.title !== undefined)
|
|
1321
|
-
metadata.title = extractedMeta.title;
|
|
1322
|
-
if (extractedMeta.description !== undefined) {
|
|
1323
|
-
metadata.description = extractedMeta.description;
|
|
1324
|
-
}
|
|
1325
|
-
if (extractedMeta.author !== undefined) {
|
|
1326
|
-
metadata.author = extractedMeta.author;
|
|
1327
|
-
}
|
|
1418
|
+
return article !== null;
|
|
1328
1419
|
}
|
|
1329
1420
|
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
1330
1421
|
if (!includeMetadata)
|
|
1331
1422
|
return undefined;
|
|
1332
|
-
const now = new Date().toISOString();
|
|
1333
1423
|
const metadata = {
|
|
1334
1424
|
type: 'metadata',
|
|
1335
1425
|
url,
|
|
1336
|
-
fetchedAt:
|
|
1426
|
+
fetchedAt: new Date().toISOString(),
|
|
1337
1427
|
};
|
|
1338
1428
|
if (shouldExtractFromArticle && article) {
|
|
1339
|
-
|
|
1340
|
-
|
|
1429
|
+
if (article.title !== undefined)
|
|
1430
|
+
metadata.title = article.title;
|
|
1431
|
+
if (article.byline !== undefined)
|
|
1432
|
+
metadata.author = article.byline;
|
|
1433
|
+
}
|
|
1434
|
+
else {
|
|
1435
|
+
if (extractedMeta.title !== undefined)
|
|
1436
|
+
metadata.title = extractedMeta.title;
|
|
1437
|
+
if (extractedMeta.description !== undefined) {
|
|
1438
|
+
metadata.description = extractedMeta.description;
|
|
1439
|
+
}
|
|
1440
|
+
if (extractedMeta.author !== undefined) {
|
|
1441
|
+
metadata.author = extractedMeta.author;
|
|
1442
|
+
}
|
|
1341
1443
|
}
|
|
1342
|
-
applyExtractedMetadata(metadata, extractedMeta);
|
|
1343
1444
|
return metadata;
|
|
1344
1445
|
}
|
|
1345
1446
|
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
|
|
@@ -1357,17 +1458,24 @@ function logQualityGateFallback({ url, articleLength, }) {
|
|
|
1357
1458
|
});
|
|
1358
1459
|
}
|
|
1359
1460
|
function shouldUseArticleContent(article, html, url) {
|
|
1360
|
-
|
|
1361
|
-
if (!
|
|
1461
|
+
// Check content sufficiency (length-based quality gate)
|
|
1462
|
+
if (!isExtractionSufficient(article, html)) {
|
|
1463
|
+
logQualityGateFallback({
|
|
1464
|
+
url,
|
|
1465
|
+
articleLength: article.textContent.length,
|
|
1466
|
+
});
|
|
1362
1467
|
return false;
|
|
1363
|
-
if (isExtractionSufficient(article, html)) {
|
|
1364
|
-
return true;
|
|
1365
1468
|
}
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1469
|
+
// Check heading structure preservation
|
|
1470
|
+
if (!isHeadingStructurePreserved(article, html)) {
|
|
1471
|
+
logDebug('Quality gate: Readability broke heading structure, using full HTML', {
|
|
1472
|
+
url: url.substring(0, 80),
|
|
1473
|
+
originalHeadings: countHeadings(html),
|
|
1474
|
+
articleHeadings: countHeadings(article.content),
|
|
1475
|
+
});
|
|
1476
|
+
return false;
|
|
1477
|
+
}
|
|
1478
|
+
return true;
|
|
1371
1479
|
}
|
|
1372
1480
|
function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
1373
1481
|
const { article, metadata: extractedMeta } = extractContent(html, url, {
|