@j0hanz/superfetch 2.1.7 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -34
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +54 -11
- package/dist/cache.js.map +1 -1
- package/dist/config.d.ts +3 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +6 -0
- package/dist/config.js.map +1 -1
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +14 -1
- package/dist/errors.js.map +1 -1
- package/dist/fetch.d.ts.map +1 -1
- package/dist/fetch.js +6 -3
- package/dist/fetch.js.map +1 -1
- package/dist/http.d.ts +1 -1
- package/dist/http.d.ts.map +1 -1
- package/dist/http.js +50 -25
- package/dist/http.js.map +1 -1
- package/dist/index.js +8 -11
- package/dist/index.js.map +1 -1
- package/dist/mcp.d.ts.map +1 -1
- package/dist/mcp.js +6 -5
- package/dist/mcp.js.map +1 -1
- package/dist/observability.d.ts.map +1 -1
- package/dist/observability.js +9 -12
- package/dist/observability.js.map +1 -1
- package/dist/tools.d.ts.map +1 -1
- package/dist/tools.js +26 -28
- package/dist/tools.js.map +1 -1
- package/dist/transform.d.ts.map +1 -1
- package/dist/transform.js +462 -350
- package/dist/transform.js.map +1 -1
- package/dist/type-guards.js +1 -1
- package/dist/type-guards.js.map +1 -1
- package/package.json +1 -1
package/dist/transform.js
CHANGED
|
@@ -129,83 +129,41 @@ function truncateHtml(html) {
|
|
|
129
129
|
});
|
|
130
130
|
return html.substring(0, maxSize);
|
|
131
131
|
}
|
|
132
|
-
function
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
}
|
|
155
|
-
function parseStandardKey(name) {
|
|
156
|
-
if (name === 'description')
|
|
157
|
-
return 'description';
|
|
158
|
-
if (name === 'author')
|
|
159
|
-
return 'author';
|
|
160
|
-
return null;
|
|
161
|
-
}
|
|
162
|
-
function collectMetaTag(state, tag) {
|
|
163
|
-
const content = tag.getAttribute('content')?.trim();
|
|
164
|
-
if (!content)
|
|
165
|
-
return;
|
|
166
|
-
const ogKey = parseOpenGraphKey(tag.getAttribute('property'));
|
|
167
|
-
if (ogKey) {
|
|
168
|
-
state[ogKey].og = content;
|
|
169
|
-
return;
|
|
170
|
-
}
|
|
171
|
-
const name = tag.getAttribute('name');
|
|
172
|
-
const twitterKey = parseTwitterKey(name);
|
|
173
|
-
if (twitterKey) {
|
|
174
|
-
state[twitterKey].twitter = content;
|
|
175
|
-
return;
|
|
176
|
-
}
|
|
177
|
-
const standardKey = parseStandardKey(name);
|
|
178
|
-
if (standardKey) {
|
|
179
|
-
state[standardKey].standard = content;
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
function scanMetaTags(document, state) {
|
|
183
|
-
const metaTags = document.querySelectorAll('meta');
|
|
184
|
-
for (const tag of metaTags) {
|
|
185
|
-
collectMetaTag(state, tag);
|
|
132
|
+
function extractMetadata(document) {
|
|
133
|
+
const title = {};
|
|
134
|
+
const description = {};
|
|
135
|
+
let author;
|
|
136
|
+
for (const tag of document.querySelectorAll('meta')) {
|
|
137
|
+
const content = tag.getAttribute('content')?.trim();
|
|
138
|
+
if (!content)
|
|
139
|
+
continue;
|
|
140
|
+
const property = tag.getAttribute('property');
|
|
141
|
+
const name = tag.getAttribute('name');
|
|
142
|
+
if (property === 'og:title')
|
|
143
|
+
title.og = content;
|
|
144
|
+
else if (property === 'og:description')
|
|
145
|
+
description.og = content;
|
|
146
|
+
else if (name === 'twitter:title')
|
|
147
|
+
title.twitter = content;
|
|
148
|
+
else if (name === 'twitter:description')
|
|
149
|
+
description.twitter = content;
|
|
150
|
+
else if (name === 'description')
|
|
151
|
+
description.standard = content;
|
|
152
|
+
else if (name === 'author')
|
|
153
|
+
author = content;
|
|
186
154
|
}
|
|
187
|
-
}
|
|
188
|
-
function ensureTitleFallback(document, state) {
|
|
189
|
-
if (state.title.standard)
|
|
190
|
-
return;
|
|
191
155
|
const titleEl = document.querySelector('title');
|
|
192
|
-
if (titleEl?.textContent) {
|
|
193
|
-
|
|
156
|
+
if (!title.standard && titleEl?.textContent) {
|
|
157
|
+
title.standard = titleEl.textContent.trim();
|
|
194
158
|
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
const state = createMetaCollectorState();
|
|
198
|
-
scanMetaTags(document, state);
|
|
199
|
-
ensureTitleFallback(document, state);
|
|
159
|
+
const resolvedTitle = title.og ?? title.twitter ?? title.standard;
|
|
160
|
+
const resolvedDesc = description.og ?? description.twitter ?? description.standard;
|
|
200
161
|
const metadata = {};
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
if (description !== undefined)
|
|
207
|
-
metadata.description = description;
|
|
208
|
-
if (author !== undefined)
|
|
162
|
+
if (resolvedTitle)
|
|
163
|
+
metadata.title = resolvedTitle;
|
|
164
|
+
if (resolvedDesc)
|
|
165
|
+
metadata.description = resolvedDesc;
|
|
166
|
+
if (author)
|
|
209
167
|
metadata.author = author;
|
|
210
168
|
return metadata;
|
|
211
169
|
}
|
|
@@ -226,9 +184,6 @@ function extractArticle(document) {
|
|
|
226
184
|
logWarn('Document not compatible with Readability');
|
|
227
185
|
return null;
|
|
228
186
|
}
|
|
229
|
-
return mapParsedArticle(parseReadabilityArticle(document));
|
|
230
|
-
}
|
|
231
|
-
function parseReadabilityArticle(document) {
|
|
232
187
|
try {
|
|
233
188
|
const documentClone = document.cloneNode(true);
|
|
234
189
|
const rawText = documentClone.body.textContent ||
|
|
@@ -238,54 +193,31 @@ function parseReadabilityArticle(document) {
|
|
|
238
193
|
return null;
|
|
239
194
|
}
|
|
240
195
|
const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
|
|
241
|
-
|
|
196
|
+
const parsed = reader.parse();
|
|
197
|
+
if (!parsed)
|
|
198
|
+
return null;
|
|
199
|
+
return {
|
|
200
|
+
content: parsed.content ?? '',
|
|
201
|
+
textContent: parsed.textContent ?? '',
|
|
202
|
+
...(parsed.title != null && { title: parsed.title }),
|
|
203
|
+
...(parsed.byline != null && { byline: parsed.byline }),
|
|
204
|
+
...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
|
|
205
|
+
...(parsed.siteName != null && { siteName: parsed.siteName }),
|
|
206
|
+
};
|
|
242
207
|
}
|
|
243
208
|
catch (error) {
|
|
244
|
-
logError('Failed to extract article with Readability',
|
|
209
|
+
logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
|
|
245
210
|
return null;
|
|
246
211
|
}
|
|
247
212
|
}
|
|
248
|
-
function asError(error) {
|
|
249
|
-
if (error instanceof Error) {
|
|
250
|
-
return error;
|
|
251
|
-
}
|
|
252
|
-
return undefined;
|
|
253
|
-
}
|
|
254
|
-
function mapParsedArticle(parsed) {
|
|
255
|
-
return parsed ? mapReadabilityResult(parsed) : null;
|
|
256
|
-
}
|
|
257
|
-
function mapReadabilityResult(parsed) {
|
|
258
|
-
return {
|
|
259
|
-
content: parsed.content ?? '',
|
|
260
|
-
textContent: parsed.textContent ?? '',
|
|
261
|
-
...buildOptionalArticleFields(parsed),
|
|
262
|
-
};
|
|
263
|
-
}
|
|
264
|
-
function buildOptionalArticleFields(parsed) {
|
|
265
|
-
const optional = {};
|
|
266
|
-
addOptionalField(optional, 'title', parsed.title);
|
|
267
|
-
addOptionalField(optional, 'byline', parsed.byline);
|
|
268
|
-
addOptionalField(optional, 'excerpt', parsed.excerpt);
|
|
269
|
-
addOptionalField(optional, 'siteName', parsed.siteName);
|
|
270
|
-
return optional;
|
|
271
|
-
}
|
|
272
|
-
function addOptionalField(target, key, value) {
|
|
273
|
-
if (value == null)
|
|
274
|
-
return;
|
|
275
|
-
target[key] = value;
|
|
276
|
-
}
|
|
277
213
|
export function extractContent(html, url, options = {
|
|
278
214
|
extractArticle: true,
|
|
279
215
|
}) {
|
|
280
|
-
const emptyResult = createEmptyExtractionResult();
|
|
281
216
|
if (!isValidInput(html, url)) {
|
|
282
|
-
return
|
|
217
|
+
return { article: null, metadata: {} };
|
|
283
218
|
}
|
|
284
219
|
return tryExtractContent(html, url, options);
|
|
285
220
|
}
|
|
286
|
-
function createEmptyExtractionResult() {
|
|
287
|
-
return { article: null, metadata: {} };
|
|
288
|
-
}
|
|
289
221
|
function extractArticleWithStage(document, url, shouldExtract) {
|
|
290
222
|
if (!shouldExtract)
|
|
291
223
|
return null;
|
|
@@ -297,7 +229,7 @@ function handleExtractionFailure(error, url, signal) {
|
|
|
297
229
|
}
|
|
298
230
|
throwIfAborted(signal, url, 'extract:error');
|
|
299
231
|
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
300
|
-
return
|
|
232
|
+
return { article: null, metadata: {} };
|
|
301
233
|
}
|
|
302
234
|
function extractContentStages(html, url, options) {
|
|
303
235
|
throwIfAborted(options.signal, url, 'extract:begin');
|
|
@@ -325,14 +257,11 @@ function isValidInput(html, url) {
|
|
|
325
257
|
return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
|
|
326
258
|
}
|
|
327
259
|
function validateRequiredString(value, message) {
|
|
328
|
-
if (
|
|
260
|
+
if (typeof value === 'string' && value.length > 0)
|
|
329
261
|
return true;
|
|
330
262
|
logWarn(message);
|
|
331
263
|
return false;
|
|
332
264
|
}
|
|
333
|
-
function isNonEmptyString(value) {
|
|
334
|
-
return typeof value === 'string' && value.length > 0;
|
|
335
|
-
}
|
|
336
265
|
function resolveArticleExtraction(document, shouldExtract) {
|
|
337
266
|
return shouldExtract ? extractArticle(document) : null;
|
|
338
267
|
}
|
|
@@ -417,7 +346,124 @@ function isWordChar(char) {
|
|
|
417
346
|
(code >= 97 && code <= 122) ||
|
|
418
347
|
char === '_');
|
|
419
348
|
}
|
|
420
|
-
const
|
|
349
|
+
const LANGUAGE_PATTERNS = [
|
|
350
|
+
{
|
|
351
|
+
language: 'jsx',
|
|
352
|
+
pattern: {
|
|
353
|
+
keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
|
|
354
|
+
custom: (code) => containsJsxTag(code),
|
|
355
|
+
},
|
|
356
|
+
},
|
|
357
|
+
{
|
|
358
|
+
language: 'typescript',
|
|
359
|
+
pattern: {
|
|
360
|
+
wordBoundary: ['interface', 'type'],
|
|
361
|
+
custom: (_, lower) => [
|
|
362
|
+
': string',
|
|
363
|
+
':string',
|
|
364
|
+
': number',
|
|
365
|
+
':number',
|
|
366
|
+
': boolean',
|
|
367
|
+
':boolean',
|
|
368
|
+
': void',
|
|
369
|
+
':void',
|
|
370
|
+
': any',
|
|
371
|
+
':any',
|
|
372
|
+
': unknown',
|
|
373
|
+
':unknown',
|
|
374
|
+
': never',
|
|
375
|
+
':never',
|
|
376
|
+
].some((hint) => lower.includes(hint)),
|
|
377
|
+
},
|
|
378
|
+
},
|
|
379
|
+
{
|
|
380
|
+
language: 'rust',
|
|
381
|
+
pattern: {
|
|
382
|
+
regex: /\b(?:fn|impl|struct|enum)\b/,
|
|
383
|
+
keywords: ['let mut'],
|
|
384
|
+
custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
|
|
385
|
+
},
|
|
386
|
+
},
|
|
387
|
+
{
|
|
388
|
+
language: 'javascript',
|
|
389
|
+
pattern: {
|
|
390
|
+
regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
|
|
391
|
+
},
|
|
392
|
+
},
|
|
393
|
+
{
|
|
394
|
+
language: 'python',
|
|
395
|
+
pattern: {
|
|
396
|
+
regex: /\b(?:def|class|import|from)\b/,
|
|
397
|
+
keywords: ['print(', '__name__'],
|
|
398
|
+
},
|
|
399
|
+
},
|
|
400
|
+
{
|
|
401
|
+
language: 'bash',
|
|
402
|
+
pattern: {
|
|
403
|
+
custom: (code) => detectBashIndicators(code),
|
|
404
|
+
},
|
|
405
|
+
},
|
|
406
|
+
{
|
|
407
|
+
language: 'css',
|
|
408
|
+
pattern: {
|
|
409
|
+
regex: /@media|@import|@keyframes/,
|
|
410
|
+
custom: (code) => detectCssStructure(code),
|
|
411
|
+
},
|
|
412
|
+
},
|
|
413
|
+
{
|
|
414
|
+
language: 'html',
|
|
415
|
+
pattern: {
|
|
416
|
+
keywords: [
|
|
417
|
+
'<!doctype',
|
|
418
|
+
'<html',
|
|
419
|
+
'<head',
|
|
420
|
+
'<body',
|
|
421
|
+
'<div',
|
|
422
|
+
'<span',
|
|
423
|
+
'<p',
|
|
424
|
+
'<a',
|
|
425
|
+
'<script',
|
|
426
|
+
'<style',
|
|
427
|
+
],
|
|
428
|
+
},
|
|
429
|
+
},
|
|
430
|
+
{
|
|
431
|
+
language: 'json',
|
|
432
|
+
pattern: {
|
|
433
|
+
startsWith: ['{', '['],
|
|
434
|
+
},
|
|
435
|
+
},
|
|
436
|
+
{
|
|
437
|
+
language: 'yaml',
|
|
438
|
+
pattern: {
|
|
439
|
+
custom: (code) => detectYamlStructure(code),
|
|
440
|
+
},
|
|
441
|
+
},
|
|
442
|
+
{
|
|
443
|
+
language: 'sql',
|
|
444
|
+
pattern: {
|
|
445
|
+
wordBoundary: [
|
|
446
|
+
'select',
|
|
447
|
+
'insert',
|
|
448
|
+
'update',
|
|
449
|
+
'delete',
|
|
450
|
+
'create',
|
|
451
|
+
'alter',
|
|
452
|
+
'drop',
|
|
453
|
+
],
|
|
454
|
+
},
|
|
455
|
+
},
|
|
456
|
+
{
|
|
457
|
+
language: 'go',
|
|
458
|
+
pattern: {
|
|
459
|
+
wordBoundary: ['package', 'func'],
|
|
460
|
+
keywords: ['import "'],
|
|
461
|
+
},
|
|
462
|
+
},
|
|
463
|
+
];
|
|
464
|
+
// Bash detection constants
|
|
465
|
+
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
466
|
+
const BASH_PKG_MANAGERS = [
|
|
421
467
|
'npm',
|
|
422
468
|
'yarn',
|
|
423
469
|
'pnpm',
|
|
@@ -429,184 +475,83 @@ const BASH_PACKAGE_MANAGERS = [
|
|
|
429
475
|
'go',
|
|
430
476
|
];
|
|
431
477
|
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
478
|
+
function isShellPrefix(line) {
|
|
479
|
+
return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
|
|
480
|
+
}
|
|
481
|
+
function matchesBashCommand(line) {
|
|
482
|
+
return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
|
|
483
|
+
}
|
|
484
|
+
function matchesPackageManagerVerb(line) {
|
|
485
|
+
for (const mgr of BASH_PKG_MANAGERS) {
|
|
486
|
+
if (!line.startsWith(`${mgr} `))
|
|
438
487
|
continue;
|
|
439
|
-
|
|
488
|
+
const rest = line.slice(mgr.length + 1);
|
|
489
|
+
if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
|
|
440
490
|
return true;
|
|
491
|
+
}
|
|
441
492
|
}
|
|
442
493
|
return false;
|
|
443
494
|
}
|
|
444
|
-
function
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
return line.startsWith('#!');
|
|
455
|
-
}
|
|
456
|
-
function isPromptLine(line) {
|
|
457
|
-
return line.startsWith('$ ') || line.startsWith('# ');
|
|
458
|
-
}
|
|
459
|
-
function startsWithPackageManagerCommand(line) {
|
|
460
|
-
return BASH_PACKAGE_MANAGERS.some((manager) => {
|
|
461
|
-
if (!line.startsWith(`${manager} `))
|
|
462
|
-
return false;
|
|
463
|
-
const rest = line.slice(manager.length + 1);
|
|
464
|
-
return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
|
|
465
|
-
});
|
|
466
|
-
}
|
|
467
|
-
const TYPE_HINTS = [
|
|
468
|
-
'string',
|
|
469
|
-
'number',
|
|
470
|
-
'boolean',
|
|
471
|
-
'void',
|
|
472
|
-
'any',
|
|
473
|
-
'unknown',
|
|
474
|
-
'never',
|
|
475
|
-
];
|
|
476
|
-
const HTML_TAGS = [
|
|
477
|
-
'<!doctype',
|
|
478
|
-
'<html',
|
|
479
|
-
'<head',
|
|
480
|
-
'<body',
|
|
481
|
-
'<div',
|
|
482
|
-
'<span',
|
|
483
|
-
'<p',
|
|
484
|
-
'<a',
|
|
485
|
-
'<script',
|
|
486
|
-
'<style',
|
|
487
|
-
];
|
|
488
|
-
const SQL_KEYWORDS = [
|
|
489
|
-
'select',
|
|
490
|
-
'insert',
|
|
491
|
-
'update',
|
|
492
|
-
'delete',
|
|
493
|
-
'create',
|
|
494
|
-
'alter',
|
|
495
|
-
'drop',
|
|
496
|
-
];
|
|
497
|
-
const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
|
|
498
|
-
const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
|
|
499
|
-
const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
|
|
500
|
-
const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
|
|
501
|
-
const CODE_DETECTORS = [
|
|
502
|
-
{ language: 'jsx', detect: detectJsx },
|
|
503
|
-
{ language: 'typescript', detect: detectTypescript },
|
|
504
|
-
{ language: 'rust', detect: detectRust },
|
|
505
|
-
{ language: 'javascript', detect: detectJavascript },
|
|
506
|
-
{ language: 'python', detect: detectPython },
|
|
507
|
-
{ language: 'bash', detect: detectBash },
|
|
508
|
-
{ language: 'css', detect: detectCss },
|
|
509
|
-
{ language: 'html', detect: detectHtml },
|
|
510
|
-
{ language: 'json', detect: detectJson },
|
|
511
|
-
{ language: 'yaml', detect: detectYaml },
|
|
512
|
-
{ language: 'sql', detect: detectSql },
|
|
513
|
-
{ language: 'go', detect: detectGo },
|
|
514
|
-
];
|
|
515
|
-
function detectJsx(code) {
|
|
516
|
-
const lower = code.toLowerCase();
|
|
517
|
-
if (lower.includes('classname='))
|
|
518
|
-
return true;
|
|
519
|
-
if (lower.includes('jsx:'))
|
|
520
|
-
return true;
|
|
521
|
-
if (lower.includes("from 'react'") || lower.includes('from "react"')) {
|
|
522
|
-
return true;
|
|
495
|
+
function detectBashIndicators(code) {
|
|
496
|
+
for (const line of splitLines(code)) {
|
|
497
|
+
const trimmed = line.trimStart();
|
|
498
|
+
if (!trimmed)
|
|
499
|
+
continue;
|
|
500
|
+
if (isShellPrefix(trimmed) ||
|
|
501
|
+
matchesBashCommand(trimmed) ||
|
|
502
|
+
matchesPackageManagerVerb(trimmed)) {
|
|
503
|
+
return true;
|
|
504
|
+
}
|
|
523
505
|
}
|
|
524
|
-
return
|
|
525
|
-
}
|
|
526
|
-
function detectTypescript(code) {
|
|
527
|
-
const lower = code.toLowerCase();
|
|
528
|
-
if (containsWord(lower, 'interface'))
|
|
529
|
-
return true;
|
|
530
|
-
if (containsWord(lower, 'type'))
|
|
531
|
-
return true;
|
|
532
|
-
return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
|
|
533
|
-
}
|
|
534
|
-
function detectRust(code) {
|
|
535
|
-
const lower = code.toLowerCase();
|
|
536
|
-
return (RUST_WORD_REGEX.test(lower) ||
|
|
537
|
-
lower.includes('let mut') ||
|
|
538
|
-
(lower.includes('use ') && lower.includes('::')));
|
|
539
|
-
}
|
|
540
|
-
function detectJavascript(code) {
|
|
541
|
-
const lower = code.toLowerCase();
|
|
542
|
-
return JS_WORD_REGEX.test(lower);
|
|
543
|
-
}
|
|
544
|
-
function detectPython(code) {
|
|
545
|
-
const lower = code.toLowerCase();
|
|
546
|
-
return (PYTHON_WORD_REGEX.test(lower) ||
|
|
547
|
-
lower.includes('print(') ||
|
|
548
|
-
lower.includes('__name__'));
|
|
506
|
+
return false;
|
|
549
507
|
}
|
|
550
|
-
function
|
|
551
|
-
const
|
|
552
|
-
if (CSS_DIRECTIVE_REGEX.test(lower))
|
|
553
|
-
return true;
|
|
554
|
-
const lines = splitLines(code);
|
|
555
|
-
for (const line of lines) {
|
|
508
|
+
function detectCssStructure(code) {
|
|
509
|
+
for (const line of splitLines(code)) {
|
|
556
510
|
const trimmed = line.trimStart();
|
|
557
511
|
if (!trimmed)
|
|
558
512
|
continue;
|
|
559
|
-
|
|
513
|
+
const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
|
|
514
|
+
trimmed.includes('{');
|
|
515
|
+
const isProperty = trimmed.includes(':') && trimmed.includes(';');
|
|
516
|
+
if (isSelector || isProperty)
|
|
560
517
|
return true;
|
|
561
518
|
}
|
|
562
519
|
return false;
|
|
563
520
|
}
|
|
564
|
-
function
|
|
565
|
-
const
|
|
566
|
-
return HTML_TAGS.some((tag) => lower.includes(tag));
|
|
567
|
-
}
|
|
568
|
-
function detectJson(code) {
|
|
569
|
-
const trimmed = code.trimStart();
|
|
570
|
-
if (!trimmed)
|
|
571
|
-
return false;
|
|
572
|
-
return trimmed.startsWith('{') || trimmed.startsWith('[');
|
|
573
|
-
}
|
|
574
|
-
function detectYaml(code) {
|
|
575
|
-
const lines = splitLines(code);
|
|
576
|
-
for (const line of lines) {
|
|
521
|
+
function detectYamlStructure(code) {
|
|
522
|
+
for (const line of splitLines(code)) {
|
|
577
523
|
const trimmed = line.trim();
|
|
578
524
|
if (!trimmed)
|
|
579
525
|
continue;
|
|
580
|
-
const
|
|
581
|
-
if (
|
|
526
|
+
const colonIdx = trimmed.indexOf(':');
|
|
527
|
+
if (colonIdx <= 0)
|
|
582
528
|
continue;
|
|
583
|
-
const after = trimmed[
|
|
529
|
+
const after = trimmed[colonIdx + 1];
|
|
584
530
|
if (after === ' ' || after === '\t')
|
|
585
531
|
return true;
|
|
586
532
|
}
|
|
587
533
|
return false;
|
|
588
534
|
}
|
|
589
|
-
function
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
function isCssPropertyLine(line) {
|
|
605
|
-
return line.includes(':') && line.includes(';');
|
|
535
|
+
function matchesLanguagePattern(code, lower, pattern) {
|
|
536
|
+
if (pattern.keywords?.some((kw) => lower.includes(kw)))
|
|
537
|
+
return true;
|
|
538
|
+
if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
|
|
539
|
+
return true;
|
|
540
|
+
if (pattern.regex?.test(lower))
|
|
541
|
+
return true;
|
|
542
|
+
if (pattern.startsWith) {
|
|
543
|
+
const trimmed = code.trimStart();
|
|
544
|
+
if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
|
|
545
|
+
return true;
|
|
546
|
+
}
|
|
547
|
+
if (pattern.custom?.(code, lower))
|
|
548
|
+
return true;
|
|
549
|
+
return false;
|
|
606
550
|
}
|
|
607
551
|
export function detectLanguageFromCode(code) {
|
|
608
|
-
|
|
609
|
-
|
|
552
|
+
const lower = code.toLowerCase();
|
|
553
|
+
for (const { language, pattern } of LANGUAGE_PATTERNS) {
|
|
554
|
+
if (matchesLanguagePattern(code, lower, pattern))
|
|
610
555
|
return language;
|
|
611
556
|
}
|
|
612
557
|
return undefined;
|
|
@@ -630,6 +575,7 @@ const STRUCTURAL_TAGS = new Set([
|
|
|
630
575
|
'input',
|
|
631
576
|
'select',
|
|
632
577
|
'textarea',
|
|
578
|
+
'svg',
|
|
633
579
|
]);
|
|
634
580
|
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
|
|
635
581
|
const NAVIGATION_ROLES = new Set([
|
|
@@ -642,6 +588,7 @@ const NAVIGATION_ROLES = new Set([
|
|
|
642
588
|
'menu',
|
|
643
589
|
'dialog',
|
|
644
590
|
'alertdialog',
|
|
591
|
+
'search',
|
|
645
592
|
]);
|
|
646
593
|
const PROMO_TOKENS = new Set([
|
|
647
594
|
'banner',
|
|
@@ -669,6 +616,7 @@ const PROMO_TOKENS = new Set([
|
|
|
669
616
|
'breadcrumb',
|
|
670
617
|
'pagination',
|
|
671
618
|
'pager',
|
|
619
|
+
'taglist',
|
|
672
620
|
]);
|
|
673
621
|
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
674
622
|
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
@@ -727,6 +675,8 @@ const NOISE_MARKERS = [
|
|
|
727
675
|
' z-50',
|
|
728
676
|
' z-4',
|
|
729
677
|
' isolate',
|
|
678
|
+
' breadcrumb',
|
|
679
|
+
' pagination',
|
|
730
680
|
];
|
|
731
681
|
function mayContainNoise(html) {
|
|
732
682
|
const haystack = html.toLowerCase();
|
|
@@ -760,11 +710,9 @@ function matchesPromoIdOrClass(className, id) {
|
|
|
760
710
|
const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
|
|
761
711
|
return tokens.some((token) => PROMO_TOKENS.has(token));
|
|
762
712
|
}
|
|
763
|
-
function matchesHighZIsolate(className) {
|
|
764
|
-
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
765
|
-
}
|
|
766
713
|
function matchesFixedOrHighZIsolate(className) {
|
|
767
|
-
return FIXED_PATTERN.test(className) ||
|
|
714
|
+
return (FIXED_PATTERN.test(className) ||
|
|
715
|
+
(HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
|
|
768
716
|
}
|
|
769
717
|
function readElementMetadata(element) {
|
|
770
718
|
return {
|
|
@@ -791,8 +739,7 @@ function isNoiseElement(node) {
|
|
|
791
739
|
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
792
740
|
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
793
741
|
}
|
|
794
|
-
function
|
|
795
|
-
const nodes = document.querySelectorAll('*');
|
|
742
|
+
function removeNoiseNodes(nodes) {
|
|
796
743
|
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
797
744
|
const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
|
|
798
745
|
if (!node)
|
|
@@ -802,6 +749,30 @@ function stripNoiseNodes(document) {
|
|
|
802
749
|
}
|
|
803
750
|
}
|
|
804
751
|
}
|
|
752
|
+
function stripNoiseNodes(document) {
|
|
753
|
+
// Use targeted selectors for common noise elements instead of querySelectorAll('*')
|
|
754
|
+
const targetSelectors = [
|
|
755
|
+
'nav',
|
|
756
|
+
'footer',
|
|
757
|
+
'aside',
|
|
758
|
+
'header[class*="site"]',
|
|
759
|
+
'header[class*="nav"]',
|
|
760
|
+
'header[class*="menu"]',
|
|
761
|
+
'[role="banner"]',
|
|
762
|
+
'[role="navigation"]',
|
|
763
|
+
'[role="dialog"]',
|
|
764
|
+
'[style*="display: none"]',
|
|
765
|
+
'[style*="display:none"]',
|
|
766
|
+
'[hidden]',
|
|
767
|
+
'[aria-hidden="true"]',
|
|
768
|
+
].join(',');
|
|
769
|
+
const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
|
|
770
|
+
// Remove in reverse order to handle nested elements correctly
|
|
771
|
+
removeNoiseNodes(potentialNoiseNodes);
|
|
772
|
+
// Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
|
|
773
|
+
const allElements = document.querySelectorAll('*');
|
|
774
|
+
removeNoiseNodes(allElements);
|
|
775
|
+
}
|
|
805
776
|
function removeNoiseFromHtml(html) {
|
|
806
777
|
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
807
778
|
if (!shouldParse)
|
|
@@ -826,7 +797,14 @@ function removeNoiseFromHtml(html) {
|
|
|
826
797
|
}
|
|
827
798
|
function buildInlineCode(content) {
|
|
828
799
|
const runs = content.match(/`+/g);
|
|
829
|
-
|
|
800
|
+
let longest = '';
|
|
801
|
+
if (runs) {
|
|
802
|
+
for (const run of runs) {
|
|
803
|
+
if (run.length > longest.length) {
|
|
804
|
+
longest = run;
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
}
|
|
830
808
|
const delimiter = `\`${longest}`;
|
|
831
809
|
const padding = delimiter.length > 1 ? ' ' : '';
|
|
832
810
|
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
@@ -983,10 +961,8 @@ function translateHtmlToMarkdown(html, url, signal) {
|
|
|
983
961
|
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
984
962
|
const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
|
|
985
963
|
throwIfAborted(signal, url, 'markdown:translated');
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
finalMarkdown = normalizeTableWhitespace(finalMarkdown);
|
|
989
|
-
return finalMarkdown;
|
|
964
|
+
const cleaned = cleanupMarkdownArtifacts(content);
|
|
965
|
+
return promoteOrphanHeadings(cleaned);
|
|
990
966
|
}
|
|
991
967
|
function appendMetadataFooter(content, metadata, url) {
|
|
992
968
|
const footer = buildMetadataFooter(metadata, url);
|
|
@@ -1009,34 +985,146 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
1009
985
|
}
|
|
1010
986
|
function cleanupMarkdownArtifacts(content) {
|
|
1011
987
|
let result = content;
|
|
988
|
+
const fixOrphanHeadings = (text) => {
|
|
989
|
+
return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
|
|
990
|
+
if (typeof prefix !== 'string' ||
|
|
991
|
+
typeof hashes !== 'string' ||
|
|
992
|
+
typeof heading !== 'string') {
|
|
993
|
+
return match;
|
|
994
|
+
}
|
|
995
|
+
if (heading.length > 150) {
|
|
996
|
+
return match;
|
|
997
|
+
}
|
|
998
|
+
const trimmedPrefix = prefix.trim();
|
|
999
|
+
if (trimmedPrefix === '') {
|
|
1000
|
+
return `${hashes} ${heading}\n\n`;
|
|
1001
|
+
}
|
|
1002
|
+
return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
|
|
1003
|
+
});
|
|
1004
|
+
};
|
|
1005
|
+
result = fixOrphanHeadings(result);
|
|
1012
1006
|
result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
|
|
1013
1007
|
const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
|
|
1014
1008
|
result = result.replace(zeroWidthAnchorLink, '');
|
|
1009
|
+
result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
|
|
1010
|
+
result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
|
|
1011
|
+
result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
|
|
1012
|
+
result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
|
|
1013
|
+
result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
|
|
1014
|
+
result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
|
|
1015
|
+
const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
|
|
1016
|
+
const lines = result.split('\n');
|
|
1017
|
+
const filtered = [];
|
|
1018
|
+
let skipTocBlock = false;
|
|
1019
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
1020
|
+
const line = lines[i] ?? '';
|
|
1021
|
+
const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
|
|
1022
|
+
const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
|
|
1023
|
+
if (tocLinkLine.test(line)) {
|
|
1024
|
+
const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
|
|
1025
|
+
const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
|
|
1026
|
+
if (prevIsToc || nextIsToc) {
|
|
1027
|
+
skipTocBlock = true;
|
|
1028
|
+
continue;
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
else if (line.trim() === '' && skipTocBlock) {
|
|
1032
|
+
skipTocBlock = false;
|
|
1033
|
+
continue;
|
|
1034
|
+
}
|
|
1035
|
+
else {
|
|
1036
|
+
skipTocBlock = false;
|
|
1037
|
+
}
|
|
1038
|
+
filtered.push(line);
|
|
1039
|
+
}
|
|
1040
|
+
result = filtered.join('\n');
|
|
1015
1041
|
result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
|
|
1016
1042
|
result = result.replace(/^Was this page helpful\??\s*$/gim, '');
|
|
1043
|
+
result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
|
|
1044
|
+
result = result.replace(/\\([[]])/g, '$1');
|
|
1045
|
+
result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
|
|
1046
|
+
result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
|
|
1017
1047
|
result = result.replace(/\n{3,}/g, '\n\n');
|
|
1018
1048
|
return result.trim();
|
|
1019
1049
|
}
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1050
|
+
const HEADING_KEYWORDS = new Set([
|
|
1051
|
+
'overview',
|
|
1052
|
+
'introduction',
|
|
1053
|
+
'summary',
|
|
1054
|
+
'conclusion',
|
|
1055
|
+
'prerequisites',
|
|
1056
|
+
'requirements',
|
|
1057
|
+
'installation',
|
|
1058
|
+
'configuration',
|
|
1059
|
+
'usage',
|
|
1060
|
+
'features',
|
|
1061
|
+
'limitations',
|
|
1062
|
+
'troubleshooting',
|
|
1063
|
+
'faq',
|
|
1064
|
+
'resources',
|
|
1065
|
+
'references',
|
|
1066
|
+
'changelog',
|
|
1067
|
+
'license',
|
|
1068
|
+
'acknowledgments',
|
|
1069
|
+
'appendix',
|
|
1070
|
+
]);
|
|
1071
|
+
function isLikelyHeadingLine(line) {
|
|
1072
|
+
const trimmed = line.trim();
|
|
1073
|
+
if (!trimmed || trimmed.length > 80)
|
|
1074
|
+
return false;
|
|
1075
|
+
if (/^#{1,6}\s/.test(trimmed))
|
|
1076
|
+
return false;
|
|
1077
|
+
if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
|
|
1078
|
+
return false;
|
|
1079
|
+
if (/[.!?]$/.test(trimmed))
|
|
1080
|
+
return false;
|
|
1081
|
+
if (/^\[.*\]\(.*\)$/.test(trimmed))
|
|
1082
|
+
return false;
|
|
1083
|
+
if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
|
|
1084
|
+
return true;
|
|
1085
|
+
}
|
|
1086
|
+
const words = trimmed.split(/\s+/);
|
|
1087
|
+
if (words.length >= 2 && words.length <= 6) {
|
|
1088
|
+
const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
|
|
1089
|
+
if (isTitleCase)
|
|
1090
|
+
return true;
|
|
1091
|
+
}
|
|
1092
|
+
if (words.length === 1) {
|
|
1093
|
+
const lower = trimmed.toLowerCase();
|
|
1094
|
+
if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
|
|
1095
|
+
return true;
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
return false;
|
|
1024
1099
|
}
|
|
1025
|
-
function
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1100
|
+
function promoteOrphanHeadings(markdown) {
|
|
1101
|
+
const lines = markdown.split('\n');
|
|
1102
|
+
const result = [];
|
|
1103
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
1104
|
+
const line = lines[i] ?? '';
|
|
1105
|
+
const prevLine = i > 0 ? lines[i - 1] : '';
|
|
1106
|
+
const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
|
|
1107
|
+
const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
|
|
1108
|
+
const isPrecededByBlank = prevLine?.trim() === '';
|
|
1109
|
+
if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
|
|
1110
|
+
const trimmed = line.trim();
|
|
1111
|
+
const isExample = /^example:\s/i.test(trimmed);
|
|
1112
|
+
const prefix = isExample ? '### ' : '## ';
|
|
1113
|
+
result.push(prefix + trimmed);
|
|
1114
|
+
}
|
|
1115
|
+
else {
|
|
1116
|
+
result.push(line);
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
return result.join('\n');
|
|
1030
1120
|
}
|
|
1031
1121
|
function formatFetchedDate(isoString) {
|
|
1032
1122
|
try {
|
|
1033
1123
|
const date = new Date(isoString);
|
|
1034
|
-
const
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
};
|
|
1039
|
-
return date.toLocaleDateString('en-US', options);
|
|
1124
|
+
const day = String(date.getDate()).padStart(2, '0');
|
|
1125
|
+
const month = String(date.getMonth() + 1).padStart(2, '0');
|
|
1126
|
+
const year = date.getFullYear();
|
|
1127
|
+
return `${day}-${month}-${year}`;
|
|
1040
1128
|
}
|
|
1041
1129
|
catch {
|
|
1042
1130
|
return isoString;
|
|
@@ -1045,20 +1133,24 @@ function formatFetchedDate(isoString) {
|
|
|
1045
1133
|
function buildMetadataFooter(metadata, fallbackUrl) {
|
|
1046
1134
|
if (!metadata)
|
|
1047
1135
|
return '';
|
|
1048
|
-
const lines = [];
|
|
1136
|
+
const lines = ['---', ''];
|
|
1137
|
+
const url = metadata.url || fallbackUrl;
|
|
1138
|
+
const parts = [];
|
|
1049
1139
|
if (metadata.title)
|
|
1050
|
-
|
|
1051
|
-
if (metadata.description)
|
|
1052
|
-
lines.push(`> *${metadata.description}*`);
|
|
1140
|
+
parts.push(`_${metadata.title}_`);
|
|
1053
1141
|
if (metadata.author)
|
|
1054
|
-
|
|
1055
|
-
if (
|
|
1056
|
-
|
|
1057
|
-
else if (fallbackUrl)
|
|
1058
|
-
lines.push(`> *<${fallbackUrl}>*`);
|
|
1142
|
+
parts.push(`_${metadata.author}_`);
|
|
1143
|
+
if (url)
|
|
1144
|
+
parts.push(`[_Original Source_](${url})`);
|
|
1059
1145
|
if (metadata.fetchedAt) {
|
|
1060
1146
|
const formattedDate = formatFetchedDate(metadata.fetchedAt);
|
|
1061
|
-
|
|
1147
|
+
parts.push(`_${formattedDate}_`);
|
|
1148
|
+
}
|
|
1149
|
+
if (parts.length > 0) {
|
|
1150
|
+
lines.push(` ${parts.join(' | ')}`);
|
|
1151
|
+
}
|
|
1152
|
+
if (metadata.description) {
|
|
1153
|
+
lines.push(` <sub>${metadata.description}</sub>`);
|
|
1062
1154
|
}
|
|
1063
1155
|
return lines.join('\n');
|
|
1064
1156
|
}
|
|
@@ -1273,69 +1365,82 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
|
1273
1365
|
}
|
|
1274
1366
|
const MIN_CONTENT_RATIO = 0.3;
|
|
1275
1367
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
1276
|
-
|
|
1277
|
-
|
|
1368
|
+
const MIN_HEADING_RETENTION_RATIO = 0.7;
|
|
1369
|
+
function countHeadings(html) {
|
|
1370
|
+
if (!html)
|
|
1371
|
+
return 0;
|
|
1372
|
+
// Match opening heading tags <h1> through <h6>
|
|
1373
|
+
const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
|
|
1374
|
+
const matches = html.match(headingPattern);
|
|
1375
|
+
return matches ? matches.length : 0;
|
|
1376
|
+
}
|
|
1377
|
+
function isHeadingStructurePreserved(article, originalHtml) {
|
|
1378
|
+
if (!article)
|
|
1379
|
+
return false;
|
|
1380
|
+
// Cache heading counts to avoid duplicate regex matching
|
|
1381
|
+
const originalHeadingCount = countHeadings(originalHtml);
|
|
1382
|
+
const articleHeadingCount = countHeadings(article.content);
|
|
1383
|
+
// If original has no headings, structure is trivially preserved
|
|
1384
|
+
if (originalHeadingCount === 0)
|
|
1385
|
+
return true;
|
|
1386
|
+
// If article lost >50% of headings, structure is broken
|
|
1387
|
+
const retentionRatio = articleHeadingCount / originalHeadingCount;
|
|
1388
|
+
return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
|
|
1389
|
+
}
|
|
1390
|
+
function stripHtmlTagsForLength(html) {
|
|
1391
|
+
let result = '';
|
|
1278
1392
|
let inTag = false;
|
|
1279
1393
|
for (const char of html) {
|
|
1280
1394
|
if (char === '<') {
|
|
1281
1395
|
inTag = true;
|
|
1282
|
-
continue;
|
|
1283
1396
|
}
|
|
1284
|
-
if (char === '>') {
|
|
1397
|
+
else if (char === '>') {
|
|
1285
1398
|
inTag = false;
|
|
1286
|
-
continue;
|
|
1287
1399
|
}
|
|
1288
|
-
if (!inTag) {
|
|
1289
|
-
|
|
1400
|
+
else if (!inTag) {
|
|
1401
|
+
result += char;
|
|
1290
1402
|
}
|
|
1291
1403
|
}
|
|
1292
|
-
return
|
|
1293
|
-
}
|
|
1294
|
-
function estimateTextLength(html) {
|
|
1295
|
-
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
1404
|
+
return result;
|
|
1296
1405
|
}
|
|
1297
1406
|
export function isExtractionSufficient(article, originalHtml) {
|
|
1298
1407
|
if (!article)
|
|
1299
1408
|
return false;
|
|
1300
1409
|
const articleLength = article.textContent.length;
|
|
1301
|
-
const originalLength =
|
|
1410
|
+
const originalLength = stripHtmlTagsForLength(originalHtml)
|
|
1411
|
+
.replace(/\s+/g, ' ')
|
|
1412
|
+
.trim().length;
|
|
1302
1413
|
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
1303
1414
|
return true;
|
|
1304
1415
|
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
1305
1416
|
}
|
|
1306
1417
|
export function determineContentExtractionSource(article) {
|
|
1307
|
-
return
|
|
1308
|
-
}
|
|
1309
|
-
function applyArticleMetadata(metadata, article) {
|
|
1310
|
-
if (article.title !== undefined)
|
|
1311
|
-
metadata.title = article.title;
|
|
1312
|
-
if (article.byline !== undefined)
|
|
1313
|
-
metadata.author = article.byline;
|
|
1314
|
-
}
|
|
1315
|
-
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
1316
|
-
if (extractedMeta.title !== undefined)
|
|
1317
|
-
metadata.title = extractedMeta.title;
|
|
1318
|
-
if (extractedMeta.description !== undefined) {
|
|
1319
|
-
metadata.description = extractedMeta.description;
|
|
1320
|
-
}
|
|
1321
|
-
if (extractedMeta.author !== undefined) {
|
|
1322
|
-
metadata.author = extractedMeta.author;
|
|
1323
|
-
}
|
|
1418
|
+
return article !== null;
|
|
1324
1419
|
}
|
|
1325
1420
|
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
1326
1421
|
if (!includeMetadata)
|
|
1327
1422
|
return undefined;
|
|
1328
|
-
const now = new Date().toISOString();
|
|
1329
1423
|
const metadata = {
|
|
1330
1424
|
type: 'metadata',
|
|
1331
1425
|
url,
|
|
1332
|
-
fetchedAt:
|
|
1426
|
+
fetchedAt: new Date().toISOString(),
|
|
1333
1427
|
};
|
|
1334
1428
|
if (shouldExtractFromArticle && article) {
|
|
1335
|
-
|
|
1336
|
-
|
|
1429
|
+
if (article.title !== undefined)
|
|
1430
|
+
metadata.title = article.title;
|
|
1431
|
+
if (article.byline !== undefined)
|
|
1432
|
+
metadata.author = article.byline;
|
|
1433
|
+
}
|
|
1434
|
+
else {
|
|
1435
|
+
if (extractedMeta.title !== undefined)
|
|
1436
|
+
metadata.title = extractedMeta.title;
|
|
1437
|
+
if (extractedMeta.description !== undefined) {
|
|
1438
|
+
metadata.description = extractedMeta.description;
|
|
1439
|
+
}
|
|
1440
|
+
if (extractedMeta.author !== undefined) {
|
|
1441
|
+
metadata.author = extractedMeta.author;
|
|
1442
|
+
}
|
|
1337
1443
|
}
|
|
1338
|
-
applyExtractedMetadata(metadata, extractedMeta);
|
|
1339
1444
|
return metadata;
|
|
1340
1445
|
}
|
|
1341
1446
|
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
|
|
@@ -1353,17 +1458,24 @@ function logQualityGateFallback({ url, articleLength, }) {
|
|
|
1353
1458
|
});
|
|
1354
1459
|
}
|
|
1355
1460
|
function shouldUseArticleContent(article, html, url) {
|
|
1356
|
-
|
|
1357
|
-
if (!
|
|
1461
|
+
// Check content sufficiency (length-based quality gate)
|
|
1462
|
+
if (!isExtractionSufficient(article, html)) {
|
|
1463
|
+
logQualityGateFallback({
|
|
1464
|
+
url,
|
|
1465
|
+
articleLength: article.textContent.length,
|
|
1466
|
+
});
|
|
1358
1467
|
return false;
|
|
1359
|
-
if (isExtractionSufficient(article, html)) {
|
|
1360
|
-
return true;
|
|
1361
1468
|
}
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1469
|
+
// Check heading structure preservation
|
|
1470
|
+
if (!isHeadingStructurePreserved(article, html)) {
|
|
1471
|
+
logDebug('Quality gate: Readability broke heading structure, using full HTML', {
|
|
1472
|
+
url: url.substring(0, 80),
|
|
1473
|
+
originalHeadings: countHeadings(html),
|
|
1474
|
+
articleHeadings: countHeadings(article.content),
|
|
1475
|
+
});
|
|
1476
|
+
return false;
|
|
1477
|
+
}
|
|
1478
|
+
return true;
|
|
1367
1479
|
}
|
|
1368
1480
|
function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
1369
1481
|
const { article, metadata: extractedMeta } = extractContent(html, url, {
|