@j0hanz/superfetch 2.1.8 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -34
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +57 -14
- package/dist/cache.js.map +1 -1
- package/dist/config.d.ts +3 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +6 -0
- package/dist/config.js.map +1 -1
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +14 -1
- package/dist/errors.js.map +1 -1
- package/dist/fetch.d.ts.map +1 -1
- package/dist/fetch.js +6 -3
- package/dist/fetch.js.map +1 -1
- package/dist/http.d.ts +1 -1
- package/dist/http.d.ts.map +1 -1
- package/dist/http.js +50 -25
- package/dist/http.js.map +1 -1
- package/dist/index.js +8 -11
- package/dist/index.js.map +1 -1
- package/dist/mcp.d.ts.map +1 -1
- package/dist/mcp.js +6 -5
- package/dist/mcp.js.map +1 -1
- package/dist/observability.d.ts.map +1 -1
- package/dist/observability.js +9 -12
- package/dist/observability.js.map +1 -1
- package/dist/tools.d.ts.map +1 -1
- package/dist/tools.js +45 -32
- package/dist/tools.js.map +1 -1
- package/dist/transform.d.ts +1 -0
- package/dist/transform.d.ts.map +1 -1
- package/dist/transform.js +498 -368
- package/dist/transform.js.map +1 -1
- package/dist/type-guards.js +1 -1
- package/dist/type-guards.js.map +1 -1
- package/package.json +1 -1
package/dist/transform.js
CHANGED
|
@@ -129,83 +129,41 @@ function truncateHtml(html) {
|
|
|
129
129
|
});
|
|
130
130
|
return html.substring(0, maxSize);
|
|
131
131
|
}
|
|
132
|
-
function
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
}
|
|
155
|
-
function parseStandardKey(name) {
|
|
156
|
-
if (name === 'description')
|
|
157
|
-
return 'description';
|
|
158
|
-
if (name === 'author')
|
|
159
|
-
return 'author';
|
|
160
|
-
return null;
|
|
161
|
-
}
|
|
162
|
-
function collectMetaTag(state, tag) {
|
|
163
|
-
const content = tag.getAttribute('content')?.trim();
|
|
164
|
-
if (!content)
|
|
165
|
-
return;
|
|
166
|
-
const ogKey = parseOpenGraphKey(tag.getAttribute('property'));
|
|
167
|
-
if (ogKey) {
|
|
168
|
-
state[ogKey].og = content;
|
|
169
|
-
return;
|
|
170
|
-
}
|
|
171
|
-
const name = tag.getAttribute('name');
|
|
172
|
-
const twitterKey = parseTwitterKey(name);
|
|
173
|
-
if (twitterKey) {
|
|
174
|
-
state[twitterKey].twitter = content;
|
|
175
|
-
return;
|
|
176
|
-
}
|
|
177
|
-
const standardKey = parseStandardKey(name);
|
|
178
|
-
if (standardKey) {
|
|
179
|
-
state[standardKey].standard = content;
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
function scanMetaTags(document, state) {
|
|
183
|
-
const metaTags = document.querySelectorAll('meta');
|
|
184
|
-
for (const tag of metaTags) {
|
|
185
|
-
collectMetaTag(state, tag);
|
|
132
|
+
function extractMetadata(document) {
|
|
133
|
+
const title = {};
|
|
134
|
+
const description = {};
|
|
135
|
+
let author;
|
|
136
|
+
for (const tag of document.querySelectorAll('meta')) {
|
|
137
|
+
const content = tag.getAttribute('content')?.trim();
|
|
138
|
+
if (!content)
|
|
139
|
+
continue;
|
|
140
|
+
const property = tag.getAttribute('property');
|
|
141
|
+
const name = tag.getAttribute('name');
|
|
142
|
+
if (property === 'og:title')
|
|
143
|
+
title.og = content;
|
|
144
|
+
else if (property === 'og:description')
|
|
145
|
+
description.og = content;
|
|
146
|
+
else if (name === 'twitter:title')
|
|
147
|
+
title.twitter = content;
|
|
148
|
+
else if (name === 'twitter:description')
|
|
149
|
+
description.twitter = content;
|
|
150
|
+
else if (name === 'description')
|
|
151
|
+
description.standard = content;
|
|
152
|
+
else if (name === 'author')
|
|
153
|
+
author = content;
|
|
186
154
|
}
|
|
187
|
-
}
|
|
188
|
-
function ensureTitleFallback(document, state) {
|
|
189
|
-
if (state.title.standard)
|
|
190
|
-
return;
|
|
191
155
|
const titleEl = document.querySelector('title');
|
|
192
|
-
if (titleEl?.textContent) {
|
|
193
|
-
|
|
156
|
+
if (!title.standard && titleEl?.textContent) {
|
|
157
|
+
title.standard = titleEl.textContent.trim();
|
|
194
158
|
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
const state = createMetaCollectorState();
|
|
198
|
-
scanMetaTags(document, state);
|
|
199
|
-
ensureTitleFallback(document, state);
|
|
159
|
+
const resolvedTitle = title.og ?? title.twitter ?? title.standard;
|
|
160
|
+
const resolvedDesc = description.og ?? description.twitter ?? description.standard;
|
|
200
161
|
const metadata = {};
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
if (description !== undefined)
|
|
207
|
-
metadata.description = description;
|
|
208
|
-
if (author !== undefined)
|
|
162
|
+
if (resolvedTitle)
|
|
163
|
+
metadata.title = resolvedTitle;
|
|
164
|
+
if (resolvedDesc)
|
|
165
|
+
metadata.description = resolvedDesc;
|
|
166
|
+
if (author)
|
|
209
167
|
metadata.author = author;
|
|
210
168
|
return metadata;
|
|
211
169
|
}
|
|
@@ -226,66 +184,44 @@ function extractArticle(document) {
|
|
|
226
184
|
logWarn('Document not compatible with Readability');
|
|
227
185
|
return null;
|
|
228
186
|
}
|
|
229
|
-
return mapParsedArticle(parseReadabilityArticle(document));
|
|
230
|
-
}
|
|
231
|
-
function parseReadabilityArticle(document) {
|
|
232
187
|
try {
|
|
233
188
|
const documentClone = document.cloneNode(true);
|
|
234
|
-
const rawText = documentClone.body
|
|
189
|
+
const rawText = documentClone.querySelector('body')?.textContent ??
|
|
235
190
|
documentClone.documentElement.textContent;
|
|
236
191
|
const textLength = rawText.replace(/\s+/g, ' ').trim().length;
|
|
237
192
|
if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
|
|
238
193
|
return null;
|
|
239
194
|
}
|
|
240
195
|
const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
|
|
241
|
-
|
|
196
|
+
const parsed = reader.parse();
|
|
197
|
+
if (!parsed)
|
|
198
|
+
return null;
|
|
199
|
+
return {
|
|
200
|
+
content: parsed.content ?? '',
|
|
201
|
+
textContent: parsed.textContent ?? '',
|
|
202
|
+
...(parsed.title != null && { title: parsed.title }),
|
|
203
|
+
...(parsed.byline != null && { byline: parsed.byline }),
|
|
204
|
+
...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
|
|
205
|
+
...(parsed.siteName != null && { siteName: parsed.siteName }),
|
|
206
|
+
};
|
|
242
207
|
}
|
|
243
208
|
catch (error) {
|
|
244
|
-
logError('Failed to extract article with Readability',
|
|
209
|
+
logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
|
|
245
210
|
return null;
|
|
246
211
|
}
|
|
247
212
|
}
|
|
248
|
-
function asError(error) {
|
|
249
|
-
if (error instanceof Error) {
|
|
250
|
-
return error;
|
|
251
|
-
}
|
|
252
|
-
return undefined;
|
|
253
|
-
}
|
|
254
|
-
function mapParsedArticle(parsed) {
|
|
255
|
-
return parsed ? mapReadabilityResult(parsed) : null;
|
|
256
|
-
}
|
|
257
|
-
function mapReadabilityResult(parsed) {
|
|
258
|
-
return {
|
|
259
|
-
content: parsed.content ?? '',
|
|
260
|
-
textContent: parsed.textContent ?? '',
|
|
261
|
-
...buildOptionalArticleFields(parsed),
|
|
262
|
-
};
|
|
263
|
-
}
|
|
264
|
-
function buildOptionalArticleFields(parsed) {
|
|
265
|
-
const optional = {};
|
|
266
|
-
addOptionalField(optional, 'title', parsed.title);
|
|
267
|
-
addOptionalField(optional, 'byline', parsed.byline);
|
|
268
|
-
addOptionalField(optional, 'excerpt', parsed.excerpt);
|
|
269
|
-
addOptionalField(optional, 'siteName', parsed.siteName);
|
|
270
|
-
return optional;
|
|
271
|
-
}
|
|
272
|
-
function addOptionalField(target, key, value) {
|
|
273
|
-
if (value == null)
|
|
274
|
-
return;
|
|
275
|
-
target[key] = value;
|
|
276
|
-
}
|
|
277
213
|
export function extractContent(html, url, options = {
|
|
278
214
|
extractArticle: true,
|
|
279
215
|
}) {
|
|
280
|
-
const
|
|
216
|
+
const result = extractContentWithDocument(html, url, options);
|
|
217
|
+
return { article: result.article, metadata: result.metadata };
|
|
218
|
+
}
|
|
219
|
+
function extractContentWithDocument(html, url, options) {
|
|
281
220
|
if (!isValidInput(html, url)) {
|
|
282
|
-
return
|
|
221
|
+
return { article: null, metadata: {} };
|
|
283
222
|
}
|
|
284
223
|
return tryExtractContent(html, url, options);
|
|
285
224
|
}
|
|
286
|
-
function createEmptyExtractionResult() {
|
|
287
|
-
return { article: null, metadata: {} };
|
|
288
|
-
}
|
|
289
225
|
function extractArticleWithStage(document, url, shouldExtract) {
|
|
290
226
|
if (!shouldExtract)
|
|
291
227
|
return null;
|
|
@@ -297,11 +233,12 @@ function handleExtractionFailure(error, url, signal) {
|
|
|
297
233
|
}
|
|
298
234
|
throwIfAborted(signal, url, 'extract:error');
|
|
299
235
|
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
300
|
-
return
|
|
236
|
+
return { article: null, metadata: {} };
|
|
301
237
|
}
|
|
302
238
|
function extractContentStages(html, url, options) {
|
|
303
239
|
throwIfAborted(options.signal, url, 'extract:begin');
|
|
304
|
-
const
|
|
240
|
+
const truncatedHtml = truncateHtml(html);
|
|
241
|
+
const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncatedHtml));
|
|
305
242
|
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
306
243
|
applyBaseUri(document, url);
|
|
307
244
|
const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
|
|
@@ -311,6 +248,7 @@ function extractContentStages(html, url, options) {
|
|
|
311
248
|
return {
|
|
312
249
|
article,
|
|
313
250
|
metadata,
|
|
251
|
+
...(truncatedHtml.length === html.length ? { document } : {}),
|
|
314
252
|
};
|
|
315
253
|
}
|
|
316
254
|
function tryExtractContent(html, url, options) {
|
|
@@ -325,14 +263,11 @@ function isValidInput(html, url) {
|
|
|
325
263
|
return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
|
|
326
264
|
}
|
|
327
265
|
function validateRequiredString(value, message) {
|
|
328
|
-
if (
|
|
266
|
+
if (typeof value === 'string' && value.length > 0)
|
|
329
267
|
return true;
|
|
330
268
|
logWarn(message);
|
|
331
269
|
return false;
|
|
332
270
|
}
|
|
333
|
-
function isNonEmptyString(value) {
|
|
334
|
-
return typeof value === 'string' && value.length > 0;
|
|
335
|
-
}
|
|
336
271
|
function resolveArticleExtraction(document, shouldExtract) {
|
|
337
272
|
return shouldExtract ? extractArticle(document) : null;
|
|
338
273
|
}
|
|
@@ -417,7 +352,124 @@ function isWordChar(char) {
|
|
|
417
352
|
(code >= 97 && code <= 122) ||
|
|
418
353
|
char === '_');
|
|
419
354
|
}
|
|
420
|
-
const
|
|
355
|
+
const LANGUAGE_PATTERNS = [
|
|
356
|
+
{
|
|
357
|
+
language: 'jsx',
|
|
358
|
+
pattern: {
|
|
359
|
+
keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
|
|
360
|
+
custom: (code) => containsJsxTag(code),
|
|
361
|
+
},
|
|
362
|
+
},
|
|
363
|
+
{
|
|
364
|
+
language: 'typescript',
|
|
365
|
+
pattern: {
|
|
366
|
+
wordBoundary: ['interface', 'type'],
|
|
367
|
+
custom: (_, lower) => [
|
|
368
|
+
': string',
|
|
369
|
+
':string',
|
|
370
|
+
': number',
|
|
371
|
+
':number',
|
|
372
|
+
': boolean',
|
|
373
|
+
':boolean',
|
|
374
|
+
': void',
|
|
375
|
+
':void',
|
|
376
|
+
': any',
|
|
377
|
+
':any',
|
|
378
|
+
': unknown',
|
|
379
|
+
':unknown',
|
|
380
|
+
': never',
|
|
381
|
+
':never',
|
|
382
|
+
].some((hint) => lower.includes(hint)),
|
|
383
|
+
},
|
|
384
|
+
},
|
|
385
|
+
{
|
|
386
|
+
language: 'rust',
|
|
387
|
+
pattern: {
|
|
388
|
+
regex: /\b(?:fn|impl|struct|enum)\b/,
|
|
389
|
+
keywords: ['let mut'],
|
|
390
|
+
custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
|
|
391
|
+
},
|
|
392
|
+
},
|
|
393
|
+
{
|
|
394
|
+
language: 'javascript',
|
|
395
|
+
pattern: {
|
|
396
|
+
regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
|
|
397
|
+
},
|
|
398
|
+
},
|
|
399
|
+
{
|
|
400
|
+
language: 'python',
|
|
401
|
+
pattern: {
|
|
402
|
+
regex: /\b(?:def|class|import|from)\b/,
|
|
403
|
+
keywords: ['print(', '__name__'],
|
|
404
|
+
},
|
|
405
|
+
},
|
|
406
|
+
{
|
|
407
|
+
language: 'bash',
|
|
408
|
+
pattern: {
|
|
409
|
+
custom: (code) => detectBashIndicators(code),
|
|
410
|
+
},
|
|
411
|
+
},
|
|
412
|
+
{
|
|
413
|
+
language: 'css',
|
|
414
|
+
pattern: {
|
|
415
|
+
regex: /@media|@import|@keyframes/,
|
|
416
|
+
custom: (code) => detectCssStructure(code),
|
|
417
|
+
},
|
|
418
|
+
},
|
|
419
|
+
{
|
|
420
|
+
language: 'html',
|
|
421
|
+
pattern: {
|
|
422
|
+
keywords: [
|
|
423
|
+
'<!doctype',
|
|
424
|
+
'<html',
|
|
425
|
+
'<head',
|
|
426
|
+
'<body',
|
|
427
|
+
'<div',
|
|
428
|
+
'<span',
|
|
429
|
+
'<p',
|
|
430
|
+
'<a',
|
|
431
|
+
'<script',
|
|
432
|
+
'<style',
|
|
433
|
+
],
|
|
434
|
+
},
|
|
435
|
+
},
|
|
436
|
+
{
|
|
437
|
+
language: 'json',
|
|
438
|
+
pattern: {
|
|
439
|
+
startsWith: ['{', '['],
|
|
440
|
+
},
|
|
441
|
+
},
|
|
442
|
+
{
|
|
443
|
+
language: 'yaml',
|
|
444
|
+
pattern: {
|
|
445
|
+
custom: (code) => detectYamlStructure(code),
|
|
446
|
+
},
|
|
447
|
+
},
|
|
448
|
+
{
|
|
449
|
+
language: 'sql',
|
|
450
|
+
pattern: {
|
|
451
|
+
wordBoundary: [
|
|
452
|
+
'select',
|
|
453
|
+
'insert',
|
|
454
|
+
'update',
|
|
455
|
+
'delete',
|
|
456
|
+
'create',
|
|
457
|
+
'alter',
|
|
458
|
+
'drop',
|
|
459
|
+
],
|
|
460
|
+
},
|
|
461
|
+
},
|
|
462
|
+
{
|
|
463
|
+
language: 'go',
|
|
464
|
+
pattern: {
|
|
465
|
+
wordBoundary: ['package', 'func'],
|
|
466
|
+
keywords: ['import "'],
|
|
467
|
+
},
|
|
468
|
+
},
|
|
469
|
+
];
|
|
470
|
+
// Bash detection constants
|
|
471
|
+
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
472
|
+
const BASH_PKG_MANAGERS = [
|
|
421
473
|
'npm',
|
|
422
474
|
'yarn',
|
|
423
475
|
'pnpm',
|
|
@@ -429,184 +481,83 @@ const BASH_PACKAGE_MANAGERS = [
|
|
|
429
481
|
'go',
|
|
430
482
|
];
|
|
431
483
|
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
484
|
+
function isShellPrefix(line) {
|
|
485
|
+
return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
|
|
486
|
+
}
|
|
487
|
+
function matchesBashCommand(line) {
|
|
488
|
+
return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
|
|
489
|
+
}
|
|
490
|
+
function matchesPackageManagerVerb(line) {
|
|
491
|
+
for (const mgr of BASH_PKG_MANAGERS) {
|
|
492
|
+
if (!line.startsWith(`${mgr} `))
|
|
438
493
|
continue;
|
|
439
|
-
|
|
494
|
+
const rest = line.slice(mgr.length + 1);
|
|
495
|
+
if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
|
|
440
496
|
return true;
|
|
497
|
+
}
|
|
441
498
|
}
|
|
442
499
|
return false;
|
|
443
500
|
}
|
|
444
|
-
function
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
return line.startsWith('#!');
|
|
455
|
-
}
|
|
456
|
-
function isPromptLine(line) {
|
|
457
|
-
return line.startsWith('$ ') || line.startsWith('# ');
|
|
458
|
-
}
|
|
459
|
-
function startsWithPackageManagerCommand(line) {
|
|
460
|
-
return BASH_PACKAGE_MANAGERS.some((manager) => {
|
|
461
|
-
if (!line.startsWith(`${manager} `))
|
|
462
|
-
return false;
|
|
463
|
-
const rest = line.slice(manager.length + 1);
|
|
464
|
-
return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
|
|
465
|
-
});
|
|
466
|
-
}
|
|
467
|
-
const TYPE_HINTS = [
|
|
468
|
-
'string',
|
|
469
|
-
'number',
|
|
470
|
-
'boolean',
|
|
471
|
-
'void',
|
|
472
|
-
'any',
|
|
473
|
-
'unknown',
|
|
474
|
-
'never',
|
|
475
|
-
];
|
|
476
|
-
const HTML_TAGS = [
|
|
477
|
-
'<!doctype',
|
|
478
|
-
'<html',
|
|
479
|
-
'<head',
|
|
480
|
-
'<body',
|
|
481
|
-
'<div',
|
|
482
|
-
'<span',
|
|
483
|
-
'<p',
|
|
484
|
-
'<a',
|
|
485
|
-
'<script',
|
|
486
|
-
'<style',
|
|
487
|
-
];
|
|
488
|
-
const SQL_KEYWORDS = [
|
|
489
|
-
'select',
|
|
490
|
-
'insert',
|
|
491
|
-
'update',
|
|
492
|
-
'delete',
|
|
493
|
-
'create',
|
|
494
|
-
'alter',
|
|
495
|
-
'drop',
|
|
496
|
-
];
|
|
497
|
-
const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
|
|
498
|
-
const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
|
|
499
|
-
const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
|
|
500
|
-
const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
|
|
501
|
-
const CODE_DETECTORS = [
|
|
502
|
-
{ language: 'jsx', detect: detectJsx },
|
|
503
|
-
{ language: 'typescript', detect: detectTypescript },
|
|
504
|
-
{ language: 'rust', detect: detectRust },
|
|
505
|
-
{ language: 'javascript', detect: detectJavascript },
|
|
506
|
-
{ language: 'python', detect: detectPython },
|
|
507
|
-
{ language: 'bash', detect: detectBash },
|
|
508
|
-
{ language: 'css', detect: detectCss },
|
|
509
|
-
{ language: 'html', detect: detectHtml },
|
|
510
|
-
{ language: 'json', detect: detectJson },
|
|
511
|
-
{ language: 'yaml', detect: detectYaml },
|
|
512
|
-
{ language: 'sql', detect: detectSql },
|
|
513
|
-
{ language: 'go', detect: detectGo },
|
|
514
|
-
];
|
|
515
|
-
function detectJsx(code) {
|
|
516
|
-
const lower = code.toLowerCase();
|
|
517
|
-
if (lower.includes('classname='))
|
|
518
|
-
return true;
|
|
519
|
-
if (lower.includes('jsx:'))
|
|
520
|
-
return true;
|
|
521
|
-
if (lower.includes("from 'react'") || lower.includes('from "react"')) {
|
|
522
|
-
return true;
|
|
501
|
+
function detectBashIndicators(code) {
|
|
502
|
+
for (const line of splitLines(code)) {
|
|
503
|
+
const trimmed = line.trimStart();
|
|
504
|
+
if (!trimmed)
|
|
505
|
+
continue;
|
|
506
|
+
if (isShellPrefix(trimmed) ||
|
|
507
|
+
matchesBashCommand(trimmed) ||
|
|
508
|
+
matchesPackageManagerVerb(trimmed)) {
|
|
509
|
+
return true;
|
|
510
|
+
}
|
|
523
511
|
}
|
|
524
|
-
return
|
|
525
|
-
}
|
|
526
|
-
function detectTypescript(code) {
|
|
527
|
-
const lower = code.toLowerCase();
|
|
528
|
-
if (containsWord(lower, 'interface'))
|
|
529
|
-
return true;
|
|
530
|
-
if (containsWord(lower, 'type'))
|
|
531
|
-
return true;
|
|
532
|
-
return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
|
|
533
|
-
}
|
|
534
|
-
function detectRust(code) {
|
|
535
|
-
const lower = code.toLowerCase();
|
|
536
|
-
return (RUST_WORD_REGEX.test(lower) ||
|
|
537
|
-
lower.includes('let mut') ||
|
|
538
|
-
(lower.includes('use ') && lower.includes('::')));
|
|
539
|
-
}
|
|
540
|
-
function detectJavascript(code) {
|
|
541
|
-
const lower = code.toLowerCase();
|
|
542
|
-
return JS_WORD_REGEX.test(lower);
|
|
543
|
-
}
|
|
544
|
-
function detectPython(code) {
|
|
545
|
-
const lower = code.toLowerCase();
|
|
546
|
-
return (PYTHON_WORD_REGEX.test(lower) ||
|
|
547
|
-
lower.includes('print(') ||
|
|
548
|
-
lower.includes('__name__'));
|
|
512
|
+
return false;
|
|
549
513
|
}
|
|
550
|
-
function
|
|
551
|
-
const
|
|
552
|
-
if (CSS_DIRECTIVE_REGEX.test(lower))
|
|
553
|
-
return true;
|
|
554
|
-
const lines = splitLines(code);
|
|
555
|
-
for (const line of lines) {
|
|
514
|
+
function detectCssStructure(code) {
|
|
515
|
+
for (const line of splitLines(code)) {
|
|
556
516
|
const trimmed = line.trimStart();
|
|
557
517
|
if (!trimmed)
|
|
558
518
|
continue;
|
|
559
|
-
|
|
519
|
+
const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
|
|
520
|
+
trimmed.includes('{');
|
|
521
|
+
const isProperty = trimmed.includes(':') && trimmed.includes(';');
|
|
522
|
+
if (isSelector || isProperty)
|
|
560
523
|
return true;
|
|
561
524
|
}
|
|
562
525
|
return false;
|
|
563
526
|
}
|
|
564
|
-
function
|
|
565
|
-
const
|
|
566
|
-
return HTML_TAGS.some((tag) => lower.includes(tag));
|
|
567
|
-
}
|
|
568
|
-
function detectJson(code) {
|
|
569
|
-
const trimmed = code.trimStart();
|
|
570
|
-
if (!trimmed)
|
|
571
|
-
return false;
|
|
572
|
-
return trimmed.startsWith('{') || trimmed.startsWith('[');
|
|
573
|
-
}
|
|
574
|
-
function detectYaml(code) {
|
|
575
|
-
const lines = splitLines(code);
|
|
576
|
-
for (const line of lines) {
|
|
527
|
+
function detectYamlStructure(code) {
|
|
528
|
+
for (const line of splitLines(code)) {
|
|
577
529
|
const trimmed = line.trim();
|
|
578
530
|
if (!trimmed)
|
|
579
531
|
continue;
|
|
580
|
-
const
|
|
581
|
-
if (
|
|
532
|
+
const colonIdx = trimmed.indexOf(':');
|
|
533
|
+
if (colonIdx <= 0)
|
|
582
534
|
continue;
|
|
583
|
-
const after = trimmed[
|
|
535
|
+
const after = trimmed[colonIdx + 1];
|
|
584
536
|
if (after === ' ' || after === '\t')
|
|
585
537
|
return true;
|
|
586
538
|
}
|
|
587
539
|
return false;
|
|
588
540
|
}
|
|
589
|
-
function
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
function isCssPropertyLine(line) {
|
|
605
|
-
return line.includes(':') && line.includes(';');
|
|
541
|
+
function matchesLanguagePattern(code, lower, pattern) {
|
|
542
|
+
if (pattern.keywords?.some((kw) => lower.includes(kw)))
|
|
543
|
+
return true;
|
|
544
|
+
if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
|
|
545
|
+
return true;
|
|
546
|
+
if (pattern.regex?.test(lower))
|
|
547
|
+
return true;
|
|
548
|
+
if (pattern.startsWith) {
|
|
549
|
+
const trimmed = code.trimStart();
|
|
550
|
+
if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
|
|
551
|
+
return true;
|
|
552
|
+
}
|
|
553
|
+
if (pattern.custom?.(code, lower))
|
|
554
|
+
return true;
|
|
555
|
+
return false;
|
|
606
556
|
}
|
|
607
557
|
export function detectLanguageFromCode(code) {
|
|
608
|
-
|
|
609
|
-
|
|
558
|
+
const lower = code.toLowerCase();
|
|
559
|
+
for (const { language, pattern } of LANGUAGE_PATTERNS) {
|
|
560
|
+
if (matchesLanguagePattern(code, lower, pattern))
|
|
610
561
|
return language;
|
|
611
562
|
}
|
|
612
563
|
return undefined;
|
|
@@ -630,6 +581,7 @@ const STRUCTURAL_TAGS = new Set([
|
|
|
630
581
|
'input',
|
|
631
582
|
'select',
|
|
632
583
|
'textarea',
|
|
584
|
+
'svg',
|
|
633
585
|
]);
|
|
634
586
|
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
|
|
635
587
|
const NAVIGATION_ROLES = new Set([
|
|
@@ -642,6 +594,7 @@ const NAVIGATION_ROLES = new Set([
|
|
|
642
594
|
'menu',
|
|
643
595
|
'dialog',
|
|
644
596
|
'alertdialog',
|
|
597
|
+
'search',
|
|
645
598
|
]);
|
|
646
599
|
const PROMO_TOKENS = new Set([
|
|
647
600
|
'banner',
|
|
@@ -669,6 +622,7 @@ const PROMO_TOKENS = new Set([
|
|
|
669
622
|
'breadcrumb',
|
|
670
623
|
'pagination',
|
|
671
624
|
'pager',
|
|
625
|
+
'taglist',
|
|
672
626
|
]);
|
|
673
627
|
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
674
628
|
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
@@ -727,6 +681,8 @@ const NOISE_MARKERS = [
|
|
|
727
681
|
' z-50',
|
|
728
682
|
' z-4',
|
|
729
683
|
' isolate',
|
|
684
|
+
' breadcrumb',
|
|
685
|
+
' pagination',
|
|
730
686
|
];
|
|
731
687
|
function mayContainNoise(html) {
|
|
732
688
|
const haystack = html.toLowerCase();
|
|
@@ -760,11 +716,9 @@ function matchesPromoIdOrClass(className, id) {
|
|
|
760
716
|
const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
|
|
761
717
|
return tokens.some((token) => PROMO_TOKENS.has(token));
|
|
762
718
|
}
|
|
763
|
-
function matchesHighZIsolate(className) {
|
|
764
|
-
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
765
|
-
}
|
|
766
719
|
function matchesFixedOrHighZIsolate(className) {
|
|
767
|
-
return FIXED_PATTERN.test(className) ||
|
|
720
|
+
return (FIXED_PATTERN.test(className) ||
|
|
721
|
+
(HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
|
|
768
722
|
}
|
|
769
723
|
function readElementMetadata(element) {
|
|
770
724
|
return {
|
|
@@ -791,8 +745,7 @@ function isNoiseElement(node) {
|
|
|
791
745
|
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
792
746
|
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
793
747
|
}
|
|
794
|
-
function
|
|
795
|
-
const nodes = document.querySelectorAll('*');
|
|
748
|
+
function removeNoiseNodes(nodes) {
|
|
796
749
|
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
797
750
|
const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
|
|
798
751
|
if (!node)
|
|
@@ -802,20 +755,54 @@ function stripNoiseNodes(document) {
|
|
|
802
755
|
}
|
|
803
756
|
}
|
|
804
757
|
}
|
|
805
|
-
function
|
|
758
|
+
function stripNoiseNodes(document) {
|
|
759
|
+
// Use targeted selectors for common noise elements instead of querySelectorAll('*')
|
|
760
|
+
const targetSelectors = [
|
|
761
|
+
'nav',
|
|
762
|
+
'footer',
|
|
763
|
+
'aside',
|
|
764
|
+
'header[class*="site"]',
|
|
765
|
+
'header[class*="nav"]',
|
|
766
|
+
'header[class*="menu"]',
|
|
767
|
+
'[role="banner"]',
|
|
768
|
+
'[role="navigation"]',
|
|
769
|
+
'[role="dialog"]',
|
|
770
|
+
'[style*="display: none"]',
|
|
771
|
+
'[style*="display:none"]',
|
|
772
|
+
'[hidden]',
|
|
773
|
+
'[aria-hidden="true"]',
|
|
774
|
+
].join(',');
|
|
775
|
+
const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
|
|
776
|
+
// Remove in reverse order to handle nested elements correctly
|
|
777
|
+
removeNoiseNodes(potentialNoiseNodes);
|
|
778
|
+
// Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
|
|
779
|
+
const candidateSelectors = [
|
|
780
|
+
...STRUCTURAL_TAGS,
|
|
781
|
+
...ALWAYS_NOISE_TAGS,
|
|
782
|
+
'header',
|
|
783
|
+
'canvas',
|
|
784
|
+
'[class]',
|
|
785
|
+
'[id]',
|
|
786
|
+
'[role]',
|
|
787
|
+
'[style]',
|
|
788
|
+
].join(',');
|
|
789
|
+
const allElements = document.querySelectorAll(candidateSelectors);
|
|
790
|
+
removeNoiseNodes(allElements);
|
|
791
|
+
}
|
|
792
|
+
function removeNoiseFromHtml(html, document) {
|
|
806
793
|
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
807
794
|
if (!shouldParse)
|
|
808
795
|
return html;
|
|
809
796
|
try {
|
|
810
|
-
const
|
|
811
|
-
stripNoiseNodes(
|
|
812
|
-
const bodyInnerHtml = getBodyInnerHtml(
|
|
797
|
+
const resolvedDocument = document ?? parseHTML(html).document;
|
|
798
|
+
stripNoiseNodes(resolvedDocument);
|
|
799
|
+
const bodyInnerHtml = getBodyInnerHtml(resolvedDocument);
|
|
813
800
|
if (bodyInnerHtml)
|
|
814
801
|
return bodyInnerHtml;
|
|
815
|
-
const docToString = getDocumentToString(
|
|
802
|
+
const docToString = getDocumentToString(resolvedDocument);
|
|
816
803
|
if (docToString)
|
|
817
804
|
return docToString();
|
|
818
|
-
const documentElementOuterHtml = getDocumentElementOuterHtml(
|
|
805
|
+
const documentElementOuterHtml = getDocumentElementOuterHtml(resolvedDocument);
|
|
819
806
|
if (documentElementOuterHtml)
|
|
820
807
|
return documentElementOuterHtml;
|
|
821
808
|
return html;
|
|
@@ -826,7 +813,14 @@ function removeNoiseFromHtml(html) {
|
|
|
826
813
|
}
|
|
827
814
|
function buildInlineCode(content) {
|
|
828
815
|
const runs = content.match(/`+/g);
|
|
829
|
-
|
|
816
|
+
let longest = '';
|
|
817
|
+
if (runs) {
|
|
818
|
+
for (const run of runs) {
|
|
819
|
+
if (run.length > longest.length) {
|
|
820
|
+
longest = run;
|
|
821
|
+
}
|
|
822
|
+
}
|
|
823
|
+
}
|
|
830
824
|
const delimiter = `\`${longest}`;
|
|
831
825
|
const padding = delimiter.length > 1 ? ' ' : '';
|
|
832
826
|
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
@@ -977,17 +971,14 @@ function getMarkdownConverter() {
|
|
|
977
971
|
markdownInstance ??= createMarkdownInstance();
|
|
978
972
|
return markdownInstance;
|
|
979
973
|
}
|
|
980
|
-
function translateHtmlToMarkdown(html, url, signal) {
|
|
974
|
+
function translateHtmlToMarkdown(html, url, signal, document) {
|
|
981
975
|
throwIfAborted(signal, url, 'markdown:begin');
|
|
982
|
-
const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html));
|
|
976
|
+
const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document));
|
|
983
977
|
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
984
978
|
const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
|
|
985
979
|
throwIfAborted(signal, url, 'markdown:translated');
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
finalMarkdown = normalizeTableWhitespace(finalMarkdown);
|
|
989
|
-
finalMarkdown = normalizeLineEndings(finalMarkdown);
|
|
990
|
-
return finalMarkdown;
|
|
980
|
+
const cleaned = cleanupMarkdownArtifacts(content);
|
|
981
|
+
return promoteOrphanHeadings(cleaned);
|
|
991
982
|
}
|
|
992
983
|
function appendMetadataFooter(content, metadata, url) {
|
|
993
984
|
const footer = buildMetadataFooter(metadata, url);
|
|
@@ -998,7 +989,7 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
998
989
|
if (!html)
|
|
999
990
|
return buildMetadataFooter(metadata, url);
|
|
1000
991
|
try {
|
|
1001
|
-
const content = translateHtmlToMarkdown(html, url, options?.signal);
|
|
992
|
+
const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document);
|
|
1002
993
|
return appendMetadataFooter(content, metadata, url);
|
|
1003
994
|
}
|
|
1004
995
|
catch (error) {
|
|
@@ -1010,37 +1001,146 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
1010
1001
|
}
|
|
1011
1002
|
function cleanupMarkdownArtifacts(content) {
|
|
1012
1003
|
let result = content;
|
|
1004
|
+
const fixOrphanHeadings = (text) => {
|
|
1005
|
+
return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
|
|
1006
|
+
if (typeof prefix !== 'string' ||
|
|
1007
|
+
typeof hashes !== 'string' ||
|
|
1008
|
+
typeof heading !== 'string') {
|
|
1009
|
+
return match;
|
|
1010
|
+
}
|
|
1011
|
+
if (heading.length > 150) {
|
|
1012
|
+
return match;
|
|
1013
|
+
}
|
|
1014
|
+
const trimmedPrefix = prefix.trim();
|
|
1015
|
+
if (trimmedPrefix === '') {
|
|
1016
|
+
return `${hashes} ${heading}\n\n`;
|
|
1017
|
+
}
|
|
1018
|
+
return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
|
|
1019
|
+
});
|
|
1020
|
+
};
|
|
1021
|
+
result = fixOrphanHeadings(result);
|
|
1013
1022
|
result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
|
|
1014
1023
|
const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
|
|
1015
1024
|
result = result.replace(zeroWidthAnchorLink, '');
|
|
1025
|
+
result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
|
|
1026
|
+
result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
|
|
1027
|
+
result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
|
|
1028
|
+
result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
|
|
1029
|
+
result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
|
|
1030
|
+
result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
|
|
1031
|
+
const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
|
|
1032
|
+
const lines = result.split('\n');
|
|
1033
|
+
const filtered = [];
|
|
1034
|
+
let skipTocBlock = false;
|
|
1035
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
1036
|
+
const line = lines[i] ?? '';
|
|
1037
|
+
const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
|
|
1038
|
+
const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
|
|
1039
|
+
if (tocLinkLine.test(line)) {
|
|
1040
|
+
const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
|
|
1041
|
+
const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
|
|
1042
|
+
if (prevIsToc || nextIsToc) {
|
|
1043
|
+
skipTocBlock = true;
|
|
1044
|
+
continue;
|
|
1045
|
+
}
|
|
1046
|
+
}
|
|
1047
|
+
else if (line.trim() === '' && skipTocBlock) {
|
|
1048
|
+
skipTocBlock = false;
|
|
1049
|
+
continue;
|
|
1050
|
+
}
|
|
1051
|
+
else {
|
|
1052
|
+
skipTocBlock = false;
|
|
1053
|
+
}
|
|
1054
|
+
filtered.push(line);
|
|
1055
|
+
}
|
|
1056
|
+
result = filtered.join('\n');
|
|
1016
1057
|
result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
|
|
1017
1058
|
result = result.replace(/^Was this page helpful\??\s*$/gim, '');
|
|
1059
|
+
result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
|
|
1060
|
+
result = result.replace(/\\([[]])/g, '$1');
|
|
1061
|
+
result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
|
|
1062
|
+
result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
|
|
1018
1063
|
result = result.replace(/\n{3,}/g, '\n\n');
|
|
1019
1064
|
return result.trim();
|
|
1020
1065
|
}
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1066
|
+
const HEADING_KEYWORDS = new Set([
|
|
1067
|
+
'overview',
|
|
1068
|
+
'introduction',
|
|
1069
|
+
'summary',
|
|
1070
|
+
'conclusion',
|
|
1071
|
+
'prerequisites',
|
|
1072
|
+
'requirements',
|
|
1073
|
+
'installation',
|
|
1074
|
+
'configuration',
|
|
1075
|
+
'usage',
|
|
1076
|
+
'features',
|
|
1077
|
+
'limitations',
|
|
1078
|
+
'troubleshooting',
|
|
1079
|
+
'faq',
|
|
1080
|
+
'resources',
|
|
1081
|
+
'references',
|
|
1082
|
+
'changelog',
|
|
1083
|
+
'license',
|
|
1084
|
+
'acknowledgments',
|
|
1085
|
+
'appendix',
|
|
1086
|
+
]);
|
|
1087
|
+
function isLikelyHeadingLine(line) {
|
|
1088
|
+
const trimmed = line.trim();
|
|
1089
|
+
if (!trimmed || trimmed.length > 80)
|
|
1090
|
+
return false;
|
|
1091
|
+
if (/^#{1,6}\s/.test(trimmed))
|
|
1092
|
+
return false;
|
|
1093
|
+
if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
|
|
1094
|
+
return false;
|
|
1095
|
+
if (/[.!?]$/.test(trimmed))
|
|
1096
|
+
return false;
|
|
1097
|
+
if (/^\[.*\]\(.*\)$/.test(trimmed))
|
|
1098
|
+
return false;
|
|
1099
|
+
if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
|
|
1100
|
+
return true;
|
|
1101
|
+
}
|
|
1102
|
+
const words = trimmed.split(/\s+/);
|
|
1103
|
+
if (words.length >= 2 && words.length <= 6) {
|
|
1104
|
+
const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
|
|
1105
|
+
if (isTitleCase)
|
|
1106
|
+
return true;
|
|
1107
|
+
}
|
|
1108
|
+
if (words.length === 1) {
|
|
1109
|
+
const lower = trimmed.toLowerCase();
|
|
1110
|
+
if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
|
|
1111
|
+
return true;
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
return false;
|
|
1031
1115
|
}
|
|
1032
|
-
function
|
|
1033
|
-
|
|
1116
|
+
function promoteOrphanHeadings(markdown) {
|
|
1117
|
+
const lines = markdown.split('\n');
|
|
1118
|
+
const result = [];
|
|
1119
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
1120
|
+
const line = lines[i] ?? '';
|
|
1121
|
+
const prevLine = i > 0 ? lines[i - 1] : '';
|
|
1122
|
+
const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
|
|
1123
|
+
const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
|
|
1124
|
+
const isPrecededByBlank = prevLine?.trim() === '';
|
|
1125
|
+
if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
|
|
1126
|
+
const trimmed = line.trim();
|
|
1127
|
+
const isExample = /^example:\s/i.test(trimmed);
|
|
1128
|
+
const prefix = isExample ? '### ' : '## ';
|
|
1129
|
+
result.push(prefix + trimmed);
|
|
1130
|
+
}
|
|
1131
|
+
else {
|
|
1132
|
+
result.push(line);
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
return result.join('\n');
|
|
1034
1136
|
}
|
|
1035
1137
|
function formatFetchedDate(isoString) {
|
|
1036
1138
|
try {
|
|
1037
1139
|
const date = new Date(isoString);
|
|
1038
|
-
const
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
};
|
|
1043
|
-
return date.toLocaleDateString('en-US', options);
|
|
1140
|
+
const day = String(date.getDate()).padStart(2, '0');
|
|
1141
|
+
const month = String(date.getMonth() + 1).padStart(2, '0');
|
|
1142
|
+
const year = date.getFullYear();
|
|
1143
|
+
return `${day}-${month}-${year}`;
|
|
1044
1144
|
}
|
|
1045
1145
|
catch {
|
|
1046
1146
|
return isoString;
|
|
@@ -1049,20 +1149,24 @@ function formatFetchedDate(isoString) {
|
|
|
1049
1149
|
function buildMetadataFooter(metadata, fallbackUrl) {
|
|
1050
1150
|
if (!metadata)
|
|
1051
1151
|
return '';
|
|
1052
|
-
const lines = [];
|
|
1152
|
+
const lines = ['---', ''];
|
|
1153
|
+
const url = metadata.url || fallbackUrl;
|
|
1154
|
+
const parts = [];
|
|
1053
1155
|
if (metadata.title)
|
|
1054
|
-
|
|
1055
|
-
if (metadata.description)
|
|
1056
|
-
lines.push(`> *${metadata.description}*`);
|
|
1156
|
+
parts.push(`_${metadata.title}_`);
|
|
1057
1157
|
if (metadata.author)
|
|
1058
|
-
|
|
1059
|
-
if (
|
|
1060
|
-
|
|
1061
|
-
else if (fallbackUrl)
|
|
1062
|
-
lines.push(`> *<${fallbackUrl}>*`);
|
|
1158
|
+
parts.push(`_${metadata.author}_`);
|
|
1159
|
+
if (url)
|
|
1160
|
+
parts.push(`[_Original Source_](${url})`);
|
|
1063
1161
|
if (metadata.fetchedAt) {
|
|
1064
1162
|
const formattedDate = formatFetchedDate(metadata.fetchedAt);
|
|
1065
|
-
|
|
1163
|
+
parts.push(`_${formattedDate}_`);
|
|
1164
|
+
}
|
|
1165
|
+
if (parts.length > 0) {
|
|
1166
|
+
lines.push(` ${parts.join(' | ')}`);
|
|
1167
|
+
}
|
|
1168
|
+
if (metadata.description) {
|
|
1169
|
+
lines.push(` <sub>${metadata.description}</sub>`);
|
|
1066
1170
|
}
|
|
1067
1171
|
return lines.join('\n');
|
|
1068
1172
|
}
|
|
@@ -1277,78 +1381,95 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
|
1277
1381
|
}
|
|
1278
1382
|
const MIN_CONTENT_RATIO = 0.3;
|
|
1279
1383
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
1280
|
-
|
|
1281
|
-
|
|
1384
|
+
const MIN_HEADING_RETENTION_RATIO = 0.7;
|
|
1385
|
+
function countHeadings(html) {
|
|
1386
|
+
if (!html)
|
|
1387
|
+
return 0;
|
|
1388
|
+
// Match opening heading tags <h1> through <h6>
|
|
1389
|
+
const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
|
|
1390
|
+
const matches = html.match(headingPattern);
|
|
1391
|
+
return matches ? matches.length : 0;
|
|
1392
|
+
}
|
|
1393
|
+
function isHeadingStructurePreserved(article, originalHtml) {
|
|
1394
|
+
if (!article)
|
|
1395
|
+
return false;
|
|
1396
|
+
// Cache heading counts to avoid duplicate regex matching
|
|
1397
|
+
const originalHeadingCount = countHeadings(originalHtml);
|
|
1398
|
+
const articleHeadingCount = countHeadings(article.content);
|
|
1399
|
+
// If original has no headings, structure is trivially preserved
|
|
1400
|
+
if (originalHeadingCount === 0)
|
|
1401
|
+
return true;
|
|
1402
|
+
// If article lost >50% of headings, structure is broken
|
|
1403
|
+
const retentionRatio = articleHeadingCount / originalHeadingCount;
|
|
1404
|
+
return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
|
|
1405
|
+
}
|
|
1406
|
+
function stripHtmlTagsForLength(html) {
|
|
1407
|
+
let result = '';
|
|
1282
1408
|
let inTag = false;
|
|
1283
1409
|
for (const char of html) {
|
|
1284
1410
|
if (char === '<') {
|
|
1285
1411
|
inTag = true;
|
|
1286
|
-
continue;
|
|
1287
1412
|
}
|
|
1288
|
-
if (char === '>') {
|
|
1413
|
+
else if (char === '>') {
|
|
1289
1414
|
inTag = false;
|
|
1290
|
-
continue;
|
|
1291
1415
|
}
|
|
1292
|
-
if (!inTag) {
|
|
1293
|
-
|
|
1416
|
+
else if (!inTag) {
|
|
1417
|
+
result += char;
|
|
1294
1418
|
}
|
|
1295
1419
|
}
|
|
1296
|
-
return
|
|
1297
|
-
}
|
|
1298
|
-
function estimateTextLength(html) {
|
|
1299
|
-
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
1420
|
+
return result;
|
|
1300
1421
|
}
|
|
1301
1422
|
export function isExtractionSufficient(article, originalHtml) {
|
|
1302
1423
|
if (!article)
|
|
1303
1424
|
return false;
|
|
1304
1425
|
const articleLength = article.textContent.length;
|
|
1305
|
-
const originalLength =
|
|
1426
|
+
const originalLength = stripHtmlTagsForLength(originalHtml)
|
|
1427
|
+
.replace(/\s+/g, ' ')
|
|
1428
|
+
.trim().length;
|
|
1306
1429
|
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
1307
1430
|
return true;
|
|
1308
1431
|
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
1309
1432
|
}
|
|
1310
1433
|
export function determineContentExtractionSource(article) {
|
|
1311
|
-
return
|
|
1312
|
-
}
|
|
1313
|
-
function applyArticleMetadata(metadata, article) {
|
|
1314
|
-
if (article.title !== undefined)
|
|
1315
|
-
metadata.title = article.title;
|
|
1316
|
-
if (article.byline !== undefined)
|
|
1317
|
-
metadata.author = article.byline;
|
|
1318
|
-
}
|
|
1319
|
-
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
1320
|
-
if (extractedMeta.title !== undefined)
|
|
1321
|
-
metadata.title = extractedMeta.title;
|
|
1322
|
-
if (extractedMeta.description !== undefined) {
|
|
1323
|
-
metadata.description = extractedMeta.description;
|
|
1324
|
-
}
|
|
1325
|
-
if (extractedMeta.author !== undefined) {
|
|
1326
|
-
metadata.author = extractedMeta.author;
|
|
1327
|
-
}
|
|
1434
|
+
return article !== null;
|
|
1328
1435
|
}
|
|
1329
1436
|
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
1330
1437
|
if (!includeMetadata)
|
|
1331
1438
|
return undefined;
|
|
1332
|
-
const now = new Date().toISOString();
|
|
1333
1439
|
const metadata = {
|
|
1334
1440
|
type: 'metadata',
|
|
1335
1441
|
url,
|
|
1336
|
-
fetchedAt:
|
|
1442
|
+
fetchedAt: new Date().toISOString(),
|
|
1337
1443
|
};
|
|
1338
1444
|
if (shouldExtractFromArticle && article) {
|
|
1339
|
-
|
|
1340
|
-
|
|
1445
|
+
if (article.title !== undefined)
|
|
1446
|
+
metadata.title = article.title;
|
|
1447
|
+
if (article.byline !== undefined)
|
|
1448
|
+
metadata.author = article.byline;
|
|
1449
|
+
}
|
|
1450
|
+
else {
|
|
1451
|
+
if (extractedMeta.title !== undefined)
|
|
1452
|
+
metadata.title = extractedMeta.title;
|
|
1453
|
+
if (extractedMeta.description !== undefined) {
|
|
1454
|
+
metadata.description = extractedMeta.description;
|
|
1455
|
+
}
|
|
1456
|
+
if (extractedMeta.author !== undefined) {
|
|
1457
|
+
metadata.author = extractedMeta.author;
|
|
1458
|
+
}
|
|
1341
1459
|
}
|
|
1342
|
-
applyExtractedMetadata(metadata, extractedMeta);
|
|
1343
1460
|
return metadata;
|
|
1344
1461
|
}
|
|
1345
|
-
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
|
|
1462
|
+
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
|
|
1346
1463
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
1347
|
-
|
|
1464
|
+
const source = {
|
|
1348
1465
|
sourceHtml: useArticleContent && article ? article.content : html,
|
|
1349
1466
|
title: useArticleContent && article ? article.title : extractedMeta.title,
|
|
1350
1467
|
metadata,
|
|
1351
1468
|
};
|
|
1469
|
+
if (!useArticleContent && document) {
|
|
1470
|
+
return { ...source, document };
|
|
1471
|
+
}
|
|
1472
|
+
return source;
|
|
1352
1473
|
}
|
|
1353
1474
|
function logQualityGateFallback({ url, articleLength, }) {
|
|
1354
1475
|
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
@@ -1357,20 +1478,27 @@ function logQualityGateFallback({ url, articleLength, }) {
|
|
|
1357
1478
|
});
|
|
1358
1479
|
}
|
|
1359
1480
|
function shouldUseArticleContent(article, html, url) {
|
|
1360
|
-
|
|
1361
|
-
if (!
|
|
1481
|
+
// Check content sufficiency (length-based quality gate)
|
|
1482
|
+
if (!isExtractionSufficient(article, html)) {
|
|
1483
|
+
logQualityGateFallback({
|
|
1484
|
+
url,
|
|
1485
|
+
articleLength: article.textContent.length,
|
|
1486
|
+
});
|
|
1362
1487
|
return false;
|
|
1363
|
-
if (isExtractionSufficient(article, html)) {
|
|
1364
|
-
return true;
|
|
1365
1488
|
}
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1489
|
+
// Check heading structure preservation
|
|
1490
|
+
if (!isHeadingStructurePreserved(article, html)) {
|
|
1491
|
+
logDebug('Quality gate: Readability broke heading structure, using full HTML', {
|
|
1492
|
+
url: url.substring(0, 80),
|
|
1493
|
+
originalHeadings: countHeadings(html),
|
|
1494
|
+
articleHeadings: countHeadings(article.content),
|
|
1495
|
+
});
|
|
1496
|
+
return false;
|
|
1497
|
+
}
|
|
1498
|
+
return true;
|
|
1371
1499
|
}
|
|
1372
1500
|
function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
1373
|
-
const { article, metadata: extractedMeta } =
|
|
1501
|
+
const { article, metadata: extractedMeta, document, } = extractContentWithDocument(html, url, {
|
|
1374
1502
|
extractArticle: true,
|
|
1375
1503
|
...(signal ? { signal } : {}),
|
|
1376
1504
|
});
|
|
@@ -1384,6 +1512,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
|
1384
1512
|
extractedMeta,
|
|
1385
1513
|
includeMetadata,
|
|
1386
1514
|
useArticleContent,
|
|
1515
|
+
...(document ? { document } : {}),
|
|
1387
1516
|
});
|
|
1388
1517
|
}
|
|
1389
1518
|
function tryTransformRawStage(html, url, includeMetadata) {
|
|
@@ -1405,6 +1534,7 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
1405
1534
|
const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
1406
1535
|
url,
|
|
1407
1536
|
...(signal ? { signal } : {}),
|
|
1537
|
+
...(context.document ? { document: context.document } : {}),
|
|
1408
1538
|
}));
|
|
1409
1539
|
return {
|
|
1410
1540
|
markdown: content,
|