@j0hanz/superfetch 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -38
- package/dist/cache.d.ts +42 -0
- package/dist/cache.js +565 -0
- package/dist/config/env-parsers.d.ts +1 -0
- package/dist/config/env-parsers.js +12 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +10 -3
- package/dist/config/types/content.d.ts +1 -0
- package/dist/config.d.ts +77 -0
- package/dist/config.js +261 -0
- package/dist/crypto.d.ts +2 -0
- package/dist/crypto.js +32 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +28 -0
- package/dist/fetch.d.ts +40 -0
- package/dist/fetch.js +910 -0
- package/dist/http/base-middleware.d.ts +7 -0
- package/dist/http/base-middleware.js +143 -0
- package/dist/http/cors.d.ts +0 -5
- package/dist/http/cors.js +0 -6
- package/dist/http/download-routes.js +6 -2
- package/dist/http/error-handler.d.ts +2 -0
- package/dist/http/error-handler.js +55 -0
- package/dist/http/mcp-routes.js +2 -2
- package/dist/http/mcp-sessions.d.ts +3 -5
- package/dist/http/mcp-sessions.js +8 -8
- package/dist/http/server-tuning.d.ts +9 -0
- package/dist/http/server-tuning.js +45 -0
- package/dist/http/server.d.ts +0 -10
- package/dist/http/server.js +33 -333
- package/dist/http.d.ts +78 -0
- package/dist/http.js +1437 -0
- package/dist/index.js +3 -3
- package/dist/mcp.d.ts +3 -0
- package/dist/mcp.js +94 -0
- package/dist/observability.d.ts +16 -0
- package/dist/observability.js +78 -0
- package/dist/server.js +20 -5
- package/dist/services/cache.d.ts +1 -1
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +3 -0
- package/dist/services/extractor.d.ts +1 -0
- package/dist/services/extractor.js +28 -2
- package/dist/services/fetcher.d.ts +2 -0
- package/dist/services/fetcher.js +35 -14
- package/dist/services/logger.js +4 -1
- package/dist/services/telemetry.d.ts +19 -0
- package/dist/services/telemetry.js +43 -0
- package/dist/services/transform-worker-pool.d.ts +10 -3
- package/dist/services/transform-worker-pool.js +213 -184
- package/dist/tools/handlers/fetch-url.tool.js +8 -6
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.js +13 -1
- package/dist/tools/schemas.d.ts +2 -0
- package/dist/tools/schemas.js +8 -0
- package/dist/tools/utils/content-transform-core.d.ts +5 -0
- package/dist/tools/utils/content-transform-core.js +180 -0
- package/dist/tools/utils/content-transform-workers.d.ts +1 -0
- package/dist/tools/utils/content-transform-workers.js +1 -0
- package/dist/tools/utils/content-transform.d.ts +3 -5
- package/dist/tools/utils/content-transform.js +35 -148
- package/dist/tools/utils/raw-markdown.js +15 -1
- package/dist/tools.d.ts +104 -0
- package/dist/tools.js +421 -0
- package/dist/transform.d.ts +69 -0
- package/dist/transform.js +1509 -0
- package/dist/transformers/markdown.d.ts +4 -1
- package/dist/transformers/markdown.js +182 -53
- package/dist/utils/cancellation.d.ts +1 -0
- package/dist/utils/cancellation.js +18 -0
- package/dist/utils/code-language.d.ts +0 -9
- package/dist/utils/code-language.js +5 -5
- package/dist/utils/host-normalizer.d.ts +1 -0
- package/dist/utils/host-normalizer.js +37 -0
- package/dist/utils/url-redactor.d.ts +1 -0
- package/dist/utils/url-redactor.js +13 -0
- package/dist/utils/url-validator.js +8 -5
- package/dist/workers/transform-worker.js +82 -38
- package/package.json +8 -7
|
@@ -0,0 +1,1509 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
2
|
+
import diagnosticsChannel from 'node:diagnostics_channel';
|
|
3
|
+
import os from 'node:os';
|
|
4
|
+
import { performance } from 'node:perf_hooks';
|
|
5
|
+
import { Worker } from 'node:worker_threads';
|
|
6
|
+
import { parseHTML } from 'linkedom';
|
|
7
|
+
import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
8
|
+
import { Readability } from '@mozilla/readability';
|
|
9
|
+
import { config } from './config.js';
|
|
10
|
+
import { FetchError, getErrorMessage } from './errors.js';
|
|
11
|
+
import { isRawTextContentUrl } from './fetch.js';
|
|
12
|
+
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
|
|
13
|
+
function isRecord(value) {
|
|
14
|
+
return typeof value === 'object' && value !== null;
|
|
15
|
+
}
|
|
16
|
+
const FRONTMATTER_DELIMITER = '---';
|
|
17
|
+
const CODE_BLOCK = {
|
|
18
|
+
fence: '```',
|
|
19
|
+
format: (code, language = '') => {
|
|
20
|
+
return `\`\`\`${language}\n${code}\n\`\`\``;
|
|
21
|
+
},
|
|
22
|
+
};
|
|
23
|
+
const transformChannel = diagnosticsChannel.channel('superfetch.transform');
|
|
24
|
+
function publishTransformEvent(event) {
|
|
25
|
+
if (!transformChannel.hasSubscribers)
|
|
26
|
+
return;
|
|
27
|
+
try {
|
|
28
|
+
transformChannel.publish(event);
|
|
29
|
+
}
|
|
30
|
+
catch {
|
|
31
|
+
// Avoid crashing the publisher if a subscriber throws.
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
export function startTransformStage(url, stage) {
|
|
35
|
+
if (!transformChannel.hasSubscribers)
|
|
36
|
+
return null;
|
|
37
|
+
return {
|
|
38
|
+
stage,
|
|
39
|
+
startTime: performance.now(),
|
|
40
|
+
url: redactUrl(url),
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
export function endTransformStage(context, options) {
|
|
44
|
+
if (!context)
|
|
45
|
+
return;
|
|
46
|
+
const requestId = getRequestId();
|
|
47
|
+
const operationId = getOperationId();
|
|
48
|
+
const event = {
|
|
49
|
+
v: 1,
|
|
50
|
+
type: 'stage',
|
|
51
|
+
stage: context.stage,
|
|
52
|
+
durationMs: performance.now() - context.startTime,
|
|
53
|
+
url: context.url,
|
|
54
|
+
...(requestId ? { requestId } : {}),
|
|
55
|
+
...(operationId ? { operationId } : {}),
|
|
56
|
+
...(options?.truncated !== undefined
|
|
57
|
+
? { truncated: options.truncated }
|
|
58
|
+
: {}),
|
|
59
|
+
};
|
|
60
|
+
publishTransformEvent(event);
|
|
61
|
+
}
|
|
62
|
+
function isTimeoutReason(reason) {
|
|
63
|
+
return reason instanceof Error && reason.name === 'TimeoutError';
|
|
64
|
+
}
|
|
65
|
+
function throwIfAborted(signal, url, stage) {
|
|
66
|
+
if (!signal)
|
|
67
|
+
return;
|
|
68
|
+
const { aborted } = signal;
|
|
69
|
+
if (!aborted)
|
|
70
|
+
return;
|
|
71
|
+
const { reason } = signal;
|
|
72
|
+
if (isTimeoutReason(reason)) {
|
|
73
|
+
throw new FetchError('Request timeout', url, 504, {
|
|
74
|
+
reason: 'timeout',
|
|
75
|
+
stage,
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
throw new FetchError('Request was canceled', url, 499, {
|
|
79
|
+
reason: 'aborted',
|
|
80
|
+
stage,
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
function truncateHtml(html) {
|
|
84
|
+
const maxSize = config.constants.maxHtmlSize;
|
|
85
|
+
if (html.length <= maxSize) {
|
|
86
|
+
return html;
|
|
87
|
+
}
|
|
88
|
+
logWarn('HTML content exceeds maximum size, truncating', {
|
|
89
|
+
size: html.length,
|
|
90
|
+
maxSize,
|
|
91
|
+
});
|
|
92
|
+
return html.substring(0, maxSize);
|
|
93
|
+
}
|
|
94
|
+
function createMetaCollectorState() {
|
|
95
|
+
return {
|
|
96
|
+
title: {},
|
|
97
|
+
description: {},
|
|
98
|
+
author: {},
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
function resolveMetaField(state, field) {
|
|
102
|
+
const sources = state[field];
|
|
103
|
+
return sources.og ?? sources.twitter ?? sources.standard;
|
|
104
|
+
}
|
|
105
|
+
function parseOpenGraphKey(property) {
|
|
106
|
+
if (!property?.startsWith('og:'))
|
|
107
|
+
return null;
|
|
108
|
+
const key = property.replace('og:', '');
|
|
109
|
+
return key === 'title' || key === 'description' ? key : null;
|
|
110
|
+
}
|
|
111
|
+
function parseTwitterKey(name) {
|
|
112
|
+
if (!name?.startsWith('twitter:'))
|
|
113
|
+
return null;
|
|
114
|
+
const key = name.replace('twitter:', '');
|
|
115
|
+
return key === 'title' || key === 'description' ? key : null;
|
|
116
|
+
}
|
|
117
|
+
function parseStandardKey(name) {
|
|
118
|
+
if (name === 'description')
|
|
119
|
+
return 'description';
|
|
120
|
+
if (name === 'author')
|
|
121
|
+
return 'author';
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
function collectMetaTag(state, tag) {
|
|
125
|
+
const content = tag.getAttribute('content')?.trim();
|
|
126
|
+
if (!content)
|
|
127
|
+
return;
|
|
128
|
+
const ogKey = parseOpenGraphKey(tag.getAttribute('property'));
|
|
129
|
+
if (ogKey) {
|
|
130
|
+
state[ogKey].og = content;
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
const name = tag.getAttribute('name');
|
|
134
|
+
const twitterKey = parseTwitterKey(name);
|
|
135
|
+
if (twitterKey) {
|
|
136
|
+
state[twitterKey].twitter = content;
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
139
|
+
const standardKey = parseStandardKey(name);
|
|
140
|
+
if (standardKey) {
|
|
141
|
+
state[standardKey].standard = content;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
function scanMetaTags(document, state) {
|
|
145
|
+
const metaTags = document.querySelectorAll('meta');
|
|
146
|
+
for (const tag of metaTags) {
|
|
147
|
+
collectMetaTag(state, tag);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
function ensureTitleFallback(document, state) {
|
|
151
|
+
if (state.title.standard)
|
|
152
|
+
return;
|
|
153
|
+
const titleEl = document.querySelector('title');
|
|
154
|
+
if (titleEl?.textContent) {
|
|
155
|
+
state.title.standard = titleEl.textContent.trim();
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
function extractMetadata(document) {
|
|
159
|
+
const state = createMetaCollectorState();
|
|
160
|
+
scanMetaTags(document, state);
|
|
161
|
+
ensureTitleFallback(document, state);
|
|
162
|
+
const metadata = {};
|
|
163
|
+
const title = resolveMetaField(state, 'title');
|
|
164
|
+
const description = resolveMetaField(state, 'description');
|
|
165
|
+
const author = resolveMetaField(state, 'author');
|
|
166
|
+
if (title !== undefined)
|
|
167
|
+
metadata.title = title;
|
|
168
|
+
if (description !== undefined)
|
|
169
|
+
metadata.description = description;
|
|
170
|
+
if (author !== undefined)
|
|
171
|
+
metadata.author = author;
|
|
172
|
+
return metadata;
|
|
173
|
+
}
|
|
174
|
+
function isReadabilityCompatible(doc) {
|
|
175
|
+
if (!isRecord(doc))
|
|
176
|
+
return false;
|
|
177
|
+
return hasDocumentElement(doc) && hasQuerySelectors(doc);
|
|
178
|
+
}
|
|
179
|
+
function hasDocumentElement(record) {
|
|
180
|
+
return 'documentElement' in record;
|
|
181
|
+
}
|
|
182
|
+
function hasQuerySelectors(record) {
|
|
183
|
+
return (typeof record.querySelectorAll === 'function' &&
|
|
184
|
+
typeof record.querySelector === 'function');
|
|
185
|
+
}
|
|
186
|
+
function extractArticle(document) {
|
|
187
|
+
if (!isReadabilityCompatible(document)) {
|
|
188
|
+
logWarn('Document not compatible with Readability');
|
|
189
|
+
return null;
|
|
190
|
+
}
|
|
191
|
+
return mapParsedArticle(parseReadabilityArticle(document));
|
|
192
|
+
}
|
|
193
|
+
function parseReadabilityArticle(document) {
|
|
194
|
+
try {
|
|
195
|
+
// Type assertion is safe here due to isReadabilityCompatible check
|
|
196
|
+
const reader = new Readability(document);
|
|
197
|
+
return reader.parse();
|
|
198
|
+
}
|
|
199
|
+
catch (error) {
|
|
200
|
+
logError('Failed to extract article with Readability', asError(error));
|
|
201
|
+
return null;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
function asError(error) {
|
|
205
|
+
if (error instanceof Error) {
|
|
206
|
+
return error;
|
|
207
|
+
}
|
|
208
|
+
return undefined;
|
|
209
|
+
}
|
|
210
|
+
function mapParsedArticle(parsed) {
|
|
211
|
+
return parsed ? mapReadabilityResult(parsed) : null;
|
|
212
|
+
}
|
|
213
|
+
function mapReadabilityResult(parsed) {
|
|
214
|
+
return {
|
|
215
|
+
content: parsed.content ?? '',
|
|
216
|
+
textContent: parsed.textContent ?? '',
|
|
217
|
+
...buildOptionalArticleFields(parsed),
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
function buildOptionalArticleFields(parsed) {
|
|
221
|
+
const optional = {};
|
|
222
|
+
addOptionalField(optional, 'title', parsed.title);
|
|
223
|
+
addOptionalField(optional, 'byline', parsed.byline);
|
|
224
|
+
addOptionalField(optional, 'excerpt', parsed.excerpt);
|
|
225
|
+
addOptionalField(optional, 'siteName', parsed.siteName);
|
|
226
|
+
return optional;
|
|
227
|
+
}
|
|
228
|
+
function addOptionalField(target, key, value) {
|
|
229
|
+
if (value == null)
|
|
230
|
+
return;
|
|
231
|
+
target[key] = value;
|
|
232
|
+
}
|
|
233
|
+
export function extractContent(html, url, options = {
|
|
234
|
+
extractArticle: true,
|
|
235
|
+
}) {
|
|
236
|
+
if (!isValidInput(html, url)) {
|
|
237
|
+
return { article: null, metadata: {} };
|
|
238
|
+
}
|
|
239
|
+
return tryExtractContent(html, url, options);
|
|
240
|
+
}
|
|
241
|
+
function tryExtractContent(html, url, options) {
|
|
242
|
+
try {
|
|
243
|
+
throwIfAborted(options.signal, url, 'extract:begin');
|
|
244
|
+
const parseStage = startTransformStage(url, 'extract:parse');
|
|
245
|
+
const { document } = parseHTML(truncateHtml(html));
|
|
246
|
+
endTransformStage(parseStage);
|
|
247
|
+
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
248
|
+
applyBaseUri(document, url);
|
|
249
|
+
const metadataStage = startTransformStage(url, 'extract:metadata');
|
|
250
|
+
const metadata = extractMetadata(document);
|
|
251
|
+
endTransformStage(metadataStage);
|
|
252
|
+
throwIfAborted(options.signal, url, 'extract:metadata');
|
|
253
|
+
let article;
|
|
254
|
+
if (options.extractArticle) {
|
|
255
|
+
const articleStage = startTransformStage(url, 'extract:article');
|
|
256
|
+
article = resolveArticleExtraction(document, options.extractArticle);
|
|
257
|
+
endTransformStage(articleStage);
|
|
258
|
+
}
|
|
259
|
+
else {
|
|
260
|
+
article = null;
|
|
261
|
+
}
|
|
262
|
+
throwIfAborted(options.signal, url, 'extract:article');
|
|
263
|
+
return {
|
|
264
|
+
article,
|
|
265
|
+
metadata,
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
catch (error) {
|
|
269
|
+
if (error instanceof FetchError) {
|
|
270
|
+
throw error;
|
|
271
|
+
}
|
|
272
|
+
throwIfAborted(options.signal, url, 'extract:error');
|
|
273
|
+
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
274
|
+
return { article: null, metadata: {} };
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
function isValidInput(html, url) {
|
|
278
|
+
return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
|
|
279
|
+
}
|
|
280
|
+
function validateRequiredString(value, message) {
|
|
281
|
+
if (isNonEmptyString(value))
|
|
282
|
+
return true;
|
|
283
|
+
logWarn(message);
|
|
284
|
+
return false;
|
|
285
|
+
}
|
|
286
|
+
function isNonEmptyString(value) {
|
|
287
|
+
return typeof value === 'string' && value.length > 0;
|
|
288
|
+
}
|
|
289
|
+
function resolveArticleExtraction(document, shouldExtract) {
|
|
290
|
+
return shouldExtract ? extractArticle(document) : null;
|
|
291
|
+
}
|
|
292
|
+
function applyBaseUri(document, url) {
|
|
293
|
+
try {
|
|
294
|
+
Object.defineProperty(document, 'baseURI', {
|
|
295
|
+
value: url,
|
|
296
|
+
writable: true,
|
|
297
|
+
});
|
|
298
|
+
}
|
|
299
|
+
catch (error) {
|
|
300
|
+
logInfo('Failed to set baseURI (non-critical)', {
|
|
301
|
+
url: url.substring(0, 100),
|
|
302
|
+
error: getErrorMessage(error),
|
|
303
|
+
});
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
function containsJsxTag(code) {
|
|
307
|
+
for (let index = 0; index < code.length - 1; index += 1) {
|
|
308
|
+
if (code[index] !== '<')
|
|
309
|
+
continue;
|
|
310
|
+
const next = code[index + 1];
|
|
311
|
+
if (!next)
|
|
312
|
+
continue;
|
|
313
|
+
if (next >= 'A' && next <= 'Z')
|
|
314
|
+
return true;
|
|
315
|
+
}
|
|
316
|
+
return false;
|
|
317
|
+
}
|
|
318
|
+
function containsWord(source, word) {
|
|
319
|
+
let startIndex = source.indexOf(word);
|
|
320
|
+
while (startIndex !== -1) {
|
|
321
|
+
const before = startIndex === 0 ? '' : source[startIndex - 1];
|
|
322
|
+
const afterIndex = startIndex + word.length;
|
|
323
|
+
const after = afterIndex >= source.length ? '' : source[afterIndex];
|
|
324
|
+
if (!isWordChar(before) && !isWordChar(after))
|
|
325
|
+
return true;
|
|
326
|
+
startIndex = source.indexOf(word, startIndex + word.length);
|
|
327
|
+
}
|
|
328
|
+
return false;
|
|
329
|
+
}
|
|
330
|
+
function splitLines(content) {
|
|
331
|
+
return content.split('\n');
|
|
332
|
+
}
|
|
333
|
+
function extractLanguageFromClassName(className) {
|
|
334
|
+
const tokens = className.match(/\S+/g);
|
|
335
|
+
if (!tokens)
|
|
336
|
+
return undefined;
|
|
337
|
+
for (const token of tokens) {
|
|
338
|
+
const lower = token.toLowerCase();
|
|
339
|
+
if (lower.startsWith('language-'))
|
|
340
|
+
return token.slice('language-'.length);
|
|
341
|
+
if (lower.startsWith('lang-'))
|
|
342
|
+
return token.slice('lang-'.length);
|
|
343
|
+
if (lower.startsWith('highlight-')) {
|
|
344
|
+
return token.slice('highlight-'.length);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
return undefined;
|
|
348
|
+
}
|
|
349
|
+
function resolveLanguageFromDataAttribute(dataLang) {
|
|
350
|
+
const trimmed = dataLang.trim();
|
|
351
|
+
if (!trimmed)
|
|
352
|
+
return undefined;
|
|
353
|
+
for (const char of trimmed) {
|
|
354
|
+
if (!isWordChar(char))
|
|
355
|
+
return undefined;
|
|
356
|
+
}
|
|
357
|
+
return trimmed;
|
|
358
|
+
}
|
|
359
|
+
function isWordChar(char) {
|
|
360
|
+
if (!char)
|
|
361
|
+
return false;
|
|
362
|
+
const code = char.charCodeAt(0);
|
|
363
|
+
return ((code >= 48 && code <= 57) ||
|
|
364
|
+
(code >= 65 && code <= 90) ||
|
|
365
|
+
(code >= 97 && code <= 122) ||
|
|
366
|
+
char === '_');
|
|
367
|
+
}
|
|
368
|
+
const BASH_PACKAGE_MANAGERS = [
|
|
369
|
+
'npm',
|
|
370
|
+
'yarn',
|
|
371
|
+
'pnpm',
|
|
372
|
+
'npx',
|
|
373
|
+
'brew',
|
|
374
|
+
'apt',
|
|
375
|
+
'pip',
|
|
376
|
+
'cargo',
|
|
377
|
+
'go',
|
|
378
|
+
];
|
|
379
|
+
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
380
|
+
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
381
|
+
function detectBash(code) {
|
|
382
|
+
const lines = splitLines(code);
|
|
383
|
+
for (const line of lines) {
|
|
384
|
+
const trimmed = line.trimStart();
|
|
385
|
+
if (!trimmed)
|
|
386
|
+
continue;
|
|
387
|
+
if (isBashIndicator(trimmed))
|
|
388
|
+
return true;
|
|
389
|
+
}
|
|
390
|
+
return false;
|
|
391
|
+
}
|
|
392
|
+
function startsWithCommand(line, commands) {
|
|
393
|
+
return commands.some((command) => line === command || line.startsWith(`${command} `));
|
|
394
|
+
}
|
|
395
|
+
function isBashIndicator(line) {
|
|
396
|
+
return (isShebang(line) ||
|
|
397
|
+
isPromptLine(line) ||
|
|
398
|
+
startsWithCommand(line, BASH_COMMANDS) ||
|
|
399
|
+
startsWithPackageManagerCommand(line));
|
|
400
|
+
}
|
|
401
|
+
function isShebang(line) {
|
|
402
|
+
return line.startsWith('#!');
|
|
403
|
+
}
|
|
404
|
+
function isPromptLine(line) {
|
|
405
|
+
return line.startsWith('$ ') || line.startsWith('# ');
|
|
406
|
+
}
|
|
407
|
+
function startsWithPackageManagerCommand(line) {
|
|
408
|
+
return BASH_PACKAGE_MANAGERS.some((manager) => {
|
|
409
|
+
if (!line.startsWith(`${manager} `))
|
|
410
|
+
return false;
|
|
411
|
+
const rest = line.slice(manager.length + 1);
|
|
412
|
+
return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
|
|
413
|
+
});
|
|
414
|
+
}
|
|
415
|
+
const TYPE_HINTS = [
|
|
416
|
+
'string',
|
|
417
|
+
'number',
|
|
418
|
+
'boolean',
|
|
419
|
+
'void',
|
|
420
|
+
'any',
|
|
421
|
+
'unknown',
|
|
422
|
+
'never',
|
|
423
|
+
];
|
|
424
|
+
const HTML_TAGS = [
|
|
425
|
+
'<!doctype',
|
|
426
|
+
'<html',
|
|
427
|
+
'<head',
|
|
428
|
+
'<body',
|
|
429
|
+
'<div',
|
|
430
|
+
'<span',
|
|
431
|
+
'<p',
|
|
432
|
+
'<a',
|
|
433
|
+
'<script',
|
|
434
|
+
'<style',
|
|
435
|
+
];
|
|
436
|
+
const SQL_KEYWORDS = [
|
|
437
|
+
'select',
|
|
438
|
+
'insert',
|
|
439
|
+
'update',
|
|
440
|
+
'delete',
|
|
441
|
+
'create',
|
|
442
|
+
'alter',
|
|
443
|
+
'drop',
|
|
444
|
+
];
|
|
445
|
+
const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
|
|
446
|
+
const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
|
|
447
|
+
const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
|
|
448
|
+
const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
|
|
449
|
+
const CODE_DETECTORS = [
|
|
450
|
+
{ language: 'jsx', detect: detectJsx },
|
|
451
|
+
{ language: 'typescript', detect: detectTypescript },
|
|
452
|
+
{ language: 'rust', detect: detectRust },
|
|
453
|
+
{ language: 'javascript', detect: detectJavascript },
|
|
454
|
+
{ language: 'python', detect: detectPython },
|
|
455
|
+
{ language: 'bash', detect: detectBash },
|
|
456
|
+
{ language: 'css', detect: detectCss },
|
|
457
|
+
{ language: 'html', detect: detectHtml },
|
|
458
|
+
{ language: 'json', detect: detectJson },
|
|
459
|
+
{ language: 'yaml', detect: detectYaml },
|
|
460
|
+
{ language: 'sql', detect: detectSql },
|
|
461
|
+
{ language: 'go', detect: detectGo },
|
|
462
|
+
];
|
|
463
|
+
function detectJsx(code) {
|
|
464
|
+
const lower = code.toLowerCase();
|
|
465
|
+
if (lower.includes('classname='))
|
|
466
|
+
return true;
|
|
467
|
+
if (lower.includes('jsx:'))
|
|
468
|
+
return true;
|
|
469
|
+
if (lower.includes("from 'react'") || lower.includes('from "react"')) {
|
|
470
|
+
return true;
|
|
471
|
+
}
|
|
472
|
+
return containsJsxTag(code);
|
|
473
|
+
}
|
|
474
|
+
function detectTypescript(code) {
|
|
475
|
+
const lower = code.toLowerCase();
|
|
476
|
+
if (containsWord(lower, 'interface'))
|
|
477
|
+
return true;
|
|
478
|
+
if (containsWord(lower, 'type'))
|
|
479
|
+
return true;
|
|
480
|
+
return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
|
|
481
|
+
}
|
|
482
|
+
function detectRust(code) {
|
|
483
|
+
const lower = code.toLowerCase();
|
|
484
|
+
return (RUST_WORD_REGEX.test(lower) ||
|
|
485
|
+
lower.includes('let mut') ||
|
|
486
|
+
(lower.includes('use ') && lower.includes('::')));
|
|
487
|
+
}
|
|
488
|
+
function detectJavascript(code) {
|
|
489
|
+
const lower = code.toLowerCase();
|
|
490
|
+
return JS_WORD_REGEX.test(lower);
|
|
491
|
+
}
|
|
492
|
+
function detectPython(code) {
|
|
493
|
+
const lower = code.toLowerCase();
|
|
494
|
+
return (PYTHON_WORD_REGEX.test(lower) ||
|
|
495
|
+
lower.includes('print(') ||
|
|
496
|
+
lower.includes('__name__'));
|
|
497
|
+
}
|
|
498
|
+
function detectCss(code) {
|
|
499
|
+
const lower = code.toLowerCase();
|
|
500
|
+
if (CSS_DIRECTIVE_REGEX.test(lower))
|
|
501
|
+
return true;
|
|
502
|
+
const lines = splitLines(code);
|
|
503
|
+
for (const line of lines) {
|
|
504
|
+
const trimmed = line.trimStart();
|
|
505
|
+
if (!trimmed)
|
|
506
|
+
continue;
|
|
507
|
+
if (isCssSelectorLine(trimmed) || isCssPropertyLine(trimmed))
|
|
508
|
+
return true;
|
|
509
|
+
}
|
|
510
|
+
return false;
|
|
511
|
+
}
|
|
512
|
+
function detectHtml(code) {
|
|
513
|
+
const lower = code.toLowerCase();
|
|
514
|
+
return HTML_TAGS.some((tag) => lower.includes(tag));
|
|
515
|
+
}
|
|
516
|
+
function detectJson(code) {
|
|
517
|
+
const trimmed = code.trimStart();
|
|
518
|
+
if (!trimmed)
|
|
519
|
+
return false;
|
|
520
|
+
return trimmed.startsWith('{') || trimmed.startsWith('[');
|
|
521
|
+
}
|
|
522
|
+
function detectYaml(code) {
|
|
523
|
+
const lines = splitLines(code);
|
|
524
|
+
for (const line of lines) {
|
|
525
|
+
const trimmed = line.trim();
|
|
526
|
+
if (!trimmed)
|
|
527
|
+
continue;
|
|
528
|
+
const colonIndex = trimmed.indexOf(':');
|
|
529
|
+
if (colonIndex <= 0)
|
|
530
|
+
continue;
|
|
531
|
+
const after = trimmed[colonIndex + 1];
|
|
532
|
+
if (after === ' ' || after === '\t')
|
|
533
|
+
return true;
|
|
534
|
+
}
|
|
535
|
+
return false;
|
|
536
|
+
}
|
|
537
|
+
function detectSql(code) {
|
|
538
|
+
const lower = code.toLowerCase();
|
|
539
|
+
return SQL_KEYWORDS.some((keyword) => containsWord(lower, keyword));
|
|
540
|
+
}
|
|
541
|
+
function detectGo(code) {
|
|
542
|
+
const lower = code.toLowerCase();
|
|
543
|
+
return (containsWord(lower, 'package') ||
|
|
544
|
+
containsWord(lower, 'func') ||
|
|
545
|
+
lower.includes('import "'));
|
|
546
|
+
}
|
|
547
|
+
function isCssSelectorLine(line) {
|
|
548
|
+
if (!line.startsWith('.') && !line.startsWith('#'))
|
|
549
|
+
return false;
|
|
550
|
+
return line.includes('{');
|
|
551
|
+
}
|
|
552
|
+
function isCssPropertyLine(line) {
|
|
553
|
+
return line.includes(':') && line.includes(';');
|
|
554
|
+
}
|
|
555
|
+
export function detectLanguageFromCode(code) {
|
|
556
|
+
for (const { language, detect } of CODE_DETECTORS) {
|
|
557
|
+
if (detect(code))
|
|
558
|
+
return language;
|
|
559
|
+
}
|
|
560
|
+
return undefined;
|
|
561
|
+
}
|
|
562
|
+
export function resolveLanguageFromAttributes(className, dataLang) {
|
|
563
|
+
const classMatch = extractLanguageFromClassName(className);
|
|
564
|
+
return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
|
|
565
|
+
}
|
|
566
|
+
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
567
|
+
const YAML_NUMERIC = /^[\d.]+$/;
|
|
568
|
+
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
569
|
+
const ESCAPE_PATTERNS = {
|
|
570
|
+
backslash: /\\/g,
|
|
571
|
+
quote: /"/g,
|
|
572
|
+
newline: /\n/g,
|
|
573
|
+
tab: /\t/g,
|
|
574
|
+
};
|
|
575
|
+
const YAML_QUOTE_CHECKS = [
|
|
576
|
+
(input) => YAML_SPECIAL_CHARS.test(input),
|
|
577
|
+
(input) => input.startsWith(' ') || input.endsWith(' '),
|
|
578
|
+
(input) => input === '',
|
|
579
|
+
(input) => YAML_NUMERIC.test(input),
|
|
580
|
+
(input) => YAML_RESERVED_WORDS.test(input),
|
|
581
|
+
];
|
|
582
|
+
function needsYamlQuotes(value) {
|
|
583
|
+
return YAML_QUOTE_CHECKS.some((check) => check(value));
|
|
584
|
+
}
|
|
585
|
+
function escapeYamlValue(value) {
|
|
586
|
+
if (!needsYamlQuotes(value)) {
|
|
587
|
+
return value;
|
|
588
|
+
}
|
|
589
|
+
const escaped = value
|
|
590
|
+
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
591
|
+
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
592
|
+
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
593
|
+
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
594
|
+
return `"${escaped}"`;
|
|
595
|
+
}
|
|
596
|
+
function appendFrontmatterField(lines, key, value) {
|
|
597
|
+
if (!value)
|
|
598
|
+
return;
|
|
599
|
+
lines.push(`${key}: ${escapeYamlValue(value)}`);
|
|
600
|
+
}
|
|
601
|
+
function joinLines(lines) {
|
|
602
|
+
return lines.join('\n');
|
|
603
|
+
}
|
|
604
|
+
function buildFrontmatter(metadata) {
|
|
605
|
+
if (!metadata)
|
|
606
|
+
return '';
|
|
607
|
+
const lines = [FRONTMATTER_DELIMITER];
|
|
608
|
+
appendFrontmatterField(lines, 'title', metadata.title);
|
|
609
|
+
appendFrontmatterField(lines, 'source', metadata.url);
|
|
610
|
+
appendFrontmatterField(lines, 'author', metadata.author);
|
|
611
|
+
appendFrontmatterField(lines, 'description', metadata.description);
|
|
612
|
+
appendFrontmatterField(lines, 'fetchedAt', metadata.fetchedAt);
|
|
613
|
+
lines.push(FRONTMATTER_DELIMITER);
|
|
614
|
+
return joinLines(lines);
|
|
615
|
+
}
|
|
616
|
+
function isElement(node) {
|
|
617
|
+
return (isRecord(node) &&
|
|
618
|
+
'getAttribute' in node &&
|
|
619
|
+
typeof node.getAttribute === 'function');
|
|
620
|
+
}
|
|
621
|
+
const STRUCTURAL_TAGS = new Set([
|
|
622
|
+
'script',
|
|
623
|
+
'style',
|
|
624
|
+
'noscript',
|
|
625
|
+
'iframe',
|
|
626
|
+
'nav',
|
|
627
|
+
'footer',
|
|
628
|
+
'aside',
|
|
629
|
+
'header',
|
|
630
|
+
'form',
|
|
631
|
+
'button',
|
|
632
|
+
'input',
|
|
633
|
+
'select',
|
|
634
|
+
'textarea',
|
|
635
|
+
]);
|
|
636
|
+
const NAVIGATION_ROLES = new Set([
|
|
637
|
+
'navigation',
|
|
638
|
+
'banner',
|
|
639
|
+
'complementary',
|
|
640
|
+
'contentinfo',
|
|
641
|
+
'tree',
|
|
642
|
+
'menubar',
|
|
643
|
+
'menu',
|
|
644
|
+
]);
|
|
645
|
+
const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
|
|
646
|
+
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
647
|
+
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
648
|
+
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
649
|
+
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
650
|
+
const NOISE_MARKERS = [
|
|
651
|
+
'<script',
|
|
652
|
+
'<style',
|
|
653
|
+
'<noscript',
|
|
654
|
+
'<iframe',
|
|
655
|
+
'<nav',
|
|
656
|
+
'<footer',
|
|
657
|
+
'<aside',
|
|
658
|
+
'<header',
|
|
659
|
+
'<form',
|
|
660
|
+
'<button',
|
|
661
|
+
'<input',
|
|
662
|
+
'<select',
|
|
663
|
+
'<textarea',
|
|
664
|
+
'<svg',
|
|
665
|
+
'<canvas',
|
|
666
|
+
' aria-hidden="true"',
|
|
667
|
+
" aria-hidden='true'",
|
|
668
|
+
' hidden',
|
|
669
|
+
' role="navigation"',
|
|
670
|
+
" role='navigation'",
|
|
671
|
+
' role="banner"',
|
|
672
|
+
" role='banner'",
|
|
673
|
+
' role="complementary"',
|
|
674
|
+
" role='complementary'",
|
|
675
|
+
' role="contentinfo"',
|
|
676
|
+
" role='contentinfo'",
|
|
677
|
+
' role="tree"',
|
|
678
|
+
" role='tree'",
|
|
679
|
+
' role="menubar"',
|
|
680
|
+
" role='menubar'",
|
|
681
|
+
' role="menu"',
|
|
682
|
+
" role='menu'",
|
|
683
|
+
' banner',
|
|
684
|
+
' promo',
|
|
685
|
+
' announcement',
|
|
686
|
+
' cta',
|
|
687
|
+
' callout',
|
|
688
|
+
' advert',
|
|
689
|
+
' newsletter',
|
|
690
|
+
' subscribe',
|
|
691
|
+
' cookie',
|
|
692
|
+
' consent',
|
|
693
|
+
' popup',
|
|
694
|
+
' modal',
|
|
695
|
+
' overlay',
|
|
696
|
+
' toast',
|
|
697
|
+
' fixed',
|
|
698
|
+
' sticky',
|
|
699
|
+
' z-50',
|
|
700
|
+
' z-4',
|
|
701
|
+
' isolate',
|
|
702
|
+
];
|
|
703
|
+
function mayContainNoise(html) {
|
|
704
|
+
const haystack = html.toLowerCase();
|
|
705
|
+
return NOISE_MARKERS.some((marker) => haystack.includes(marker));
|
|
706
|
+
}
|
|
707
|
+
function isFullDocumentHtml(html) {
|
|
708
|
+
return HTML_DOCUMENT_MARKERS.test(html);
|
|
709
|
+
}
|
|
710
|
+
function isStructuralNoiseTag(tagName) {
|
|
711
|
+
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
712
|
+
}
|
|
713
|
+
function isElementHidden(element) {
|
|
714
|
+
return (element.getAttribute('hidden') !== null ||
|
|
715
|
+
element.getAttribute('aria-hidden') === 'true');
|
|
716
|
+
}
|
|
717
|
+
function hasNoiseRole(role) {
|
|
718
|
+
return role !== null && NAVIGATION_ROLES.has(role);
|
|
719
|
+
}
|
|
720
|
+
function matchesPromoIdOrClass(className, id) {
|
|
721
|
+
const combined = `${className} ${id}`.toLowerCase();
|
|
722
|
+
return PROMO_PATTERN.test(combined);
|
|
723
|
+
}
|
|
724
|
+
function matchesHighZIsolate(className) {
|
|
725
|
+
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
726
|
+
}
|
|
727
|
+
function matchesFixedOrHighZIsolate(className) {
|
|
728
|
+
return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
|
|
729
|
+
}
|
|
730
|
+
function readElementMetadata(element) {
|
|
731
|
+
return {
|
|
732
|
+
tagName: element.tagName.toLowerCase(),
|
|
733
|
+
className: element.getAttribute('class') ?? '',
|
|
734
|
+
id: element.getAttribute('id') ?? '',
|
|
735
|
+
role: element.getAttribute('role'),
|
|
736
|
+
isHidden: isElementHidden(element),
|
|
737
|
+
};
|
|
738
|
+
}
|
|
739
|
+
function isNoiseElement(node) {
|
|
740
|
+
const metadata = readElementMetadata(node);
|
|
741
|
+
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
742
|
+
metadata.isHidden ||
|
|
743
|
+
hasNoiseRole(metadata.role) ||
|
|
744
|
+
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
745
|
+
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
746
|
+
}
|
|
747
|
+
function removeNoiseFromHtml(html) {
|
|
748
|
+
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
749
|
+
if (!shouldParse)
|
|
750
|
+
return html;
|
|
751
|
+
const shouldRemove = mayContainNoise(html);
|
|
752
|
+
try {
|
|
753
|
+
const { document } = parseHTML(html);
|
|
754
|
+
if (shouldRemove) {
|
|
755
|
+
const nodes = Array.from(document.querySelectorAll('*'));
|
|
756
|
+
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
757
|
+
const node = nodes[index];
|
|
758
|
+
if (!node)
|
|
759
|
+
continue;
|
|
760
|
+
if (isElement(node) && isNoiseElement(node)) {
|
|
761
|
+
node.remove();
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
const { body } = document;
|
|
766
|
+
if (body?.innerHTML)
|
|
767
|
+
return body.innerHTML;
|
|
768
|
+
if (typeof document.toString ===
|
|
769
|
+
'function') {
|
|
770
|
+
return document.toString();
|
|
771
|
+
}
|
|
772
|
+
const { documentElement } = document;
|
|
773
|
+
if (documentElement?.outerHTML)
|
|
774
|
+
return documentElement.outerHTML;
|
|
775
|
+
return html;
|
|
776
|
+
}
|
|
777
|
+
catch {
|
|
778
|
+
return html;
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
function buildInlineCode(content) {
|
|
782
|
+
const runs = content.match(/`+/g);
|
|
783
|
+
const longest = runs?.sort((a, b) => b.length - a.length)[0] ?? '';
|
|
784
|
+
const delimiter = `\`${longest}`;
|
|
785
|
+
const padding = delimiter.length > 1 ? ' ' : '';
|
|
786
|
+
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
787
|
+
}
|
|
788
|
+
function isCodeBlock(parent) {
|
|
789
|
+
if (!isRecord(parent))
|
|
790
|
+
return false;
|
|
791
|
+
const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
|
|
792
|
+
return ['PRE', 'WRAPPED-PRE'].includes(tagName);
|
|
793
|
+
}
|
|
794
|
+
function createCodeTranslator() {
|
|
795
|
+
return {
|
|
796
|
+
code: (ctx) => {
|
|
797
|
+
if (!isRecord(ctx)) {
|
|
798
|
+
return {
|
|
799
|
+
spaceIfRepeatingChar: true,
|
|
800
|
+
noEscape: true,
|
|
801
|
+
postprocess: ({ content }) => buildInlineCode(content),
|
|
802
|
+
};
|
|
803
|
+
}
|
|
804
|
+
const { node, parent, visitor } = ctx;
|
|
805
|
+
const getAttribute = isRecord(node) && typeof node.getAttribute === 'function'
|
|
806
|
+
? node.getAttribute.bind(node)
|
|
807
|
+
: undefined;
|
|
808
|
+
if (!isCodeBlock(parent)) {
|
|
809
|
+
return {
|
|
810
|
+
spaceIfRepeatingChar: true,
|
|
811
|
+
noEscape: true,
|
|
812
|
+
postprocess: ({ content }) => buildInlineCode(content),
|
|
813
|
+
};
|
|
814
|
+
}
|
|
815
|
+
const className = getAttribute?.('class') ?? '';
|
|
816
|
+
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
817
|
+
const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
|
|
818
|
+
const childTranslators = isRecord(visitor) ? visitor.instance : null;
|
|
819
|
+
const codeBlockTranslators = isRecord(childTranslators) &&
|
|
820
|
+
isRecord(childTranslators
|
|
821
|
+
.codeBlockTranslators)
|
|
822
|
+
? childTranslators.codeBlockTranslators
|
|
823
|
+
: null;
|
|
824
|
+
return {
|
|
825
|
+
noEscape: true,
|
|
826
|
+
preserveWhitespace: true,
|
|
827
|
+
...(codeBlockTranslators
|
|
828
|
+
? { childTranslators: codeBlockTranslators }
|
|
829
|
+
: null),
|
|
830
|
+
postprocess: ({ content }) => {
|
|
831
|
+
const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
|
|
832
|
+
return CODE_BLOCK.format(content, language);
|
|
833
|
+
},
|
|
834
|
+
};
|
|
835
|
+
},
|
|
836
|
+
};
|
|
837
|
+
}
|
|
838
|
+
let markdownInstance = null;
|
|
839
|
+
function createMarkdownInstance() {
|
|
840
|
+
return new NodeHtmlMarkdown({
|
|
841
|
+
codeFence: CODE_BLOCK.fence,
|
|
842
|
+
codeBlockStyle: 'fenced',
|
|
843
|
+
emDelimiter: '_',
|
|
844
|
+
bulletMarker: '-',
|
|
845
|
+
}, createCodeTranslator());
|
|
846
|
+
}
|
|
847
|
+
function getMarkdownConverter() {
|
|
848
|
+
markdownInstance ??= createMarkdownInstance();
|
|
849
|
+
return markdownInstance;
|
|
850
|
+
}
|
|
851
|
+
export function htmlToMarkdown(html, metadata, options) {
|
|
852
|
+
const url = options?.url ?? metadata?.url ?? '';
|
|
853
|
+
const frontmatter = buildFrontmatter(metadata);
|
|
854
|
+
if (!html)
|
|
855
|
+
return frontmatter;
|
|
856
|
+
try {
|
|
857
|
+
throwIfAborted(options?.signal, url, 'markdown:begin');
|
|
858
|
+
const noiseStage = startTransformStage(url, 'markdown:noise');
|
|
859
|
+
const cleanedHtml = removeNoiseFromHtml(html);
|
|
860
|
+
endTransformStage(noiseStage);
|
|
861
|
+
throwIfAborted(options?.signal, url, 'markdown:cleaned');
|
|
862
|
+
const translateStage = startTransformStage(url, 'markdown:translate');
|
|
863
|
+
const content = getMarkdownConverter().translate(cleanedHtml).trim();
|
|
864
|
+
endTransformStage(translateStage);
|
|
865
|
+
throwIfAborted(options?.signal, url, 'markdown:translated');
|
|
866
|
+
return frontmatter ? `${frontmatter}\n${content}` : content;
|
|
867
|
+
}
|
|
868
|
+
catch (error) {
|
|
869
|
+
if (error instanceof FetchError) {
|
|
870
|
+
throw error;
|
|
871
|
+
}
|
|
872
|
+
return frontmatter;
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
876
|
+
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
877
|
+
const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
|
|
878
|
+
function containsMarkdownHeading(content) {
|
|
879
|
+
return HEADING_PATTERN.test(content);
|
|
880
|
+
}
|
|
881
|
+
function containsMarkdownList(content) {
|
|
882
|
+
return LIST_PATTERN.test(content);
|
|
883
|
+
}
|
|
884
|
+
function containsFencedCodeBlock(content) {
|
|
885
|
+
const first = content.indexOf('```');
|
|
886
|
+
if (first === -1)
|
|
887
|
+
return false;
|
|
888
|
+
return content.includes('```', first + 3);
|
|
889
|
+
}
|
|
890
|
+
function looksLikeMarkdown(content) {
|
|
891
|
+
return (containsMarkdownHeading(content) ||
|
|
892
|
+
containsMarkdownList(content) ||
|
|
893
|
+
containsFencedCodeBlock(content));
|
|
894
|
+
}
|
|
895
|
+
function detectLineEnding(content) {
|
|
896
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
897
|
+
}
|
|
898
|
+
function findFrontmatterLines(content) {
|
|
899
|
+
const lineEnding = detectLineEnding(content);
|
|
900
|
+
const lines = content.split(lineEnding);
|
|
901
|
+
if (lines[0] !== FRONTMATTER_DELIMITER)
|
|
902
|
+
return null;
|
|
903
|
+
const endIndex = lines.indexOf(FRONTMATTER_DELIMITER, 1);
|
|
904
|
+
if (endIndex === -1)
|
|
905
|
+
return null;
|
|
906
|
+
return { lineEnding, lines, endIndex };
|
|
907
|
+
}
|
|
908
|
+
function stripOptionalQuotes(value) {
|
|
909
|
+
const trimmed = value.trim();
|
|
910
|
+
if (trimmed.length < 2)
|
|
911
|
+
return trimmed;
|
|
912
|
+
const first = trimmed[0];
|
|
913
|
+
const last = trimmed[trimmed.length - 1];
|
|
914
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
915
|
+
return trimmed.slice(1, -1).trim();
|
|
916
|
+
}
|
|
917
|
+
return trimmed;
|
|
918
|
+
}
|
|
919
|
+
function parseFrontmatterEntry(line) {
|
|
920
|
+
const trimmed = line.trim();
|
|
921
|
+
if (!trimmed)
|
|
922
|
+
return null;
|
|
923
|
+
const separatorIndex = trimmed.indexOf(':');
|
|
924
|
+
if (separatorIndex <= 0)
|
|
925
|
+
return null;
|
|
926
|
+
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
927
|
+
const value = trimmed.slice(separatorIndex + 1);
|
|
928
|
+
return { key, value };
|
|
929
|
+
}
|
|
930
|
+
function isTitleKey(key) {
|
|
931
|
+
return key === 'title' || key === 'name';
|
|
932
|
+
}
|
|
933
|
+
function extractTitleFromRawMarkdown(content) {
|
|
934
|
+
const frontmatter = findFrontmatterLines(content);
|
|
935
|
+
if (!frontmatter)
|
|
936
|
+
return undefined;
|
|
937
|
+
const { lines, endIndex } = frontmatter;
|
|
938
|
+
const entry = lines
|
|
939
|
+
.slice(1, endIndex)
|
|
940
|
+
.map((line) => parseFrontmatterEntry(line))
|
|
941
|
+
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
942
|
+
if (!entry)
|
|
943
|
+
return undefined;
|
|
944
|
+
const value = stripOptionalQuotes(entry.value);
|
|
945
|
+
return value || undefined;
|
|
946
|
+
}
|
|
947
|
+
function addSourceToMarkdown(content, url) {
|
|
948
|
+
const frontmatter = findFrontmatterLines(content);
|
|
949
|
+
if (!frontmatter) {
|
|
950
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
951
|
+
}
|
|
952
|
+
const { lineEnding, lines, endIndex } = frontmatter;
|
|
953
|
+
const bodyLines = lines.slice(1, endIndex);
|
|
954
|
+
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
955
|
+
if (hasSource)
|
|
956
|
+
return content;
|
|
957
|
+
const updatedLines = [
|
|
958
|
+
lines[0],
|
|
959
|
+
...bodyLines,
|
|
960
|
+
`source: "${url}"`,
|
|
961
|
+
...lines.slice(endIndex),
|
|
962
|
+
];
|
|
963
|
+
return updatedLines.join(lineEnding);
|
|
964
|
+
}
|
|
965
|
+
function hasFrontmatter(trimmed) {
|
|
966
|
+
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
967
|
+
}
|
|
968
|
+
function looksLikeHtmlDocument(trimmed) {
|
|
969
|
+
return HTML_DOCUMENT_PATTERN.test(trimmed);
|
|
970
|
+
}
|
|
971
|
+
function countCommonHtmlTags(content) {
|
|
972
|
+
const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
|
|
973
|
+
[];
|
|
974
|
+
return matches.length;
|
|
975
|
+
}
|
|
976
|
+
function isRawTextContent(content) {
|
|
977
|
+
const trimmed = content.trim();
|
|
978
|
+
const isHtmlDocument = looksLikeHtmlDocument(trimmed);
|
|
979
|
+
const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
|
|
980
|
+
const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
|
|
981
|
+
const isMarkdown = looksLikeMarkdown(content);
|
|
982
|
+
return (!isHtmlDocument &&
|
|
983
|
+
(hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
|
|
984
|
+
}
|
|
985
|
+
function isLikelyHtmlContent(content) {
|
|
986
|
+
const trimmed = content.trim();
|
|
987
|
+
if (!trimmed)
|
|
988
|
+
return false;
|
|
989
|
+
if (looksLikeHtmlDocument(trimmed))
|
|
990
|
+
return true;
|
|
991
|
+
return countCommonHtmlTags(content) > 2;
|
|
992
|
+
}
|
|
993
|
+
function shouldPreserveRawContent(url, content) {
|
|
994
|
+
if (isRawTextContentUrl(url)) {
|
|
995
|
+
return !isLikelyHtmlContent(content);
|
|
996
|
+
}
|
|
997
|
+
return isRawTextContent(content);
|
|
998
|
+
}
|
|
999
|
+
function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
|
|
1000
|
+
const title = extractTitleFromRawMarkdown(rawContent);
|
|
1001
|
+
const content = includeMetadata
|
|
1002
|
+
? addSourceToMarkdown(rawContent, url)
|
|
1003
|
+
: rawContent;
|
|
1004
|
+
return { content, title };
|
|
1005
|
+
}
|
|
1006
|
+
function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
1007
|
+
if (!shouldPreserveRawContent(url, html)) {
|
|
1008
|
+
return null;
|
|
1009
|
+
}
|
|
1010
|
+
logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
|
|
1011
|
+
const { content, title } = buildRawMarkdownPayload({
|
|
1012
|
+
rawContent: html,
|
|
1013
|
+
url,
|
|
1014
|
+
includeMetadata,
|
|
1015
|
+
});
|
|
1016
|
+
return {
|
|
1017
|
+
markdown: content,
|
|
1018
|
+
title,
|
|
1019
|
+
truncated: false,
|
|
1020
|
+
};
|
|
1021
|
+
}
|
|
1022
|
+
const MIN_CONTENT_RATIO = 0.3;
|
|
1023
|
+
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
1024
|
+
function stripHtmlTags(html) {
|
|
1025
|
+
const parts = [];
|
|
1026
|
+
let inTag = false;
|
|
1027
|
+
for (const char of html) {
|
|
1028
|
+
if (char === '<') {
|
|
1029
|
+
inTag = true;
|
|
1030
|
+
continue;
|
|
1031
|
+
}
|
|
1032
|
+
if (char === '>') {
|
|
1033
|
+
inTag = false;
|
|
1034
|
+
continue;
|
|
1035
|
+
}
|
|
1036
|
+
if (!inTag) {
|
|
1037
|
+
parts.push(char);
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
return parts.join('');
|
|
1041
|
+
}
|
|
1042
|
+
function estimateTextLength(html) {
|
|
1043
|
+
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
1044
|
+
}
|
|
1045
|
+
export function isExtractionSufficient(article, originalHtml) {
|
|
1046
|
+
if (!article)
|
|
1047
|
+
return false;
|
|
1048
|
+
const articleLength = article.textContent.length;
|
|
1049
|
+
const originalLength = estimateTextLength(originalHtml);
|
|
1050
|
+
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
1051
|
+
return true;
|
|
1052
|
+
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
1053
|
+
}
|
|
1054
|
+
export function determineContentExtractionSource(article) {
|
|
1055
|
+
return !!article;
|
|
1056
|
+
}
|
|
1057
|
+
function applyArticleMetadata(metadata, article) {
|
|
1058
|
+
if (article.title !== undefined)
|
|
1059
|
+
metadata.title = article.title;
|
|
1060
|
+
if (article.byline !== undefined)
|
|
1061
|
+
metadata.author = article.byline;
|
|
1062
|
+
}
|
|
1063
|
+
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
1064
|
+
if (extractedMeta.title !== undefined)
|
|
1065
|
+
metadata.title = extractedMeta.title;
|
|
1066
|
+
if (extractedMeta.description !== undefined) {
|
|
1067
|
+
metadata.description = extractedMeta.description;
|
|
1068
|
+
}
|
|
1069
|
+
if (extractedMeta.author !== undefined) {
|
|
1070
|
+
metadata.author = extractedMeta.author;
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
1074
|
+
if (!includeMetadata)
|
|
1075
|
+
return undefined;
|
|
1076
|
+
const now = new Date().toISOString();
|
|
1077
|
+
const metadata = {
|
|
1078
|
+
type: 'metadata',
|
|
1079
|
+
url,
|
|
1080
|
+
fetchedAt: now,
|
|
1081
|
+
};
|
|
1082
|
+
if (shouldExtractFromArticle && article) {
|
|
1083
|
+
applyArticleMetadata(metadata, article);
|
|
1084
|
+
return metadata;
|
|
1085
|
+
}
|
|
1086
|
+
applyExtractedMetadata(metadata, extractedMeta);
|
|
1087
|
+
return metadata;
|
|
1088
|
+
}
|
|
1089
|
+
function buildArticleContentSource({ url, article, extractedMeta, includeMetadata, }) {
|
|
1090
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
|
|
1091
|
+
return {
|
|
1092
|
+
sourceHtml: article.content,
|
|
1093
|
+
title: article.title,
|
|
1094
|
+
metadata,
|
|
1095
|
+
};
|
|
1096
|
+
}
|
|
1097
|
+
function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
1098
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
|
|
1099
|
+
return {
|
|
1100
|
+
sourceHtml: html,
|
|
1101
|
+
title: extractedMeta.title,
|
|
1102
|
+
metadata,
|
|
1103
|
+
};
|
|
1104
|
+
}
|
|
1105
|
+
function logQualityGateFallback({ url, articleLength, }) {
|
|
1106
|
+
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
1107
|
+
url: url.substring(0, 80),
|
|
1108
|
+
articleLength,
|
|
1109
|
+
});
|
|
1110
|
+
}
|
|
1111
|
+
function tryBuildExtractedArticleContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
1112
|
+
if (!article)
|
|
1113
|
+
return null;
|
|
1114
|
+
const shouldExtractFromArticle = determineContentExtractionSource(article);
|
|
1115
|
+
if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
|
|
1116
|
+
return buildArticleContentSource({
|
|
1117
|
+
url,
|
|
1118
|
+
article,
|
|
1119
|
+
extractedMeta,
|
|
1120
|
+
includeMetadata,
|
|
1121
|
+
});
|
|
1122
|
+
}
|
|
1123
|
+
if (shouldExtractFromArticle) {
|
|
1124
|
+
logQualityGateFallback({
|
|
1125
|
+
url,
|
|
1126
|
+
articleLength: article.textContent.length,
|
|
1127
|
+
});
|
|
1128
|
+
}
|
|
1129
|
+
return null;
|
|
1130
|
+
}
|
|
1131
|
+
function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
1132
|
+
const { article, metadata: extractedMeta } = extractContent(html, url, {
|
|
1133
|
+
extractArticle: true,
|
|
1134
|
+
...(signal ? { signal } : {}),
|
|
1135
|
+
});
|
|
1136
|
+
const extracted = tryBuildExtractedArticleContentSource({
|
|
1137
|
+
html,
|
|
1138
|
+
url,
|
|
1139
|
+
article,
|
|
1140
|
+
extractedMeta,
|
|
1141
|
+
includeMetadata,
|
|
1142
|
+
});
|
|
1143
|
+
if (extracted)
|
|
1144
|
+
return extracted;
|
|
1145
|
+
return buildFullHtmlContentSource({
|
|
1146
|
+
html,
|
|
1147
|
+
url,
|
|
1148
|
+
article,
|
|
1149
|
+
extractedMeta,
|
|
1150
|
+
includeMetadata,
|
|
1151
|
+
});
|
|
1152
|
+
}
|
|
1153
|
+
export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
1154
|
+
const totalStage = startTransformStage(url, 'transform:total');
|
|
1155
|
+
let success = false;
|
|
1156
|
+
try {
|
|
1157
|
+
throwIfAborted(options.signal, url, 'transform:begin');
|
|
1158
|
+
const rawStage = startTransformStage(url, 'transform:raw');
|
|
1159
|
+
const raw = tryTransformRawContent({
|
|
1160
|
+
html,
|
|
1161
|
+
url,
|
|
1162
|
+
includeMetadata: options.includeMetadata,
|
|
1163
|
+
});
|
|
1164
|
+
endTransformStage(rawStage);
|
|
1165
|
+
if (raw) {
|
|
1166
|
+
success = true;
|
|
1167
|
+
return raw;
|
|
1168
|
+
}
|
|
1169
|
+
const extractStage = startTransformStage(url, 'transform:extract');
|
|
1170
|
+
const context = resolveContentSource({
|
|
1171
|
+
html,
|
|
1172
|
+
url,
|
|
1173
|
+
includeMetadata: options.includeMetadata,
|
|
1174
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
1175
|
+
});
|
|
1176
|
+
endTransformStage(extractStage);
|
|
1177
|
+
const markdownStage = startTransformStage(url, 'transform:markdown');
|
|
1178
|
+
const content = htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
1179
|
+
url,
|
|
1180
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
1181
|
+
});
|
|
1182
|
+
endTransformStage(markdownStage);
|
|
1183
|
+
success = true;
|
|
1184
|
+
return {
|
|
1185
|
+
markdown: content,
|
|
1186
|
+
title: context.title,
|
|
1187
|
+
truncated: false,
|
|
1188
|
+
};
|
|
1189
|
+
}
|
|
1190
|
+
finally {
|
|
1191
|
+
if (success) {
|
|
1192
|
+
endTransformStage(totalStage, { truncated: false });
|
|
1193
|
+
}
|
|
1194
|
+
}
|
|
1195
|
+
}
|
|
1196
|
+
let pool = null;
|
|
1197
|
+
function resolveDefaultWorkerCount() {
|
|
1198
|
+
const parallelism = typeof os.availableParallelism === 'function'
|
|
1199
|
+
? os.availableParallelism()
|
|
1200
|
+
: os.cpus().length;
|
|
1201
|
+
// Leave 1 core for the event loop; cap to avoid runaway memory.
|
|
1202
|
+
return Math.min(16, Math.max(1, parallelism - 1));
|
|
1203
|
+
}
|
|
1204
|
+
const DEFAULT_TIMEOUT_MS = 30000;
|
|
1205
|
+
function getOrCreateTransformWorkerPool() {
|
|
1206
|
+
pool ??= new WorkerPool(resolveDefaultWorkerCount(), DEFAULT_TIMEOUT_MS);
|
|
1207
|
+
return pool;
|
|
1208
|
+
}
|
|
1209
|
+
export async function shutdownTransformWorkerPool() {
|
|
1210
|
+
if (!pool)
|
|
1211
|
+
return;
|
|
1212
|
+
await pool.close();
|
|
1213
|
+
pool = null;
|
|
1214
|
+
}
|
|
1215
|
+
class WorkerPool {
|
|
1216
|
+
workers = [];
|
|
1217
|
+
queue = [];
|
|
1218
|
+
inflight = new Map();
|
|
1219
|
+
timeoutMs;
|
|
1220
|
+
queueMax;
|
|
1221
|
+
closed = false;
|
|
1222
|
+
constructor(size, timeoutMs) {
|
|
1223
|
+
const safeSize = Math.max(1, size);
|
|
1224
|
+
this.timeoutMs = timeoutMs;
|
|
1225
|
+
this.queueMax = safeSize * 2;
|
|
1226
|
+
for (let index = 0; index < safeSize; index += 1) {
|
|
1227
|
+
this.workers.push(this.spawnWorker(index));
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
spawnWorker(workerIndex) {
|
|
1231
|
+
const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
|
|
1232
|
+
// Workers must not keep the process alive by themselves.
|
|
1233
|
+
worker.unref();
|
|
1234
|
+
const slot = {
|
|
1235
|
+
worker,
|
|
1236
|
+
busy: false,
|
|
1237
|
+
currentTaskId: null,
|
|
1238
|
+
};
|
|
1239
|
+
worker.on('message', (raw) => {
|
|
1240
|
+
this.onWorkerMessage(workerIndex, raw);
|
|
1241
|
+
});
|
|
1242
|
+
worker.on('error', (error) => {
|
|
1243
|
+
this.onWorkerBroken(workerIndex, `Transform worker error: ${getErrorMessage(error)}`);
|
|
1244
|
+
});
|
|
1245
|
+
worker.on('exit', (code) => {
|
|
1246
|
+
this.onWorkerBroken(workerIndex, `Transform worker exited (code ${code})`);
|
|
1247
|
+
});
|
|
1248
|
+
return slot;
|
|
1249
|
+
}
|
|
1250
|
+
onWorkerBroken(workerIndex, message) {
|
|
1251
|
+
if (this.closed)
|
|
1252
|
+
return;
|
|
1253
|
+
const slot = this.workers[workerIndex];
|
|
1254
|
+
if (!slot)
|
|
1255
|
+
return;
|
|
1256
|
+
if (slot.busy && slot.currentTaskId) {
|
|
1257
|
+
this.failTask(slot.currentTaskId, new Error(message));
|
|
1258
|
+
}
|
|
1259
|
+
void slot.worker.terminate();
|
|
1260
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
1261
|
+
this.drainQueue();
|
|
1262
|
+
}
|
|
1263
|
+
onWorkerMessage(workerIndex, raw) {
|
|
1264
|
+
if (!raw ||
|
|
1265
|
+
typeof raw !== 'object' ||
|
|
1266
|
+
!('type' in raw) ||
|
|
1267
|
+
!('id' in raw) ||
|
|
1268
|
+
typeof raw.id !== 'string' ||
|
|
1269
|
+
typeof raw.type !== 'string') {
|
|
1270
|
+
return;
|
|
1271
|
+
}
|
|
1272
|
+
const message = raw;
|
|
1273
|
+
const inflight = this.inflight.get(message.id);
|
|
1274
|
+
if (!inflight)
|
|
1275
|
+
return;
|
|
1276
|
+
clearTimeout(inflight.timer);
|
|
1277
|
+
if (inflight.signal && inflight.abortListener) {
|
|
1278
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
1279
|
+
}
|
|
1280
|
+
this.inflight.delete(message.id);
|
|
1281
|
+
const slot = this.workers[workerIndex];
|
|
1282
|
+
if (slot) {
|
|
1283
|
+
slot.busy = false;
|
|
1284
|
+
slot.currentTaskId = null;
|
|
1285
|
+
}
|
|
1286
|
+
if (message.type === 'result') {
|
|
1287
|
+
inflight.resolve(message.result);
|
|
1288
|
+
}
|
|
1289
|
+
else {
|
|
1290
|
+
const { error } = message;
|
|
1291
|
+
if (error.name === 'FetchError') {
|
|
1292
|
+
inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
|
|
1293
|
+
}
|
|
1294
|
+
else {
|
|
1295
|
+
inflight.reject(new Error(error.message));
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
this.drainQueue();
|
|
1299
|
+
}
|
|
1300
|
+
failTask(id, error) {
|
|
1301
|
+
const inflight = this.inflight.get(id);
|
|
1302
|
+
if (!inflight)
|
|
1303
|
+
return;
|
|
1304
|
+
clearTimeout(inflight.timer);
|
|
1305
|
+
if (inflight.signal && inflight.abortListener) {
|
|
1306
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
1307
|
+
}
|
|
1308
|
+
this.inflight.delete(id);
|
|
1309
|
+
inflight.reject(error);
|
|
1310
|
+
const slot = this.workers[inflight.workerIndex];
|
|
1311
|
+
if (slot) {
|
|
1312
|
+
slot.busy = false;
|
|
1313
|
+
slot.currentTaskId = null;
|
|
1314
|
+
}
|
|
1315
|
+
}
|
|
1316
|
+
async transform(html, url, options) {
|
|
1317
|
+
if (this.closed) {
|
|
1318
|
+
throw new Error('Transform worker pool closed');
|
|
1319
|
+
}
|
|
1320
|
+
if (options.signal?.aborted) {
|
|
1321
|
+
throw new FetchError('Request was canceled', url, 499, {
|
|
1322
|
+
reason: 'aborted',
|
|
1323
|
+
stage: 'transform:enqueue',
|
|
1324
|
+
});
|
|
1325
|
+
}
|
|
1326
|
+
if (this.queue.length >= this.queueMax) {
|
|
1327
|
+
throw new Error('Transform worker queue is full');
|
|
1328
|
+
}
|
|
1329
|
+
return new Promise((resolve, reject) => {
|
|
1330
|
+
const id = randomUUID();
|
|
1331
|
+
let abortListener;
|
|
1332
|
+
if (options.signal) {
|
|
1333
|
+
abortListener = () => {
|
|
1334
|
+
if (this.closed) {
|
|
1335
|
+
reject(new Error('Transform worker pool closed'));
|
|
1336
|
+
return;
|
|
1337
|
+
}
|
|
1338
|
+
const inflight = this.inflight.get(id);
|
|
1339
|
+
if (inflight) {
|
|
1340
|
+
const { workerIndex } = inflight;
|
|
1341
|
+
const slot = this.workers[workerIndex];
|
|
1342
|
+
if (slot) {
|
|
1343
|
+
try {
|
|
1344
|
+
slot.worker.postMessage({ type: 'cancel', id });
|
|
1345
|
+
}
|
|
1346
|
+
catch {
|
|
1347
|
+
// ignore
|
|
1348
|
+
}
|
|
1349
|
+
}
|
|
1350
|
+
this.failTask(id, new FetchError('Request was canceled', url, 499, {
|
|
1351
|
+
reason: 'aborted',
|
|
1352
|
+
stage: 'transform:signal-abort',
|
|
1353
|
+
}));
|
|
1354
|
+
if (slot) {
|
|
1355
|
+
void slot.worker.terminate();
|
|
1356
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
1357
|
+
this.drainQueue();
|
|
1358
|
+
}
|
|
1359
|
+
return;
|
|
1360
|
+
}
|
|
1361
|
+
const queuedIndex = this.queue.findIndex((task) => task.id === id);
|
|
1362
|
+
if (queuedIndex !== -1) {
|
|
1363
|
+
this.queue.splice(queuedIndex, 1);
|
|
1364
|
+
reject(new FetchError('Request was canceled', url, 499, {
|
|
1365
|
+
reason: 'aborted',
|
|
1366
|
+
stage: 'transform:queued-abort',
|
|
1367
|
+
}));
|
|
1368
|
+
}
|
|
1369
|
+
};
|
|
1370
|
+
options.signal.addEventListener('abort', abortListener, { once: true });
|
|
1371
|
+
}
|
|
1372
|
+
this.queue.push({
|
|
1373
|
+
id,
|
|
1374
|
+
html,
|
|
1375
|
+
url,
|
|
1376
|
+
includeMetadata: options.includeMetadata,
|
|
1377
|
+
signal: options.signal,
|
|
1378
|
+
abortListener,
|
|
1379
|
+
resolve,
|
|
1380
|
+
reject,
|
|
1381
|
+
});
|
|
1382
|
+
this.drainQueue();
|
|
1383
|
+
});
|
|
1384
|
+
}
|
|
1385
|
+
drainQueue() {
|
|
1386
|
+
if (this.queue.length === 0)
|
|
1387
|
+
return;
|
|
1388
|
+
for (let workerIndex = 0; workerIndex < this.workers.length; workerIndex += 1) {
|
|
1389
|
+
const slot = this.workers[workerIndex];
|
|
1390
|
+
if (!slot || slot.busy)
|
|
1391
|
+
continue;
|
|
1392
|
+
const task = this.queue.shift();
|
|
1393
|
+
if (!task)
|
|
1394
|
+
return;
|
|
1395
|
+
this.dispatch(workerIndex, slot, task);
|
|
1396
|
+
if (this.queue.length === 0)
|
|
1397
|
+
return;
|
|
1398
|
+
}
|
|
1399
|
+
}
|
|
1400
|
+
dispatch(workerIndex, slot, task) {
|
|
1401
|
+
if (task.signal?.aborted) {
|
|
1402
|
+
if (task.abortListener) {
|
|
1403
|
+
task.signal.removeEventListener('abort', task.abortListener);
|
|
1404
|
+
}
|
|
1405
|
+
task.reject(new FetchError('Request was canceled', task.url, 499, {
|
|
1406
|
+
reason: 'aborted',
|
|
1407
|
+
stage: 'transform:dispatch',
|
|
1408
|
+
}));
|
|
1409
|
+
return;
|
|
1410
|
+
}
|
|
1411
|
+
slot.busy = true;
|
|
1412
|
+
slot.currentTaskId = task.id;
|
|
1413
|
+
const timer = setTimeout(() => {
|
|
1414
|
+
try {
|
|
1415
|
+
slot.worker.postMessage({ type: 'cancel', id: task.id });
|
|
1416
|
+
}
|
|
1417
|
+
catch {
|
|
1418
|
+
// ignore
|
|
1419
|
+
}
|
|
1420
|
+
const inflight = this.inflight.get(task.id);
|
|
1421
|
+
if (!inflight)
|
|
1422
|
+
return;
|
|
1423
|
+
clearTimeout(inflight.timer);
|
|
1424
|
+
if (inflight.signal && inflight.abortListener) {
|
|
1425
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
1426
|
+
}
|
|
1427
|
+
this.inflight.delete(task.id);
|
|
1428
|
+
inflight.reject(new FetchError('Request timeout', task.url, 504, {
|
|
1429
|
+
reason: 'timeout',
|
|
1430
|
+
stage: 'transform:worker-timeout',
|
|
1431
|
+
}));
|
|
1432
|
+
if (!this.closed) {
|
|
1433
|
+
void slot.worker.terminate();
|
|
1434
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
1435
|
+
this.drainQueue();
|
|
1436
|
+
}
|
|
1437
|
+
}, this.timeoutMs).unref();
|
|
1438
|
+
this.inflight.set(task.id, {
|
|
1439
|
+
resolve: task.resolve,
|
|
1440
|
+
reject: task.reject,
|
|
1441
|
+
timer,
|
|
1442
|
+
signal: task.signal,
|
|
1443
|
+
abortListener: task.abortListener,
|
|
1444
|
+
workerIndex,
|
|
1445
|
+
});
|
|
1446
|
+
slot.worker.postMessage({
|
|
1447
|
+
type: 'transform',
|
|
1448
|
+
id: task.id,
|
|
1449
|
+
html: task.html,
|
|
1450
|
+
url: task.url,
|
|
1451
|
+
includeMetadata: task.includeMetadata,
|
|
1452
|
+
});
|
|
1453
|
+
}
|
|
1454
|
+
async close() {
|
|
1455
|
+
if (this.closed)
|
|
1456
|
+
return;
|
|
1457
|
+
this.closed = true;
|
|
1458
|
+
const terminations = this.workers.map((slot) => slot.worker.terminate());
|
|
1459
|
+
this.workers.length = 0;
|
|
1460
|
+
for (const [id, inflight] of this.inflight.entries()) {
|
|
1461
|
+
clearTimeout(inflight.timer);
|
|
1462
|
+
if (inflight.signal && inflight.abortListener) {
|
|
1463
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
1464
|
+
}
|
|
1465
|
+
inflight.reject(new Error('Transform worker pool closed'));
|
|
1466
|
+
this.inflight.delete(id);
|
|
1467
|
+
}
|
|
1468
|
+
for (const task of this.queue) {
|
|
1469
|
+
task.reject(new Error('Transform worker pool closed'));
|
|
1470
|
+
}
|
|
1471
|
+
this.queue.length = 0;
|
|
1472
|
+
await Promise.allSettled(terminations);
|
|
1473
|
+
}
|
|
1474
|
+
}
|
|
1475
|
+
export async function transformHtmlToMarkdown(html, url, options) {
|
|
1476
|
+
const totalStage = startTransformStage(url, 'transform:total');
|
|
1477
|
+
let success = false;
|
|
1478
|
+
try {
|
|
1479
|
+
throwIfAborted(options.signal, url, 'transform:begin');
|
|
1480
|
+
const workerStage = startTransformStage(url, 'transform:worker');
|
|
1481
|
+
try {
|
|
1482
|
+
const poolRef = getOrCreateTransformWorkerPool();
|
|
1483
|
+
const result = await poolRef.transform(html, url, {
|
|
1484
|
+
includeMetadata: options.includeMetadata,
|
|
1485
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
1486
|
+
});
|
|
1487
|
+
success = true;
|
|
1488
|
+
return result;
|
|
1489
|
+
}
|
|
1490
|
+
catch (error) {
|
|
1491
|
+
if (error instanceof FetchError) {
|
|
1492
|
+
throw error;
|
|
1493
|
+
}
|
|
1494
|
+
// Stability-first: if worker infrastructure fails, fall back to in-process.
|
|
1495
|
+
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
1496
|
+
const fallback = transformHtmlToMarkdownInProcess(html, url, options);
|
|
1497
|
+
success = true;
|
|
1498
|
+
return fallback;
|
|
1499
|
+
}
|
|
1500
|
+
finally {
|
|
1501
|
+
endTransformStage(workerStage);
|
|
1502
|
+
}
|
|
1503
|
+
}
|
|
1504
|
+
finally {
|
|
1505
|
+
if (success) {
|
|
1506
|
+
endTransformStage(totalStage, { truncated: false });
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
}
|