@j0hanz/superfetch 2.0.1 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +121 -38
- package/dist/cache.d.ts +42 -0
- package/dist/cache.js +674 -0
- package/dist/config/env-parsers.d.ts +1 -0
- package/dist/config/env-parsers.js +12 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +10 -3
- package/dist/config/types/content.d.ts +1 -0
- package/dist/config.d.ts +82 -0
- package/dist/config.js +274 -0
- package/dist/crypto.d.ts +2 -0
- package/dist/crypto.js +32 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +28 -0
- package/dist/fetch.d.ts +40 -0
- package/dist/fetch.js +930 -0
- package/dist/http/base-middleware.d.ts +7 -0
- package/dist/http/base-middleware.js +143 -0
- package/dist/http/cors.d.ts +0 -5
- package/dist/http/cors.js +0 -6
- package/dist/http/download-routes.js +6 -2
- package/dist/http/error-handler.d.ts +2 -0
- package/dist/http/error-handler.js +55 -0
- package/dist/http/mcp-routes.js +2 -2
- package/dist/http/mcp-sessions.d.ts +3 -5
- package/dist/http/mcp-sessions.js +8 -8
- package/dist/http/server-tuning.d.ts +9 -0
- package/dist/http/server-tuning.js +45 -0
- package/dist/http/server.d.ts +0 -10
- package/dist/http/server.js +33 -333
- package/dist/http.d.ts +86 -0
- package/dist/http.js +1507 -0
- package/dist/index.js +3 -3
- package/dist/instructions.md +96 -0
- package/dist/mcp.d.ts +3 -0
- package/dist/mcp.js +104 -0
- package/dist/observability.d.ts +16 -0
- package/dist/observability.js +78 -0
- package/dist/server.js +20 -5
- package/dist/services/cache.d.ts +1 -1
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +3 -0
- package/dist/services/extractor.d.ts +1 -0
- package/dist/services/extractor.js +28 -2
- package/dist/services/fetcher.d.ts +2 -0
- package/dist/services/fetcher.js +35 -14
- package/dist/services/logger.js +4 -1
- package/dist/services/telemetry.d.ts +19 -0
- package/dist/services/telemetry.js +43 -0
- package/dist/services/transform-worker-pool.d.ts +10 -3
- package/dist/services/transform-worker-pool.js +213 -184
- package/dist/tools/handlers/fetch-url.tool.js +8 -6
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.js +13 -1
- package/dist/tools/schemas.d.ts +2 -0
- package/dist/tools/schemas.js +8 -0
- package/dist/tools/utils/content-transform-core.d.ts +5 -0
- package/dist/tools/utils/content-transform-core.js +180 -0
- package/dist/tools/utils/content-transform-workers.d.ts +1 -0
- package/dist/tools/utils/content-transform-workers.js +1 -0
- package/dist/tools/utils/content-transform.d.ts +3 -5
- package/dist/tools/utils/content-transform.js +35 -148
- package/dist/tools/utils/raw-markdown.js +15 -1
- package/dist/tools.d.ts +109 -0
- package/dist/tools.js +434 -0
- package/dist/transform.d.ts +69 -0
- package/dist/transform.js +1814 -0
- package/dist/transformers/markdown.d.ts +4 -1
- package/dist/transformers/markdown.js +182 -53
- package/dist/utils/cancellation.d.ts +1 -0
- package/dist/utils/cancellation.js +18 -0
- package/dist/utils/code-language.d.ts +0 -9
- package/dist/utils/code-language.js +5 -5
- package/dist/utils/host-normalizer.d.ts +1 -0
- package/dist/utils/host-normalizer.js +37 -0
- package/dist/utils/url-redactor.d.ts +1 -0
- package/dist/utils/url-redactor.js +13 -0
- package/dist/utils/url-validator.js +8 -5
- package/dist/utils.d.ts +1 -0
- package/dist/utils.js +3 -0
- package/dist/workers/transform-worker.js +80 -38
- package/package.json +10 -9
|
@@ -0,0 +1,1814 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
2
|
+
import diagnosticsChannel from 'node:diagnostics_channel';
|
|
3
|
+
import os from 'node:os';
|
|
4
|
+
import { performance } from 'node:perf_hooks';
|
|
5
|
+
import { Worker } from 'node:worker_threads';
|
|
6
|
+
import { parseHTML } from 'linkedom';
|
|
7
|
+
import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
8
|
+
import { z } from 'zod';
|
|
9
|
+
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
10
|
+
import { config } from './config.js';
|
|
11
|
+
import { FetchError, getErrorMessage } from './errors.js';
|
|
12
|
+
import { isRawTextContentUrl } from './fetch.js';
|
|
13
|
+
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
|
|
14
|
+
import { isRecord } from './utils.js';
|
|
15
|
+
function getAbortReason(signal) {
|
|
16
|
+
if (!isRecord(signal))
|
|
17
|
+
return undefined;
|
|
18
|
+
return 'reason' in signal ? signal.reason : undefined;
|
|
19
|
+
}
|
|
20
|
+
function getBodyInnerHtml(document) {
|
|
21
|
+
if (!isRecord(document))
|
|
22
|
+
return undefined;
|
|
23
|
+
const { body } = document;
|
|
24
|
+
if (!isRecord(body))
|
|
25
|
+
return undefined;
|
|
26
|
+
const { innerHTML } = body;
|
|
27
|
+
return typeof innerHTML === 'string' && innerHTML.length > 0
|
|
28
|
+
? innerHTML
|
|
29
|
+
: undefined;
|
|
30
|
+
}
|
|
31
|
+
function getDocumentToString(document) {
|
|
32
|
+
if (!isRecord(document))
|
|
33
|
+
return undefined;
|
|
34
|
+
if (typeof document.toString !== 'function')
|
|
35
|
+
return undefined;
|
|
36
|
+
return document.toString.bind(document);
|
|
37
|
+
}
|
|
38
|
+
function getDocumentElementOuterHtml(document) {
|
|
39
|
+
if (!isRecord(document))
|
|
40
|
+
return undefined;
|
|
41
|
+
const { documentElement } = document;
|
|
42
|
+
if (!isRecord(documentElement))
|
|
43
|
+
return undefined;
|
|
44
|
+
const { outerHTML } = documentElement;
|
|
45
|
+
return typeof outerHTML === 'string' && outerHTML.length > 0
|
|
46
|
+
? outerHTML
|
|
47
|
+
: undefined;
|
|
48
|
+
}
|
|
49
|
+
const CODE_BLOCK = {
|
|
50
|
+
fence: '```',
|
|
51
|
+
format: (code, language = '') => {
|
|
52
|
+
return `\`\`\`${language}\n${code}\n\`\`\``;
|
|
53
|
+
},
|
|
54
|
+
};
|
|
55
|
+
const transformChannel = diagnosticsChannel.channel('superfetch.transform');
|
|
56
|
+
function publishTransformEvent(event) {
|
|
57
|
+
if (!transformChannel.hasSubscribers)
|
|
58
|
+
return;
|
|
59
|
+
try {
|
|
60
|
+
transformChannel.publish(event);
|
|
61
|
+
}
|
|
62
|
+
catch {
|
|
63
|
+
// Avoid crashing the publisher if a subscriber throws.
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
export function startTransformStage(url, stage) {
|
|
67
|
+
if (!transformChannel.hasSubscribers)
|
|
68
|
+
return null;
|
|
69
|
+
return {
|
|
70
|
+
stage,
|
|
71
|
+
startTime: performance.now(),
|
|
72
|
+
url: redactUrl(url),
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
export function endTransformStage(context, options) {
|
|
76
|
+
if (!context)
|
|
77
|
+
return;
|
|
78
|
+
const requestId = getRequestId();
|
|
79
|
+
const operationId = getOperationId();
|
|
80
|
+
const event = {
|
|
81
|
+
v: 1,
|
|
82
|
+
type: 'stage',
|
|
83
|
+
stage: context.stage,
|
|
84
|
+
durationMs: performance.now() - context.startTime,
|
|
85
|
+
url: context.url,
|
|
86
|
+
...(requestId ? { requestId } : {}),
|
|
87
|
+
...(operationId ? { operationId } : {}),
|
|
88
|
+
...(options?.truncated !== undefined
|
|
89
|
+
? { truncated: options.truncated }
|
|
90
|
+
: {}),
|
|
91
|
+
};
|
|
92
|
+
publishTransformEvent(event);
|
|
93
|
+
}
|
|
94
|
+
function runTransformStage(url, stage, fn) {
|
|
95
|
+
const context = startTransformStage(url, stage);
|
|
96
|
+
const result = fn();
|
|
97
|
+
endTransformStage(context);
|
|
98
|
+
return result;
|
|
99
|
+
}
|
|
100
|
+
function isTimeoutReason(reason) {
|
|
101
|
+
return reason instanceof Error && reason.name === 'TimeoutError';
|
|
102
|
+
}
|
|
103
|
+
function throwIfAborted(signal, url, stage) {
|
|
104
|
+
if (!signal)
|
|
105
|
+
return;
|
|
106
|
+
const { aborted } = signal;
|
|
107
|
+
if (!aborted)
|
|
108
|
+
return;
|
|
109
|
+
const reason = getAbortReason(signal);
|
|
110
|
+
if (isTimeoutReason(reason)) {
|
|
111
|
+
throw new FetchError('Request timeout', url, 504, {
|
|
112
|
+
reason: 'timeout',
|
|
113
|
+
stage,
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
throw new FetchError('Request was canceled', url, 499, {
|
|
117
|
+
reason: 'aborted',
|
|
118
|
+
stage,
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
function truncateHtml(html) {
|
|
122
|
+
const maxSize = config.constants.maxHtmlSize;
|
|
123
|
+
if (html.length <= maxSize) {
|
|
124
|
+
return html;
|
|
125
|
+
}
|
|
126
|
+
logWarn('HTML content exceeds maximum size, truncating', {
|
|
127
|
+
size: html.length,
|
|
128
|
+
maxSize,
|
|
129
|
+
});
|
|
130
|
+
return html.substring(0, maxSize);
|
|
131
|
+
}
|
|
132
|
+
function createMetaCollectorState() {
|
|
133
|
+
return {
|
|
134
|
+
title: {},
|
|
135
|
+
description: {},
|
|
136
|
+
author: {},
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
function resolveMetaField(state, field) {
|
|
140
|
+
const sources = state[field];
|
|
141
|
+
return sources.og ?? sources.twitter ?? sources.standard;
|
|
142
|
+
}
|
|
143
|
+
function parseOpenGraphKey(property) {
|
|
144
|
+
if (!property?.startsWith('og:'))
|
|
145
|
+
return null;
|
|
146
|
+
const key = property.replace('og:', '');
|
|
147
|
+
return key === 'title' || key === 'description' ? key : null;
|
|
148
|
+
}
|
|
149
|
+
function parseTwitterKey(name) {
|
|
150
|
+
if (!name?.startsWith('twitter:'))
|
|
151
|
+
return null;
|
|
152
|
+
const key = name.replace('twitter:', '');
|
|
153
|
+
return key === 'title' || key === 'description' ? key : null;
|
|
154
|
+
}
|
|
155
|
+
function parseStandardKey(name) {
|
|
156
|
+
if (name === 'description')
|
|
157
|
+
return 'description';
|
|
158
|
+
if (name === 'author')
|
|
159
|
+
return 'author';
|
|
160
|
+
return null;
|
|
161
|
+
}
|
|
162
|
+
function collectMetaTag(state, tag) {
|
|
163
|
+
const content = tag.getAttribute('content')?.trim();
|
|
164
|
+
if (!content)
|
|
165
|
+
return;
|
|
166
|
+
const ogKey = parseOpenGraphKey(tag.getAttribute('property'));
|
|
167
|
+
if (ogKey) {
|
|
168
|
+
state[ogKey].og = content;
|
|
169
|
+
return;
|
|
170
|
+
}
|
|
171
|
+
const name = tag.getAttribute('name');
|
|
172
|
+
const twitterKey = parseTwitterKey(name);
|
|
173
|
+
if (twitterKey) {
|
|
174
|
+
state[twitterKey].twitter = content;
|
|
175
|
+
return;
|
|
176
|
+
}
|
|
177
|
+
const standardKey = parseStandardKey(name);
|
|
178
|
+
if (standardKey) {
|
|
179
|
+
state[standardKey].standard = content;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
function scanMetaTags(document, state) {
|
|
183
|
+
const metaTags = document.querySelectorAll('meta');
|
|
184
|
+
for (const tag of metaTags) {
|
|
185
|
+
collectMetaTag(state, tag);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
function ensureTitleFallback(document, state) {
|
|
189
|
+
if (state.title.standard)
|
|
190
|
+
return;
|
|
191
|
+
const titleEl = document.querySelector('title');
|
|
192
|
+
if (titleEl?.textContent) {
|
|
193
|
+
state.title.standard = titleEl.textContent.trim();
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
function extractMetadata(document) {
|
|
197
|
+
const state = createMetaCollectorState();
|
|
198
|
+
scanMetaTags(document, state);
|
|
199
|
+
ensureTitleFallback(document, state);
|
|
200
|
+
const metadata = {};
|
|
201
|
+
const title = resolveMetaField(state, 'title');
|
|
202
|
+
const description = resolveMetaField(state, 'description');
|
|
203
|
+
const author = resolveMetaField(state, 'author');
|
|
204
|
+
if (title !== undefined)
|
|
205
|
+
metadata.title = title;
|
|
206
|
+
if (description !== undefined)
|
|
207
|
+
metadata.description = description;
|
|
208
|
+
if (author !== undefined)
|
|
209
|
+
metadata.author = author;
|
|
210
|
+
return metadata;
|
|
211
|
+
}
|
|
212
|
+
function isReadabilityCompatible(doc) {
|
|
213
|
+
if (!isRecord(doc))
|
|
214
|
+
return false;
|
|
215
|
+
return hasDocumentElement(doc) && hasQuerySelectors(doc);
|
|
216
|
+
}
|
|
217
|
+
function hasDocumentElement(record) {
|
|
218
|
+
return 'documentElement' in record;
|
|
219
|
+
}
|
|
220
|
+
function hasQuerySelectors(record) {
|
|
221
|
+
return (typeof record.querySelectorAll === 'function' &&
|
|
222
|
+
typeof record.querySelector === 'function');
|
|
223
|
+
}
|
|
224
|
+
function extractArticle(document) {
|
|
225
|
+
if (!isReadabilityCompatible(document)) {
|
|
226
|
+
logWarn('Document not compatible with Readability');
|
|
227
|
+
return null;
|
|
228
|
+
}
|
|
229
|
+
return mapParsedArticle(parseReadabilityArticle(document));
|
|
230
|
+
}
|
|
231
|
+
function parseReadabilityArticle(document) {
|
|
232
|
+
try {
|
|
233
|
+
// Readability mutates the document; operate on a clone.
|
|
234
|
+
const documentClone = document.cloneNode(true);
|
|
235
|
+
// Avoid the more expensive parse() when the page is unlikely to be readable,
|
|
236
|
+
// but don't penalize small documents where the heuristic is often too strict.
|
|
237
|
+
const rawText = documentClone.body.textContent ||
|
|
238
|
+
documentClone.documentElement.textContent;
|
|
239
|
+
const textLength = rawText.replace(/\s+/g, ' ').trim().length;
|
|
240
|
+
if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
|
|
241
|
+
return null;
|
|
242
|
+
}
|
|
243
|
+
// Guard against pathological DOM sizes.
|
|
244
|
+
const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
|
|
245
|
+
return reader.parse();
|
|
246
|
+
}
|
|
247
|
+
catch (error) {
|
|
248
|
+
logError('Failed to extract article with Readability', asError(error));
|
|
249
|
+
return null;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
function asError(error) {
|
|
253
|
+
if (error instanceof Error) {
|
|
254
|
+
return error;
|
|
255
|
+
}
|
|
256
|
+
return undefined;
|
|
257
|
+
}
|
|
258
|
+
function mapParsedArticle(parsed) {
|
|
259
|
+
return parsed ? mapReadabilityResult(parsed) : null;
|
|
260
|
+
}
|
|
261
|
+
function mapReadabilityResult(parsed) {
|
|
262
|
+
return {
|
|
263
|
+
content: parsed.content ?? '',
|
|
264
|
+
textContent: parsed.textContent ?? '',
|
|
265
|
+
...buildOptionalArticleFields(parsed),
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
function buildOptionalArticleFields(parsed) {
|
|
269
|
+
const optional = {};
|
|
270
|
+
addOptionalField(optional, 'title', parsed.title);
|
|
271
|
+
addOptionalField(optional, 'byline', parsed.byline);
|
|
272
|
+
addOptionalField(optional, 'excerpt', parsed.excerpt);
|
|
273
|
+
addOptionalField(optional, 'siteName', parsed.siteName);
|
|
274
|
+
return optional;
|
|
275
|
+
}
|
|
276
|
+
function addOptionalField(target, key, value) {
|
|
277
|
+
if (value == null)
|
|
278
|
+
return;
|
|
279
|
+
target[key] = value;
|
|
280
|
+
}
|
|
281
|
+
export function extractContent(html, url, options = {
|
|
282
|
+
extractArticle: true,
|
|
283
|
+
}) {
|
|
284
|
+
const emptyResult = createEmptyExtractionResult();
|
|
285
|
+
if (!isValidInput(html, url)) {
|
|
286
|
+
return emptyResult;
|
|
287
|
+
}
|
|
288
|
+
return tryExtractContent(html, url, options);
|
|
289
|
+
}
|
|
290
|
+
function createEmptyExtractionResult() {
|
|
291
|
+
return { article: null, metadata: {} };
|
|
292
|
+
}
|
|
293
|
+
function extractArticleWithStage(document, url, shouldExtract) {
|
|
294
|
+
if (!shouldExtract)
|
|
295
|
+
return null;
|
|
296
|
+
return runTransformStage(url, 'extract:article', () => resolveArticleExtraction(document, shouldExtract));
|
|
297
|
+
}
|
|
298
|
+
function handleExtractionFailure(error, url, signal) {
|
|
299
|
+
if (error instanceof FetchError) {
|
|
300
|
+
throw error;
|
|
301
|
+
}
|
|
302
|
+
throwIfAborted(signal, url, 'extract:error');
|
|
303
|
+
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
304
|
+
return createEmptyExtractionResult();
|
|
305
|
+
}
|
|
306
|
+
function extractContentStages(html, url, options) {
|
|
307
|
+
throwIfAborted(options.signal, url, 'extract:begin');
|
|
308
|
+
const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncateHtml(html)));
|
|
309
|
+
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
310
|
+
applyBaseUri(document, url);
|
|
311
|
+
const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
|
|
312
|
+
throwIfAborted(options.signal, url, 'extract:metadata');
|
|
313
|
+
const article = extractArticleWithStage(document, url, options.extractArticle);
|
|
314
|
+
throwIfAborted(options.signal, url, 'extract:article');
|
|
315
|
+
return {
|
|
316
|
+
article,
|
|
317
|
+
metadata,
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
function tryExtractContent(html, url, options) {
|
|
321
|
+
try {
|
|
322
|
+
return extractContentStages(html, url, options);
|
|
323
|
+
}
|
|
324
|
+
catch (error) {
|
|
325
|
+
return handleExtractionFailure(error, url, options.signal);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
function isValidInput(html, url) {
|
|
329
|
+
return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
|
|
330
|
+
}
|
|
331
|
+
function validateRequiredString(value, message) {
|
|
332
|
+
if (isNonEmptyString(value))
|
|
333
|
+
return true;
|
|
334
|
+
logWarn(message);
|
|
335
|
+
return false;
|
|
336
|
+
}
|
|
337
|
+
function isNonEmptyString(value) {
|
|
338
|
+
return typeof value === 'string' && value.length > 0;
|
|
339
|
+
}
|
|
340
|
+
function resolveArticleExtraction(document, shouldExtract) {
|
|
341
|
+
return shouldExtract ? extractArticle(document) : null;
|
|
342
|
+
}
|
|
343
|
+
function applyBaseUri(document, url) {
|
|
344
|
+
try {
|
|
345
|
+
Object.defineProperty(document, 'baseURI', {
|
|
346
|
+
value: url,
|
|
347
|
+
writable: true,
|
|
348
|
+
});
|
|
349
|
+
}
|
|
350
|
+
catch (error) {
|
|
351
|
+
logInfo('Failed to set baseURI (non-critical)', {
|
|
352
|
+
url: url.substring(0, 100),
|
|
353
|
+
error: getErrorMessage(error),
|
|
354
|
+
});
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
function containsJsxTag(code) {
|
|
358
|
+
for (let index = 0; index < code.length - 1; index += 1) {
|
|
359
|
+
if (code[index] !== '<')
|
|
360
|
+
continue;
|
|
361
|
+
const next = code[index + 1];
|
|
362
|
+
if (!next)
|
|
363
|
+
continue;
|
|
364
|
+
if (next >= 'A' && next <= 'Z')
|
|
365
|
+
return true;
|
|
366
|
+
}
|
|
367
|
+
return false;
|
|
368
|
+
}
|
|
369
|
+
function containsWord(source, word) {
|
|
370
|
+
let startIndex = source.indexOf(word);
|
|
371
|
+
while (startIndex !== -1) {
|
|
372
|
+
const before = startIndex === 0 ? '' : source[startIndex - 1];
|
|
373
|
+
const afterIndex = startIndex + word.length;
|
|
374
|
+
const after = afterIndex >= source.length ? '' : source[afterIndex];
|
|
375
|
+
if (!isWordChar(before) && !isWordChar(after))
|
|
376
|
+
return true;
|
|
377
|
+
startIndex = source.indexOf(word, startIndex + word.length);
|
|
378
|
+
}
|
|
379
|
+
return false;
|
|
380
|
+
}
|
|
381
|
+
function splitLines(content) {
|
|
382
|
+
return content.split('\n');
|
|
383
|
+
}
|
|
384
|
+
function extractLanguageFromClassName(className) {
|
|
385
|
+
const tokens = className.match(/\S+/g);
|
|
386
|
+
if (!tokens)
|
|
387
|
+
return undefined;
|
|
388
|
+
for (const token of tokens) {
|
|
389
|
+
const lower = token.toLowerCase();
|
|
390
|
+
if (lower.startsWith('language-'))
|
|
391
|
+
return token.slice('language-'.length);
|
|
392
|
+
if (lower.startsWith('lang-'))
|
|
393
|
+
return token.slice('lang-'.length);
|
|
394
|
+
if (lower.startsWith('highlight-')) {
|
|
395
|
+
return token.slice('highlight-'.length);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
return undefined;
|
|
399
|
+
}
|
|
400
|
+
function resolveLanguageFromDataAttribute(dataLang) {
|
|
401
|
+
const trimmed = dataLang.trim();
|
|
402
|
+
if (!trimmed)
|
|
403
|
+
return undefined;
|
|
404
|
+
for (const char of trimmed) {
|
|
405
|
+
if (!isWordChar(char))
|
|
406
|
+
return undefined;
|
|
407
|
+
}
|
|
408
|
+
return trimmed;
|
|
409
|
+
}
|
|
410
|
+
function isWordChar(char) {
|
|
411
|
+
if (!char)
|
|
412
|
+
return false;
|
|
413
|
+
const code = char.charCodeAt(0);
|
|
414
|
+
return ((code >= 48 && code <= 57) ||
|
|
415
|
+
(code >= 65 && code <= 90) ||
|
|
416
|
+
(code >= 97 && code <= 122) ||
|
|
417
|
+
char === '_');
|
|
418
|
+
}
|
|
419
|
+
const BASH_PACKAGE_MANAGERS = [
|
|
420
|
+
'npm',
|
|
421
|
+
'yarn',
|
|
422
|
+
'pnpm',
|
|
423
|
+
'npx',
|
|
424
|
+
'brew',
|
|
425
|
+
'apt',
|
|
426
|
+
'pip',
|
|
427
|
+
'cargo',
|
|
428
|
+
'go',
|
|
429
|
+
];
|
|
430
|
+
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
431
|
+
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
432
|
+
function detectBash(code) {
|
|
433
|
+
const lines = splitLines(code);
|
|
434
|
+
for (const line of lines) {
|
|
435
|
+
const trimmed = line.trimStart();
|
|
436
|
+
if (!trimmed)
|
|
437
|
+
continue;
|
|
438
|
+
if (isBashIndicator(trimmed))
|
|
439
|
+
return true;
|
|
440
|
+
}
|
|
441
|
+
return false;
|
|
442
|
+
}
|
|
443
|
+
function startsWithCommand(line, commands) {
|
|
444
|
+
return commands.some((command) => line === command || line.startsWith(`${command} `));
|
|
445
|
+
}
|
|
446
|
+
function isBashIndicator(line) {
|
|
447
|
+
return (isShebang(line) ||
|
|
448
|
+
isPromptLine(line) ||
|
|
449
|
+
startsWithCommand(line, BASH_COMMANDS) ||
|
|
450
|
+
startsWithPackageManagerCommand(line));
|
|
451
|
+
}
|
|
452
|
+
function isShebang(line) {
|
|
453
|
+
return line.startsWith('#!');
|
|
454
|
+
}
|
|
455
|
+
function isPromptLine(line) {
|
|
456
|
+
return line.startsWith('$ ') || line.startsWith('# ');
|
|
457
|
+
}
|
|
458
|
+
function startsWithPackageManagerCommand(line) {
|
|
459
|
+
return BASH_PACKAGE_MANAGERS.some((manager) => {
|
|
460
|
+
if (!line.startsWith(`${manager} `))
|
|
461
|
+
return false;
|
|
462
|
+
const rest = line.slice(manager.length + 1);
|
|
463
|
+
return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
|
|
464
|
+
});
|
|
465
|
+
}
|
|
466
|
+
const TYPE_HINTS = [
|
|
467
|
+
'string',
|
|
468
|
+
'number',
|
|
469
|
+
'boolean',
|
|
470
|
+
'void',
|
|
471
|
+
'any',
|
|
472
|
+
'unknown',
|
|
473
|
+
'never',
|
|
474
|
+
];
|
|
475
|
+
const HTML_TAGS = [
|
|
476
|
+
'<!doctype',
|
|
477
|
+
'<html',
|
|
478
|
+
'<head',
|
|
479
|
+
'<body',
|
|
480
|
+
'<div',
|
|
481
|
+
'<span',
|
|
482
|
+
'<p',
|
|
483
|
+
'<a',
|
|
484
|
+
'<script',
|
|
485
|
+
'<style',
|
|
486
|
+
];
|
|
487
|
+
const SQL_KEYWORDS = [
|
|
488
|
+
'select',
|
|
489
|
+
'insert',
|
|
490
|
+
'update',
|
|
491
|
+
'delete',
|
|
492
|
+
'create',
|
|
493
|
+
'alter',
|
|
494
|
+
'drop',
|
|
495
|
+
];
|
|
496
|
+
const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
|
|
497
|
+
const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
|
|
498
|
+
const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
|
|
499
|
+
const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
|
|
500
|
+
const CODE_DETECTORS = [
|
|
501
|
+
{ language: 'jsx', detect: detectJsx },
|
|
502
|
+
{ language: 'typescript', detect: detectTypescript },
|
|
503
|
+
{ language: 'rust', detect: detectRust },
|
|
504
|
+
{ language: 'javascript', detect: detectJavascript },
|
|
505
|
+
{ language: 'python', detect: detectPython },
|
|
506
|
+
{ language: 'bash', detect: detectBash },
|
|
507
|
+
{ language: 'css', detect: detectCss },
|
|
508
|
+
{ language: 'html', detect: detectHtml },
|
|
509
|
+
{ language: 'json', detect: detectJson },
|
|
510
|
+
{ language: 'yaml', detect: detectYaml },
|
|
511
|
+
{ language: 'sql', detect: detectSql },
|
|
512
|
+
{ language: 'go', detect: detectGo },
|
|
513
|
+
];
|
|
514
|
+
function detectJsx(code) {
|
|
515
|
+
const lower = code.toLowerCase();
|
|
516
|
+
if (lower.includes('classname='))
|
|
517
|
+
return true;
|
|
518
|
+
if (lower.includes('jsx:'))
|
|
519
|
+
return true;
|
|
520
|
+
if (lower.includes("from 'react'") || lower.includes('from "react"')) {
|
|
521
|
+
return true;
|
|
522
|
+
}
|
|
523
|
+
return containsJsxTag(code);
|
|
524
|
+
}
|
|
525
|
+
function detectTypescript(code) {
|
|
526
|
+
const lower = code.toLowerCase();
|
|
527
|
+
if (containsWord(lower, 'interface'))
|
|
528
|
+
return true;
|
|
529
|
+
if (containsWord(lower, 'type'))
|
|
530
|
+
return true;
|
|
531
|
+
return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
|
|
532
|
+
}
|
|
533
|
+
function detectRust(code) {
|
|
534
|
+
const lower = code.toLowerCase();
|
|
535
|
+
return (RUST_WORD_REGEX.test(lower) ||
|
|
536
|
+
lower.includes('let mut') ||
|
|
537
|
+
(lower.includes('use ') && lower.includes('::')));
|
|
538
|
+
}
|
|
539
|
+
function detectJavascript(code) {
|
|
540
|
+
const lower = code.toLowerCase();
|
|
541
|
+
return JS_WORD_REGEX.test(lower);
|
|
542
|
+
}
|
|
543
|
+
function detectPython(code) {
|
|
544
|
+
const lower = code.toLowerCase();
|
|
545
|
+
return (PYTHON_WORD_REGEX.test(lower) ||
|
|
546
|
+
lower.includes('print(') ||
|
|
547
|
+
lower.includes('__name__'));
|
|
548
|
+
}
|
|
549
|
+
function detectCss(code) {
|
|
550
|
+
const lower = code.toLowerCase();
|
|
551
|
+
if (CSS_DIRECTIVE_REGEX.test(lower))
|
|
552
|
+
return true;
|
|
553
|
+
const lines = splitLines(code);
|
|
554
|
+
for (const line of lines) {
|
|
555
|
+
const trimmed = line.trimStart();
|
|
556
|
+
if (!trimmed)
|
|
557
|
+
continue;
|
|
558
|
+
if (isCssSelectorLine(trimmed) || isCssPropertyLine(trimmed))
|
|
559
|
+
return true;
|
|
560
|
+
}
|
|
561
|
+
return false;
|
|
562
|
+
}
|
|
563
|
+
function detectHtml(code) {
|
|
564
|
+
const lower = code.toLowerCase();
|
|
565
|
+
return HTML_TAGS.some((tag) => lower.includes(tag));
|
|
566
|
+
}
|
|
567
|
+
function detectJson(code) {
|
|
568
|
+
const trimmed = code.trimStart();
|
|
569
|
+
if (!trimmed)
|
|
570
|
+
return false;
|
|
571
|
+
return trimmed.startsWith('{') || trimmed.startsWith('[');
|
|
572
|
+
}
|
|
573
|
+
function detectYaml(code) {
|
|
574
|
+
const lines = splitLines(code);
|
|
575
|
+
for (const line of lines) {
|
|
576
|
+
const trimmed = line.trim();
|
|
577
|
+
if (!trimmed)
|
|
578
|
+
continue;
|
|
579
|
+
const colonIndex = trimmed.indexOf(':');
|
|
580
|
+
if (colonIndex <= 0)
|
|
581
|
+
continue;
|
|
582
|
+
const after = trimmed[colonIndex + 1];
|
|
583
|
+
if (after === ' ' || after === '\t')
|
|
584
|
+
return true;
|
|
585
|
+
}
|
|
586
|
+
return false;
|
|
587
|
+
}
|
|
588
|
+
function detectSql(code) {
|
|
589
|
+
const lower = code.toLowerCase();
|
|
590
|
+
return SQL_KEYWORDS.some((keyword) => containsWord(lower, keyword));
|
|
591
|
+
}
|
|
592
|
+
function detectGo(code) {
|
|
593
|
+
const lower = code.toLowerCase();
|
|
594
|
+
return (containsWord(lower, 'package') ||
|
|
595
|
+
containsWord(lower, 'func') ||
|
|
596
|
+
lower.includes('import "'));
|
|
597
|
+
}
|
|
598
|
+
function isCssSelectorLine(line) {
|
|
599
|
+
if (!line.startsWith('.') && !line.startsWith('#'))
|
|
600
|
+
return false;
|
|
601
|
+
return line.includes('{');
|
|
602
|
+
}
|
|
603
|
+
function isCssPropertyLine(line) {
|
|
604
|
+
return line.includes(':') && line.includes(';');
|
|
605
|
+
}
|
|
606
|
+
export function detectLanguageFromCode(code) {
|
|
607
|
+
for (const { language, detect } of CODE_DETECTORS) {
|
|
608
|
+
if (detect(code))
|
|
609
|
+
return language;
|
|
610
|
+
}
|
|
611
|
+
return undefined;
|
|
612
|
+
}
|
|
613
|
+
export function resolveLanguageFromAttributes(className, dataLang) {
|
|
614
|
+
const classMatch = extractLanguageFromClassName(className);
|
|
615
|
+
return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
|
|
616
|
+
}
|
|
617
|
+
function isElement(node) {
|
|
618
|
+
return (isRecord(node) &&
|
|
619
|
+
'getAttribute' in node &&
|
|
620
|
+
typeof node.getAttribute === 'function');
|
|
621
|
+
}
|
|
622
|
+
const STRUCTURAL_TAGS = new Set([
|
|
623
|
+
'script',
|
|
624
|
+
'style',
|
|
625
|
+
'noscript',
|
|
626
|
+
'iframe',
|
|
627
|
+
'form',
|
|
628
|
+
'button',
|
|
629
|
+
'input',
|
|
630
|
+
'select',
|
|
631
|
+
'textarea',
|
|
632
|
+
]);
|
|
633
|
+
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
|
|
634
|
+
const NAVIGATION_ROLES = new Set([
|
|
635
|
+
'navigation',
|
|
636
|
+
'banner',
|
|
637
|
+
'complementary',
|
|
638
|
+
'contentinfo',
|
|
639
|
+
'tree',
|
|
640
|
+
'menubar',
|
|
641
|
+
'menu',
|
|
642
|
+
'dialog',
|
|
643
|
+
'alertdialog',
|
|
644
|
+
]);
|
|
645
|
+
const PROMO_TOKENS = new Set([
|
|
646
|
+
'banner',
|
|
647
|
+
'promo',
|
|
648
|
+
'announcement',
|
|
649
|
+
'cta',
|
|
650
|
+
'callout',
|
|
651
|
+
'advert',
|
|
652
|
+
'ad',
|
|
653
|
+
'ads',
|
|
654
|
+
'sponsor',
|
|
655
|
+
'newsletter',
|
|
656
|
+
'subscribe',
|
|
657
|
+
'cookie',
|
|
658
|
+
'consent',
|
|
659
|
+
'popup',
|
|
660
|
+
'modal',
|
|
661
|
+
'overlay',
|
|
662
|
+
'toast',
|
|
663
|
+
'share',
|
|
664
|
+
'social',
|
|
665
|
+
'related',
|
|
666
|
+
'recommend',
|
|
667
|
+
'comment',
|
|
668
|
+
'breadcrumb',
|
|
669
|
+
'pagination',
|
|
670
|
+
'pager',
|
|
671
|
+
]);
|
|
672
|
+
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
673
|
+
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
674
|
+
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
675
|
+
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
676
|
+
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
677
|
+
const NOISE_MARKERS = [
|
|
678
|
+
'<script',
|
|
679
|
+
'<style',
|
|
680
|
+
'<noscript',
|
|
681
|
+
'<iframe',
|
|
682
|
+
'<nav',
|
|
683
|
+
'<footer',
|
|
684
|
+
'<aside',
|
|
685
|
+
'<header',
|
|
686
|
+
'<form',
|
|
687
|
+
'<button',
|
|
688
|
+
'<input',
|
|
689
|
+
'<select',
|
|
690
|
+
'<textarea',
|
|
691
|
+
'<svg',
|
|
692
|
+
'<canvas',
|
|
693
|
+
' aria-hidden="true"',
|
|
694
|
+
" aria-hidden='true'",
|
|
695
|
+
' hidden',
|
|
696
|
+
' role="navigation"',
|
|
697
|
+
" role='navigation'",
|
|
698
|
+
' role="banner"',
|
|
699
|
+
" role='banner'",
|
|
700
|
+
' role="complementary"',
|
|
701
|
+
" role='complementary'",
|
|
702
|
+
' role="contentinfo"',
|
|
703
|
+
" role='contentinfo'",
|
|
704
|
+
' role="tree"',
|
|
705
|
+
" role='tree'",
|
|
706
|
+
' role="menubar"',
|
|
707
|
+
" role='menubar'",
|
|
708
|
+
' role="menu"',
|
|
709
|
+
" role='menu'",
|
|
710
|
+
' banner',
|
|
711
|
+
' promo',
|
|
712
|
+
' announcement',
|
|
713
|
+
' cta',
|
|
714
|
+
' callout',
|
|
715
|
+
' advert',
|
|
716
|
+
' newsletter',
|
|
717
|
+
' subscribe',
|
|
718
|
+
' cookie',
|
|
719
|
+
' consent',
|
|
720
|
+
' popup',
|
|
721
|
+
' modal',
|
|
722
|
+
' overlay',
|
|
723
|
+
' toast',
|
|
724
|
+
' fixed',
|
|
725
|
+
' sticky',
|
|
726
|
+
' z-50',
|
|
727
|
+
' z-4',
|
|
728
|
+
' isolate',
|
|
729
|
+
];
|
|
730
|
+
function mayContainNoise(html) {
|
|
731
|
+
const haystack = html.toLowerCase();
|
|
732
|
+
return NOISE_MARKERS.some((marker) => haystack.includes(marker));
|
|
733
|
+
}
|
|
734
|
+
function isFullDocumentHtml(html) {
|
|
735
|
+
return HTML_DOCUMENT_MARKERS.test(html);
|
|
736
|
+
}
|
|
737
|
+
function isStructuralNoiseTag(tagName) {
|
|
738
|
+
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
739
|
+
}
|
|
740
|
+
function isElementHidden(element) {
|
|
741
|
+
const style = element.getAttribute('style') ?? '';
|
|
742
|
+
return (element.getAttribute('hidden') !== null ||
|
|
743
|
+
element.getAttribute('aria-hidden') === 'true' ||
|
|
744
|
+
/\bdisplay\s*:\s*none\b/i.test(style) ||
|
|
745
|
+
/\bvisibility\s*:\s*hidden\b/i.test(style));
|
|
746
|
+
}
|
|
747
|
+
function hasNoiseRole(role) {
|
|
748
|
+
return role !== null && NAVIGATION_ROLES.has(role);
|
|
749
|
+
}
|
|
750
|
+
function tokenizeIdentifierLikeText(value) {
|
|
751
|
+
return value
|
|
752
|
+
.toLowerCase()
|
|
753
|
+
.replace(/[^a-z0-9]+/g, ' ')
|
|
754
|
+
.trim()
|
|
755
|
+
.split(' ')
|
|
756
|
+
.filter(Boolean);
|
|
757
|
+
}
|
|
758
|
+
function matchesPromoIdOrClass(className, id) {
|
|
759
|
+
const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
|
|
760
|
+
return tokens.some((token) => PROMO_TOKENS.has(token));
|
|
761
|
+
}
|
|
762
|
+
function matchesHighZIsolate(className) {
|
|
763
|
+
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
764
|
+
}
|
|
765
|
+
function matchesFixedOrHighZIsolate(className) {
|
|
766
|
+
return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
|
|
767
|
+
}
|
|
768
|
+
function readElementMetadata(element) {
|
|
769
|
+
return {
|
|
770
|
+
tagName: element.tagName.toLowerCase(),
|
|
771
|
+
className: element.getAttribute('class') ?? '',
|
|
772
|
+
id: element.getAttribute('id') ?? '',
|
|
773
|
+
role: element.getAttribute('role'),
|
|
774
|
+
isHidden: isElementHidden(element),
|
|
775
|
+
};
|
|
776
|
+
}
|
|
777
|
+
function isBoilerplateHeader({ className, id, role, }) {
|
|
778
|
+
if (hasNoiseRole(role))
|
|
779
|
+
return true;
|
|
780
|
+
const combined = `${className} ${id}`.toLowerCase();
|
|
781
|
+
return HEADER_NOISE_PATTERN.test(combined);
|
|
782
|
+
}
|
|
783
|
+
function isNoiseElement(node) {
|
|
784
|
+
const metadata = readElementMetadata(node);
|
|
785
|
+
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
786
|
+
ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
|
|
787
|
+
(metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
|
|
788
|
+
metadata.isHidden ||
|
|
789
|
+
hasNoiseRole(metadata.role) ||
|
|
790
|
+
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
791
|
+
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
792
|
+
}
|
|
793
|
+
function stripNoiseNodes(document) {
|
|
794
|
+
const nodes = document.querySelectorAll('*');
|
|
795
|
+
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
796
|
+
const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
|
|
797
|
+
if (!node)
|
|
798
|
+
continue;
|
|
799
|
+
if (isElement(node) && isNoiseElement(node)) {
|
|
800
|
+
node.remove();
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
function removeNoiseFromHtml(html) {
|
|
805
|
+
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
806
|
+
if (!shouldParse)
|
|
807
|
+
return html;
|
|
808
|
+
try {
|
|
809
|
+
const { document } = parseHTML(html);
|
|
810
|
+
stripNoiseNodes(document);
|
|
811
|
+
const bodyInnerHtml = getBodyInnerHtml(document);
|
|
812
|
+
if (bodyInnerHtml)
|
|
813
|
+
return bodyInnerHtml;
|
|
814
|
+
const docToString = getDocumentToString(document);
|
|
815
|
+
if (docToString)
|
|
816
|
+
return docToString();
|
|
817
|
+
const documentElementOuterHtml = getDocumentElementOuterHtml(document);
|
|
818
|
+
if (documentElementOuterHtml)
|
|
819
|
+
return documentElementOuterHtml;
|
|
820
|
+
return html;
|
|
821
|
+
}
|
|
822
|
+
catch {
|
|
823
|
+
return html;
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
function buildInlineCode(content) {
|
|
827
|
+
const runs = content.match(/`+/g);
|
|
828
|
+
const longest = runs?.sort((a, b) => b.length - a.length)[0] ?? '';
|
|
829
|
+
const delimiter = `\`${longest}`;
|
|
830
|
+
const padding = delimiter.length > 1 ? ' ' : '';
|
|
831
|
+
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
832
|
+
}
|
|
833
|
+
/**
|
|
834
|
+
* Derive alt text from an image URL by extracting and humanizing the filename.
|
|
835
|
+
* Used as a fallback when the image has no alt attribute.
|
|
836
|
+
*/
|
|
837
|
+
function deriveAltFromImageUrl(src) {
|
|
838
|
+
if (!src)
|
|
839
|
+
return '';
|
|
840
|
+
try {
|
|
841
|
+
// Handle both absolute and relative URLs.
|
|
842
|
+
const pathname = src.startsWith('http')
|
|
843
|
+
? new URL(src).pathname
|
|
844
|
+
: (src.split('?')[0] ?? '');
|
|
845
|
+
// Extract filename from path.
|
|
846
|
+
const segments = pathname.split('/');
|
|
847
|
+
const filename = segments.pop() ?? '';
|
|
848
|
+
if (!filename)
|
|
849
|
+
return '';
|
|
850
|
+
// Remove file extension.
|
|
851
|
+
const dotIndex = filename.lastIndexOf('.');
|
|
852
|
+
const name = dotIndex > 0 ? filename.slice(0, dotIndex) : filename;
|
|
853
|
+
// Humanize: replace separators with spaces.
|
|
854
|
+
return name.replace(/[_-]+/g, ' ').trim();
|
|
855
|
+
}
|
|
856
|
+
catch {
|
|
857
|
+
return '';
|
|
858
|
+
}
|
|
859
|
+
}
|
|
860
|
+
function isCodeBlock(parent) {
|
|
861
|
+
if (!isRecord(parent))
|
|
862
|
+
return false;
|
|
863
|
+
const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
|
|
864
|
+
return ['PRE', 'WRAPPED-PRE'].includes(tagName);
|
|
865
|
+
}
|
|
866
|
+
function hasGetAttribute(value) {
|
|
867
|
+
return isRecord(value) && typeof value.getAttribute === 'function';
|
|
868
|
+
}
|
|
869
|
+
function hasCodeBlockTranslators(value) {
|
|
870
|
+
return isRecord(value) && isRecord(value.codeBlockTranslators);
|
|
871
|
+
}
|
|
872
|
+
function buildInlineCodeTranslator() {
|
|
873
|
+
return {
|
|
874
|
+
spaceIfRepeatingChar: true,
|
|
875
|
+
noEscape: true,
|
|
876
|
+
postprocess: ({ content }) => buildInlineCode(content),
|
|
877
|
+
};
|
|
878
|
+
}
|
|
879
|
+
function resolveAttributeLanguage(node) {
|
|
880
|
+
const getAttribute = hasGetAttribute(node)
|
|
881
|
+
? node.getAttribute.bind(node)
|
|
882
|
+
: undefined;
|
|
883
|
+
const className = getAttribute?.('class') ?? '';
|
|
884
|
+
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
885
|
+
return resolveLanguageFromAttributes(className, dataLanguage);
|
|
886
|
+
}
|
|
887
|
+
function resolveCodeBlockTranslators(visitor) {
|
|
888
|
+
const childTranslators = isRecord(visitor) ? visitor.instance : null;
|
|
889
|
+
return hasCodeBlockTranslators(childTranslators)
|
|
890
|
+
? childTranslators.codeBlockTranslators
|
|
891
|
+
: null;
|
|
892
|
+
}
|
|
893
|
+
function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
|
|
894
|
+
return {
|
|
895
|
+
noEscape: true,
|
|
896
|
+
preserveWhitespace: true,
|
|
897
|
+
...(codeBlockTranslators
|
|
898
|
+
? { childTranslators: codeBlockTranslators }
|
|
899
|
+
: null),
|
|
900
|
+
postprocess: ({ content }) => {
|
|
901
|
+
const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
|
|
902
|
+
return CODE_BLOCK.format(content, language);
|
|
903
|
+
},
|
|
904
|
+
};
|
|
905
|
+
}
|
|
906
|
+
function buildCodeTranslator(ctx) {
|
|
907
|
+
if (!isRecord(ctx))
|
|
908
|
+
return buildInlineCodeTranslator();
|
|
909
|
+
const { node, parent, visitor } = ctx;
|
|
910
|
+
if (!isCodeBlock(parent))
|
|
911
|
+
return buildInlineCodeTranslator();
|
|
912
|
+
const attributeLanguage = resolveAttributeLanguage(node);
|
|
913
|
+
const codeBlockTranslators = resolveCodeBlockTranslators(visitor);
|
|
914
|
+
return buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators);
|
|
915
|
+
}
|
|
916
|
+
function buildImageTranslator(ctx) {
|
|
917
|
+
if (!isRecord(ctx))
|
|
918
|
+
return { content: '' };
|
|
919
|
+
const { node } = ctx;
|
|
920
|
+
const getAttribute = hasGetAttribute(node)
|
|
921
|
+
? node.getAttribute.bind(node)
|
|
922
|
+
: undefined;
|
|
923
|
+
const src = getAttribute?.('src') ?? '';
|
|
924
|
+
const existingAlt = getAttribute?.('alt') ?? '';
|
|
925
|
+
// Use existing alt text if present, otherwise derive from filename.
|
|
926
|
+
const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
|
|
927
|
+
return {
|
|
928
|
+
content: ``,
|
|
929
|
+
};
|
|
930
|
+
}
|
|
931
|
+
function createCustomTranslators() {
|
|
932
|
+
return {
|
|
933
|
+
code: (ctx) => buildCodeTranslator(ctx),
|
|
934
|
+
img: (ctx) => buildImageTranslator(ctx),
|
|
935
|
+
};
|
|
936
|
+
}
|
|
937
|
+
let markdownInstance = null;
|
|
938
|
+
function createMarkdownInstance() {
|
|
939
|
+
return new NodeHtmlMarkdown({
|
|
940
|
+
codeFence: CODE_BLOCK.fence,
|
|
941
|
+
codeBlockStyle: 'fenced',
|
|
942
|
+
emDelimiter: '_',
|
|
943
|
+
bulletMarker: '-',
|
|
944
|
+
}, createCustomTranslators());
|
|
945
|
+
}
|
|
946
|
+
function getMarkdownConverter() {
|
|
947
|
+
markdownInstance ??= createMarkdownInstance();
|
|
948
|
+
return markdownInstance;
|
|
949
|
+
}
|
|
950
|
+
function translateHtmlToMarkdown(html, url, signal) {
|
|
951
|
+
throwIfAborted(signal, url, 'markdown:begin');
|
|
952
|
+
const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html));
|
|
953
|
+
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
954
|
+
const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
|
|
955
|
+
throwIfAborted(signal, url, 'markdown:translated');
|
|
956
|
+
// Post-process the markdown to clean up common conversion artifacts.
|
|
957
|
+
return cleanupMarkdownArtifacts(content);
|
|
958
|
+
}
|
|
959
|
+
function appendMetadataFooter(content, metadata, url) {
|
|
960
|
+
// Metadata is placed as a footer to avoid duplicating titles when the
|
|
961
|
+
// article content already contains an H1 heading at the top.
|
|
962
|
+
const footer = buildMetadataFooter(metadata, url);
|
|
963
|
+
return footer ? `${content}\n\n${footer}` : content;
|
|
964
|
+
}
|
|
965
|
+
export function htmlToMarkdown(html, metadata, options) {
|
|
966
|
+
const url = options?.url ?? metadata?.url ?? '';
|
|
967
|
+
if (!html)
|
|
968
|
+
return buildMetadataFooter(metadata, url);
|
|
969
|
+
try {
|
|
970
|
+
const content = translateHtmlToMarkdown(html, url, options?.signal);
|
|
971
|
+
return appendMetadataFooter(content, metadata, url);
|
|
972
|
+
}
|
|
973
|
+
catch (error) {
|
|
974
|
+
if (error instanceof FetchError) {
|
|
975
|
+
throw error;
|
|
976
|
+
}
|
|
977
|
+
return buildMetadataFooter(metadata, url);
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
/**
|
|
981
|
+
* Clean up common markdown conversion artifacts:
|
|
982
|
+
* - Empty headings (e.g., "## " with no text)
|
|
983
|
+
* - Anchor-only links like [ ](#section-id) used for navigation
|
|
984
|
+
* - Concatenated links without spacing
|
|
985
|
+
* - Boilerplate phrases like "Was this page helpful?"
|
|
986
|
+
*/
|
|
987
|
+
function cleanupMarkdownArtifacts(content) {
|
|
988
|
+
let result = content;
|
|
989
|
+
// Remove empty Markdown headings like "## " produced by placeholder nodes.
|
|
990
|
+
result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
|
|
991
|
+
// Remove anchor-only links like [\u200B](#section-id) or [ ](#anchor).
|
|
992
|
+
// These are navigation remnants with zero-width or whitespace text.
|
|
993
|
+
// Match: [ or whitespace or zero-width space ](#...)
|
|
994
|
+
const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
|
|
995
|
+
result = result.replace(zeroWidthAnchorLink, '');
|
|
996
|
+
// Add line breaks between concatenated links: ](url)[text] -> ](url)\n\n[text]
|
|
997
|
+
result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
|
|
998
|
+
// Remove common boilerplate phrases.
|
|
999
|
+
result = result.replace(/^Was this page helpful\??\s*$/gim, '');
|
|
1000
|
+
// Collapse multiple blank lines into at most two.
|
|
1001
|
+
result = result.replace(/\n{3,}/g, '\n\n');
|
|
1002
|
+
return result.trim();
|
|
1003
|
+
}
|
|
1004
|
+
function buildMetadataFooter(metadata, fallbackUrl) {
|
|
1005
|
+
if (!metadata)
|
|
1006
|
+
return '';
|
|
1007
|
+
const lines = [];
|
|
1008
|
+
// Horizontal rule as a clear footer separator.
|
|
1009
|
+
lines.push('---');
|
|
1010
|
+
if (metadata.title)
|
|
1011
|
+
lines.push(`**Title:** ${metadata.title}`);
|
|
1012
|
+
if (metadata.description)
|
|
1013
|
+
lines.push(`**Description:** ${metadata.description}`);
|
|
1014
|
+
if (metadata.author)
|
|
1015
|
+
lines.push(`**Author:** ${metadata.author}`);
|
|
1016
|
+
if (metadata.url)
|
|
1017
|
+
lines.push(`**Source:** ${metadata.url}`);
|
|
1018
|
+
else if (fallbackUrl)
|
|
1019
|
+
lines.push(`**Source:** ${fallbackUrl}`);
|
|
1020
|
+
if (metadata.fetchedAt)
|
|
1021
|
+
lines.push(`**Fetched:** ${metadata.fetchedAt}`);
|
|
1022
|
+
return lines.join('\n');
|
|
1023
|
+
}
|
|
1024
|
+
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
1025
|
+
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
1026
|
+
const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
|
|
1027
|
+
function containsMarkdownHeading(content) {
|
|
1028
|
+
return HEADING_PATTERN.test(content);
|
|
1029
|
+
}
|
|
1030
|
+
function containsMarkdownList(content) {
|
|
1031
|
+
return LIST_PATTERN.test(content);
|
|
1032
|
+
}
|
|
1033
|
+
function containsFencedCodeBlock(content) {
|
|
1034
|
+
const first = content.indexOf('```');
|
|
1035
|
+
if (first === -1)
|
|
1036
|
+
return false;
|
|
1037
|
+
return content.includes('```', first + 3);
|
|
1038
|
+
}
|
|
1039
|
+
function looksLikeMarkdown(content) {
|
|
1040
|
+
return (containsMarkdownHeading(content) ||
|
|
1041
|
+
containsMarkdownList(content) ||
|
|
1042
|
+
containsFencedCodeBlock(content));
|
|
1043
|
+
}
|
|
1044
|
+
function detectLineEnding(content) {
|
|
1045
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
1046
|
+
}
|
|
1047
|
+
const FRONTMATTER_DELIMITER = '---';
|
|
1048
|
+
function findFrontmatterLines(content) {
|
|
1049
|
+
const lineEnding = detectLineEnding(content);
|
|
1050
|
+
const lines = content.split(lineEnding);
|
|
1051
|
+
if (lines[0] !== FRONTMATTER_DELIMITER)
|
|
1052
|
+
return null;
|
|
1053
|
+
const endIndex = lines.indexOf(FRONTMATTER_DELIMITER, 1);
|
|
1054
|
+
if (endIndex === -1)
|
|
1055
|
+
return null;
|
|
1056
|
+
return { lineEnding, lines, endIndex };
|
|
1057
|
+
}
|
|
1058
|
+
function stripOptionalQuotes(value) {
|
|
1059
|
+
const trimmed = value.trim();
|
|
1060
|
+
if (trimmed.length < 2)
|
|
1061
|
+
return trimmed;
|
|
1062
|
+
const first = trimmed[0];
|
|
1063
|
+
const last = trimmed[trimmed.length - 1];
|
|
1064
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
1065
|
+
return trimmed.slice(1, -1).trim();
|
|
1066
|
+
}
|
|
1067
|
+
return trimmed;
|
|
1068
|
+
}
|
|
1069
|
+
function parseFrontmatterEntry(line) {
|
|
1070
|
+
const trimmed = line.trim();
|
|
1071
|
+
if (!trimmed)
|
|
1072
|
+
return null;
|
|
1073
|
+
const separatorIndex = trimmed.indexOf(':');
|
|
1074
|
+
if (separatorIndex <= 0)
|
|
1075
|
+
return null;
|
|
1076
|
+
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
1077
|
+
const value = trimmed.slice(separatorIndex + 1);
|
|
1078
|
+
return { key, value };
|
|
1079
|
+
}
|
|
1080
|
+
function isTitleKey(key) {
|
|
1081
|
+
return key === 'title' || key === 'name';
|
|
1082
|
+
}
|
|
1083
|
+
function extractTitleFromHeading(content) {
|
|
1084
|
+
const lineEnding = detectLineEnding(content);
|
|
1085
|
+
const lines = content.split(lineEnding);
|
|
1086
|
+
for (const line of lines) {
|
|
1087
|
+
const trimmed = line.trim();
|
|
1088
|
+
if (!trimmed)
|
|
1089
|
+
continue;
|
|
1090
|
+
let index = 0;
|
|
1091
|
+
while (index < trimmed.length && trimmed[index] === '#') {
|
|
1092
|
+
index += 1;
|
|
1093
|
+
}
|
|
1094
|
+
if (index === 0 || index > 6)
|
|
1095
|
+
return undefined;
|
|
1096
|
+
const nextChar = trimmed[index];
|
|
1097
|
+
if (nextChar !== ' ' && nextChar !== '\t')
|
|
1098
|
+
return undefined;
|
|
1099
|
+
const heading = trimmed.slice(index).trim();
|
|
1100
|
+
return heading.length > 0 ? heading : undefined;
|
|
1101
|
+
}
|
|
1102
|
+
return undefined;
|
|
1103
|
+
}
|
|
1104
|
+
function extractTitleFromRawMarkdown(content) {
|
|
1105
|
+
const frontmatter = findFrontmatterLines(content);
|
|
1106
|
+
if (!frontmatter) {
|
|
1107
|
+
return extractTitleFromHeading(content);
|
|
1108
|
+
}
|
|
1109
|
+
const { lines, endIndex } = frontmatter;
|
|
1110
|
+
const entry = lines
|
|
1111
|
+
.slice(1, endIndex)
|
|
1112
|
+
.map((line) => parseFrontmatterEntry(line))
|
|
1113
|
+
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
1114
|
+
if (!entry)
|
|
1115
|
+
return undefined;
|
|
1116
|
+
const value = stripOptionalQuotes(entry.value);
|
|
1117
|
+
return value || undefined;
|
|
1118
|
+
}
|
|
1119
|
+
function hasMarkdownSourceLine(content) {
|
|
1120
|
+
const lineEnding = detectLineEnding(content);
|
|
1121
|
+
const lines = content.split(lineEnding);
|
|
1122
|
+
// Only scan a small prefix to avoid wasting time on huge docs.
|
|
1123
|
+
const limit = Math.min(lines.length, 50);
|
|
1124
|
+
for (let index = 0; index < limit; index += 1) {
|
|
1125
|
+
const line = lines[index];
|
|
1126
|
+
if (!line)
|
|
1127
|
+
continue;
|
|
1128
|
+
if (line.trimStart().toLowerCase().startsWith('source:')) {
|
|
1129
|
+
return true;
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
return false;
|
|
1133
|
+
}
|
|
1134
|
+
function addSourceToMarkdownMarkdownFormat(content, url) {
|
|
1135
|
+
if (hasMarkdownSourceLine(content))
|
|
1136
|
+
return content;
|
|
1137
|
+
const lineEnding = detectLineEnding(content);
|
|
1138
|
+
const lines = content.split(lineEnding);
|
|
1139
|
+
const firstNonEmptyIndex = lines.findIndex((line) => line.trim().length > 0);
|
|
1140
|
+
if (firstNonEmptyIndex !== -1) {
|
|
1141
|
+
const firstLine = lines[firstNonEmptyIndex];
|
|
1142
|
+
if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
|
|
1143
|
+
const insertAt = firstNonEmptyIndex + 1;
|
|
1144
|
+
const updated = [
|
|
1145
|
+
...lines.slice(0, insertAt),
|
|
1146
|
+
'',
|
|
1147
|
+
`Source: ${url}`,
|
|
1148
|
+
'',
|
|
1149
|
+
...lines.slice(insertAt),
|
|
1150
|
+
];
|
|
1151
|
+
return updated.join(lineEnding);
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
return [`Source: ${url}`, '', content].join(lineEnding);
|
|
1155
|
+
}
|
|
1156
|
+
function addSourceToMarkdown(content, url) {
|
|
1157
|
+
const frontmatter = findFrontmatterLines(content);
|
|
1158
|
+
if (config.transform.metadataFormat === 'markdown' && !frontmatter) {
|
|
1159
|
+
return addSourceToMarkdownMarkdownFormat(content, url);
|
|
1160
|
+
}
|
|
1161
|
+
if (!frontmatter) {
|
|
1162
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
1163
|
+
}
|
|
1164
|
+
const { lineEnding, lines, endIndex } = frontmatter;
|
|
1165
|
+
const bodyLines = lines.slice(1, endIndex);
|
|
1166
|
+
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
1167
|
+
if (hasSource)
|
|
1168
|
+
return content;
|
|
1169
|
+
const updatedLines = [
|
|
1170
|
+
lines[0],
|
|
1171
|
+
...bodyLines,
|
|
1172
|
+
`source: "${url}"`,
|
|
1173
|
+
...lines.slice(endIndex),
|
|
1174
|
+
];
|
|
1175
|
+
return updatedLines.join(lineEnding);
|
|
1176
|
+
}
|
|
1177
|
+
function hasFrontmatter(trimmed) {
|
|
1178
|
+
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
1179
|
+
}
|
|
1180
|
+
function looksLikeHtmlDocument(trimmed) {
|
|
1181
|
+
return HTML_DOCUMENT_PATTERN.test(trimmed);
|
|
1182
|
+
}
|
|
1183
|
+
function countCommonHtmlTags(content) {
|
|
1184
|
+
const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
|
|
1185
|
+
[];
|
|
1186
|
+
return matches.length;
|
|
1187
|
+
}
|
|
1188
|
+
function isRawTextContent(content) {
|
|
1189
|
+
const trimmed = content.trim();
|
|
1190
|
+
const isHtmlDocument = looksLikeHtmlDocument(trimmed);
|
|
1191
|
+
const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
|
|
1192
|
+
const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
|
|
1193
|
+
const isMarkdown = looksLikeMarkdown(content);
|
|
1194
|
+
return (!isHtmlDocument &&
|
|
1195
|
+
(hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
|
|
1196
|
+
}
|
|
1197
|
+
function isLikelyHtmlContent(content) {
|
|
1198
|
+
const trimmed = content.trim();
|
|
1199
|
+
if (!trimmed)
|
|
1200
|
+
return false;
|
|
1201
|
+
if (looksLikeHtmlDocument(trimmed))
|
|
1202
|
+
return true;
|
|
1203
|
+
return countCommonHtmlTags(content) > 2;
|
|
1204
|
+
}
|
|
1205
|
+
function shouldPreserveRawContent(url, content) {
|
|
1206
|
+
if (isRawTextContentUrl(url)) {
|
|
1207
|
+
return !isLikelyHtmlContent(content);
|
|
1208
|
+
}
|
|
1209
|
+
return isRawTextContent(content);
|
|
1210
|
+
}
|
|
1211
|
+
function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
|
|
1212
|
+
const title = extractTitleFromRawMarkdown(rawContent);
|
|
1213
|
+
const content = includeMetadata
|
|
1214
|
+
? addSourceToMarkdown(rawContent, url)
|
|
1215
|
+
: rawContent;
|
|
1216
|
+
return { content, title };
|
|
1217
|
+
}
|
|
1218
|
+
function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
1219
|
+
if (!shouldPreserveRawContent(url, html)) {
|
|
1220
|
+
return null;
|
|
1221
|
+
}
|
|
1222
|
+
logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
|
|
1223
|
+
const { content, title } = buildRawMarkdownPayload({
|
|
1224
|
+
rawContent: html,
|
|
1225
|
+
url,
|
|
1226
|
+
includeMetadata,
|
|
1227
|
+
});
|
|
1228
|
+
return {
|
|
1229
|
+
markdown: content,
|
|
1230
|
+
title,
|
|
1231
|
+
truncated: false,
|
|
1232
|
+
};
|
|
1233
|
+
}
|
|
1234
|
+
const MIN_CONTENT_RATIO = 0.3;
|
|
1235
|
+
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
1236
|
+
function stripHtmlTags(html) {
|
|
1237
|
+
const parts = [];
|
|
1238
|
+
let inTag = false;
|
|
1239
|
+
for (const char of html) {
|
|
1240
|
+
if (char === '<') {
|
|
1241
|
+
inTag = true;
|
|
1242
|
+
continue;
|
|
1243
|
+
}
|
|
1244
|
+
if (char === '>') {
|
|
1245
|
+
inTag = false;
|
|
1246
|
+
continue;
|
|
1247
|
+
}
|
|
1248
|
+
if (!inTag) {
|
|
1249
|
+
parts.push(char);
|
|
1250
|
+
}
|
|
1251
|
+
}
|
|
1252
|
+
return parts.join('');
|
|
1253
|
+
}
|
|
1254
|
+
function estimateTextLength(html) {
|
|
1255
|
+
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
1256
|
+
}
|
|
1257
|
+
export function isExtractionSufficient(article, originalHtml) {
|
|
1258
|
+
if (!article)
|
|
1259
|
+
return false;
|
|
1260
|
+
const articleLength = article.textContent.length;
|
|
1261
|
+
const originalLength = estimateTextLength(originalHtml);
|
|
1262
|
+
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
1263
|
+
return true;
|
|
1264
|
+
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
1265
|
+
}
|
|
1266
|
+
export function determineContentExtractionSource(article) {
|
|
1267
|
+
return !!article;
|
|
1268
|
+
}
|
|
1269
|
+
function applyArticleMetadata(metadata, article) {
|
|
1270
|
+
if (article.title !== undefined)
|
|
1271
|
+
metadata.title = article.title;
|
|
1272
|
+
if (article.byline !== undefined)
|
|
1273
|
+
metadata.author = article.byline;
|
|
1274
|
+
}
|
|
1275
|
+
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
1276
|
+
if (extractedMeta.title !== undefined)
|
|
1277
|
+
metadata.title = extractedMeta.title;
|
|
1278
|
+
if (extractedMeta.description !== undefined) {
|
|
1279
|
+
metadata.description = extractedMeta.description;
|
|
1280
|
+
}
|
|
1281
|
+
if (extractedMeta.author !== undefined) {
|
|
1282
|
+
metadata.author = extractedMeta.author;
|
|
1283
|
+
}
|
|
1284
|
+
}
|
|
1285
|
+
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
1286
|
+
if (!includeMetadata)
|
|
1287
|
+
return undefined;
|
|
1288
|
+
const now = new Date().toISOString();
|
|
1289
|
+
const metadata = {
|
|
1290
|
+
type: 'metadata',
|
|
1291
|
+
url,
|
|
1292
|
+
fetchedAt: now,
|
|
1293
|
+
};
|
|
1294
|
+
if (shouldExtractFromArticle && article) {
|
|
1295
|
+
applyArticleMetadata(metadata, article);
|
|
1296
|
+
return metadata;
|
|
1297
|
+
}
|
|
1298
|
+
applyExtractedMetadata(metadata, extractedMeta);
|
|
1299
|
+
return metadata;
|
|
1300
|
+
}
|
|
1301
|
+
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
|
|
1302
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
1303
|
+
return {
|
|
1304
|
+
sourceHtml: useArticleContent && article ? article.content : html,
|
|
1305
|
+
title: useArticleContent && article ? article.title : extractedMeta.title,
|
|
1306
|
+
metadata,
|
|
1307
|
+
};
|
|
1308
|
+
}
|
|
1309
|
+
function logQualityGateFallback({ url, articleLength, }) {
|
|
1310
|
+
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
1311
|
+
url: url.substring(0, 80),
|
|
1312
|
+
articleLength,
|
|
1313
|
+
});
|
|
1314
|
+
}
|
|
1315
|
+
function shouldUseArticleContent(article, html, url) {
|
|
1316
|
+
const shouldExtractFromArticle = determineContentExtractionSource(article);
|
|
1317
|
+
if (!shouldExtractFromArticle)
|
|
1318
|
+
return false;
|
|
1319
|
+
if (isExtractionSufficient(article, html)) {
|
|
1320
|
+
return true;
|
|
1321
|
+
}
|
|
1322
|
+
logQualityGateFallback({
|
|
1323
|
+
url,
|
|
1324
|
+
articleLength: article.textContent.length,
|
|
1325
|
+
});
|
|
1326
|
+
return false;
|
|
1327
|
+
}
|
|
1328
|
+
function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
1329
|
+
const { article, metadata: extractedMeta } = extractContent(html, url, {
|
|
1330
|
+
extractArticle: true,
|
|
1331
|
+
...(signal ? { signal } : {}),
|
|
1332
|
+
});
|
|
1333
|
+
const useArticleContent = article
|
|
1334
|
+
? shouldUseArticleContent(article, html, url)
|
|
1335
|
+
: false;
|
|
1336
|
+
return buildContentSource({
|
|
1337
|
+
html,
|
|
1338
|
+
url,
|
|
1339
|
+
article,
|
|
1340
|
+
extractedMeta,
|
|
1341
|
+
includeMetadata,
|
|
1342
|
+
useArticleContent,
|
|
1343
|
+
});
|
|
1344
|
+
}
|
|
1345
|
+
function tryTransformRawStage(html, url, includeMetadata) {
|
|
1346
|
+
return runTransformStage(url, 'transform:raw', () => tryTransformRawContent({
|
|
1347
|
+
html,
|
|
1348
|
+
url,
|
|
1349
|
+
includeMetadata,
|
|
1350
|
+
}));
|
|
1351
|
+
}
|
|
1352
|
+
function resolveContentSourceStage(html, url, includeMetadata, signal) {
|
|
1353
|
+
return runTransformStage(url, 'transform:extract', () => resolveContentSource({
|
|
1354
|
+
html,
|
|
1355
|
+
url,
|
|
1356
|
+
includeMetadata,
|
|
1357
|
+
...(signal ? { signal } : {}),
|
|
1358
|
+
}));
|
|
1359
|
+
}
|
|
1360
|
+
function buildMarkdownFromContext(context, url, signal) {
|
|
1361
|
+
const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
1362
|
+
url,
|
|
1363
|
+
...(signal ? { signal } : {}),
|
|
1364
|
+
}));
|
|
1365
|
+
return {
|
|
1366
|
+
markdown: content,
|
|
1367
|
+
title: context.title,
|
|
1368
|
+
truncated: false,
|
|
1369
|
+
};
|
|
1370
|
+
}
|
|
1371
|
+
function runTotalTransformStage(url, fn) {
|
|
1372
|
+
const totalStage = startTransformStage(url, 'transform:total');
|
|
1373
|
+
let success = false;
|
|
1374
|
+
try {
|
|
1375
|
+
const result = fn();
|
|
1376
|
+
success = true;
|
|
1377
|
+
return result;
|
|
1378
|
+
}
|
|
1379
|
+
finally {
|
|
1380
|
+
if (success) {
|
|
1381
|
+
endTransformStage(totalStage, { truncated: false });
|
|
1382
|
+
}
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
async function runTotalTransformStageAsync(url, fn) {
|
|
1386
|
+
const totalStage = startTransformStage(url, 'transform:total');
|
|
1387
|
+
let success = false;
|
|
1388
|
+
try {
|
|
1389
|
+
const result = await fn();
|
|
1390
|
+
success = true;
|
|
1391
|
+
return result;
|
|
1392
|
+
}
|
|
1393
|
+
finally {
|
|
1394
|
+
if (success) {
|
|
1395
|
+
endTransformStage(totalStage, { truncated: false });
|
|
1396
|
+
}
|
|
1397
|
+
}
|
|
1398
|
+
}
|
|
1399
|
+
export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
1400
|
+
return runTotalTransformStage(url, () => {
|
|
1401
|
+
throwIfAborted(options.signal, url, 'transform:begin');
|
|
1402
|
+
const raw = tryTransformRawStage(html, url, options.includeMetadata);
|
|
1403
|
+
if (raw) {
|
|
1404
|
+
return raw;
|
|
1405
|
+
}
|
|
1406
|
+
const context = resolveContentSourceStage(html, url, options.includeMetadata, options.signal);
|
|
1407
|
+
return buildMarkdownFromContext(context, url, options.signal);
|
|
1408
|
+
});
|
|
1409
|
+
}
|
|
1410
|
+
const workerMessageSchema = z.discriminatedUnion('type', [
|
|
1411
|
+
z.object({
|
|
1412
|
+
type: z.literal('result'),
|
|
1413
|
+
id: z.string(),
|
|
1414
|
+
result: z.object({
|
|
1415
|
+
markdown: z.string(),
|
|
1416
|
+
title: z.string().optional(),
|
|
1417
|
+
truncated: z.boolean(),
|
|
1418
|
+
}),
|
|
1419
|
+
}),
|
|
1420
|
+
z.object({
|
|
1421
|
+
type: z.literal('error'),
|
|
1422
|
+
id: z.string(),
|
|
1423
|
+
error: z.object({
|
|
1424
|
+
name: z.string(),
|
|
1425
|
+
message: z.string(),
|
|
1426
|
+
url: z.string(),
|
|
1427
|
+
statusCode: z.number().optional(),
|
|
1428
|
+
details: z.record(z.string(), z.unknown()).optional(),
|
|
1429
|
+
}),
|
|
1430
|
+
}),
|
|
1431
|
+
]);
|
|
1432
|
+
let pool = null;
|
|
1433
|
+
function resolveDefaultWorkerCount() {
|
|
1434
|
+
const parallelism = typeof os.availableParallelism === 'function'
|
|
1435
|
+
? os.availableParallelism()
|
|
1436
|
+
: os.cpus().length;
|
|
1437
|
+
return Math.min(16, Math.max(1, parallelism - 1));
|
|
1438
|
+
}
|
|
1439
|
+
const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
|
|
1440
|
+
function getOrCreateTransformWorkerPool() {
|
|
1441
|
+
pool ??= new WorkerPool(resolveDefaultWorkerCount(), DEFAULT_TIMEOUT_MS);
|
|
1442
|
+
return pool;
|
|
1443
|
+
}
|
|
1444
|
+
export async function shutdownTransformWorkerPool() {
|
|
1445
|
+
if (!pool)
|
|
1446
|
+
return;
|
|
1447
|
+
await pool.close();
|
|
1448
|
+
pool = null;
|
|
1449
|
+
}
|
|
1450
|
+
class WorkerPool {
|
|
1451
|
+
workers = [];
|
|
1452
|
+
queue = [];
|
|
1453
|
+
inflight = new Map();
|
|
1454
|
+
timeoutMs;
|
|
1455
|
+
queueMax;
|
|
1456
|
+
closed = false;
|
|
1457
|
+
ensureOpen() {
|
|
1458
|
+
if (this.closed) {
|
|
1459
|
+
throw new Error('Transform worker pool closed');
|
|
1460
|
+
}
|
|
1461
|
+
}
|
|
1462
|
+
ensureNotAborted(signal, url, stage) {
|
|
1463
|
+
if (!signal?.aborted)
|
|
1464
|
+
return;
|
|
1465
|
+
throw new FetchError('Request was canceled', url, 499, {
|
|
1466
|
+
reason: 'aborted',
|
|
1467
|
+
stage,
|
|
1468
|
+
});
|
|
1469
|
+
}
|
|
1470
|
+
ensureQueueCapacity(url) {
|
|
1471
|
+
if (this.queue.length < this.queueMax)
|
|
1472
|
+
return;
|
|
1473
|
+
throw new FetchError('Transform worker queue is full', url, 503, {
|
|
1474
|
+
reason: 'queue_full',
|
|
1475
|
+
stage: 'transform:enqueue',
|
|
1476
|
+
});
|
|
1477
|
+
}
|
|
1478
|
+
clearAbortListener(signal, listener) {
|
|
1479
|
+
if (!signal || !listener)
|
|
1480
|
+
return;
|
|
1481
|
+
try {
|
|
1482
|
+
signal.removeEventListener('abort', listener);
|
|
1483
|
+
}
|
|
1484
|
+
catch {
|
|
1485
|
+
// ignore
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1488
|
+
markSlotIdle(workerIndex) {
|
|
1489
|
+
const slot = this.workers[workerIndex];
|
|
1490
|
+
if (!slot)
|
|
1491
|
+
return;
|
|
1492
|
+
slot.busy = false;
|
|
1493
|
+
slot.currentTaskId = null;
|
|
1494
|
+
}
|
|
1495
|
+
takeInflight(id) {
|
|
1496
|
+
const inflight = this.inflight.get(id);
|
|
1497
|
+
if (!inflight)
|
|
1498
|
+
return null;
|
|
1499
|
+
clearTimeout(inflight.timer);
|
|
1500
|
+
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
1501
|
+
this.inflight.delete(id);
|
|
1502
|
+
return inflight;
|
|
1503
|
+
}
|
|
1504
|
+
cancelWorkerTask(slot, id) {
|
|
1505
|
+
if (!slot)
|
|
1506
|
+
return;
|
|
1507
|
+
try {
|
|
1508
|
+
slot.worker.postMessage({ type: 'cancel', id });
|
|
1509
|
+
}
|
|
1510
|
+
catch {
|
|
1511
|
+
// ignore
|
|
1512
|
+
}
|
|
1513
|
+
}
|
|
1514
|
+
restartWorker(workerIndex, slot) {
|
|
1515
|
+
if (this.closed)
|
|
1516
|
+
return;
|
|
1517
|
+
const target = slot ?? this.workers[workerIndex];
|
|
1518
|
+
if (target) {
|
|
1519
|
+
void target.worker.terminate();
|
|
1520
|
+
}
|
|
1521
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
1522
|
+
this.drainQueue();
|
|
1523
|
+
}
|
|
1524
|
+
rejectIfClosed(reject) {
|
|
1525
|
+
if (!this.closed)
|
|
1526
|
+
return false;
|
|
1527
|
+
reject(new Error('Transform worker pool closed'));
|
|
1528
|
+
return true;
|
|
1529
|
+
}
|
|
1530
|
+
abortInflightTask(id, url, workerIndex) {
|
|
1531
|
+
const slot = this.workers[workerIndex];
|
|
1532
|
+
this.cancelWorkerTask(slot, id);
|
|
1533
|
+
this.failTask(id, new FetchError('Request was canceled', url, 499, {
|
|
1534
|
+
reason: 'aborted',
|
|
1535
|
+
stage: 'transform:signal-abort',
|
|
1536
|
+
}));
|
|
1537
|
+
if (slot) {
|
|
1538
|
+
this.restartWorker(workerIndex, slot);
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
abortQueuedTask(id, url, reject) {
|
|
1542
|
+
const queuedIndex = this.queue.findIndex((task) => task.id === id);
|
|
1543
|
+
if (queuedIndex === -1)
|
|
1544
|
+
return;
|
|
1545
|
+
this.queue.splice(queuedIndex, 1);
|
|
1546
|
+
reject(new FetchError('Request was canceled', url, 499, {
|
|
1547
|
+
reason: 'aborted',
|
|
1548
|
+
stage: 'transform:queued-abort',
|
|
1549
|
+
}));
|
|
1550
|
+
}
|
|
1551
|
+
createWorkerSlot(worker) {
|
|
1552
|
+
return {
|
|
1553
|
+
worker,
|
|
1554
|
+
busy: false,
|
|
1555
|
+
currentTaskId: null,
|
|
1556
|
+
};
|
|
1557
|
+
}
|
|
1558
|
+
registerWorkerHandlers(workerIndex, worker) {
|
|
1559
|
+
worker.on('message', (raw) => {
|
|
1560
|
+
this.onWorkerMessage(workerIndex, raw);
|
|
1561
|
+
});
|
|
1562
|
+
worker.on('error', (error) => {
|
|
1563
|
+
this.onWorkerBroken(workerIndex, `Transform worker error: ${getErrorMessage(error)}`);
|
|
1564
|
+
});
|
|
1565
|
+
worker.on('exit', (code) => {
|
|
1566
|
+
this.onWorkerBroken(workerIndex, `Transform worker exited (code ${code})`);
|
|
1567
|
+
});
|
|
1568
|
+
}
|
|
1569
|
+
constructor(size, timeoutMs) {
|
|
1570
|
+
const safeSize = Math.max(1, size);
|
|
1571
|
+
this.timeoutMs = timeoutMs;
|
|
1572
|
+
this.queueMax = safeSize * 2;
|
|
1573
|
+
for (let index = 0; index < safeSize; index += 1) {
|
|
1574
|
+
this.workers.push(this.spawnWorker(index));
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
spawnWorker(workerIndex) {
|
|
1578
|
+
const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
|
|
1579
|
+
// Workers must not keep the process alive by themselves.
|
|
1580
|
+
worker.unref();
|
|
1581
|
+
const slot = this.createWorkerSlot(worker);
|
|
1582
|
+
this.registerWorkerHandlers(workerIndex, worker);
|
|
1583
|
+
return slot;
|
|
1584
|
+
}
|
|
1585
|
+
onWorkerBroken(workerIndex, message) {
|
|
1586
|
+
if (this.closed)
|
|
1587
|
+
return;
|
|
1588
|
+
const slot = this.workers[workerIndex];
|
|
1589
|
+
if (!slot)
|
|
1590
|
+
return;
|
|
1591
|
+
if (slot.busy && slot.currentTaskId) {
|
|
1592
|
+
this.failTask(slot.currentTaskId, new Error(message));
|
|
1593
|
+
}
|
|
1594
|
+
this.restartWorker(workerIndex, slot);
|
|
1595
|
+
}
|
|
1596
|
+
resolveWorkerResult(inflight, result) {
|
|
1597
|
+
inflight.resolve({
|
|
1598
|
+
markdown: result.markdown,
|
|
1599
|
+
truncated: result.truncated,
|
|
1600
|
+
title: result.title,
|
|
1601
|
+
});
|
|
1602
|
+
}
|
|
1603
|
+
rejectWorkerError(inflight, error) {
|
|
1604
|
+
if (error.name === 'FetchError') {
|
|
1605
|
+
inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
|
|
1606
|
+
return;
|
|
1607
|
+
}
|
|
1608
|
+
inflight.reject(new Error(error.message));
|
|
1609
|
+
}
|
|
1610
|
+
onWorkerMessage(workerIndex, raw) {
|
|
1611
|
+
const parsed = workerMessageSchema.safeParse(raw);
|
|
1612
|
+
if (!parsed.success)
|
|
1613
|
+
return;
|
|
1614
|
+
const message = parsed.data;
|
|
1615
|
+
const inflight = this.takeInflight(message.id);
|
|
1616
|
+
if (!inflight)
|
|
1617
|
+
return;
|
|
1618
|
+
this.markSlotIdle(workerIndex);
|
|
1619
|
+
if (message.type === 'result') {
|
|
1620
|
+
this.resolveWorkerResult(inflight, message.result);
|
|
1621
|
+
}
|
|
1622
|
+
else {
|
|
1623
|
+
this.rejectWorkerError(inflight, message.error);
|
|
1624
|
+
}
|
|
1625
|
+
this.drainQueue();
|
|
1626
|
+
}
|
|
1627
|
+
failTask(id, error) {
|
|
1628
|
+
const inflight = this.takeInflight(id);
|
|
1629
|
+
if (!inflight)
|
|
1630
|
+
return;
|
|
1631
|
+
inflight.reject(error);
|
|
1632
|
+
this.markSlotIdle(inflight.workerIndex);
|
|
1633
|
+
}
|
|
1634
|
+
handleAbortSignal(id, url, reject) {
|
|
1635
|
+
if (this.rejectIfClosed(reject))
|
|
1636
|
+
return;
|
|
1637
|
+
const inflight = this.inflight.get(id);
|
|
1638
|
+
if (inflight) {
|
|
1639
|
+
this.abortInflightTask(id, url, inflight.workerIndex);
|
|
1640
|
+
return;
|
|
1641
|
+
}
|
|
1642
|
+
this.abortQueuedTask(id, url, reject);
|
|
1643
|
+
}
|
|
1644
|
+
createPendingTask(html, url, options, resolve, reject) {
|
|
1645
|
+
const id = randomUUID();
|
|
1646
|
+
let abortListener;
|
|
1647
|
+
if (options.signal) {
|
|
1648
|
+
abortListener = () => {
|
|
1649
|
+
this.handleAbortSignal(id, url, reject);
|
|
1650
|
+
};
|
|
1651
|
+
options.signal.addEventListener('abort', abortListener, { once: true });
|
|
1652
|
+
}
|
|
1653
|
+
return {
|
|
1654
|
+
id,
|
|
1655
|
+
html,
|
|
1656
|
+
url,
|
|
1657
|
+
includeMetadata: options.includeMetadata,
|
|
1658
|
+
signal: options.signal,
|
|
1659
|
+
abortListener,
|
|
1660
|
+
resolve,
|
|
1661
|
+
reject,
|
|
1662
|
+
};
|
|
1663
|
+
}
|
|
1664
|
+
async transform(html, url, options) {
|
|
1665
|
+
this.ensureOpen();
|
|
1666
|
+
this.ensureNotAborted(options.signal, url, 'transform:enqueue');
|
|
1667
|
+
this.ensureQueueCapacity(url);
|
|
1668
|
+
return new Promise((resolve, reject) => {
|
|
1669
|
+
const task = this.createPendingTask(html, url, options, resolve, reject);
|
|
1670
|
+
this.queue.push(task);
|
|
1671
|
+
this.drainQueue();
|
|
1672
|
+
});
|
|
1673
|
+
}
|
|
1674
|
+
drainQueue() {
|
|
1675
|
+
if (this.queue.length === 0)
|
|
1676
|
+
return;
|
|
1677
|
+
for (let workerIndex = 0; workerIndex < this.workers.length; workerIndex += 1) {
|
|
1678
|
+
const slot = this.workers[workerIndex];
|
|
1679
|
+
if (!slot || slot.busy)
|
|
1680
|
+
continue;
|
|
1681
|
+
const task = this.queue.shift();
|
|
1682
|
+
if (!task)
|
|
1683
|
+
return;
|
|
1684
|
+
this.dispatch(workerIndex, slot, task);
|
|
1685
|
+
if (this.queue.length === 0)
|
|
1686
|
+
return;
|
|
1687
|
+
}
|
|
1688
|
+
}
|
|
1689
|
+
dispatch(workerIndex, slot, task) {
|
|
1690
|
+
if (this.rejectIfAborted(task))
|
|
1691
|
+
return;
|
|
1692
|
+
this.markSlotBusy(slot, task);
|
|
1693
|
+
const timer = this.startTaskTimer(workerIndex, slot, task);
|
|
1694
|
+
this.registerInflightTask(task, timer, workerIndex);
|
|
1695
|
+
try {
|
|
1696
|
+
this.sendTransformMessage(slot, task);
|
|
1697
|
+
}
|
|
1698
|
+
catch (error) {
|
|
1699
|
+
this.handleDispatchFailure(workerIndex, slot, task, timer, error);
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1702
|
+
rejectIfAborted(task) {
|
|
1703
|
+
if (!task.signal?.aborted)
|
|
1704
|
+
return false;
|
|
1705
|
+
this.clearAbortListener(task.signal, task.abortListener);
|
|
1706
|
+
task.reject(new FetchError('Request was canceled', task.url, 499, {
|
|
1707
|
+
reason: 'aborted',
|
|
1708
|
+
stage: 'transform:dispatch',
|
|
1709
|
+
}));
|
|
1710
|
+
return true;
|
|
1711
|
+
}
|
|
1712
|
+
markSlotBusy(slot, task) {
|
|
1713
|
+
slot.busy = true;
|
|
1714
|
+
slot.currentTaskId = task.id;
|
|
1715
|
+
}
|
|
1716
|
+
startTaskTimer(workerIndex, slot, task) {
|
|
1717
|
+
const timer = setTimeout(() => {
|
|
1718
|
+
this.cancelWorkerTask(slot, task.id);
|
|
1719
|
+
const inflight = this.takeInflight(task.id);
|
|
1720
|
+
if (!inflight)
|
|
1721
|
+
return;
|
|
1722
|
+
inflight.reject(new FetchError('Request timeout', task.url, 504, {
|
|
1723
|
+
reason: 'timeout',
|
|
1724
|
+
stage: 'transform:worker-timeout',
|
|
1725
|
+
}));
|
|
1726
|
+
this.restartWorker(workerIndex, slot);
|
|
1727
|
+
}, this.timeoutMs);
|
|
1728
|
+
timer.unref();
|
|
1729
|
+
return timer;
|
|
1730
|
+
}
|
|
1731
|
+
registerInflightTask(task, timer, workerIndex) {
|
|
1732
|
+
this.inflight.set(task.id, {
|
|
1733
|
+
resolve: task.resolve,
|
|
1734
|
+
reject: task.reject,
|
|
1735
|
+
timer,
|
|
1736
|
+
signal: task.signal,
|
|
1737
|
+
abortListener: task.abortListener,
|
|
1738
|
+
workerIndex,
|
|
1739
|
+
});
|
|
1740
|
+
}
|
|
1741
|
+
sendTransformMessage(slot, task) {
|
|
1742
|
+
slot.worker.postMessage({
|
|
1743
|
+
type: 'transform',
|
|
1744
|
+
id: task.id,
|
|
1745
|
+
html: task.html,
|
|
1746
|
+
url: task.url,
|
|
1747
|
+
includeMetadata: task.includeMetadata,
|
|
1748
|
+
});
|
|
1749
|
+
}
|
|
1750
|
+
handleDispatchFailure(workerIndex, slot, task, timer, error) {
|
|
1751
|
+
clearTimeout(timer);
|
|
1752
|
+
this.clearAbortListener(task.signal, task.abortListener);
|
|
1753
|
+
this.inflight.delete(task.id);
|
|
1754
|
+
this.markSlotIdle(workerIndex);
|
|
1755
|
+
const message = error instanceof Error
|
|
1756
|
+
? error
|
|
1757
|
+
: new Error('Failed to dispatch transform worker message');
|
|
1758
|
+
task.reject(message);
|
|
1759
|
+
this.restartWorker(workerIndex, slot);
|
|
1760
|
+
}
|
|
1761
|
+
async close() {
|
|
1762
|
+
if (this.closed)
|
|
1763
|
+
return;
|
|
1764
|
+
this.closed = true;
|
|
1765
|
+
const terminations = this.workers.map((slot) => slot.worker.terminate());
|
|
1766
|
+
this.workers.length = 0;
|
|
1767
|
+
for (const [id, inflight] of this.inflight.entries()) {
|
|
1768
|
+
clearTimeout(inflight.timer);
|
|
1769
|
+
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
1770
|
+
inflight.reject(new Error('Transform worker pool closed'));
|
|
1771
|
+
this.inflight.delete(id);
|
|
1772
|
+
}
|
|
1773
|
+
for (const task of this.queue) {
|
|
1774
|
+
task.reject(new Error('Transform worker pool closed'));
|
|
1775
|
+
}
|
|
1776
|
+
this.queue.length = 0;
|
|
1777
|
+
await Promise.allSettled(terminations);
|
|
1778
|
+
}
|
|
1779
|
+
}
|
|
1780
|
+
function buildWorkerTransformOptions(options) {
|
|
1781
|
+
return {
|
|
1782
|
+
includeMetadata: options.includeMetadata,
|
|
1783
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
1784
|
+
};
|
|
1785
|
+
}
|
|
1786
|
+
async function transformWithWorkerPool(html, url, options) {
|
|
1787
|
+
const poolRef = getOrCreateTransformWorkerPool();
|
|
1788
|
+
return poolRef.transform(html, url, buildWorkerTransformOptions(options));
|
|
1789
|
+
}
|
|
1790
|
+
function resolveWorkerFallback(error, html, url, options) {
|
|
1791
|
+
if (error instanceof FetchError) {
|
|
1792
|
+
throw error;
|
|
1793
|
+
}
|
|
1794
|
+
// Stability-first: if worker infrastructure fails, fall back to in-process.
|
|
1795
|
+
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
1796
|
+
return transformHtmlToMarkdownInProcess(html, url, options);
|
|
1797
|
+
}
|
|
1798
|
+
export async function transformHtmlToMarkdown(html, url, options) {
|
|
1799
|
+
return runTotalTransformStageAsync(url, async () => {
|
|
1800
|
+
throwIfAborted(options.signal, url, 'transform:begin');
|
|
1801
|
+
const workerStage = startTransformStage(url, 'transform:worker');
|
|
1802
|
+
try {
|
|
1803
|
+
const result = await transformWithWorkerPool(html, url, options);
|
|
1804
|
+
return result;
|
|
1805
|
+
}
|
|
1806
|
+
catch (error) {
|
|
1807
|
+
const fallback = resolveWorkerFallback(error, html, url, options);
|
|
1808
|
+
return fallback;
|
|
1809
|
+
}
|
|
1810
|
+
finally {
|
|
1811
|
+
endTransformStage(workerStage);
|
|
1812
|
+
}
|
|
1813
|
+
});
|
|
1814
|
+
}
|