@j0hanz/fetch-url-mcp 1.12.7 → 1.12.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/http/auth.d.ts +2 -2
- package/dist/http/auth.d.ts.map +1 -1
- package/dist/http/auth.js +4 -5
- package/dist/http/index.d.ts +6 -0
- package/dist/http/index.d.ts.map +1 -0
- package/dist/http/index.js +5 -0
- package/dist/http/native.d.ts +73 -0
- package/dist/http/native.d.ts.map +1 -1
- package/dist/http/native.js +554 -10
- package/dist/http/rate-limit.d.ts +1 -1
- package/dist/http/rate-limit.d.ts.map +1 -1
- package/dist/http/rate-limit.js +3 -4
- package/dist/index.d.ts +17 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +67 -6
- package/dist/lib/config.js +2 -2
- package/dist/lib/core.d.ts +56 -4
- package/dist/lib/core.d.ts.map +1 -1
- package/dist/lib/core.js +155 -4
- package/dist/lib/error/classes.d.ts +19 -0
- package/dist/lib/error/classes.d.ts.map +1 -0
- package/dist/lib/error/classes.js +107 -0
- package/dist/lib/error/classify.d.ts +4 -0
- package/dist/lib/error/classify.d.ts.map +1 -0
- package/dist/lib/error/classify.js +154 -0
- package/dist/lib/error/codes.d.ts +23 -0
- package/dist/lib/error/codes.d.ts.map +1 -0
- package/dist/lib/error/codes.js +22 -0
- package/dist/lib/error/index.d.ts +6 -0
- package/dist/lib/error/index.d.ts.map +1 -0
- package/dist/lib/error/index.js +5 -0
- package/dist/lib/{error-messages.d.ts → error/messages.d.ts} +2 -2
- package/dist/lib/error/messages.d.ts.map +1 -0
- package/dist/lib/{error-messages.js → error/messages.js} +2 -2
- package/dist/lib/{tool-errors.d.ts → error/payload.d.ts} +7 -13
- package/dist/lib/error/payload.d.ts.map +1 -0
- package/dist/lib/error/payload.js +108 -0
- package/dist/lib/mcp-interop.d.ts.map +1 -1
- package/dist/lib/mcp-interop.js +4 -6
- package/dist/lib/net/http.d.ts.map +1 -0
- package/dist/lib/{http.js → net/http.js} +4 -7
- package/dist/lib/net/index.d.ts +4 -0
- package/dist/lib/net/index.d.ts.map +1 -0
- package/dist/lib/net/index.js +3 -0
- package/dist/lib/{fetch-pipeline.d.ts → net/pipeline.d.ts} +3 -3
- package/dist/lib/net/pipeline.d.ts.map +1 -0
- package/dist/lib/{fetch-pipeline.js → net/pipeline.js} +3 -5
- package/dist/lib/{url.d.ts → net/url.d.ts} +1 -1
- package/dist/lib/net/url.d.ts.map +1 -0
- package/dist/lib/{url.js → net/url.js} +3 -5
- package/dist/lib/utils.d.ts +2 -18
- package/dist/lib/utils.d.ts.map +1 -1
- package/dist/lib/utils.js +29 -104
- package/dist/resources/index.d.ts.map +1 -1
- package/dist/resources/index.js +8 -5
- package/dist/schemas.d.ts +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +7 -9
- package/dist/tasks/index.d.ts +2 -0
- package/dist/tasks/index.d.ts.map +1 -0
- package/dist/tasks/index.js +1 -0
- package/dist/tasks/manager.d.ts +123 -1
- package/dist/tasks/manager.d.ts.map +1 -1
- package/dist/tasks/manager.js +745 -10
- package/dist/tools/{fetch-url.d.ts → index.d.ts} +4 -5
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/{fetch-url.js → index.js} +6 -8
- package/dist/transform/index.d.ts +279 -0
- package/dist/transform/index.d.ts.map +1 -0
- package/dist/transform/index.js +5234 -0
- package/package.json +2 -2
- package/dist/cli.d.ts +0 -19
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js +0 -65
- package/dist/http/health.d.ts +0 -8
- package/dist/http/health.d.ts.map +0 -1
- package/dist/http/health.js +0 -152
- package/dist/http/helpers.d.ts +0 -68
- package/dist/http/helpers.d.ts.map +0 -1
- package/dist/http/helpers.js +0 -402
- package/dist/lib/error-codes.d.ts +0 -13
- package/dist/lib/error-codes.d.ts.map +0 -1
- package/dist/lib/error-codes.js +0 -12
- package/dist/lib/error-messages.d.ts.map +0 -1
- package/dist/lib/fetch-pipeline.d.ts.map +0 -1
- package/dist/lib/http.d.ts.map +0 -1
- package/dist/lib/logger-names.d.ts +0 -16
- package/dist/lib/logger-names.d.ts.map +0 -1
- package/dist/lib/logger-names.js +0 -15
- package/dist/lib/session.d.ts +0 -44
- package/dist/lib/session.d.ts.map +0 -1
- package/dist/lib/session.js +0 -137
- package/dist/lib/tool-errors.d.ts.map +0 -1
- package/dist/lib/tool-errors.js +0 -253
- package/dist/lib/url.d.ts.map +0 -1
- package/dist/lib/zod.d.ts +0 -3
- package/dist/lib/zod.d.ts.map +0 -1
- package/dist/lib/zod.js +0 -27
- package/dist/tasks/call-contract.d.ts +0 -25
- package/dist/tasks/call-contract.d.ts.map +0 -1
- package/dist/tasks/call-contract.js +0 -59
- package/dist/tasks/execution.d.ts +0 -16
- package/dist/tasks/execution.d.ts.map +0 -1
- package/dist/tasks/execution.js +0 -241
- package/dist/tasks/handlers.d.ts +0 -11
- package/dist/tasks/handlers.d.ts.map +0 -1
- package/dist/tasks/handlers.js +0 -157
- package/dist/tasks/owner.d.ts +0 -43
- package/dist/tasks/owner.d.ts.map +0 -1
- package/dist/tasks/owner.js +0 -144
- package/dist/tasks/registry.d.ts +0 -20
- package/dist/tasks/registry.d.ts.map +0 -1
- package/dist/tasks/registry.js +0 -40
- package/dist/tasks/waiters.d.ts +0 -27
- package/dist/tasks/waiters.d.ts.map +0 -1
- package/dist/tasks/waiters.js +0 -114
- package/dist/tools/fetch-url.d.ts.map +0 -1
- package/dist/transform/dom-prep.d.ts +0 -16
- package/dist/transform/dom-prep.d.ts.map +0 -1
- package/dist/transform/dom-prep.js +0 -1287
- package/dist/transform/html-translators.d.ts +0 -5
- package/dist/transform/html-translators.d.ts.map +0 -1
- package/dist/transform/html-translators.js +0 -697
- package/dist/transform/markdown-cleanup.d.ts +0 -10
- package/dist/transform/markdown-cleanup.d.ts.map +0 -1
- package/dist/transform/markdown-cleanup.js +0 -542
- package/dist/transform/metadata.d.ts +0 -18
- package/dist/transform/metadata.d.ts.map +0 -1
- package/dist/transform/metadata.js +0 -462
- package/dist/transform/next-flight.d.ts +0 -2
- package/dist/transform/next-flight.d.ts.map +0 -1
- package/dist/transform/next-flight.js +0 -374
- package/dist/transform/shared.d.ts +0 -8
- package/dist/transform/shared.d.ts.map +0 -1
- package/dist/transform/shared.js +0 -137
- package/dist/transform/transform.d.ts +0 -38
- package/dist/transform/transform.d.ts.map +0 -1
- package/dist/transform/transform.js +0 -1042
- package/dist/transform/types.d.ts +0 -124
- package/dist/transform/types.d.ts.map +0 -1
- package/dist/transform/types.js +0 -5
- package/dist/transform/worker-pool.d.ts +0 -76
- package/dist/transform/worker-pool.d.ts.map +0 -1
- package/dist/transform/worker-pool.js +0 -725
- /package/dist/lib/{http.d.ts → net/http.d.ts} +0 -0
|
@@ -1,1042 +0,0 @@
|
|
|
1
|
-
import diagnosticsChannel from 'node:diagnostics_channel';
|
|
2
|
-
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
3
|
-
import { parseHTML } from 'linkedom';
|
|
4
|
-
import { config, getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
|
|
5
|
-
import { SystemErrors } from '../lib/error-codes.js';
|
|
6
|
-
import { isRawTextContentUrl } from '../lib/http.js';
|
|
7
|
-
import { Loggers } from '../lib/logger-names.js';
|
|
8
|
-
import { composeAbortSignal, FetchError, getErrorMessage, getUtf8ByteLength, isAsciiOnly, isObject, throwIfAborted, toError, trimDanglingTagFragment, truncateToUtf8Boundary, } from '../lib/utils.js';
|
|
9
|
-
import { evaluateArticleContent, extractNoscriptImages, getVisibleTextLength, normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, stripDocsControls, stripScreenReaderText, surfaceCodeEditorContent, } from './dom-prep.js';
|
|
10
|
-
import { extractLanguageFromClassName } from './html-translators.js';
|
|
11
|
-
import { translateHtmlFragmentToMarkdown } from './html-translators.js';
|
|
12
|
-
import { cleanupMarkdownArtifacts, finalizeMarkdownSections, processFencedContent, } from './markdown-cleanup.js';
|
|
13
|
-
import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from './metadata.js';
|
|
14
|
-
import { extractMetadata, extractMetadataFromHead, isGithubRepositoryRootUrl, maybePrependSyntheticTitle, maybeStripGithubPrimaryHeading, mergeMetadata, normalizeDocumentTitle, shouldPreferPrimaryHeadingTitle, } from './metadata.js';
|
|
15
|
-
import { supplementMarkdownFromNextFlight } from './next-flight.js';
|
|
16
|
-
import { getOrCreateWorkerPool, getWorkerPoolStats, shutdownWorkerPool, } from './worker-pool.js';
|
|
17
|
-
function decodeInput(input, encoding) {
|
|
18
|
-
if (typeof input === 'string')
|
|
19
|
-
return input;
|
|
20
|
-
const normalizedEncoding = encoding?.trim().toLowerCase();
|
|
21
|
-
if (!normalizedEncoding ||
|
|
22
|
-
normalizedEncoding === 'utf-8' ||
|
|
23
|
-
normalizedEncoding === 'utf8') {
|
|
24
|
-
const decoded = new TextDecoder('utf-8').decode(input);
|
|
25
|
-
return decoded;
|
|
26
|
-
}
|
|
27
|
-
try {
|
|
28
|
-
const decoded = new TextDecoder(normalizedEncoding, { fatal: true }).decode(input);
|
|
29
|
-
return decoded;
|
|
30
|
-
}
|
|
31
|
-
catch {
|
|
32
|
-
const decoded = new TextDecoder('utf-8').decode(input);
|
|
33
|
-
return decoded;
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
function buildTransformSignal(signal) {
|
|
37
|
-
return composeAbortSignal(signal, config.transform.timeoutMs);
|
|
38
|
-
}
|
|
39
|
-
class StageTracker {
|
|
40
|
-
channel = diagnosticsChannel.channel('fetch-url-mcp.transform');
|
|
41
|
-
start(url, stage, budget) {
|
|
42
|
-
if (this.shouldSkipTracking(budget))
|
|
43
|
-
return null;
|
|
44
|
-
const remainingBudgetMs = budget
|
|
45
|
-
? budget.totalBudgetMs - budget.elapsedMs
|
|
46
|
-
: undefined;
|
|
47
|
-
const base = {
|
|
48
|
-
stage,
|
|
49
|
-
startTime: performance.now(),
|
|
50
|
-
url: redactUrl(url),
|
|
51
|
-
};
|
|
52
|
-
if (remainingBudgetMs !== undefined && budget) {
|
|
53
|
-
return {
|
|
54
|
-
...base,
|
|
55
|
-
budgetMs: remainingBudgetMs,
|
|
56
|
-
totalBudgetMs: budget.totalBudgetMs,
|
|
57
|
-
};
|
|
58
|
-
}
|
|
59
|
-
return base;
|
|
60
|
-
}
|
|
61
|
-
end(context, options) {
|
|
62
|
-
if (!context)
|
|
63
|
-
return 0;
|
|
64
|
-
const durationMs = performance.now() - context.startTime;
|
|
65
|
-
const requestId = getRequestId();
|
|
66
|
-
const operationId = getOperationId();
|
|
67
|
-
if (context.totalBudgetMs !== undefined) {
|
|
68
|
-
const warnThresholdMs = context.totalBudgetMs * config.transform.stageWarnRatio;
|
|
69
|
-
if (durationMs > warnThresholdMs) {
|
|
70
|
-
logWarn('Transform stage exceeded warning threshold', {
|
|
71
|
-
stage: context.stage,
|
|
72
|
-
durationMs: Math.round(durationMs),
|
|
73
|
-
thresholdMs: Math.round(warnThresholdMs),
|
|
74
|
-
url: context.url,
|
|
75
|
-
}, Loggers.LOG_TRANSFORM);
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
const event = {
|
|
79
|
-
v: 1,
|
|
80
|
-
type: 'stage',
|
|
81
|
-
stage: context.stage,
|
|
82
|
-
durationMs,
|
|
83
|
-
url: context.url,
|
|
84
|
-
...(requestId ? { requestId } : {}),
|
|
85
|
-
...(operationId ? { operationId } : {}),
|
|
86
|
-
...(options?.truncated !== undefined
|
|
87
|
-
? { truncated: options.truncated }
|
|
88
|
-
: {}),
|
|
89
|
-
};
|
|
90
|
-
this.publish(event);
|
|
91
|
-
return durationMs;
|
|
92
|
-
}
|
|
93
|
-
checkBudget(url, stage, budget) {
|
|
94
|
-
if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
|
|
95
|
-
const error = new FetchError('Transform budget exhausted', url, 504, {
|
|
96
|
-
reason: 'timeout',
|
|
97
|
-
stage: `${stage}:budget_exhausted`,
|
|
98
|
-
elapsedMs: budget.elapsedMs,
|
|
99
|
-
totalBudgetMs: budget.totalBudgetMs,
|
|
100
|
-
});
|
|
101
|
-
throw error;
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
run(url, stage, fn, budget) {
|
|
105
|
-
if (this.shouldSkipTracking(budget)) {
|
|
106
|
-
return fn();
|
|
107
|
-
}
|
|
108
|
-
this.checkBudget(url, stage, budget);
|
|
109
|
-
const ctx = this.start(url, stage, budget);
|
|
110
|
-
try {
|
|
111
|
-
return fn();
|
|
112
|
-
}
|
|
113
|
-
finally {
|
|
114
|
-
this.end(ctx);
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
async runAsync(url, stage, fn, budget) {
|
|
118
|
-
if (this.shouldSkipTracking(budget)) {
|
|
119
|
-
return fn();
|
|
120
|
-
}
|
|
121
|
-
this.checkBudget(url, stage, budget);
|
|
122
|
-
const ctx = this.start(url, stage, budget);
|
|
123
|
-
try {
|
|
124
|
-
return await fn();
|
|
125
|
-
}
|
|
126
|
-
finally {
|
|
127
|
-
this.end(ctx);
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
shouldSkipTracking(budget) {
|
|
131
|
-
return !this.channel.hasSubscribers && !budget;
|
|
132
|
-
}
|
|
133
|
-
publish(event) {
|
|
134
|
-
if (!this.channel.hasSubscribers)
|
|
135
|
-
return;
|
|
136
|
-
try {
|
|
137
|
-
this.channel.publish(event);
|
|
138
|
-
}
|
|
139
|
-
catch (error) {
|
|
140
|
-
logDebug('Diagnostic channel publish failed', {
|
|
141
|
-
stage: event.stage,
|
|
142
|
-
error: getErrorMessage(error),
|
|
143
|
-
}, Loggers.LOG_TRANSFORM);
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
runTrackedSync(url, signal, fn) {
|
|
147
|
-
const totalStage = this.start(url, 'transform:total');
|
|
148
|
-
try {
|
|
149
|
-
throwIfAborted(signal, url, 'transform:begin');
|
|
150
|
-
const result = fn();
|
|
151
|
-
this.end(totalStage, result.truncated !== undefined
|
|
152
|
-
? { truncated: result.truncated }
|
|
153
|
-
: undefined);
|
|
154
|
-
return result;
|
|
155
|
-
}
|
|
156
|
-
catch (error) {
|
|
157
|
-
this.end(totalStage);
|
|
158
|
-
throw error;
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
async runTrackedAsync(url, signal, fn) {
|
|
162
|
-
const totalStage = this.start(url, 'transform:total');
|
|
163
|
-
try {
|
|
164
|
-
throwIfAborted(signal, url, 'transform:begin');
|
|
165
|
-
const result = await fn();
|
|
166
|
-
this.end(totalStage, result.truncated !== undefined
|
|
167
|
-
? { truncated: result.truncated }
|
|
168
|
-
: undefined);
|
|
169
|
-
return result;
|
|
170
|
-
}
|
|
171
|
-
catch (error) {
|
|
172
|
-
this.end(totalStage);
|
|
173
|
-
throw error;
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
const stageTracker = new StageTracker();
|
|
178
|
-
export function startTransformStage(url, stage, budget) {
|
|
179
|
-
return stageTracker.start(url, stage, budget);
|
|
180
|
-
}
|
|
181
|
-
export function endTransformStage(context, options) {
|
|
182
|
-
return stageTracker.end(context, options);
|
|
183
|
-
}
|
|
184
|
-
function truncateHtml(html, inputTruncated = false) {
|
|
185
|
-
const maxSize = config.constants.maxHtmlBytes;
|
|
186
|
-
if (maxSize <= 0)
|
|
187
|
-
return { html, truncated: false };
|
|
188
|
-
if (html.length <= maxSize) {
|
|
189
|
-
if (isAsciiOnly(html) && !inputTruncated)
|
|
190
|
-
return { html, truncated: false };
|
|
191
|
-
const byteLength = getUtf8ByteLength(html);
|
|
192
|
-
if (byteLength <= maxSize && !inputTruncated)
|
|
193
|
-
return { html, truncated: false };
|
|
194
|
-
}
|
|
195
|
-
const sliced = html.slice(0, maxSize);
|
|
196
|
-
if (getUtf8ByteLength(sliced) <= maxSize) {
|
|
197
|
-
return { html: trimDanglingTagFragment(sliced), truncated: true };
|
|
198
|
-
}
|
|
199
|
-
const content = truncateToUtf8Boundary(sliced, maxSize);
|
|
200
|
-
logWarn('HTML content exceeds maximum size, truncating', {
|
|
201
|
-
size: getUtf8ByteLength(html),
|
|
202
|
-
maxSize,
|
|
203
|
-
truncatedSize: getUtf8ByteLength(content),
|
|
204
|
-
}, Loggers.LOG_TRANSFORM);
|
|
205
|
-
return { html: content, truncated: true };
|
|
206
|
-
}
|
|
207
|
-
const MIN_SPA_CONTENT_LENGTH = 100;
|
|
208
|
-
const MIN_READERABLE_TEXT_LENGTH = 400;
|
|
209
|
-
const MAX_READABILITY_ELEMENTS = 20_000;
|
|
210
|
-
function isReadabilityCompatible(doc) {
|
|
211
|
-
if (!isObject(doc))
|
|
212
|
-
return false;
|
|
213
|
-
const { querySelectorAll, querySelector } = doc;
|
|
214
|
-
return ('documentElement' in doc &&
|
|
215
|
-
typeof querySelectorAll === 'function' &&
|
|
216
|
-
typeof querySelector === 'function');
|
|
217
|
-
}
|
|
218
|
-
function getNormalizedTextLengthUpTo(text, max) {
|
|
219
|
-
if (max <= 0)
|
|
220
|
-
return 0;
|
|
221
|
-
let length = 0;
|
|
222
|
-
let seenNonWhitespace = false;
|
|
223
|
-
let pendingSpace = false;
|
|
224
|
-
for (let i = 0; i < text.length; i += 1) {
|
|
225
|
-
const code = text.charCodeAt(i);
|
|
226
|
-
const isWhitespace = code <= 0x20;
|
|
227
|
-
if (isWhitespace) {
|
|
228
|
-
if (seenNonWhitespace)
|
|
229
|
-
pendingSpace = true;
|
|
230
|
-
continue;
|
|
231
|
-
}
|
|
232
|
-
if (!seenNonWhitespace) {
|
|
233
|
-
seenNonWhitespace = true;
|
|
234
|
-
}
|
|
235
|
-
else if (pendingSpace) {
|
|
236
|
-
length += 1;
|
|
237
|
-
pendingSpace = false;
|
|
238
|
-
if (length >= max)
|
|
239
|
-
return length;
|
|
240
|
-
}
|
|
241
|
-
length += 1;
|
|
242
|
-
if (length >= max)
|
|
243
|
-
return length;
|
|
244
|
-
}
|
|
245
|
-
return length;
|
|
246
|
-
}
|
|
247
|
-
function preserveGalleryImages(doc) {
|
|
248
|
-
const galleries = doc.querySelectorAll('[class*="gallery"],[class*="slideshow"],[class*="carousel"]');
|
|
249
|
-
for (const gallery of galleries) {
|
|
250
|
-
const images = gallery.querySelectorAll('img');
|
|
251
|
-
if (images.length === 0)
|
|
252
|
-
continue;
|
|
253
|
-
const fragment = doc.createDocumentFragment();
|
|
254
|
-
for (const img of images) {
|
|
255
|
-
const figure = doc.createElement('figure');
|
|
256
|
-
figure.appendChild(img.cloneNode(true));
|
|
257
|
-
fragment.appendChild(figure);
|
|
258
|
-
}
|
|
259
|
-
gallery.replaceWith(fragment);
|
|
260
|
-
}
|
|
261
|
-
}
|
|
262
|
-
function preserveAlertElements(doc) {
|
|
263
|
-
const alerts = doc.querySelectorAll('[role="alert"], .admonition, [class*="callout"]');
|
|
264
|
-
for (const el of alerts) {
|
|
265
|
-
const bq = doc.createElement('blockquote');
|
|
266
|
-
bq.innerHTML = el.innerHTML;
|
|
267
|
-
el.replaceWith(bq);
|
|
268
|
-
}
|
|
269
|
-
}
|
|
270
|
-
function preserveHeadingLayouts(doc) {
|
|
271
|
-
// Readability aggressively drops elements matching /header/i in their class/id.
|
|
272
|
-
// Many technical docs use `<div class="layout__header">` to wrap their title and intro text,
|
|
273
|
-
// causing the ENTIRE intro and H1 to be dropped.
|
|
274
|
-
for (const heading of doc.querySelectorAll('h1, h2')) {
|
|
275
|
-
let p = heading.parentNode;
|
|
276
|
-
while (p && p.tagName !== 'BODY' && p.tagName !== 'HTML') {
|
|
277
|
-
const cls = p.getAttribute('class');
|
|
278
|
-
if (cls && /header/i.test(cls)) {
|
|
279
|
-
p.setAttribute('class', cls.replace(/header/gi, 'hdr-preserved'));
|
|
280
|
-
}
|
|
281
|
-
const id = p.getAttribute('id');
|
|
282
|
-
if (id && /header/i.test(id)) {
|
|
283
|
-
p.setAttribute('id', id.replace(/header/gi, 'hdr-preserved'));
|
|
284
|
-
}
|
|
285
|
-
p = p.parentNode;
|
|
286
|
-
}
|
|
287
|
-
}
|
|
288
|
-
// To prevent Readability from penalizing sibling document sections
|
|
289
|
-
// (e.g. intro vs reference tables) and picking only one, we unwrap structural wrappers inside main boundaries.
|
|
290
|
-
for (const main of doc.querySelectorAll('main, [role="main"], article')) {
|
|
291
|
-
for (const child of Array.from(main.children)) {
|
|
292
|
-
// Don't unwrap nav, aside, or blockquotes (alerts are already converted to blockquotes here)
|
|
293
|
-
if (child.tagName === 'DIV' ||
|
|
294
|
-
child.tagName === 'HEADER' ||
|
|
295
|
-
child.tagName === 'SECTION') {
|
|
296
|
-
// preserve specific structural features Readability might want to keep
|
|
297
|
-
const cls = child.getAttribute('class') ?? '';
|
|
298
|
-
if (cls.includes('mermaid'))
|
|
299
|
-
continue;
|
|
300
|
-
const frag = doc.createDocumentFragment();
|
|
301
|
-
while (child.firstChild) {
|
|
302
|
-
frag.appendChild(child.firstChild);
|
|
303
|
-
}
|
|
304
|
-
child.replaceWith(frag);
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
}
|
|
309
|
-
function preserveCodeLanguageAttributes(doc) {
|
|
310
|
-
for (const el of doc.querySelectorAll('pre, code')) {
|
|
311
|
-
if (el.getAttribute('data-language'))
|
|
312
|
-
continue;
|
|
313
|
-
const lang = extractLanguageFromClassName(el.getAttribute('class') ?? '');
|
|
314
|
-
if (lang)
|
|
315
|
-
el.setAttribute('data-language', lang);
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
function prepareReadabilityDocument(readabilityDoc) {
|
|
319
|
-
extractNoscriptImages(readabilityDoc);
|
|
320
|
-
preserveGalleryImages(readabilityDoc);
|
|
321
|
-
preserveAlertElements(readabilityDoc);
|
|
322
|
-
preserveHeadingLayouts(readabilityDoc);
|
|
323
|
-
preserveCodeLanguageAttributes(readabilityDoc);
|
|
324
|
-
normalizeTabContent(readabilityDoc);
|
|
325
|
-
surfaceCodeEditorContent(readabilityDoc);
|
|
326
|
-
stripDocsControls(readabilityDoc);
|
|
327
|
-
stripScreenReaderText(readabilityDoc);
|
|
328
|
-
for (const el of readabilityDoc.querySelectorAll('[class*="breadcrumb"],[class*="pagination"]')) {
|
|
329
|
-
if (el.tagName === 'HTML' || el.tagName === 'BODY')
|
|
330
|
-
continue;
|
|
331
|
-
el.remove();
|
|
332
|
-
}
|
|
333
|
-
}
|
|
334
|
-
function validateReaderability(doc, url, signal) {
|
|
335
|
-
throwIfAborted(signal, url, 'extract:article:textCheck');
|
|
336
|
-
const rawText = doc.querySelector('body')?.textContent ??
|
|
337
|
-
doc.documentElement.textContent ??
|
|
338
|
-
'';
|
|
339
|
-
const textLength = getNormalizedTextLengthUpTo(rawText, MIN_READERABLE_TEXT_LENGTH + 1);
|
|
340
|
-
if (textLength < MIN_SPA_CONTENT_LENGTH) {
|
|
341
|
-
logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
|
|
342
|
-
'This might be a client-side rendered (SPA) application. ' +
|
|
343
|
-
'Content extraction may be incomplete.', { textLength }, Loggers.LOG_TRANSFORM);
|
|
344
|
-
}
|
|
345
|
-
throwIfAborted(signal, url, 'extract:article:readabilityCheck');
|
|
346
|
-
if (textLength >= MIN_READERABLE_TEXT_LENGTH && !isProbablyReaderable(doc)) {
|
|
347
|
-
return false;
|
|
348
|
-
}
|
|
349
|
-
return true;
|
|
350
|
-
}
|
|
351
|
-
function invokeReadability(doc, url, signal) {
|
|
352
|
-
throwIfAborted(signal, url, 'extract:article:clone');
|
|
353
|
-
const readabilityDoc = typeof doc.cloneNode === 'function'
|
|
354
|
-
? doc.cloneNode(true)
|
|
355
|
-
: doc;
|
|
356
|
-
prepareReadabilityDocument(readabilityDoc);
|
|
357
|
-
throwIfAborted(signal, url, 'extract:article:parse');
|
|
358
|
-
const reader = new Readability(readabilityDoc, {
|
|
359
|
-
charThreshold: 140,
|
|
360
|
-
maxElemsToParse: MAX_READABILITY_ELEMENTS,
|
|
361
|
-
classesToPreserve: [
|
|
362
|
-
'admonition',
|
|
363
|
-
'callout',
|
|
364
|
-
'custom-block',
|
|
365
|
-
'alert',
|
|
366
|
-
'note',
|
|
367
|
-
'tip',
|
|
368
|
-
'info',
|
|
369
|
-
'warning',
|
|
370
|
-
'danger',
|
|
371
|
-
'caution',
|
|
372
|
-
'important',
|
|
373
|
-
'mermaid',
|
|
374
|
-
],
|
|
375
|
-
});
|
|
376
|
-
return reader.parse();
|
|
377
|
-
}
|
|
378
|
-
function mapReadabilityResult(parsed) {
|
|
379
|
-
return {
|
|
380
|
-
content: parsed.content ?? '',
|
|
381
|
-
textContent: parsed.textContent ?? '',
|
|
382
|
-
...(parsed.title != null && { title: parsed.title }),
|
|
383
|
-
...(parsed.byline != null && { byline: parsed.byline }),
|
|
384
|
-
...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
|
|
385
|
-
...(parsed.siteName != null && { siteName: parsed.siteName }),
|
|
386
|
-
};
|
|
387
|
-
}
|
|
388
|
-
// Pre-Readability cleanup on a cloned document.
|
|
389
|
-
// Must strip tabs/breadcrumbs before Readability mangles role attributes.
|
|
390
|
-
// The original document is NOT yet prepared (prepareDocumentForMarkdown
|
|
391
|
-
// runs later in buildContentSource), so this clone starts from raw HTML.
|
|
392
|
-
function extractArticle(document, url, signal) {
|
|
393
|
-
if (!isReadabilityCompatible(document)) {
|
|
394
|
-
logWarn('Document not compatible with Readability', undefined, Loggers.LOG_TRANSFORM);
|
|
395
|
-
return null;
|
|
396
|
-
}
|
|
397
|
-
try {
|
|
398
|
-
if (!validateReaderability(document, url, signal)) {
|
|
399
|
-
return null;
|
|
400
|
-
}
|
|
401
|
-
const parsed = invokeReadability(document, url, signal);
|
|
402
|
-
if (!parsed)
|
|
403
|
-
return null;
|
|
404
|
-
return mapReadabilityResult(parsed);
|
|
405
|
-
}
|
|
406
|
-
catch (error) {
|
|
407
|
-
logError('Failed to extract article with Readability', error instanceof Error ? error : undefined, Loggers.LOG_TRANSFORM);
|
|
408
|
-
return null;
|
|
409
|
-
}
|
|
410
|
-
}
|
|
411
|
-
function isValidInput(html, url) {
|
|
412
|
-
if (typeof html !== 'string' || html.length === 0) {
|
|
413
|
-
logWarn('extractContent called with invalid HTML input', undefined, Loggers.LOG_TRANSFORM);
|
|
414
|
-
return false;
|
|
415
|
-
}
|
|
416
|
-
if (typeof url !== 'string' || url.length === 0) {
|
|
417
|
-
logWarn('extractContent called with invalid URL', undefined, Loggers.LOG_TRANSFORM);
|
|
418
|
-
return false;
|
|
419
|
-
}
|
|
420
|
-
return true;
|
|
421
|
-
}
|
|
422
|
-
function applyBaseUri(document, url) {
|
|
423
|
-
try {
|
|
424
|
-
Object.defineProperty(document, 'baseURI', { value: url, writable: true });
|
|
425
|
-
}
|
|
426
|
-
catch (error) {
|
|
427
|
-
logInfo('Failed to set baseURI (non-critical)', {
|
|
428
|
-
url: url.substring(0, 100),
|
|
429
|
-
error: getErrorMessage(error),
|
|
430
|
-
}, Loggers.LOG_TRANSFORM);
|
|
431
|
-
}
|
|
432
|
-
}
|
|
433
|
-
function createEmptyExtractionContext() {
|
|
434
|
-
const { document } = parseHTML('<html></html>');
|
|
435
|
-
return { article: null, metadata: {}, document };
|
|
436
|
-
}
|
|
437
|
-
function extractEarlyMetadataIfNeeded(html, url) {
|
|
438
|
-
const maxSize = config.constants.maxHtmlBytes;
|
|
439
|
-
if (maxSize <= 0)
|
|
440
|
-
return null;
|
|
441
|
-
if (html.length <= maxSize &&
|
|
442
|
-
(isAsciiOnly(html) || getUtf8ByteLength(html) <= maxSize)) {
|
|
443
|
-
return null;
|
|
444
|
-
}
|
|
445
|
-
return stageTracker.run(url, 'extract:early-metadata', () => extractMetadataFromHead(html, url));
|
|
446
|
-
}
|
|
447
|
-
function parseExtractionDocument(html, url, inputTruncated) {
|
|
448
|
-
const { html: limitedHtml, truncated } = truncateHtml(html, inputTruncated);
|
|
449
|
-
const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
|
|
450
|
-
return { document, truncated };
|
|
451
|
-
}
|
|
452
|
-
function extractMergedMetadata(html, url, document) {
|
|
453
|
-
const earlyMetadata = extractEarlyMetadataIfNeeded(html, url);
|
|
454
|
-
const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document, url));
|
|
455
|
-
return mergeMetadata(earlyMetadata, lateMetadata);
|
|
456
|
-
}
|
|
457
|
-
function extractArticleIfRequested(document, url, options) {
|
|
458
|
-
if (!options.extractArticle)
|
|
459
|
-
return null;
|
|
460
|
-
return stageTracker.run(url, 'extract:article', () => extractArticle(document, url, options.signal));
|
|
461
|
-
}
|
|
462
|
-
function extractContentContext(html, url, options) {
|
|
463
|
-
if (!isValidInput(html, url)) {
|
|
464
|
-
return createEmptyExtractionContext();
|
|
465
|
-
}
|
|
466
|
-
try {
|
|
467
|
-
throwIfAborted(options.signal, url, 'extract:begin');
|
|
468
|
-
const { document, truncated } = parseExtractionDocument(html, url, options.inputTruncated);
|
|
469
|
-
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
470
|
-
applyBaseUri(document, url);
|
|
471
|
-
const metadata = extractMergedMetadata(html, url, document);
|
|
472
|
-
throwIfAborted(options.signal, url, 'extract:metadata');
|
|
473
|
-
const article = extractArticleIfRequested(document, url, options);
|
|
474
|
-
throwIfAborted(options.signal, url, 'extract:article');
|
|
475
|
-
return {
|
|
476
|
-
article,
|
|
477
|
-
metadata,
|
|
478
|
-
document,
|
|
479
|
-
...(truncated ? { truncated: true } : {}),
|
|
480
|
-
};
|
|
481
|
-
}
|
|
482
|
-
catch (error) {
|
|
483
|
-
if (error instanceof FetchError)
|
|
484
|
-
throw error;
|
|
485
|
-
throwIfAborted(options.signal, url, 'extract:error');
|
|
486
|
-
logError('Failed to extract content', error instanceof Error ? error : undefined, Loggers.LOG_TRANSFORM);
|
|
487
|
-
return createEmptyExtractionContext();
|
|
488
|
-
}
|
|
489
|
-
}
|
|
490
|
-
export function extractContent(html, url, options = {
|
|
491
|
-
extractArticle: true,
|
|
492
|
-
}) {
|
|
493
|
-
const result = extractContentContext(html, url, options);
|
|
494
|
-
return { article: result.article, metadata: result.metadata };
|
|
495
|
-
}
|
|
496
|
-
function resolveRelativeHref(href, baseUrl, origin) {
|
|
497
|
-
const trimmedHref = href.trim();
|
|
498
|
-
if (!trimmedHref || /[\t\n\f\r ]/.test(trimmedHref))
|
|
499
|
-
return href;
|
|
500
|
-
if (isAbsoluteOrSpecialUrl(trimmedHref))
|
|
501
|
-
return trimmedHref;
|
|
502
|
-
const resolved = URL.parse(trimmedHref, baseUrl);
|
|
503
|
-
if (resolved)
|
|
504
|
-
return resolved.href;
|
|
505
|
-
if (trimmedHref.startsWith('/'))
|
|
506
|
-
return `${origin}${trimmedHref}`;
|
|
507
|
-
return trimmedHref;
|
|
508
|
-
}
|
|
509
|
-
function findBalancedCloseParen(text, start) {
|
|
510
|
-
let depth = 1;
|
|
511
|
-
for (let i = start; i < text.length; i++) {
|
|
512
|
-
const ch = text[i];
|
|
513
|
-
if (ch === '(') {
|
|
514
|
-
depth++;
|
|
515
|
-
}
|
|
516
|
-
else if (ch === ')') {
|
|
517
|
-
depth--;
|
|
518
|
-
if (depth === 0)
|
|
519
|
-
return i;
|
|
520
|
-
}
|
|
521
|
-
}
|
|
522
|
-
return -1;
|
|
523
|
-
}
|
|
524
|
-
function findInlineLink(markdown, start) {
|
|
525
|
-
let openBracket = markdown.indexOf('[', start);
|
|
526
|
-
while (openBracket !== -1) {
|
|
527
|
-
const closeBracket = markdown.indexOf(']', openBracket + 1);
|
|
528
|
-
if (closeBracket === -1)
|
|
529
|
-
return null;
|
|
530
|
-
if (markdown[closeBracket + 1] !== '(') {
|
|
531
|
-
openBracket = markdown.indexOf('[', closeBracket + 1);
|
|
532
|
-
continue;
|
|
533
|
-
}
|
|
534
|
-
const closeParen = findBalancedCloseParen(markdown, closeBracket + 2);
|
|
535
|
-
if (closeParen === -1)
|
|
536
|
-
return null;
|
|
537
|
-
const isImage = openBracket > 0 && markdown[openBracket - 1] === '!';
|
|
538
|
-
const prefixStart = isImage ? openBracket - 1 : openBracket;
|
|
539
|
-
return {
|
|
540
|
-
prefixStart,
|
|
541
|
-
closeParen,
|
|
542
|
-
prefix: markdown.slice(prefixStart, closeBracket + 1),
|
|
543
|
-
href: markdown.slice(closeBracket + 2, closeParen),
|
|
544
|
-
};
|
|
545
|
-
}
|
|
546
|
-
return null;
|
|
547
|
-
}
|
|
548
|
-
function isAbsoluteOrSpecialUrl(href) {
|
|
549
|
-
const trimmedHref = href.trim();
|
|
550
|
-
if (!trimmedHref)
|
|
551
|
-
return false;
|
|
552
|
-
if (trimmedHref.startsWith('#'))
|
|
553
|
-
return true;
|
|
554
|
-
return URL.canParse(trimmedHref);
|
|
555
|
-
}
|
|
556
|
-
function resolveRelativeUrlsInSegment(markdown, baseUrl, origin) {
|
|
557
|
-
let cursor = 0;
|
|
558
|
-
const parts = [];
|
|
559
|
-
while (cursor < markdown.length) {
|
|
560
|
-
const link = findInlineLink(markdown, cursor);
|
|
561
|
-
if (!link) {
|
|
562
|
-
parts.push(markdown.slice(cursor));
|
|
563
|
-
break;
|
|
564
|
-
}
|
|
565
|
-
parts.push(markdown.slice(cursor, link.prefixStart));
|
|
566
|
-
parts.push(`${link.prefix}(${resolveRelativeHref(link.href, baseUrl, origin)})`);
|
|
567
|
-
cursor = link.closeParen + 1;
|
|
568
|
-
}
|
|
569
|
-
return parts.join('');
|
|
570
|
-
}
|
|
571
|
-
function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
572
|
-
const parsedBase = URL.parse(baseUrl);
|
|
573
|
-
if (!parsedBase)
|
|
574
|
-
return markdown;
|
|
575
|
-
const { origin } = parsedBase;
|
|
576
|
-
if (!markdown)
|
|
577
|
-
return markdown;
|
|
578
|
-
return processFencedContent(markdown, (text) => {
|
|
579
|
-
throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
|
|
580
|
-
return resolveRelativeUrlsInSegment(text, baseUrl, origin);
|
|
581
|
-
});
|
|
582
|
-
}
|
|
583
|
-
function translateHtmlToMarkdown(params) {
|
|
584
|
-
const { html, url, signal, document, skipNoiseRemoval } = params;
|
|
585
|
-
throwIfAborted(signal, url, 'markdown:begin');
|
|
586
|
-
const cleanedHtml = skipNoiseRemoval
|
|
587
|
-
? html
|
|
588
|
-
: stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url, signal));
|
|
589
|
-
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
590
|
-
const content = stageTracker.run(url, 'markdown:translate', () => translateHtmlFragmentToMarkdown(cleanedHtml));
|
|
591
|
-
throwIfAborted(signal, url, 'markdown:translated');
|
|
592
|
-
const cleaned = cleanupMarkdownArtifacts(content, signal
|
|
593
|
-
? { preserveEmptyHeadings: true, signal, url }
|
|
594
|
-
: { preserveEmptyHeadings: true, url });
|
|
595
|
-
return url ? resolveRelativeUrls(cleaned, url, signal) : cleaned;
|
|
596
|
-
}
|
|
597
|
-
function appendMetadataFooter(content, metadata, url) {
|
|
598
|
-
const footer = buildMetadataFooter(metadata, url);
|
|
599
|
-
if (!content.trim() && footer) {
|
|
600
|
-
const note = '> **Note:** This page contains no readable content. It may require JavaScript to render.\n\n';
|
|
601
|
-
return `${note}${footer}`;
|
|
602
|
-
}
|
|
603
|
-
return footer ? `${content}\n\n${footer}` : content;
|
|
604
|
-
}
|
|
605
|
-
export function htmlToMarkdown(html, metadata, options) {
|
|
606
|
-
const url = options?.url ?? metadata?.url ?? '';
|
|
607
|
-
if (!html)
|
|
608
|
-
return buildMetadataFooter(metadata, url);
|
|
609
|
-
try {
|
|
610
|
-
const content = translateHtmlToMarkdown({
|
|
611
|
-
html,
|
|
612
|
-
url,
|
|
613
|
-
signal: options?.signal,
|
|
614
|
-
document: options?.document,
|
|
615
|
-
skipNoiseRemoval: options?.skipNoiseRemoval,
|
|
616
|
-
});
|
|
617
|
-
return appendMetadataFooter(content, metadata, url);
|
|
618
|
-
}
|
|
619
|
-
catch (error) {
|
|
620
|
-
if (error instanceof FetchError)
|
|
621
|
-
throw error;
|
|
622
|
-
logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined, Loggers.LOG_TRANSFORM);
|
|
623
|
-
const fetchError = new FetchError('Failed to convert HTML to markdown', url, 500, {
|
|
624
|
-
reason: 'markdown_convert_failed',
|
|
625
|
-
});
|
|
626
|
-
throw fetchError;
|
|
627
|
-
}
|
|
628
|
-
}
|
|
629
|
-
const HTML_DOCUMENT_START = /^\s*<(?:!doctype|html|head|body)\b/i;
|
|
630
|
-
const STRUCTURAL_HTML_TAGS = /<(?:html|head|body|div|p|span|section|article|main|nav|footer|header)\b/i;
|
|
631
|
-
function shouldPreserveRawContent(url, content) {
|
|
632
|
-
if (isRawTextContentUrl(url)) {
|
|
633
|
-
return !HTML_DOCUMENT_START.test(content.trim());
|
|
634
|
-
}
|
|
635
|
-
if (!isRawTextContent(content))
|
|
636
|
-
return false;
|
|
637
|
-
return !STRUCTURAL_HTML_TAGS.test(content);
|
|
638
|
-
}
|
|
639
|
-
function buildRawMarkdownPayload(params) {
|
|
640
|
-
const title = extractTitleFromRawMarkdown(params.rawContent);
|
|
641
|
-
let content = params.includeMetadataFooter
|
|
642
|
-
? addSourceToMarkdown(params.rawContent, params.url)
|
|
643
|
-
: params.rawContent;
|
|
644
|
-
if (params.url) {
|
|
645
|
-
content = resolveRelativeUrls(content, params.url);
|
|
646
|
-
}
|
|
647
|
-
return { content, title };
|
|
648
|
-
}
|
|
649
|
-
function tryTransformRawContent(params) {
|
|
650
|
-
if (!shouldPreserveRawContent(params.url, params.html))
|
|
651
|
-
return null;
|
|
652
|
-
logDebug('Preserving raw markdown content', {
|
|
653
|
-
url: params.url.substring(0, 80),
|
|
654
|
-
}, Loggers.LOG_TRANSFORM);
|
|
655
|
-
const { content, title } = buildRawMarkdownPayload({
|
|
656
|
-
rawContent: params.html,
|
|
657
|
-
url: params.url,
|
|
658
|
-
includeMetadataFooter: params.includeMetadataFooter,
|
|
659
|
-
});
|
|
660
|
-
return {
|
|
661
|
-
markdown: content,
|
|
662
|
-
title,
|
|
663
|
-
truncated: params.inputTruncated ?? false,
|
|
664
|
-
};
|
|
665
|
-
}
|
|
666
|
-
const MIN_CONTENT_RATIO = 0.15;
|
|
667
|
-
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
668
|
-
export function isExtractionSufficient(article, originalHtmlOrDocument) {
|
|
669
|
-
if (!article)
|
|
670
|
-
return false;
|
|
671
|
-
const articleLength = article.textContent.length;
|
|
672
|
-
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
673
|
-
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
674
|
-
return true;
|
|
675
|
-
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
676
|
-
}
|
|
677
|
-
const MIN_CONTENT_ROOT_LENGTH = 100;
|
|
678
|
-
const BINARY_SAMPLE_SIZE = 2000;
|
|
679
|
-
export function determineContentExtractionSource(article) {
|
|
680
|
-
return article !== null;
|
|
681
|
-
}
|
|
682
|
-
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadataFooter) {
|
|
683
|
-
if (!includeMetadataFooter)
|
|
684
|
-
return undefined;
|
|
685
|
-
const metadata = {
|
|
686
|
-
type: 'metadata',
|
|
687
|
-
url,
|
|
688
|
-
fetchedAt: new Date().toISOString(),
|
|
689
|
-
};
|
|
690
|
-
if (shouldExtractFromArticle && article) {
|
|
691
|
-
if (article.title !== undefined) {
|
|
692
|
-
metadata.title = normalizeDocumentTitle(article.title, url);
|
|
693
|
-
}
|
|
694
|
-
if (article.byline !== undefined)
|
|
695
|
-
metadata.author = article.byline;
|
|
696
|
-
}
|
|
697
|
-
else {
|
|
698
|
-
if (extractedMeta.title !== undefined)
|
|
699
|
-
metadata.title = extractedMeta.title;
|
|
700
|
-
if (extractedMeta.description !== undefined)
|
|
701
|
-
metadata.description = extractedMeta.description;
|
|
702
|
-
if (extractedMeta.author !== undefined)
|
|
703
|
-
metadata.author = extractedMeta.author;
|
|
704
|
-
}
|
|
705
|
-
return metadata;
|
|
706
|
-
}
|
|
707
|
-
function prepareContentSourceDocument(document, url, signal) {
|
|
708
|
-
const initialPrimaryHeading = TransformHeuristics.findPrimaryHeading(document);
|
|
709
|
-
prepareDocumentForMarkdown(document, url, signal);
|
|
710
|
-
return {
|
|
711
|
-
document,
|
|
712
|
-
primaryHeading: TransformHeuristics.findPrimaryHeading(document) ?? initialPrimaryHeading,
|
|
713
|
-
};
|
|
714
|
-
}
|
|
715
|
-
function resolveContentTitle(params) {
|
|
716
|
-
const resolvedTitle = (params.preferPrimaryHeading ? params.primaryHeading : undefined) ??
|
|
717
|
-
params.title;
|
|
718
|
-
return {
|
|
719
|
-
title: resolvedTitle,
|
|
720
|
-
};
|
|
721
|
-
}
|
|
722
|
-
function resolveSourceTitle(base, candidateTitle, url) {
|
|
723
|
-
return resolveContentTitle({
|
|
724
|
-
primaryHeading: base.primaryHeading,
|
|
725
|
-
title: candidateTitle,
|
|
726
|
-
preferPrimaryHeading: TransformHeuristics.isGithubRepositoryRootUrl(url) ||
|
|
727
|
-
shouldPreferPrimaryHeadingTitle(base.primaryHeading, candidateTitle),
|
|
728
|
-
});
|
|
729
|
-
}
|
|
730
|
-
const CONTENT_REGION_SELECTORS = [
|
|
731
|
-
'article',
|
|
732
|
-
'main',
|
|
733
|
-
'[role="main"]',
|
|
734
|
-
'#content',
|
|
735
|
-
'#main-content',
|
|
736
|
-
'.content',
|
|
737
|
-
'.main-content',
|
|
738
|
-
'.post-content',
|
|
739
|
-
'.article-content',
|
|
740
|
-
'.entry-content',
|
|
741
|
-
'[itemprop="articleBody"]',
|
|
742
|
-
'[data-content]',
|
|
743
|
-
'.post-body',
|
|
744
|
-
'.article-body',
|
|
745
|
-
];
|
|
746
|
-
const HEADING_REGION_EXTRA_SELECTORS = [
|
|
747
|
-
'.markdown-body',
|
|
748
|
-
'[itemprop="text"]',
|
|
749
|
-
];
|
|
750
|
-
function findContentRoot(document) {
|
|
751
|
-
for (const selector of CONTENT_REGION_SELECTORS) {
|
|
752
|
-
const element = document.querySelector(selector);
|
|
753
|
-
if (!element)
|
|
754
|
-
continue;
|
|
755
|
-
const innerHTML = typeof element.innerHTML === 'string'
|
|
756
|
-
? element.innerHTML
|
|
757
|
-
: undefined;
|
|
758
|
-
if (innerHTML && innerHTML.trim().length > MIN_CONTENT_ROOT_LENGTH)
|
|
759
|
-
return innerHTML;
|
|
760
|
-
}
|
|
761
|
-
return undefined;
|
|
762
|
-
}
|
|
763
|
-
const PRIMARY_HEADING_SELECTORS_GLOBAL = ['[data-title="true"]', 'h1'];
|
|
764
|
-
const PRIMARY_HEADING_SELECTORS_LOCAL = [
|
|
765
|
-
'[data-title="true"]',
|
|
766
|
-
'h1',
|
|
767
|
-
'h2',
|
|
768
|
-
];
|
|
769
|
-
function extractHeadingText(root, selectors) {
|
|
770
|
-
for (const selector of selectors) {
|
|
771
|
-
const heading = root.querySelector(selector);
|
|
772
|
-
if (!heading)
|
|
773
|
-
continue;
|
|
774
|
-
const text = heading.textContent.trim();
|
|
775
|
-
if (text)
|
|
776
|
-
return text;
|
|
777
|
-
}
|
|
778
|
-
return undefined;
|
|
779
|
-
}
|
|
780
|
-
function findPrimaryHeading(document) {
|
|
781
|
-
const globalHeading = extractHeadingText(document, PRIMARY_HEADING_SELECTORS_GLOBAL);
|
|
782
|
-
if (globalHeading)
|
|
783
|
-
return globalHeading;
|
|
784
|
-
for (const selector of [
|
|
785
|
-
...CONTENT_REGION_SELECTORS,
|
|
786
|
-
...HEADING_REGION_EXTRA_SELECTORS,
|
|
787
|
-
]) {
|
|
788
|
-
const root = document.querySelector(selector);
|
|
789
|
-
if (!root)
|
|
790
|
-
continue;
|
|
791
|
-
const localHeading = extractHeadingText(root, PRIMARY_HEADING_SELECTORS_LOCAL);
|
|
792
|
-
if (localHeading)
|
|
793
|
-
return localHeading;
|
|
794
|
-
}
|
|
795
|
-
return undefined;
|
|
796
|
-
}
|
|
797
|
-
const TransformHeuristics = {
|
|
798
|
-
findContentRoot,
|
|
799
|
-
findPrimaryHeading,
|
|
800
|
-
isGithubRepositoryRootUrl,
|
|
801
|
-
};
|
|
802
|
-
function buildArticleSource(base, params) {
|
|
803
|
-
const { evaluatedArticleDoc, article, extractedMeta, url, signal } = params;
|
|
804
|
-
prepareDocumentForMarkdown(evaluatedArticleDoc, url, signal);
|
|
805
|
-
const articleTitle = article.title !== undefined
|
|
806
|
-
? normalizeDocumentTitle(article.title, url)
|
|
807
|
-
: extractedMeta.title;
|
|
808
|
-
const title = resolveSourceTitle(base, articleTitle, url);
|
|
809
|
-
return {
|
|
810
|
-
...base,
|
|
811
|
-
sourceHtml: evaluatedArticleDoc.body.innerHTML,
|
|
812
|
-
...title,
|
|
813
|
-
skipNoiseRemoval: true,
|
|
814
|
-
};
|
|
815
|
-
}
|
|
816
|
-
function buildDocumentSource(base, params) {
|
|
817
|
-
const { resolvedDocument, html, extractedMeta, url } = params;
|
|
818
|
-
const contentRoot = TransformHeuristics.findContentRoot(resolvedDocument);
|
|
819
|
-
const title = resolveSourceTitle(base, extractedMeta.title, url);
|
|
820
|
-
return {
|
|
821
|
-
...base,
|
|
822
|
-
sourceHtml: contentRoot ?? serializeDocumentForMarkdown(resolvedDocument, html),
|
|
823
|
-
...title,
|
|
824
|
-
skipNoiseRemoval: true,
|
|
825
|
-
document: resolvedDocument,
|
|
826
|
-
};
|
|
827
|
-
}
|
|
828
|
-
function buildRawSource(base, params) {
|
|
829
|
-
return {
|
|
830
|
-
...base,
|
|
831
|
-
sourceHtml: params.html,
|
|
832
|
-
title: params.extractedMeta.title,
|
|
833
|
-
};
|
|
834
|
-
}
|
|
835
|
-
function resolveBaseContentSource(input) {
|
|
836
|
-
const { html, url, article, extractedMeta, includeMetadataFooter, evaluatedArticleDoc, document, truncated, signal, } = input;
|
|
837
|
-
const metadata = createContentMetadataBlock(url, article, extractedMeta, evaluatedArticleDoc !== null, includeMetadataFooter);
|
|
838
|
-
const preparedDocument = document
|
|
839
|
-
? prepareContentSourceDocument(document, url, signal)
|
|
840
|
-
: undefined;
|
|
841
|
-
const base = {
|
|
842
|
-
favicon: extractedMeta.favicon,
|
|
843
|
-
metadata,
|
|
844
|
-
extractedMetadata: extractedMeta,
|
|
845
|
-
truncated,
|
|
846
|
-
primaryHeading: preparedDocument?.primaryHeading,
|
|
847
|
-
originalHtml: html,
|
|
848
|
-
};
|
|
849
|
-
return { base, preparedDocument };
|
|
850
|
-
}
|
|
851
|
-
function buildContentSource(input) {
|
|
852
|
-
const { base, preparedDocument } = resolveBaseContentSource(input);
|
|
853
|
-
if (input.evaluatedArticleDoc && input.article) {
|
|
854
|
-
return buildArticleSource(base, {
|
|
855
|
-
evaluatedArticleDoc: input.evaluatedArticleDoc,
|
|
856
|
-
article: input.article,
|
|
857
|
-
extractedMeta: input.extractedMeta,
|
|
858
|
-
url: input.url,
|
|
859
|
-
signal: input.signal,
|
|
860
|
-
});
|
|
861
|
-
}
|
|
862
|
-
if (preparedDocument) {
|
|
863
|
-
return buildDocumentSource(base, {
|
|
864
|
-
resolvedDocument: preparedDocument.document,
|
|
865
|
-
html: input.html,
|
|
866
|
-
extractedMeta: input.extractedMeta,
|
|
867
|
-
url: input.url,
|
|
868
|
-
});
|
|
869
|
-
}
|
|
870
|
-
return buildRawSource(base, {
|
|
871
|
-
html: input.html,
|
|
872
|
-
extractedMeta: input.extractedMeta,
|
|
873
|
-
});
|
|
874
|
-
}
|
|
875
|
-
function resolveContentSource(params) {
|
|
876
|
-
const { article, metadata: extractedMeta, document, truncated, } = extractContentContext(params.html, params.url, {
|
|
877
|
-
extractArticle: true,
|
|
878
|
-
signal: params.signal,
|
|
879
|
-
inputTruncated: params.inputTruncated,
|
|
880
|
-
});
|
|
881
|
-
const evaluatedArticleDoc = article
|
|
882
|
-
? evaluateArticleContent(article, document)
|
|
883
|
-
: null;
|
|
884
|
-
return buildContentSource({
|
|
885
|
-
html: params.html,
|
|
886
|
-
url: params.url,
|
|
887
|
-
article,
|
|
888
|
-
extractedMeta,
|
|
889
|
-
includeMetadataFooter: params.includeMetadataFooter,
|
|
890
|
-
evaluatedArticleDoc,
|
|
891
|
-
document,
|
|
892
|
-
truncated: truncated ?? false,
|
|
893
|
-
signal: params.signal,
|
|
894
|
-
});
|
|
895
|
-
}
|
|
896
|
-
function renderMarkdownStage({ context, url, signal, }) {
|
|
897
|
-
return stageTracker.run(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
898
|
-
url,
|
|
899
|
-
signal,
|
|
900
|
-
document: context.document,
|
|
901
|
-
skipNoiseRemoval: context.skipNoiseRemoval,
|
|
902
|
-
}));
|
|
903
|
-
}
|
|
904
|
-
function postprocessMarkdownStage({ context, url, signal }, markdown) {
|
|
905
|
-
let content = maybeStripGithubPrimaryHeading(markdown, context.primaryHeading, url);
|
|
906
|
-
content = maybePrependSyntheticTitle(content, context);
|
|
907
|
-
content = supplementMarkdownFromNextFlight(content, context.originalHtml);
|
|
908
|
-
content = finalizeMarkdownSections(content, signal ? { signal, url } : { url });
|
|
909
|
-
return {
|
|
910
|
-
markdown: content,
|
|
911
|
-
title: context.title,
|
|
912
|
-
truncated: context.truncated,
|
|
913
|
-
metadata: context.extractedMetadata,
|
|
914
|
-
};
|
|
915
|
-
}
|
|
916
|
-
function buildMarkdownFromContext(context, url, signal) {
|
|
917
|
-
const renderContext = { context, url, signal };
|
|
918
|
-
const markdown = renderMarkdownStage(renderContext);
|
|
919
|
-
return postprocessMarkdownStage(renderContext, markdown);
|
|
920
|
-
}
|
|
921
|
-
function resolveTransformContentResult(html, url, options, signal) {
|
|
922
|
-
const rawResult = stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
|
|
923
|
-
html,
|
|
924
|
-
url,
|
|
925
|
-
includeMetadataFooter: options.includeMetadataFooter,
|
|
926
|
-
inputTruncated: options.inputTruncated,
|
|
927
|
-
}));
|
|
928
|
-
if (rawResult)
|
|
929
|
-
return rawResult;
|
|
930
|
-
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
931
|
-
html,
|
|
932
|
-
url,
|
|
933
|
-
includeMetadataFooter: options.includeMetadataFooter,
|
|
934
|
-
signal,
|
|
935
|
-
inputTruncated: options.inputTruncated,
|
|
936
|
-
}));
|
|
937
|
-
return buildMarkdownFromContext(context, url, signal);
|
|
938
|
-
}
|
|
939
|
-
const REPLACEMENT_CHAR = '\ufffd';
|
|
940
|
-
const BINARY_INDICATOR_THRESHOLD = 0.1;
|
|
941
|
-
function hasBinaryIndicators(content) {
|
|
942
|
-
if (!content)
|
|
943
|
-
return false;
|
|
944
|
-
if (content.includes('\x00'))
|
|
945
|
-
return true;
|
|
946
|
-
const sampleSize = Math.min(content.length, BINARY_SAMPLE_SIZE);
|
|
947
|
-
let replacementCount = 0;
|
|
948
|
-
let i = -1;
|
|
949
|
-
while ((i = content.indexOf(REPLACEMENT_CHAR, i + 1)) !== -1 &&
|
|
950
|
-
i < sampleSize) {
|
|
951
|
-
replacementCount++;
|
|
952
|
-
}
|
|
953
|
-
return replacementCount > sampleSize * BINARY_INDICATOR_THRESHOLD;
|
|
954
|
-
}
|
|
955
|
-
export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
956
|
-
const signal = buildTransformSignal(options.signal);
|
|
957
|
-
return stageTracker.runTrackedSync(url, signal, () => {
|
|
958
|
-
validateBinaryContent(html, url);
|
|
959
|
-
return resolveTransformContentResult(html, url, options, signal);
|
|
960
|
-
});
|
|
961
|
-
}
|
|
962
|
-
function validateBinaryContent(html, url) {
|
|
963
|
-
if (hasBinaryIndicators(html)) {
|
|
964
|
-
const error = new FetchError('Content appears to be binary data (high replacement character ratio or null bytes)', url, 415, { reason: 'binary_content_detected', stage: 'transform:validate' });
|
|
965
|
-
throw error;
|
|
966
|
-
}
|
|
967
|
-
}
|
|
968
|
-
export function getTransformPoolStats() {
|
|
969
|
-
return getWorkerPoolStats();
|
|
970
|
-
}
|
|
971
|
-
export async function shutdownTransformWorkerPool() {
|
|
972
|
-
await shutdownWorkerPool();
|
|
973
|
-
}
|
|
974
|
-
function transformInputInProcess(htmlOrBuffer, url, options) {
|
|
975
|
-
return transformHtmlToMarkdownInProcess(decodeInput(htmlOrBuffer, options.encoding), url, options);
|
|
976
|
-
}
|
|
977
|
-
function workerTransformOptions(options) {
|
|
978
|
-
return {
|
|
979
|
-
includeMetadataFooter: options.includeMetadataFooter,
|
|
980
|
-
...(options.signal ? { signal: options.signal } : {}),
|
|
981
|
-
...(options.inputTruncated
|
|
982
|
-
? { inputTruncated: options.inputTruncated }
|
|
983
|
-
: {}),
|
|
984
|
-
};
|
|
985
|
-
}
|
|
986
|
-
async function transformWithWorkerPool(htmlOrBuffer, url, options) {
|
|
987
|
-
const pool = getOrCreateWorkerPool();
|
|
988
|
-
if (pool.getCapacity() === 0) {
|
|
989
|
-
return transformInputInProcess(htmlOrBuffer, url, options);
|
|
990
|
-
}
|
|
991
|
-
if (typeof htmlOrBuffer === 'string') {
|
|
992
|
-
return pool.transform(htmlOrBuffer, url, workerTransformOptions(options));
|
|
993
|
-
}
|
|
994
|
-
return pool.transform(htmlOrBuffer, url, {
|
|
995
|
-
...workerTransformOptions(options),
|
|
996
|
-
...(options.encoding ? { encoding: options.encoding } : {}),
|
|
997
|
-
});
|
|
998
|
-
}
|
|
999
|
-
function resolveWorkerFallback(error, htmlOrBuffer, url, options) {
|
|
1000
|
-
const poolStats = getWorkerPoolStats();
|
|
1001
|
-
const isQueueFull = error instanceof FetchError &&
|
|
1002
|
-
error.details['reason'] === SystemErrors.QUEUE_FULL;
|
|
1003
|
-
if (isQueueFull) {
|
|
1004
|
-
logWarn('Transform worker queue full; falling back to in-process', {
|
|
1005
|
-
url: redactUrl(url),
|
|
1006
|
-
...(poolStats ?? {}),
|
|
1007
|
-
}, Loggers.LOG_TRANSFORM);
|
|
1008
|
-
return transformInputInProcess(htmlOrBuffer, url, options);
|
|
1009
|
-
}
|
|
1010
|
-
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
1011
|
-
if (error instanceof FetchError)
|
|
1012
|
-
throw error;
|
|
1013
|
-
if (!(error instanceof Error))
|
|
1014
|
-
throw toError(error);
|
|
1015
|
-
const message = getErrorMessage(error);
|
|
1016
|
-
logWarn('Transform worker failed; falling back to in-process', {
|
|
1017
|
-
url: redactUrl(url),
|
|
1018
|
-
error: message,
|
|
1019
|
-
...(poolStats ?? {}),
|
|
1020
|
-
}, Loggers.LOG_TRANSFORM);
|
|
1021
|
-
return transformInputInProcess(htmlOrBuffer, url, options);
|
|
1022
|
-
}
|
|
1023
|
-
async function runWorkerTransformWithFallback(htmlOrBuffer, url, options) {
|
|
1024
|
-
return stageTracker.runAsync(url, 'transform:worker', async () => {
|
|
1025
|
-
try {
|
|
1026
|
-
return await transformWithWorkerPool(htmlOrBuffer, url, options);
|
|
1027
|
-
}
|
|
1028
|
-
catch (error) {
|
|
1029
|
-
return resolveWorkerFallback(error, htmlOrBuffer, url, options);
|
|
1030
|
-
}
|
|
1031
|
-
});
|
|
1032
|
-
}
|
|
1033
|
-
async function transformInputToMarkdown(htmlOrBuffer, url, options) {
|
|
1034
|
-
return stageTracker.runTrackedAsync(url, options.signal, () => runWorkerTransformWithFallback(htmlOrBuffer, url, options));
|
|
1035
|
-
}
|
|
1036
|
-
export async function transformHtmlToMarkdown(html, url, options) {
|
|
1037
|
-
return transformInputToMarkdown(html, url, options);
|
|
1038
|
-
}
|
|
1039
|
-
export async function transformBufferToMarkdown(htmlBuffer, url, options) {
|
|
1040
|
-
return transformInputToMarkdown(htmlBuffer, url, options);
|
|
1041
|
-
}
|
|
1042
|
-
export { cleanupMarkdownArtifacts, finalizeMarkdownSections, processFencedContent, };
|