@j0hanz/superfetch 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -45
- package/dist/config/formatting.d.ts +1 -1
- package/dist/config/types/content.d.ts +3 -3
- package/dist/config/types/runtime.d.ts +1 -1
- package/dist/config/types/tools.d.ts +12 -12
- package/dist/http/cors.js +23 -23
- package/dist/http/download-routes.js +11 -5
- package/dist/http/mcp-routes.js +2 -13
- package/dist/http/mcp-validation.js +1 -1
- package/dist/http/server-middleware.js +5 -3
- package/dist/http/server.js +2 -0
- package/dist/index.js +5 -0
- package/dist/middleware/error-handler.js +1 -1
- package/dist/resources/cached-content.js +8 -4
- package/dist/server.js +2 -0
- package/dist/services/cache.d.ts +2 -1
- package/dist/services/cache.js +23 -7
- package/dist/services/context.d.ts +4 -4
- package/dist/services/context.js +11 -1
- package/dist/services/extractor.js +26 -21
- package/dist/services/fetcher/agents.js +55 -1
- package/dist/services/fetcher/interceptors.d.ts +22 -0
- package/dist/services/fetcher/interceptors.js +57 -26
- package/dist/services/fetcher/response.d.ts +1 -1
- package/dist/services/fetcher/response.js +37 -16
- package/dist/services/fetcher.d.ts +1 -1
- package/dist/services/fetcher.js +9 -8
- package/dist/services/metadata-collector.d.ts +10 -0
- package/dist/services/metadata-collector.js +11 -0
- package/dist/services/parser.d.ts +5 -1
- package/dist/services/parser.js +82 -11
- package/dist/services/transform-worker-pool.d.ts +14 -0
- package/dist/services/transform-worker-pool.js +167 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
- package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
- package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
- package/dist/tools/handlers/fetch-single.shared.js +42 -17
- package/dist/tools/handlers/fetch-url.tool.js +46 -16
- package/dist/tools/index.js +13 -0
- package/dist/tools/schemas.d.ts +19 -16
- package/dist/tools/schemas.js +25 -4
- package/dist/tools/utils/common.js +20 -16
- package/dist/tools/utils/content-transform-async.d.ts +6 -0
- package/dist/tools/utils/content-transform-async.js +33 -0
- package/dist/tools/utils/content-transform.d.ts +4 -1
- package/dist/tools/utils/content-transform.js +37 -3
- package/dist/tools/utils/fetch-pipeline.js +26 -15
- package/dist/utils/content-cleaner.d.ts +1 -1
- package/dist/utils/download-url.d.ts +9 -1
- package/dist/utils/download-url.js +9 -6
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +7 -7
- package/dist/utils/url-validator.d.ts +5 -0
- package/dist/utils/url-validator.js +45 -3
- package/dist/workers/transform-worker.d.ts +1 -0
- package/dist/workers/transform-worker.js +50 -0
- package/package.json +4 -6
|
@@ -3,17 +3,7 @@ import { Readability } from '@mozilla/readability';
|
|
|
3
3
|
import { getErrorMessage } from '../utils/error-utils.js';
|
|
4
4
|
import { truncateHtml } from '../utils/html-truncator.js';
|
|
5
5
|
import { logError, logInfo, logWarn } from './logger.js';
|
|
6
|
-
|
|
7
|
-
const sources = state[field];
|
|
8
|
-
return sources.og ?? sources.twitter ?? sources.standard;
|
|
9
|
-
}
|
|
10
|
-
function createMetaCollectorState() {
|
|
11
|
-
return {
|
|
12
|
-
title: {},
|
|
13
|
-
description: {},
|
|
14
|
-
author: {},
|
|
15
|
-
};
|
|
16
|
-
}
|
|
6
|
+
import { createMetaCollectorState, resolveMetaField, } from './metadata-collector.js';
|
|
17
7
|
function collectMetaTag(state, tag) {
|
|
18
8
|
const content = getMetaContent(tag);
|
|
19
9
|
if (!content)
|
|
@@ -76,11 +66,17 @@ function extractMetadata(document) {
|
|
|
76
66
|
const state = createMetaCollectorState();
|
|
77
67
|
scanMetaTags(document, state);
|
|
78
68
|
ensureTitleFallback(document, state);
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
69
|
+
const metadata = {};
|
|
70
|
+
const title = resolveMetaField(state, 'title');
|
|
71
|
+
const description = resolveMetaField(state, 'description');
|
|
72
|
+
const author = resolveMetaField(state, 'author');
|
|
73
|
+
if (title !== undefined)
|
|
74
|
+
metadata.title = title;
|
|
75
|
+
if (description !== undefined)
|
|
76
|
+
metadata.description = description;
|
|
77
|
+
if (author !== undefined)
|
|
78
|
+
metadata.author = author;
|
|
79
|
+
return metadata;
|
|
84
80
|
}
|
|
85
81
|
function isReadabilityCompatible(doc) {
|
|
86
82
|
if (!doc || typeof doc !== 'object')
|
|
@@ -113,14 +109,23 @@ function parseReadabilityArticle(document) {
|
|
|
113
109
|
}
|
|
114
110
|
}
|
|
115
111
|
function mapReadabilityResult(parsed) {
|
|
116
|
-
|
|
117
|
-
title: toOptional(parsed.title),
|
|
118
|
-
byline: toOptional(parsed.byline),
|
|
112
|
+
const article = {
|
|
119
113
|
content: parsed.content ?? '',
|
|
120
114
|
textContent: parsed.textContent ?? '',
|
|
121
|
-
excerpt: toOptional(parsed.excerpt),
|
|
122
|
-
siteName: toOptional(parsed.siteName),
|
|
123
115
|
};
|
|
116
|
+
const title = toOptional(parsed.title);
|
|
117
|
+
if (title !== undefined)
|
|
118
|
+
article.title = title;
|
|
119
|
+
const byline = toOptional(parsed.byline);
|
|
120
|
+
if (byline !== undefined)
|
|
121
|
+
article.byline = byline;
|
|
122
|
+
const excerpt = toOptional(parsed.excerpt);
|
|
123
|
+
if (excerpt !== undefined)
|
|
124
|
+
article.excerpt = excerpt;
|
|
125
|
+
const siteName = toOptional(parsed.siteName);
|
|
126
|
+
if (siteName !== undefined)
|
|
127
|
+
article.siteName = siteName;
|
|
128
|
+
return article;
|
|
124
129
|
}
|
|
125
130
|
function toOptional(value) {
|
|
126
131
|
return value ?? undefined;
|
|
@@ -3,9 +3,26 @@ import os from 'node:os';
|
|
|
3
3
|
import { Agent } from 'undici';
|
|
4
4
|
import { createErrorWithCode } from '../../utils/error-utils.js';
|
|
5
5
|
import { isBlockedIp } from '../../utils/url-validator.js';
|
|
6
|
+
const DNS_LOOKUP_TIMEOUT_MS = 5000;
|
|
6
7
|
function resolveDns(hostname, options, callback) {
|
|
7
8
|
const { normalizedOptions, useAll, resolvedFamily } = buildLookupContext(options);
|
|
8
|
-
|
|
9
|
+
const lookupOptions = buildLookupOptions(normalizedOptions);
|
|
10
|
+
let done = false;
|
|
11
|
+
const timer = setTimeout(() => {
|
|
12
|
+
if (done)
|
|
13
|
+
return;
|
|
14
|
+
done = true;
|
|
15
|
+
callback(createErrorWithCode(`DNS lookup timed out for ${hostname}`, 'ETIMEOUT'), []);
|
|
16
|
+
}, DNS_LOOKUP_TIMEOUT_MS);
|
|
17
|
+
timer.unref();
|
|
18
|
+
const safeCallback = (err, address, family) => {
|
|
19
|
+
if (done)
|
|
20
|
+
return;
|
|
21
|
+
done = true;
|
|
22
|
+
clearTimeout(timer);
|
|
23
|
+
callback(err, address, family);
|
|
24
|
+
};
|
|
25
|
+
dns.lookup(hostname, lookupOptions, createLookupCallback(hostname, resolvedFamily, useAll, safeCallback));
|
|
9
26
|
}
|
|
10
27
|
function normalizeLookupOptions(options) {
|
|
11
28
|
return typeof options === 'number' ? { family: options } : options;
|
|
@@ -18,6 +35,29 @@ function buildLookupContext(options) {
|
|
|
18
35
|
resolvedFamily: resolveFamily(normalizedOptions.family),
|
|
19
36
|
};
|
|
20
37
|
}
|
|
38
|
+
const DEFAULT_DNS_ORDER = 'verbatim';
|
|
39
|
+
function resolveResultOrder(options) {
|
|
40
|
+
if (options.order)
|
|
41
|
+
return options.order;
|
|
42
|
+
const legacyVerbatim = getLegacyVerbatim(options);
|
|
43
|
+
if (legacyVerbatim !== undefined) {
|
|
44
|
+
return legacyVerbatim ? 'verbatim' : 'ipv4first';
|
|
45
|
+
}
|
|
46
|
+
return DEFAULT_DNS_ORDER;
|
|
47
|
+
}
|
|
48
|
+
function getLegacyVerbatim(options) {
|
|
49
|
+
const legacy = options.verbatim;
|
|
50
|
+
return typeof legacy === 'boolean' ? legacy : undefined;
|
|
51
|
+
}
|
|
52
|
+
function buildLookupOptions(normalizedOptions) {
|
|
53
|
+
const options = {
|
|
54
|
+
...normalizedOptions,
|
|
55
|
+
order: resolveResultOrder(normalizedOptions),
|
|
56
|
+
all: true,
|
|
57
|
+
};
|
|
58
|
+
delete options.verbatim;
|
|
59
|
+
return options;
|
|
60
|
+
}
|
|
21
61
|
function createLookupCallback(hostname, resolvedFamily, useAll, callback) {
|
|
22
62
|
return (err, addresses) => {
|
|
23
63
|
handleLookupResult(err, addresses, hostname, resolvedFamily, useAll, callback);
|
|
@@ -42,6 +82,11 @@ function handleLookupResult(error, addresses, hostname, resolvedFamily, useAll,
|
|
|
42
82
|
return;
|
|
43
83
|
}
|
|
44
84
|
const list = normalizeLookupResults(addresses, resolvedFamily);
|
|
85
|
+
const invalidFamilyError = findInvalidFamilyError(list, hostname);
|
|
86
|
+
if (invalidFamilyError) {
|
|
87
|
+
callback(invalidFamilyError, list);
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
45
90
|
const blockedError = findBlockedIpError(list, hostname);
|
|
46
91
|
if (blockedError) {
|
|
47
92
|
callback(blockedError, list);
|
|
@@ -89,6 +134,15 @@ function findBlockedIpError(list, hostname) {
|
|
|
89
134
|
}
|
|
90
135
|
return null;
|
|
91
136
|
}
|
|
137
|
+
function findInvalidFamilyError(list, hostname) {
|
|
138
|
+
for (const addr of list) {
|
|
139
|
+
const family = typeof addr === 'string' ? 0 : addr.family;
|
|
140
|
+
if (family === 4 || family === 6)
|
|
141
|
+
continue;
|
|
142
|
+
return createErrorWithCode(`Invalid address family returned for ${hostname}`, 'EINVAL');
|
|
143
|
+
}
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
92
146
|
function createNoDnsResultsError(hostname) {
|
|
93
147
|
return createErrorWithCode(`No DNS results returned for ${hostname}`, 'ENODATA');
|
|
94
148
|
}
|
|
@@ -1,3 +1,25 @@
|
|
|
1
|
+
export type FetchChannelEvent = {
|
|
2
|
+
v: 1;
|
|
3
|
+
type: 'start';
|
|
4
|
+
requestId: string;
|
|
5
|
+
method: string;
|
|
6
|
+
url: string;
|
|
7
|
+
} | {
|
|
8
|
+
v: 1;
|
|
9
|
+
type: 'end';
|
|
10
|
+
requestId: string;
|
|
11
|
+
status: number;
|
|
12
|
+
duration: number;
|
|
13
|
+
} | {
|
|
14
|
+
v: 1;
|
|
15
|
+
type: 'error';
|
|
16
|
+
requestId: string;
|
|
17
|
+
url: string;
|
|
18
|
+
error: string;
|
|
19
|
+
code?: string;
|
|
20
|
+
status?: number;
|
|
21
|
+
duration: number;
|
|
22
|
+
};
|
|
1
23
|
interface FetchTelemetryContext {
|
|
2
24
|
requestId: string;
|
|
3
25
|
startTime: number;
|
|
@@ -4,21 +4,44 @@ import { performance } from 'node:perf_hooks';
|
|
|
4
4
|
import { isSystemError } from '../../utils/error-utils.js';
|
|
5
5
|
import { logDebug, logError, logWarn } from '../logger.js';
|
|
6
6
|
const fetchChannel = diagnosticsChannel.channel('superfetch.fetch');
|
|
7
|
+
function redactUrl(rawUrl) {
|
|
8
|
+
try {
|
|
9
|
+
const url = new URL(rawUrl);
|
|
10
|
+
url.username = '';
|
|
11
|
+
url.password = '';
|
|
12
|
+
url.hash = '';
|
|
13
|
+
url.search = '';
|
|
14
|
+
return url.toString();
|
|
15
|
+
}
|
|
16
|
+
catch {
|
|
17
|
+
return rawUrl;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
function publishFetchEvent(event) {
|
|
21
|
+
if (!fetchChannel.hasSubscribers)
|
|
22
|
+
return;
|
|
23
|
+
try {
|
|
24
|
+
fetchChannel.publish(event);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
// Avoid crashing the publisher if a subscriber throws.
|
|
28
|
+
}
|
|
29
|
+
}
|
|
7
30
|
export function startFetchTelemetry(url, method) {
|
|
31
|
+
const safeUrl = redactUrl(url);
|
|
8
32
|
const context = {
|
|
9
33
|
requestId: randomUUID(),
|
|
10
34
|
startTime: performance.now(),
|
|
11
|
-
url,
|
|
35
|
+
url: safeUrl,
|
|
12
36
|
method: method.toUpperCase(),
|
|
13
37
|
};
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
}
|
|
38
|
+
publishFetchEvent({
|
|
39
|
+
v: 1,
|
|
40
|
+
type: 'start',
|
|
41
|
+
requestId: context.requestId,
|
|
42
|
+
method: context.method,
|
|
43
|
+
url: context.url,
|
|
44
|
+
});
|
|
22
45
|
logDebug('HTTP Request', {
|
|
23
46
|
requestId: context.requestId,
|
|
24
47
|
method: context.method,
|
|
@@ -38,9 +61,8 @@ export function recordFetchResponse(context, response, contentSize) {
|
|
|
38
61
|
logSlowRequestIfNeeded(context, duration);
|
|
39
62
|
}
|
|
40
63
|
function publishFetchEnd(context, status, duration) {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
fetchChannel.publish({
|
|
64
|
+
publishFetchEvent({
|
|
65
|
+
v: 1,
|
|
44
66
|
type: 'end',
|
|
45
67
|
requestId: context.requestId,
|
|
46
68
|
status,
|
|
@@ -48,13 +70,18 @@ function publishFetchEnd(context, status, duration) {
|
|
|
48
70
|
});
|
|
49
71
|
}
|
|
50
72
|
function buildResponseMeta(response, contentSize, duration) {
|
|
51
|
-
const contentType = response.headers.get('content-type') ?? undefined;
|
|
52
73
|
const contentLength = response.headers.get('content-length') ?? contentSize?.toString();
|
|
53
|
-
|
|
54
|
-
contentType,
|
|
74
|
+
const meta = {
|
|
55
75
|
duration: `${Math.round(duration)}ms`,
|
|
56
|
-
size: contentLength,
|
|
57
76
|
};
|
|
77
|
+
const contentType = response.headers.get('content-type');
|
|
78
|
+
if (contentType !== null) {
|
|
79
|
+
meta.contentType = contentType;
|
|
80
|
+
}
|
|
81
|
+
if (contentLength !== undefined) {
|
|
82
|
+
meta.size = contentLength;
|
|
83
|
+
}
|
|
84
|
+
return meta;
|
|
58
85
|
}
|
|
59
86
|
function logSlowRequestIfNeeded(context, duration) {
|
|
60
87
|
if (duration <= 5000)
|
|
@@ -69,17 +96,21 @@ export function recordFetchError(context, error, status) {
|
|
|
69
96
|
const duration = performance.now() - context.startTime;
|
|
70
97
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
71
98
|
const code = isSystemError(err) ? err.code : undefined;
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
99
|
+
const event = {
|
|
100
|
+
v: 1,
|
|
101
|
+
type: 'error',
|
|
102
|
+
requestId: context.requestId,
|
|
103
|
+
url: context.url,
|
|
104
|
+
error: err.message,
|
|
105
|
+
duration,
|
|
106
|
+
};
|
|
107
|
+
if (code !== undefined) {
|
|
108
|
+
event.code = code;
|
|
109
|
+
}
|
|
110
|
+
if (status !== undefined) {
|
|
111
|
+
event.status = status;
|
|
82
112
|
}
|
|
113
|
+
publishFetchEvent(event);
|
|
83
114
|
const log = status === 429 ? logWarn : logError;
|
|
84
115
|
log('HTTP Request Error', {
|
|
85
116
|
requestId: context.requestId,
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { Readable, Writable } from 'node:stream';
|
|
2
|
+
import { pipeline } from 'node:stream/promises';
|
|
1
3
|
import { FetchError } from '../../errors/app-error.js';
|
|
2
4
|
function assertContentLengthWithinLimit(response, url, maxBytes) {
|
|
3
5
|
const contentLengthHeader = response.headers.get('content-length');
|
|
@@ -9,30 +11,49 @@ function assertContentLengthWithinLimit(response, url, maxBytes) {
|
|
|
9
11
|
}
|
|
10
12
|
throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
|
|
11
13
|
}
|
|
12
|
-
async function readStreamWithLimit(stream, url, maxBytes) {
|
|
13
|
-
const reader = stream.getReader();
|
|
14
|
+
async function readStreamWithLimit(stream, url, maxBytes, signal) {
|
|
14
15
|
const decoder = new TextDecoder();
|
|
15
16
|
let total = 0;
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
break;
|
|
21
|
-
total += value.byteLength;
|
|
22
|
-
if (total > maxBytes) {
|
|
23
|
-
await reader.cancel();
|
|
24
|
-
throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
|
|
17
|
+
let text = '';
|
|
18
|
+
const toBuffer = (chunk) => {
|
|
19
|
+
if (typeof chunk === 'string') {
|
|
20
|
+
return Buffer.from(chunk);
|
|
25
21
|
}
|
|
26
|
-
|
|
22
|
+
return Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
23
|
+
};
|
|
24
|
+
const sink = new Writable({
|
|
25
|
+
write(chunk, _encoding, callback) {
|
|
26
|
+
const buffer = toBuffer(chunk);
|
|
27
|
+
total += buffer.length;
|
|
28
|
+
if (total > maxBytes) {
|
|
29
|
+
callback(new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url));
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
text += decoder.decode(buffer, { stream: true });
|
|
33
|
+
callback();
|
|
34
|
+
},
|
|
35
|
+
final(callback) {
|
|
36
|
+
text += decoder.decode();
|
|
37
|
+
callback();
|
|
38
|
+
},
|
|
39
|
+
});
|
|
40
|
+
try {
|
|
41
|
+
const readable = Readable.fromWeb(stream, { signal });
|
|
42
|
+
await pipeline(readable, sink, { signal });
|
|
27
43
|
}
|
|
28
|
-
|
|
29
|
-
|
|
44
|
+
catch (error) {
|
|
45
|
+
if (signal?.aborted) {
|
|
46
|
+
throw new FetchError('Request was aborted during response read', url, 499, { reason: 'aborted' });
|
|
47
|
+
}
|
|
48
|
+
throw error;
|
|
49
|
+
}
|
|
50
|
+
return { text, size: total };
|
|
30
51
|
}
|
|
31
|
-
export async function readResponseText(response, url, maxBytes) {
|
|
52
|
+
export async function readResponseText(response, url, maxBytes, signal) {
|
|
32
53
|
assertContentLengthWithinLimit(response, url, maxBytes);
|
|
33
54
|
if (!response.body) {
|
|
34
55
|
const text = await response.text();
|
|
35
56
|
return { text, size: Buffer.byteLength(text) };
|
|
36
57
|
}
|
|
37
|
-
return readStreamWithLimit(response.body, url, maxBytes);
|
|
58
|
+
return readStreamWithLimit(response.body, url, maxBytes, signal);
|
|
38
59
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
import type { FetchOptions } from '../config/types/runtime.js';
|
|
2
2
|
import { destroyAgents } from './fetcher/agents.js';
|
|
3
3
|
export { destroyAgents };
|
|
4
|
-
export declare function
|
|
4
|
+
export declare function fetchNormalizedUrlWithRetry(normalizedUrl: string, options?: FetchOptions, maxRetries?: number): Promise<string>;
|
package/dist/services/fetcher.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { config } from '../config/index.js';
|
|
2
2
|
import { normalizeHeaderRecord } from '../utils/header-normalizer.js';
|
|
3
|
-
import { validateAndNormalizeUrl } from '../utils/url-validator.js';
|
|
4
3
|
import { destroyAgents, dispatcher } from './fetcher/agents.js';
|
|
5
4
|
import { createHttpError, createRateLimitError, mapFetchError, } from './fetcher/errors.js';
|
|
6
5
|
import { recordFetchError, recordFetchResponse, startFetchTelemetry, } from './fetcher/interceptors.js';
|
|
@@ -39,7 +38,7 @@ function buildRequestInit(headers, signal) {
|
|
|
39
38
|
dispatcher,
|
|
40
39
|
};
|
|
41
40
|
}
|
|
42
|
-
async function handleFetchResponse(response, finalUrl, telemetry) {
|
|
41
|
+
async function handleFetchResponse(response, finalUrl, telemetry, signal) {
|
|
43
42
|
if (response.status === 429) {
|
|
44
43
|
void response.body?.cancel();
|
|
45
44
|
throw createRateLimitError(finalUrl, response.headers.get('retry-after'));
|
|
@@ -48,7 +47,7 @@ async function handleFetchResponse(response, finalUrl, telemetry) {
|
|
|
48
47
|
void response.body?.cancel();
|
|
49
48
|
throw createHttpError(finalUrl, response.status, response.statusText);
|
|
50
49
|
}
|
|
51
|
-
const { text, size } = await readResponseText(response, finalUrl, config.fetcher.maxContentLength);
|
|
50
|
+
const { text, size } = await readResponseText(response, finalUrl, config.fetcher.maxContentLength, signal);
|
|
52
51
|
recordFetchResponse(telemetry, response, size);
|
|
53
52
|
return text;
|
|
54
53
|
}
|
|
@@ -57,7 +56,7 @@ async function fetchWithTelemetry(normalizedUrl, requestInit, timeoutMs) {
|
|
|
57
56
|
try {
|
|
58
57
|
const { response, url: finalUrl } = await fetchWithRedirects(normalizedUrl, requestInit, config.fetcher.maxRedirects);
|
|
59
58
|
telemetry.url = finalUrl;
|
|
60
|
-
return await handleFetchResponse(response, finalUrl, telemetry);
|
|
59
|
+
return await handleFetchResponse(response, finalUrl, telemetry, requestInit.signal ?? undefined);
|
|
61
60
|
}
|
|
62
61
|
catch (error) {
|
|
63
62
|
const mapped = mapFetchError(error, normalizedUrl, timeoutMs);
|
|
@@ -66,17 +65,19 @@ async function fetchWithTelemetry(normalizedUrl, requestInit, timeoutMs) {
|
|
|
66
65
|
throw mapped;
|
|
67
66
|
}
|
|
68
67
|
}
|
|
69
|
-
export async function
|
|
70
|
-
const normalizedUrl = await validateAndNormalizeUrl(url);
|
|
68
|
+
export async function fetchNormalizedUrlWithRetry(normalizedUrl, options, maxRetries = 3) {
|
|
71
69
|
const context = buildRequestContext(options);
|
|
72
70
|
return executeWithRetry(normalizedUrl, maxRetries, async () => runFetch(normalizedUrl, context), context.signal);
|
|
73
71
|
}
|
|
74
72
|
function buildRequestContext(options) {
|
|
75
|
-
|
|
73
|
+
const context = {
|
|
76
74
|
timeoutMs: options?.timeout ?? config.fetcher.timeout,
|
|
77
75
|
headers: buildHeaders(options?.customHeaders),
|
|
78
|
-
signal: options?.signal,
|
|
79
76
|
};
|
|
77
|
+
if (options?.signal) {
|
|
78
|
+
context.signal = options.signal;
|
|
79
|
+
}
|
|
80
|
+
return context;
|
|
80
81
|
}
|
|
81
82
|
async function runFetch(normalizedUrl, context) {
|
|
82
83
|
const signal = buildRequestSignal(context.timeoutMs, context.signal);
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { ExtractedMetadata } from '../config/types/content.js';
|
|
2
|
+
export type MetaSource = 'og' | 'twitter' | 'standard';
|
|
3
|
+
export type MetaField = keyof ExtractedMetadata;
|
|
4
|
+
export interface MetaCollectorState {
|
|
5
|
+
title: Partial<Record<MetaSource, string>>;
|
|
6
|
+
description: Partial<Record<MetaSource, string>>;
|
|
7
|
+
author: Partial<Record<MetaSource, string>>;
|
|
8
|
+
}
|
|
9
|
+
export declare function createMetaCollectorState(): MetaCollectorState;
|
|
10
|
+
export declare function resolveMetaField(state: MetaCollectorState, field: MetaField): string | undefined;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export function createMetaCollectorState() {
|
|
2
|
+
return {
|
|
3
|
+
title: {},
|
|
4
|
+
description: {},
|
|
5
|
+
author: {},
|
|
6
|
+
};
|
|
7
|
+
}
|
|
8
|
+
export function resolveMetaField(state, field) {
|
|
9
|
+
const sources = state[field];
|
|
10
|
+
return sources.og ?? sources.twitter ?? sources.standard;
|
|
11
|
+
}
|
|
@@ -1,2 +1,6 @@
|
|
|
1
|
-
import type { ContentBlockUnion } from '../config/types/content.js';
|
|
1
|
+
import type { ContentBlockUnion, ExtractedMetadata } from '../config/types/content.js';
|
|
2
2
|
export declare function parseHtml(html: string): ContentBlockUnion[];
|
|
3
|
+
export declare function parseHtmlWithMetadata(html: string): {
|
|
4
|
+
blocks: ContentBlockUnion[];
|
|
5
|
+
metadata: ExtractedMetadata;
|
|
6
|
+
};
|
package/dist/services/parser.js
CHANGED
|
@@ -6,7 +6,57 @@ import { getErrorMessage } from '../utils/error-utils.js';
|
|
|
6
6
|
import { truncateHtml } from '../utils/html-truncator.js';
|
|
7
7
|
import { sanitizeText } from '../utils/sanitizer.js';
|
|
8
8
|
import { logWarn } from './logger.js';
|
|
9
|
+
import { createMetaCollectorState, resolveMetaField, } from './metadata-collector.js';
|
|
9
10
|
const CONTENT_SELECTOR = 'h1, h2, h3, h4, h5, h6, p, ul, ol, pre, code:not(pre code), table, img, blockquote';
|
|
11
|
+
function extractMetadata($) {
|
|
12
|
+
const state = createMetaCollectorState();
|
|
13
|
+
$('meta').each((_, element) => {
|
|
14
|
+
const content = $(element).attr('content')?.trim();
|
|
15
|
+
if (!content)
|
|
16
|
+
return;
|
|
17
|
+
const property = $(element).attr('property');
|
|
18
|
+
if (property?.startsWith('og:')) {
|
|
19
|
+
const key = property.replace('og:', '');
|
|
20
|
+
if (key === 'title')
|
|
21
|
+
state.title.og = content;
|
|
22
|
+
if (key === 'description')
|
|
23
|
+
state.description.og = content;
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
const name = $(element).attr('name');
|
|
27
|
+
if (name?.startsWith('twitter:')) {
|
|
28
|
+
const key = name.replace('twitter:', '');
|
|
29
|
+
if (key === 'title')
|
|
30
|
+
state.title.twitter = content;
|
|
31
|
+
if (key === 'description')
|
|
32
|
+
state.description.twitter = content;
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
if (name === 'description') {
|
|
36
|
+
state.description.standard = content;
|
|
37
|
+
}
|
|
38
|
+
if (name === 'author') {
|
|
39
|
+
state.author.standard = content;
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
if (!state.title.standard) {
|
|
43
|
+
const titleText = $('title').first().text().trim();
|
|
44
|
+
if (titleText) {
|
|
45
|
+
state.title.standard = titleText;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
const metadata = {};
|
|
49
|
+
const title = resolveMetaField(state, 'title');
|
|
50
|
+
const description = resolveMetaField(state, 'description');
|
|
51
|
+
const author = resolveMetaField(state, 'author');
|
|
52
|
+
if (title !== undefined)
|
|
53
|
+
metadata.title = title;
|
|
54
|
+
if (description !== undefined)
|
|
55
|
+
metadata.description = description;
|
|
56
|
+
if (author !== undefined)
|
|
57
|
+
metadata.author = author;
|
|
58
|
+
return metadata;
|
|
59
|
+
}
|
|
10
60
|
function parseHeading($, element) {
|
|
11
61
|
const rawText = sanitizeText($(element).text());
|
|
12
62
|
const text = cleanHeading(rawText);
|
|
@@ -55,11 +105,14 @@ function parseCode($, element) {
|
|
|
55
105
|
const dataLang = $(element).attr('data-language') ?? '';
|
|
56
106
|
const language = resolveLanguageFromAttributes(className, dataLang) ??
|
|
57
107
|
detectLanguageFromCode(text);
|
|
58
|
-
|
|
108
|
+
const block = {
|
|
59
109
|
type: 'code',
|
|
60
|
-
language,
|
|
61
110
|
text,
|
|
62
111
|
};
|
|
112
|
+
if (language !== undefined) {
|
|
113
|
+
block.language = language;
|
|
114
|
+
}
|
|
115
|
+
return block;
|
|
63
116
|
}
|
|
64
117
|
function parseTable($, element) {
|
|
65
118
|
const headers = [];
|
|
@@ -90,21 +143,23 @@ function parseTable($, element) {
|
|
|
90
143
|
});
|
|
91
144
|
if (rows.length === 0)
|
|
92
145
|
return null;
|
|
93
|
-
return
|
|
94
|
-
type: 'table',
|
|
95
|
-
|
|
96
|
-
rows,
|
|
97
|
-
};
|
|
146
|
+
return headers.length > 0
|
|
147
|
+
? { type: 'table', headers, rows }
|
|
148
|
+
: { type: 'table', rows };
|
|
98
149
|
}
|
|
99
150
|
function parseImage($, element) {
|
|
100
151
|
const src = $(element).attr('src');
|
|
101
152
|
if (!src)
|
|
102
153
|
return null;
|
|
103
|
-
|
|
154
|
+
const alt = $(element).attr('alt');
|
|
155
|
+
const image = {
|
|
104
156
|
type: 'image',
|
|
105
157
|
src,
|
|
106
|
-
alt: $(element).attr('alt') ?? undefined,
|
|
107
158
|
};
|
|
159
|
+
if (alt !== undefined) {
|
|
160
|
+
image.alt = alt;
|
|
161
|
+
}
|
|
162
|
+
return image;
|
|
108
163
|
}
|
|
109
164
|
function parseBlockquote($, element) {
|
|
110
165
|
const rawText = sanitizeText($(element).text());
|
|
@@ -173,6 +228,10 @@ function loadHtml(html) {
|
|
|
173
228
|
return null;
|
|
174
229
|
}
|
|
175
230
|
}
|
|
231
|
+
function prepareCheerio(html) {
|
|
232
|
+
const processedHtml = truncateHtml(html);
|
|
233
|
+
return loadHtml(processedHtml);
|
|
234
|
+
}
|
|
176
235
|
function removeNoiseElements($) {
|
|
177
236
|
$('script, style, noscript, iframe, svg').remove();
|
|
178
237
|
}
|
|
@@ -198,10 +257,22 @@ function safeParseElement($, element) {
|
|
|
198
257
|
export function parseHtml(html) {
|
|
199
258
|
if (!html || typeof html !== 'string')
|
|
200
259
|
return [];
|
|
201
|
-
const
|
|
202
|
-
const $ = loadHtml(processedHtml);
|
|
260
|
+
const $ = prepareCheerio(html);
|
|
203
261
|
if (!$)
|
|
204
262
|
return [];
|
|
205
263
|
removeNoiseElements($);
|
|
206
264
|
return filterBlocks(collectBlocks($));
|
|
207
265
|
}
|
|
266
|
+
export function parseHtmlWithMetadata(html) {
|
|
267
|
+
if (!html || typeof html !== 'string') {
|
|
268
|
+
return { blocks: [], metadata: {} };
|
|
269
|
+
}
|
|
270
|
+
const $ = prepareCheerio(html);
|
|
271
|
+
if (!$) {
|
|
272
|
+
return { blocks: [], metadata: {} };
|
|
273
|
+
}
|
|
274
|
+
const metadata = extractMetadata($);
|
|
275
|
+
removeNoiseElements($);
|
|
276
|
+
const blocks = filterBlocks(collectBlocks($));
|
|
277
|
+
return { blocks, metadata };
|
|
278
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { JsonlTransformResult, MarkdownTransformResult, TransformOptions } from '../config/types/content.js';
|
|
2
|
+
type TransformMode = 'jsonl' | 'markdown' | 'markdown-blocks';
|
|
3
|
+
export interface TransformJob {
|
|
4
|
+
mode: TransformMode;
|
|
5
|
+
html: string;
|
|
6
|
+
url: string;
|
|
7
|
+
options: TransformOptions & {
|
|
8
|
+
includeContentBlocks?: boolean;
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
type TransformResult = JsonlTransformResult | MarkdownTransformResult;
|
|
12
|
+
export declare function runTransformInWorker(job: TransformJob): Promise<TransformResult | null>;
|
|
13
|
+
export declare function destroyTransformWorkers(): void;
|
|
14
|
+
export {};
|