@j0hanz/superfetch 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/http/download-routes.js +2 -1
- package/dist/http/server-middleware.js +4 -3
- package/dist/services/cache.d.ts +1 -0
- package/dist/services/cache.js +3 -0
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +10 -0
- package/dist/services/fetcher/agents.js +55 -1
- package/dist/services/fetcher/interceptors.js +44 -23
- package/dist/services/fetcher/response.d.ts +1 -1
- package/dist/services/fetcher/response.js +27 -14
- package/dist/services/fetcher.d.ts +1 -0
- package/dist/services/fetcher.js +6 -3
- package/dist/services/parser.d.ts +5 -1
- package/dist/services/parser.js +72 -2
- package/dist/tools/schemas.d.ts +29 -29
- package/dist/tools/schemas.js +21 -4
- package/dist/tools/utils/content-transform.js +31 -2
- package/dist/tools/utils/fetch-pipeline.js +8 -5
- package/dist/utils/url-validator.d.ts +5 -0
- package/dist/utils/url-validator.js +7 -3
- package/package.json +1 -1
|
@@ -80,7 +80,8 @@ export function resolveDownloadPayload(params, cacheEntry) {
|
|
|
80
80
|
const content = resolvePayloadContent(payload, params.namespace);
|
|
81
81
|
if (!content)
|
|
82
82
|
return null;
|
|
83
|
-
const
|
|
83
|
+
const safeTitle = typeof payload.title === 'string' ? payload.title : undefined;
|
|
84
|
+
const fileName = generateSafeFilename(cacheEntry.url, cacheEntry.title ?? safeTitle, params.hash, resolveExtension(params.namespace));
|
|
84
85
|
return {
|
|
85
86
|
content,
|
|
86
87
|
contentType: resolveContentType(params.namespace),
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { randomUUID } from 'node:crypto';
|
|
2
2
|
import { config } from '../config/index.js';
|
|
3
|
-
import {
|
|
3
|
+
import { bindToRequestContext, runWithRequestContext, } from '../services/context.js';
|
|
4
4
|
import { getSessionId } from './sessions.js';
|
|
5
5
|
const LOOPBACK_HOSTS = new Set(['localhost', '127.0.0.1', '::1']);
|
|
6
6
|
function normalizeHost(value) {
|
|
@@ -84,8 +84,9 @@ export function createContextMiddleware() {
|
|
|
84
84
|
return (req, _res, next) => {
|
|
85
85
|
const requestId = randomUUID();
|
|
86
86
|
const sessionId = getSessionId(req);
|
|
87
|
-
|
|
88
|
-
next
|
|
87
|
+
runWithRequestContext({ requestId, sessionId }, () => {
|
|
88
|
+
const boundNext = bindToRequestContext(next);
|
|
89
|
+
boundNext();
|
|
89
90
|
});
|
|
90
91
|
};
|
|
91
92
|
}
|
package/dist/services/cache.d.ts
CHANGED
|
@@ -18,4 +18,5 @@ export declare function onCacheUpdate(listener: CacheUpdateListener): () => void
|
|
|
18
18
|
export declare function get(cacheKey: string | null): CacheEntry | undefined;
|
|
19
19
|
export declare function set(cacheKey: string | null, content: string, metadata: CacheEntryMetadata): void;
|
|
20
20
|
export declare function keys(): string[];
|
|
21
|
+
export declare function isEnabled(): boolean;
|
|
21
22
|
export {};
|
package/dist/services/cache.js
CHANGED
|
@@ -175,6 +175,9 @@ export function set(cacheKey, content, metadata) {
|
|
|
175
175
|
export function keys() {
|
|
176
176
|
return Array.from(contentCache.keys());
|
|
177
177
|
}
|
|
178
|
+
export function isEnabled() {
|
|
179
|
+
return config.cache.enabled;
|
|
180
|
+
}
|
|
178
181
|
function buildCacheEntry(cacheKey, content, metadata) {
|
|
179
182
|
return {
|
|
180
183
|
url: metadata.url,
|
|
@@ -4,6 +4,8 @@ interface RequestContext {
|
|
|
4
4
|
sessionId?: string;
|
|
5
5
|
}
|
|
6
6
|
export declare const requestContext: AsyncLocalStorage<RequestContext>;
|
|
7
|
+
export declare function runWithRequestContext<T>(context: RequestContext, fn: () => T): T;
|
|
8
|
+
export declare function bindToRequestContext<T extends (...args: unknown[]) => unknown>(fn: T): T;
|
|
7
9
|
export declare function getRequestId(): string | undefined;
|
|
8
10
|
export declare function getSessionId(): string | undefined;
|
|
9
11
|
export {};
|
package/dist/services/context.js
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
2
2
|
export const requestContext = new AsyncLocalStorage();
|
|
3
|
+
export function runWithRequestContext(context, fn) {
|
|
4
|
+
return requestContext.run(context, fn);
|
|
5
|
+
}
|
|
6
|
+
export function bindToRequestContext(fn) {
|
|
7
|
+
const store = requestContext.getStore();
|
|
8
|
+
if (!store) {
|
|
9
|
+
return fn;
|
|
10
|
+
}
|
|
11
|
+
return ((...args) => requestContext.run(store, () => fn(...args)));
|
|
12
|
+
}
|
|
3
13
|
export function getRequestId() {
|
|
4
14
|
return requestContext.getStore()?.requestId;
|
|
5
15
|
}
|
|
@@ -3,9 +3,26 @@ import os from 'node:os';
|
|
|
3
3
|
import { Agent } from 'undici';
|
|
4
4
|
import { createErrorWithCode } from '../../utils/error-utils.js';
|
|
5
5
|
import { isBlockedIp } from '../../utils/url-validator.js';
|
|
6
|
+
const DNS_LOOKUP_TIMEOUT_MS = 5000;
|
|
6
7
|
function resolveDns(hostname, options, callback) {
|
|
7
8
|
const { normalizedOptions, useAll, resolvedFamily } = buildLookupContext(options);
|
|
8
|
-
|
|
9
|
+
const lookupOptions = buildLookupOptions(normalizedOptions);
|
|
10
|
+
let done = false;
|
|
11
|
+
const timer = setTimeout(() => {
|
|
12
|
+
if (done)
|
|
13
|
+
return;
|
|
14
|
+
done = true;
|
|
15
|
+
callback(createErrorWithCode(`DNS lookup timed out for ${hostname}`, 'ETIMEOUT'), []);
|
|
16
|
+
}, DNS_LOOKUP_TIMEOUT_MS);
|
|
17
|
+
timer.unref();
|
|
18
|
+
const safeCallback = (err, address, family) => {
|
|
19
|
+
if (done)
|
|
20
|
+
return;
|
|
21
|
+
done = true;
|
|
22
|
+
clearTimeout(timer);
|
|
23
|
+
callback(err, address, family);
|
|
24
|
+
};
|
|
25
|
+
dns.lookup(hostname, lookupOptions, createLookupCallback(hostname, resolvedFamily, useAll, safeCallback));
|
|
9
26
|
}
|
|
10
27
|
function normalizeLookupOptions(options) {
|
|
11
28
|
return typeof options === 'number' ? { family: options } : options;
|
|
@@ -18,6 +35,29 @@ function buildLookupContext(options) {
|
|
|
18
35
|
resolvedFamily: resolveFamily(normalizedOptions.family),
|
|
19
36
|
};
|
|
20
37
|
}
|
|
38
|
+
const DEFAULT_DNS_ORDER = 'verbatim';
|
|
39
|
+
function resolveResultOrder(options) {
|
|
40
|
+
if (options.order)
|
|
41
|
+
return options.order;
|
|
42
|
+
const legacyVerbatim = getLegacyVerbatim(options);
|
|
43
|
+
if (legacyVerbatim !== undefined) {
|
|
44
|
+
return legacyVerbatim ? 'verbatim' : 'ipv4first';
|
|
45
|
+
}
|
|
46
|
+
return DEFAULT_DNS_ORDER;
|
|
47
|
+
}
|
|
48
|
+
function getLegacyVerbatim(options) {
|
|
49
|
+
const legacy = options.verbatim;
|
|
50
|
+
return typeof legacy === 'boolean' ? legacy : undefined;
|
|
51
|
+
}
|
|
52
|
+
function buildLookupOptions(normalizedOptions) {
|
|
53
|
+
const options = {
|
|
54
|
+
...normalizedOptions,
|
|
55
|
+
order: resolveResultOrder(normalizedOptions),
|
|
56
|
+
all: true,
|
|
57
|
+
};
|
|
58
|
+
delete options.verbatim;
|
|
59
|
+
return options;
|
|
60
|
+
}
|
|
21
61
|
function createLookupCallback(hostname, resolvedFamily, useAll, callback) {
|
|
22
62
|
return (err, addresses) => {
|
|
23
63
|
handleLookupResult(err, addresses, hostname, resolvedFamily, useAll, callback);
|
|
@@ -42,6 +82,11 @@ function handleLookupResult(error, addresses, hostname, resolvedFamily, useAll,
|
|
|
42
82
|
return;
|
|
43
83
|
}
|
|
44
84
|
const list = normalizeLookupResults(addresses, resolvedFamily);
|
|
85
|
+
const invalidFamilyError = findInvalidFamilyError(list, hostname);
|
|
86
|
+
if (invalidFamilyError) {
|
|
87
|
+
callback(invalidFamilyError, list);
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
45
90
|
const blockedError = findBlockedIpError(list, hostname);
|
|
46
91
|
if (blockedError) {
|
|
47
92
|
callback(blockedError, list);
|
|
@@ -89,6 +134,15 @@ function findBlockedIpError(list, hostname) {
|
|
|
89
134
|
}
|
|
90
135
|
return null;
|
|
91
136
|
}
|
|
137
|
+
function findInvalidFamilyError(list, hostname) {
|
|
138
|
+
for (const addr of list) {
|
|
139
|
+
const family = typeof addr === 'string' ? 0 : addr.family;
|
|
140
|
+
if (family === 4 || family === 6)
|
|
141
|
+
continue;
|
|
142
|
+
return createErrorWithCode(`Invalid address family returned for ${hostname}`, 'EINVAL');
|
|
143
|
+
}
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
92
146
|
function createNoDnsResultsError(hostname) {
|
|
93
147
|
return createErrorWithCode(`No DNS results returned for ${hostname}`, 'ENODATA');
|
|
94
148
|
}
|
|
@@ -4,21 +4,44 @@ import { performance } from 'node:perf_hooks';
|
|
|
4
4
|
import { isSystemError } from '../../utils/error-utils.js';
|
|
5
5
|
import { logDebug, logError, logWarn } from '../logger.js';
|
|
6
6
|
const fetchChannel = diagnosticsChannel.channel('superfetch.fetch');
|
|
7
|
+
function redactUrl(rawUrl) {
|
|
8
|
+
try {
|
|
9
|
+
const url = new URL(rawUrl);
|
|
10
|
+
url.username = '';
|
|
11
|
+
url.password = '';
|
|
12
|
+
url.hash = '';
|
|
13
|
+
url.search = '';
|
|
14
|
+
return url.toString();
|
|
15
|
+
}
|
|
16
|
+
catch {
|
|
17
|
+
return rawUrl;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
function publishFetchEvent(event) {
|
|
21
|
+
if (!fetchChannel.hasSubscribers)
|
|
22
|
+
return;
|
|
23
|
+
try {
|
|
24
|
+
fetchChannel.publish(event);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
// Avoid crashing the publisher if a subscriber throws.
|
|
28
|
+
}
|
|
29
|
+
}
|
|
7
30
|
export function startFetchTelemetry(url, method) {
|
|
31
|
+
const safeUrl = redactUrl(url);
|
|
8
32
|
const context = {
|
|
9
33
|
requestId: randomUUID(),
|
|
10
34
|
startTime: performance.now(),
|
|
11
|
-
url,
|
|
35
|
+
url: safeUrl,
|
|
12
36
|
method: method.toUpperCase(),
|
|
13
37
|
};
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
}
|
|
38
|
+
publishFetchEvent({
|
|
39
|
+
v: 1,
|
|
40
|
+
type: 'start',
|
|
41
|
+
requestId: context.requestId,
|
|
42
|
+
method: context.method,
|
|
43
|
+
url: context.url,
|
|
44
|
+
});
|
|
22
45
|
logDebug('HTTP Request', {
|
|
23
46
|
requestId: context.requestId,
|
|
24
47
|
method: context.method,
|
|
@@ -38,9 +61,8 @@ export function recordFetchResponse(context, response, contentSize) {
|
|
|
38
61
|
logSlowRequestIfNeeded(context, duration);
|
|
39
62
|
}
|
|
40
63
|
function publishFetchEnd(context, status, duration) {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
fetchChannel.publish({
|
|
64
|
+
publishFetchEvent({
|
|
65
|
+
v: 1,
|
|
44
66
|
type: 'end',
|
|
45
67
|
requestId: context.requestId,
|
|
46
68
|
status,
|
|
@@ -69,17 +91,16 @@ export function recordFetchError(context, error, status) {
|
|
|
69
91
|
const duration = performance.now() - context.startTime;
|
|
70
92
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
71
93
|
const code = isSystemError(err) ? err.code : undefined;
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
}
|
|
94
|
+
publishFetchEvent({
|
|
95
|
+
v: 1,
|
|
96
|
+
type: 'error',
|
|
97
|
+
requestId: context.requestId,
|
|
98
|
+
url: context.url,
|
|
99
|
+
error: err.message,
|
|
100
|
+
code,
|
|
101
|
+
status,
|
|
102
|
+
duration,
|
|
103
|
+
});
|
|
83
104
|
const log = status === 429 ? logWarn : logError;
|
|
84
105
|
log('HTTP Request Error', {
|
|
85
106
|
requestId: context.requestId,
|
|
@@ -9,30 +9,43 @@ function assertContentLengthWithinLimit(response, url, maxBytes) {
|
|
|
9
9
|
}
|
|
10
10
|
throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
|
|
11
11
|
}
|
|
12
|
-
|
|
12
|
+
function throwIfReadAborted(url, signal) {
|
|
13
|
+
if (!signal?.aborted)
|
|
14
|
+
return;
|
|
15
|
+
throw new FetchError('Request was aborted during response read', url, 499, {
|
|
16
|
+
reason: 'aborted',
|
|
17
|
+
});
|
|
18
|
+
}
|
|
19
|
+
async function readStreamWithLimit(stream, url, maxBytes, signal) {
|
|
13
20
|
const reader = stream.getReader();
|
|
14
21
|
const decoder = new TextDecoder();
|
|
15
22
|
let total = 0;
|
|
16
23
|
const chunks = [];
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
try {
|
|
25
|
+
for (;;) {
|
|
26
|
+
throwIfReadAborted(url, signal);
|
|
27
|
+
const { value, done } = await reader.read();
|
|
28
|
+
if (done)
|
|
29
|
+
break;
|
|
30
|
+
total += value.byteLength;
|
|
31
|
+
if (total > maxBytes) {
|
|
32
|
+
await reader.cancel();
|
|
33
|
+
throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
|
|
34
|
+
}
|
|
35
|
+
chunks.push(decoder.decode(value, { stream: true }));
|
|
25
36
|
}
|
|
26
|
-
chunks.push(decoder.decode(
|
|
37
|
+
chunks.push(decoder.decode());
|
|
38
|
+
return { text: chunks.join(''), size: total };
|
|
39
|
+
}
|
|
40
|
+
finally {
|
|
41
|
+
reader.releaseLock();
|
|
27
42
|
}
|
|
28
|
-
chunks.push(decoder.decode());
|
|
29
|
-
return { text: chunks.join(''), size: total };
|
|
30
43
|
}
|
|
31
|
-
export async function readResponseText(response, url, maxBytes) {
|
|
44
|
+
export async function readResponseText(response, url, maxBytes, signal) {
|
|
32
45
|
assertContentLengthWithinLimit(response, url, maxBytes);
|
|
33
46
|
if (!response.body) {
|
|
34
47
|
const text = await response.text();
|
|
35
48
|
return { text, size: Buffer.byteLength(text) };
|
|
36
49
|
}
|
|
37
|
-
return readStreamWithLimit(response.body, url, maxBytes);
|
|
50
|
+
return readStreamWithLimit(response.body, url, maxBytes, signal);
|
|
38
51
|
}
|
|
@@ -2,3 +2,4 @@ import type { FetchOptions } from '../config/types/runtime.js';
|
|
|
2
2
|
import { destroyAgents } from './fetcher/agents.js';
|
|
3
3
|
export { destroyAgents };
|
|
4
4
|
export declare function fetchUrlWithRetry(url: string, options?: FetchOptions, maxRetries?: number): Promise<string>;
|
|
5
|
+
export declare function fetchNormalizedUrlWithRetry(normalizedUrl: string, options?: FetchOptions, maxRetries?: number): Promise<string>;
|
package/dist/services/fetcher.js
CHANGED
|
@@ -39,7 +39,7 @@ function buildRequestInit(headers, signal) {
|
|
|
39
39
|
dispatcher,
|
|
40
40
|
};
|
|
41
41
|
}
|
|
42
|
-
async function handleFetchResponse(response, finalUrl, telemetry) {
|
|
42
|
+
async function handleFetchResponse(response, finalUrl, telemetry, signal) {
|
|
43
43
|
if (response.status === 429) {
|
|
44
44
|
void response.body?.cancel();
|
|
45
45
|
throw createRateLimitError(finalUrl, response.headers.get('retry-after'));
|
|
@@ -48,7 +48,7 @@ async function handleFetchResponse(response, finalUrl, telemetry) {
|
|
|
48
48
|
void response.body?.cancel();
|
|
49
49
|
throw createHttpError(finalUrl, response.status, response.statusText);
|
|
50
50
|
}
|
|
51
|
-
const { text, size } = await readResponseText(response, finalUrl, config.fetcher.maxContentLength);
|
|
51
|
+
const { text, size } = await readResponseText(response, finalUrl, config.fetcher.maxContentLength, signal);
|
|
52
52
|
recordFetchResponse(telemetry, response, size);
|
|
53
53
|
return text;
|
|
54
54
|
}
|
|
@@ -57,7 +57,7 @@ async function fetchWithTelemetry(normalizedUrl, requestInit, timeoutMs) {
|
|
|
57
57
|
try {
|
|
58
58
|
const { response, url: finalUrl } = await fetchWithRedirects(normalizedUrl, requestInit, config.fetcher.maxRedirects);
|
|
59
59
|
telemetry.url = finalUrl;
|
|
60
|
-
return await handleFetchResponse(response, finalUrl, telemetry);
|
|
60
|
+
return await handleFetchResponse(response, finalUrl, telemetry, requestInit.signal ?? undefined);
|
|
61
61
|
}
|
|
62
62
|
catch (error) {
|
|
63
63
|
const mapped = mapFetchError(error, normalizedUrl, timeoutMs);
|
|
@@ -68,6 +68,9 @@ async function fetchWithTelemetry(normalizedUrl, requestInit, timeoutMs) {
|
|
|
68
68
|
}
|
|
69
69
|
export async function fetchUrlWithRetry(url, options, maxRetries = 3) {
|
|
70
70
|
const normalizedUrl = await validateAndNormalizeUrl(url);
|
|
71
|
+
return fetchNormalizedUrlWithRetry(normalizedUrl, options, maxRetries);
|
|
72
|
+
}
|
|
73
|
+
export async function fetchNormalizedUrlWithRetry(normalizedUrl, options, maxRetries = 3) {
|
|
71
74
|
const context = buildRequestContext(options);
|
|
72
75
|
return executeWithRetry(normalizedUrl, maxRetries, async () => runFetch(normalizedUrl, context), context.signal);
|
|
73
76
|
}
|
|
@@ -1,2 +1,6 @@
|
|
|
1
|
-
import type { ContentBlockUnion } from '../config/types/content.js';
|
|
1
|
+
import type { ContentBlockUnion, ExtractedMetadata } from '../config/types/content.js';
|
|
2
2
|
export declare function parseHtml(html: string): ContentBlockUnion[];
|
|
3
|
+
export declare function parseHtmlWithMetadata(html: string): {
|
|
4
|
+
blocks: ContentBlockUnion[];
|
|
5
|
+
metadata: ExtractedMetadata;
|
|
6
|
+
};
|
package/dist/services/parser.js
CHANGED
|
@@ -7,6 +7,60 @@ import { truncateHtml } from '../utils/html-truncator.js';
|
|
|
7
7
|
import { sanitizeText } from '../utils/sanitizer.js';
|
|
8
8
|
import { logWarn } from './logger.js';
|
|
9
9
|
const CONTENT_SELECTOR = 'h1, h2, h3, h4, h5, h6, p, ul, ol, pre, code:not(pre code), table, img, blockquote';
|
|
10
|
+
function createMetaCollectorState() {
|
|
11
|
+
return {
|
|
12
|
+
title: {},
|
|
13
|
+
description: {},
|
|
14
|
+
author: {},
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
function resolveMetaField(state, field) {
|
|
18
|
+
const sources = state[field];
|
|
19
|
+
return sources.og ?? sources.twitter ?? sources.standard;
|
|
20
|
+
}
|
|
21
|
+
function extractMetadata($) {
|
|
22
|
+
const state = createMetaCollectorState();
|
|
23
|
+
$('meta').each((_, element) => {
|
|
24
|
+
const content = $(element).attr('content')?.trim();
|
|
25
|
+
if (!content)
|
|
26
|
+
return;
|
|
27
|
+
const property = $(element).attr('property');
|
|
28
|
+
if (property?.startsWith('og:')) {
|
|
29
|
+
const key = property.replace('og:', '');
|
|
30
|
+
if (key === 'title')
|
|
31
|
+
state.title.og = content;
|
|
32
|
+
if (key === 'description')
|
|
33
|
+
state.description.og = content;
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
const name = $(element).attr('name');
|
|
37
|
+
if (name?.startsWith('twitter:')) {
|
|
38
|
+
const key = name.replace('twitter:', '');
|
|
39
|
+
if (key === 'title')
|
|
40
|
+
state.title.twitter = content;
|
|
41
|
+
if (key === 'description')
|
|
42
|
+
state.description.twitter = content;
|
|
43
|
+
return;
|
|
44
|
+
}
|
|
45
|
+
if (name === 'description') {
|
|
46
|
+
state.description.standard = content;
|
|
47
|
+
}
|
|
48
|
+
if (name === 'author') {
|
|
49
|
+
state.author.standard = content;
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
if (!state.title.standard) {
|
|
53
|
+
const titleText = $('title').first().text().trim();
|
|
54
|
+
if (titleText) {
|
|
55
|
+
state.title.standard = titleText;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return {
|
|
59
|
+
title: resolveMetaField(state, 'title'),
|
|
60
|
+
description: resolveMetaField(state, 'description'),
|
|
61
|
+
author: resolveMetaField(state, 'author'),
|
|
62
|
+
};
|
|
63
|
+
}
|
|
10
64
|
function parseHeading($, element) {
|
|
11
65
|
const rawText = sanitizeText($(element).text());
|
|
12
66
|
const text = cleanHeading(rawText);
|
|
@@ -173,6 +227,10 @@ function loadHtml(html) {
|
|
|
173
227
|
return null;
|
|
174
228
|
}
|
|
175
229
|
}
|
|
230
|
+
function prepareCheerio(html) {
|
|
231
|
+
const processedHtml = truncateHtml(html);
|
|
232
|
+
return loadHtml(processedHtml);
|
|
233
|
+
}
|
|
176
234
|
function removeNoiseElements($) {
|
|
177
235
|
$('script, style, noscript, iframe, svg').remove();
|
|
178
236
|
}
|
|
@@ -198,10 +256,22 @@ function safeParseElement($, element) {
|
|
|
198
256
|
export function parseHtml(html) {
|
|
199
257
|
if (!html || typeof html !== 'string')
|
|
200
258
|
return [];
|
|
201
|
-
const
|
|
202
|
-
const $ = loadHtml(processedHtml);
|
|
259
|
+
const $ = prepareCheerio(html);
|
|
203
260
|
if (!$)
|
|
204
261
|
return [];
|
|
205
262
|
removeNoiseElements($);
|
|
206
263
|
return filterBlocks(collectBlocks($));
|
|
207
264
|
}
|
|
265
|
+
export function parseHtmlWithMetadata(html) {
|
|
266
|
+
if (!html || typeof html !== 'string') {
|
|
267
|
+
return { blocks: [], metadata: {} };
|
|
268
|
+
}
|
|
269
|
+
const $ = prepareCheerio(html);
|
|
270
|
+
if (!$) {
|
|
271
|
+
return { blocks: [], metadata: {} };
|
|
272
|
+
}
|
|
273
|
+
const metadata = extractMetadata($);
|
|
274
|
+
removeNoiseElements($);
|
|
275
|
+
const blocks = filterBlocks(collectBlocks($));
|
|
276
|
+
return { blocks, metadata };
|
|
277
|
+
}
|
package/dist/tools/schemas.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
3
|
-
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString
|
|
3
|
+
customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
|
|
4
4
|
timeout: z.ZodDefault<z.ZodNumber>;
|
|
5
5
|
retries: z.ZodDefault<z.ZodNumber>;
|
|
6
6
|
} & {
|
|
@@ -13,25 +13,25 @@ export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
|
13
13
|
format: z.ZodDefault<z.ZodEnum<["jsonl", "markdown"]>>;
|
|
14
14
|
}, "strict", z.ZodTypeAny, {
|
|
15
15
|
url: string;
|
|
16
|
+
timeout: number;
|
|
17
|
+
retries: number;
|
|
16
18
|
extractMainContent: boolean;
|
|
17
19
|
includeMetadata: boolean;
|
|
18
|
-
retries: number;
|
|
19
20
|
format: "jsonl" | "markdown";
|
|
20
|
-
timeout: number;
|
|
21
21
|
customHeaders?: Record<string, string> | undefined;
|
|
22
22
|
maxContentLength?: number | undefined;
|
|
23
23
|
}, {
|
|
24
24
|
url: string;
|
|
25
25
|
customHeaders?: Record<string, string> | undefined;
|
|
26
|
+
timeout?: number | undefined;
|
|
27
|
+
retries?: number | undefined;
|
|
26
28
|
extractMainContent?: boolean | undefined;
|
|
27
29
|
includeMetadata?: boolean | undefined;
|
|
28
|
-
retries?: number | undefined;
|
|
29
|
-
format?: "jsonl" | "markdown" | undefined;
|
|
30
30
|
maxContentLength?: number | undefined;
|
|
31
|
-
|
|
31
|
+
format?: "jsonl" | "markdown" | undefined;
|
|
32
32
|
}>;
|
|
33
33
|
export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
34
|
-
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString
|
|
34
|
+
customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
|
|
35
35
|
timeout: z.ZodDefault<z.ZodNumber>;
|
|
36
36
|
retries: z.ZodDefault<z.ZodNumber>;
|
|
37
37
|
} & {
|
|
@@ -42,20 +42,20 @@ export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
|
42
42
|
maxContentLength: z.ZodOptional<z.ZodNumber>;
|
|
43
43
|
}, "strict", z.ZodTypeAny, {
|
|
44
44
|
url: string;
|
|
45
|
+
timeout: number;
|
|
46
|
+
retries: number;
|
|
45
47
|
extractMainContent: boolean;
|
|
46
48
|
includeMetadata: boolean;
|
|
47
|
-
retries: number;
|
|
48
|
-
timeout: number;
|
|
49
49
|
customHeaders?: Record<string, string> | undefined;
|
|
50
50
|
maxContentLength?: number | undefined;
|
|
51
51
|
}, {
|
|
52
52
|
url: string;
|
|
53
53
|
customHeaders?: Record<string, string> | undefined;
|
|
54
|
+
timeout?: number | undefined;
|
|
55
|
+
retries?: number | undefined;
|
|
54
56
|
extractMainContent?: boolean | undefined;
|
|
55
57
|
includeMetadata?: boolean | undefined;
|
|
56
|
-
retries?: number | undefined;
|
|
57
58
|
maxContentLength?: number | undefined;
|
|
58
|
-
timeout?: number | undefined;
|
|
59
59
|
}>;
|
|
60
60
|
export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
61
61
|
url: z.ZodString;
|
|
@@ -74,31 +74,31 @@ export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
|
74
74
|
errorCode: z.ZodOptional<z.ZodString>;
|
|
75
75
|
}, "strict", z.ZodTypeAny, {
|
|
76
76
|
url: string;
|
|
77
|
+
format: "jsonl" | "markdown";
|
|
77
78
|
contentBlocks: number;
|
|
78
79
|
fetchedAt: string;
|
|
79
|
-
format: "jsonl" | "markdown";
|
|
80
80
|
cached: boolean;
|
|
81
81
|
error?: string | undefined;
|
|
82
|
-
content?: string | undefined;
|
|
83
82
|
title?: string | undefined;
|
|
84
|
-
|
|
83
|
+
content?: string | undefined;
|
|
84
|
+
contentSize?: number | undefined;
|
|
85
85
|
resourceUri?: string | undefined;
|
|
86
86
|
resourceMimeType?: string | undefined;
|
|
87
|
-
|
|
87
|
+
truncated?: boolean | undefined;
|
|
88
88
|
errorCode?: string | undefined;
|
|
89
89
|
}, {
|
|
90
90
|
url: string;
|
|
91
|
+
format: "jsonl" | "markdown";
|
|
91
92
|
contentBlocks: number;
|
|
92
93
|
fetchedAt: string;
|
|
93
|
-
format: "jsonl" | "markdown";
|
|
94
94
|
cached: boolean;
|
|
95
95
|
error?: string | undefined;
|
|
96
|
-
content?: string | undefined;
|
|
97
96
|
title?: string | undefined;
|
|
98
|
-
|
|
97
|
+
content?: string | undefined;
|
|
98
|
+
contentSize?: number | undefined;
|
|
99
99
|
resourceUri?: string | undefined;
|
|
100
100
|
resourceMimeType?: string | undefined;
|
|
101
|
-
|
|
101
|
+
truncated?: boolean | undefined;
|
|
102
102
|
errorCode?: string | undefined;
|
|
103
103
|
}>;
|
|
104
104
|
export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
@@ -111,13 +111,13 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
111
111
|
fileName: z.ZodString;
|
|
112
112
|
expiresAt: z.ZodString;
|
|
113
113
|
}, "strip", z.ZodTypeAny, {
|
|
114
|
+
downloadUrl: string;
|
|
114
115
|
fileName: string;
|
|
115
116
|
expiresAt: string;
|
|
116
|
-
downloadUrl: string;
|
|
117
117
|
}, {
|
|
118
|
+
downloadUrl: string;
|
|
118
119
|
fileName: string;
|
|
119
120
|
expiresAt: string;
|
|
120
|
-
downloadUrl: string;
|
|
121
121
|
}>>;
|
|
122
122
|
} & {
|
|
123
123
|
contentSize: z.ZodOptional<z.ZodNumber>;
|
|
@@ -134,16 +134,16 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
134
134
|
error?: string | undefined;
|
|
135
135
|
markdown?: string | undefined;
|
|
136
136
|
title?: string | undefined;
|
|
137
|
-
|
|
137
|
+
contentSize?: number | undefined;
|
|
138
138
|
resourceUri?: string | undefined;
|
|
139
139
|
resourceMimeType?: string | undefined;
|
|
140
|
-
|
|
140
|
+
truncated?: boolean | undefined;
|
|
141
|
+
errorCode?: string | undefined;
|
|
141
142
|
file?: {
|
|
143
|
+
downloadUrl: string;
|
|
142
144
|
fileName: string;
|
|
143
145
|
expiresAt: string;
|
|
144
|
-
downloadUrl: string;
|
|
145
146
|
} | undefined;
|
|
146
|
-
errorCode?: string | undefined;
|
|
147
147
|
}, {
|
|
148
148
|
url: string;
|
|
149
149
|
fetchedAt: string;
|
|
@@ -151,14 +151,14 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
151
151
|
error?: string | undefined;
|
|
152
152
|
markdown?: string | undefined;
|
|
153
153
|
title?: string | undefined;
|
|
154
|
-
|
|
154
|
+
contentSize?: number | undefined;
|
|
155
155
|
resourceUri?: string | undefined;
|
|
156
156
|
resourceMimeType?: string | undefined;
|
|
157
|
-
|
|
157
|
+
truncated?: boolean | undefined;
|
|
158
|
+
errorCode?: string | undefined;
|
|
158
159
|
file?: {
|
|
160
|
+
downloadUrl: string;
|
|
159
161
|
fileName: string;
|
|
160
162
|
expiresAt: string;
|
|
161
|
-
downloadUrl: string;
|
|
162
163
|
} | undefined;
|
|
163
|
-
errorCode?: string | undefined;
|
|
164
164
|
}>;
|
package/dist/tools/schemas.js
CHANGED
|
@@ -1,8 +1,16 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
import { config } from '../config/index.js';
|
|
3
|
+
const MAX_HEADER_NAME_LENGTH = 128;
|
|
4
|
+
const MAX_HEADER_VALUE_LENGTH = 2048;
|
|
5
|
+
const MAX_HEADER_COUNT = 50;
|
|
6
|
+
const MAX_CONTENT_LENGTH = config.constants.maxContentSize;
|
|
7
|
+
const customHeadersSchema = z
|
|
8
|
+
.record(z.string().max(MAX_HEADER_NAME_LENGTH), z.string().max(MAX_HEADER_VALUE_LENGTH))
|
|
9
|
+
.refine((headers) => Object.keys(headers).length <= MAX_HEADER_COUNT, {
|
|
10
|
+
message: `customHeaders must have at most ${MAX_HEADER_COUNT} entries`,
|
|
11
|
+
});
|
|
3
12
|
const requestOptionsSchema = z.object({
|
|
4
|
-
customHeaders:
|
|
5
|
-
.record(z.string())
|
|
13
|
+
customHeaders: customHeadersSchema
|
|
6
14
|
.optional()
|
|
7
15
|
.describe('Custom HTTP headers for the request'),
|
|
8
16
|
timeout: z
|
|
@@ -30,6 +38,7 @@ const extractionOptionsSchema = z.object({
|
|
|
30
38
|
maxContentLength: z
|
|
31
39
|
.number()
|
|
32
40
|
.positive()
|
|
41
|
+
.max(MAX_CONTENT_LENGTH)
|
|
33
42
|
.optional()
|
|
34
43
|
.describe('Maximum content length in characters'),
|
|
35
44
|
});
|
|
@@ -64,14 +73,22 @@ const fileDownloadSchema = z.object({
|
|
|
64
73
|
});
|
|
65
74
|
export const fetchUrlInputSchema = requestOptionsSchema
|
|
66
75
|
.extend({
|
|
67
|
-
url: z
|
|
76
|
+
url: z
|
|
77
|
+
.string()
|
|
78
|
+
.min(1)
|
|
79
|
+
.max(config.constants.maxUrlLength)
|
|
80
|
+
.describe('The URL to fetch'),
|
|
68
81
|
})
|
|
69
82
|
.merge(extractionOptionsSchema)
|
|
70
83
|
.merge(formatOptionsSchema)
|
|
71
84
|
.strict();
|
|
72
85
|
export const fetchMarkdownInputSchema = requestOptionsSchema
|
|
73
86
|
.extend({
|
|
74
|
-
url: z
|
|
87
|
+
url: z
|
|
88
|
+
.string()
|
|
89
|
+
.min(1)
|
|
90
|
+
.max(config.constants.maxUrlLength)
|
|
91
|
+
.describe('The URL to fetch'),
|
|
75
92
|
})
|
|
76
93
|
.merge(extractionOptionsSchema)
|
|
77
94
|
.strict();
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { TRUNCATION_MARKER } from '../../config/formatting.js';
|
|
2
2
|
import { extractContent } from '../../services/extractor.js';
|
|
3
|
-
import { parseHtml } from '../../services/parser.js';
|
|
3
|
+
import { parseHtml, parseHtmlWithMetadata } from '../../services/parser.js';
|
|
4
4
|
import { sanitizeText } from '../../utils/sanitizer.js';
|
|
5
5
|
import { toJsonl } from '../../transformers/jsonl.transformer.js';
|
|
6
6
|
import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
|
|
@@ -56,7 +56,10 @@ function decodeHtmlEntities(value) {
|
|
|
56
56
|
}
|
|
57
57
|
function buildJsonlPayload(context, maxContentLength) {
|
|
58
58
|
const contentBlocks = parseHtml(context.sourceHtml);
|
|
59
|
-
|
|
59
|
+
return buildJsonlPayloadFromBlocks(contentBlocks, context.metadata, maxContentLength);
|
|
60
|
+
}
|
|
61
|
+
function buildJsonlPayloadFromBlocks(contentBlocks, metadata, maxContentLength) {
|
|
62
|
+
const { content, truncated } = truncateContent(toJsonl(contentBlocks, metadata), maxContentLength);
|
|
60
63
|
return {
|
|
61
64
|
content,
|
|
62
65
|
contentBlocks: contentBlocks.length,
|
|
@@ -69,6 +72,17 @@ function buildMarkdownPayload(context, maxContentLength) {
|
|
|
69
72
|
return { content, truncated };
|
|
70
73
|
}
|
|
71
74
|
export function transformHtmlToJsonl(html, url, options) {
|
|
75
|
+
if (!options.extractMainContent && options.includeMetadata) {
|
|
76
|
+
const parsed = parseHtmlWithMetadata(html);
|
|
77
|
+
const metadataBlock = createContentMetadataBlock(url, null, parsed.metadata, false, true);
|
|
78
|
+
const { content, contentBlocks, truncated } = buildJsonlPayloadFromBlocks(parsed.blocks, metadataBlock, options.maxContentLength);
|
|
79
|
+
return {
|
|
80
|
+
content,
|
|
81
|
+
contentBlocks,
|
|
82
|
+
title: parsed.metadata.title,
|
|
83
|
+
...(truncated && { truncated }),
|
|
84
|
+
};
|
|
85
|
+
}
|
|
72
86
|
const context = resolveContentSource(html, url, options);
|
|
73
87
|
const { content, contentBlocks, truncated } = buildJsonlPayload(context, options.maxContentLength);
|
|
74
88
|
return {
|
|
@@ -88,6 +102,21 @@ export function transformHtmlToMarkdown(html, url, options) {
|
|
|
88
102
|
};
|
|
89
103
|
}
|
|
90
104
|
export function transformHtmlToMarkdownWithBlocks(html, url, options) {
|
|
105
|
+
if (!options.extractMainContent && options.includeMetadata) {
|
|
106
|
+
const parsed = parseHtmlWithMetadata(html);
|
|
107
|
+
const context = {
|
|
108
|
+
sourceHtml: html,
|
|
109
|
+
title: parsed.metadata.title,
|
|
110
|
+
metadata: createContentMetadataBlock(url, null, parsed.metadata, false, true),
|
|
111
|
+
};
|
|
112
|
+
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
113
|
+
return {
|
|
114
|
+
content,
|
|
115
|
+
contentBlocks: parsed.blocks.length,
|
|
116
|
+
title: context.title,
|
|
117
|
+
...(truncated && { truncated }),
|
|
118
|
+
};
|
|
119
|
+
}
|
|
91
120
|
const context = resolveContentSource(html, url, options);
|
|
92
121
|
const contentBlocks = parseHtml(context.sourceHtml);
|
|
93
122
|
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import * as cache from '../../services/cache.js';
|
|
2
|
-
import {
|
|
2
|
+
import { fetchNormalizedUrlWithRetry } from '../../services/fetcher.js';
|
|
3
3
|
import { logDebug } from '../../services/logger.js';
|
|
4
|
-
import {
|
|
4
|
+
import { assertResolvedAddressesAllowed, normalizeUrl, } from '../../utils/url-validator.js';
|
|
5
5
|
import { appendHeaderVary } from './cache-vary.js';
|
|
6
6
|
function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
|
|
7
7
|
if (!cacheKey)
|
|
@@ -42,16 +42,19 @@ function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalized
|
|
|
42
42
|
* @returns Promise resolving to the pipeline result
|
|
43
43
|
*/
|
|
44
44
|
export async function executeFetchPipeline(options) {
|
|
45
|
-
const normalizedUrl =
|
|
45
|
+
const { normalizedUrl, hostname } = normalizeUrl(options.url);
|
|
46
46
|
const cacheKey = resolveCacheKey(options, normalizedUrl);
|
|
47
47
|
const cachedResult = attemptCacheRetrieval(cacheKey, options.deserialize, options.cacheNamespace, normalizedUrl);
|
|
48
48
|
if (cachedResult)
|
|
49
49
|
return cachedResult;
|
|
50
|
+
await assertResolvedAddressesAllowed(hostname);
|
|
50
51
|
const fetchOptions = buildFetchOptions(options);
|
|
51
52
|
logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
|
|
52
|
-
const html = await
|
|
53
|
+
const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
|
|
53
54
|
const data = options.transform(html, normalizedUrl);
|
|
54
|
-
|
|
55
|
+
if (cache.isEnabled()) {
|
|
56
|
+
persistCache(cacheKey, data, options.serialize, normalizedUrl);
|
|
57
|
+
}
|
|
55
58
|
return buildPipelineResult(normalizedUrl, data, cacheKey);
|
|
56
59
|
}
|
|
57
60
|
function resolveCacheKey(options, normalizedUrl) {
|
|
@@ -1,2 +1,7 @@
|
|
|
1
1
|
export declare function isBlockedIp(ip: string): boolean;
|
|
2
|
+
export declare function assertResolvedAddressesAllowed(hostname: string): Promise<void>;
|
|
3
|
+
export declare function normalizeUrl(urlString: string): {
|
|
4
|
+
normalizedUrl: string;
|
|
5
|
+
hostname: string;
|
|
6
|
+
};
|
|
2
7
|
export declare function validateAndNormalizeUrl(urlString: string): Promise<string>;
|
|
@@ -78,7 +78,7 @@ function lookupWithTimeout(hostname) {
|
|
|
78
78
|
});
|
|
79
79
|
});
|
|
80
80
|
}
|
|
81
|
-
async function assertResolvedAddressesAllowed(hostname) {
|
|
81
|
+
export async function assertResolvedAddressesAllowed(hostname) {
|
|
82
82
|
try {
|
|
83
83
|
const result = await lookupWithTimeout(hostname);
|
|
84
84
|
const addresses = Array.isArray(result) ? result : [result];
|
|
@@ -102,7 +102,7 @@ async function assertResolvedAddressesAllowed(hostname) {
|
|
|
102
102
|
throw createValidationError(String(error));
|
|
103
103
|
}
|
|
104
104
|
}
|
|
105
|
-
export
|
|
105
|
+
export function normalizeUrl(urlString) {
|
|
106
106
|
const trimmedUrl = requireTrimmedUrl(urlString);
|
|
107
107
|
assertUrlLength(trimmedUrl);
|
|
108
108
|
const url = parseUrl(trimmedUrl);
|
|
@@ -110,8 +110,12 @@ export async function validateAndNormalizeUrl(urlString) {
|
|
|
110
110
|
assertNoCredentials(url);
|
|
111
111
|
const hostname = normalizeHostname(url);
|
|
112
112
|
assertHostnameAllowed(hostname);
|
|
113
|
+
return { normalizedUrl: url.href, hostname };
|
|
114
|
+
}
|
|
115
|
+
export async function validateAndNormalizeUrl(urlString) {
|
|
116
|
+
const { normalizedUrl, hostname } = normalizeUrl(urlString);
|
|
113
117
|
await assertResolvedAddressesAllowed(hostname);
|
|
114
|
-
return
|
|
118
|
+
return normalizedUrl;
|
|
115
119
|
}
|
|
116
120
|
const VALIDATION_ERROR_CODE = 'VALIDATION_ERROR';
|
|
117
121
|
function createValidationError(message) {
|
package/package.json
CHANGED