@j0hanz/superfetch 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -46
- package/dist/cache.d.ts +42 -0
- package/dist/cache.js +565 -0
- package/dist/config/env-parsers.d.ts +1 -0
- package/dist/config/env-parsers.js +12 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +20 -8
- package/dist/config/types/content.d.ts +1 -0
- package/dist/config.d.ts +77 -0
- package/dist/config.js +261 -0
- package/dist/crypto.d.ts +2 -0
- package/dist/crypto.js +32 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +28 -0
- package/dist/fetch.d.ts +40 -0
- package/dist/fetch.js +910 -0
- package/dist/http/auth.js +161 -2
- package/dist/http/base-middleware.d.ts +7 -0
- package/dist/http/base-middleware.js +143 -0
- package/dist/http/cors.d.ts +0 -5
- package/dist/http/cors.js +0 -6
- package/dist/http/download-routes.js +6 -2
- package/dist/http/error-handler.d.ts +2 -0
- package/dist/http/error-handler.js +55 -0
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/mcp-routes.d.ts +8 -2
- package/dist/http/mcp-routes.js +101 -8
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +5 -114
- package/dist/http/mcp-sessions.d.ts +41 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/rate-limit.js +2 -2
- package/dist/http/server-middleware.d.ts +6 -1
- package/dist/http/server-middleware.js +3 -117
- package/dist/http/server-shutdown.js +1 -1
- package/dist/http/server-tuning.d.ts +9 -0
- package/dist/http/server-tuning.js +45 -0
- package/dist/http/server.js +206 -9
- package/dist/http/session-cleanup.js +8 -5
- package/dist/http.d.ts +78 -0
- package/dist/http.js +1437 -0
- package/dist/index.js +3 -3
- package/dist/mcp.d.ts +3 -0
- package/dist/mcp.js +94 -0
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +31 -30
- package/dist/observability.d.ts +16 -0
- package/dist/observability.js +78 -0
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +33 -33
- package/dist/server.js +21 -6
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache.d.ts +5 -4
- package/dist/services/cache.js +49 -45
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +3 -0
- package/dist/services/extractor.d.ts +1 -0
- package/dist/services/extractor.js +77 -40
- package/dist/services/fetcher/agents.js +1 -1
- package/dist/services/fetcher/dns-selection.js +1 -1
- package/dist/services/fetcher/interceptors.js +29 -60
- package/dist/services/fetcher/redirects.js +12 -4
- package/dist/services/fetcher/response.js +18 -8
- package/dist/services/fetcher.d.ts +23 -0
- package/dist/services/fetcher.js +553 -13
- package/dist/services/logger.js +4 -1
- package/dist/services/telemetry.d.ts +19 -0
- package/dist/services/telemetry.js +43 -0
- package/dist/services/transform-worker-pool.d.ts +10 -3
- package/dist/services/transform-worker-pool.js +213 -184
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -3
- package/dist/tools/handlers/fetch-single.shared.js +131 -2
- package/dist/tools/handlers/fetch-url.tool.d.ts +6 -0
- package/dist/tools/handlers/fetch-url.tool.js +56 -12
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.js +13 -1
- package/dist/tools/schemas.d.ts +2 -0
- package/dist/tools/schemas.js +8 -0
- package/dist/tools/utils/content-shaping.js +19 -4
- package/dist/tools/utils/content-transform-core.d.ts +5 -0
- package/dist/tools/utils/content-transform-core.js +180 -0
- package/dist/tools/utils/content-transform-workers.d.ts +1 -0
- package/dist/tools/utils/content-transform-workers.js +1 -0
- package/dist/tools/utils/content-transform.d.ts +2 -1
- package/dist/tools/utils/content-transform.js +37 -136
- package/dist/tools/utils/fetch-pipeline.js +47 -56
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +149 -0
- package/dist/tools.d.ts +104 -0
- package/dist/tools.js +421 -0
- package/dist/transform.d.ts +69 -0
- package/dist/transform.js +1509 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +5 -0
- package/dist/transformers/markdown.js +314 -0
- package/dist/transformers/markdown.transformer.js +2 -189
- package/dist/utils/cancellation.d.ts +1 -0
- package/dist/utils/cancellation.js +18 -0
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/filename-generator.js +14 -3
- package/dist/utils/host-normalizer.d.ts +1 -0
- package/dist/utils/host-normalizer.js +37 -0
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.js +12 -17
- package/dist/utils/url-redactor.d.ts +1 -0
- package/dist/utils/url-redactor.js +13 -0
- package/dist/utils/url-validator.js +35 -20
- package/dist/workers/transform-worker.js +82 -38
- package/package.json +13 -10
package/dist/services/cache.js
CHANGED
|
@@ -1,10 +1,28 @@
|
|
|
1
1
|
import { setInterval as setIntervalPromise } from 'node:timers/promises';
|
|
2
2
|
import { config } from '../config/index.js';
|
|
3
|
-
import { getErrorMessage } from '../utils/error-
|
|
3
|
+
import { getErrorMessage } from '../utils/error-details.js';
|
|
4
4
|
import { parseCacheKey } from './cache-keys.js';
|
|
5
5
|
import { logWarn } from './logger.js';
|
|
6
6
|
const contentCache = new Map();
|
|
7
7
|
let cleanupController = null;
|
|
8
|
+
const updateListeners = new Set();
|
|
9
|
+
export function onCacheUpdate(listener) {
|
|
10
|
+
updateListeners.add(listener);
|
|
11
|
+
return () => {
|
|
12
|
+
updateListeners.delete(listener);
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
function notifyCacheUpdate(cacheKey) {
|
|
16
|
+
if (updateListeners.size === 0)
|
|
17
|
+
return;
|
|
18
|
+
const parts = parseCacheKey(cacheKey);
|
|
19
|
+
if (!parts)
|
|
20
|
+
return;
|
|
21
|
+
const event = { cacheKey, ...parts };
|
|
22
|
+
for (const listener of updateListeners) {
|
|
23
|
+
listener(event);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
8
26
|
function startCleanupLoop() {
|
|
9
27
|
if (cleanupController)
|
|
10
28
|
return;
|
|
@@ -17,15 +35,14 @@ function startCleanupLoop() {
|
|
|
17
35
|
}
|
|
18
36
|
async function runCleanupLoop(signal) {
|
|
19
37
|
const intervalMs = Math.floor(config.cache.ttl * 1000);
|
|
20
|
-
for await (const
|
|
38
|
+
for await (const getNow of setIntervalPromise(intervalMs, Date.now, {
|
|
21
39
|
signal,
|
|
22
40
|
ref: false,
|
|
23
41
|
})) {
|
|
24
|
-
enforceCacheLimits();
|
|
42
|
+
enforceCacheLimits(getNow());
|
|
25
43
|
}
|
|
26
44
|
}
|
|
27
|
-
function enforceCacheLimits() {
|
|
28
|
-
const now = Date.now();
|
|
45
|
+
function enforceCacheLimits(now) {
|
|
29
46
|
for (const [key, item] of contentCache.entries()) {
|
|
30
47
|
if (now > item.expiresAt) {
|
|
31
48
|
contentCache.delete(key);
|
|
@@ -33,21 +50,6 @@ function enforceCacheLimits() {
|
|
|
33
50
|
}
|
|
34
51
|
trimCacheToMaxKeys();
|
|
35
52
|
}
|
|
36
|
-
const updateListeners = new Set();
|
|
37
|
-
export function onCacheUpdate(listener) {
|
|
38
|
-
updateListeners.add(listener);
|
|
39
|
-
return () => {
|
|
40
|
-
updateListeners.delete(listener);
|
|
41
|
-
};
|
|
42
|
-
}
|
|
43
|
-
function emitCacheUpdate(cacheKey) {
|
|
44
|
-
const parts = parseCacheKey(cacheKey);
|
|
45
|
-
if (!parts)
|
|
46
|
-
return;
|
|
47
|
-
for (const listener of updateListeners) {
|
|
48
|
-
listener({ cacheKey, ...parts });
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
53
|
export function get(cacheKey) {
|
|
52
54
|
if (!isCacheReadable(cacheKey))
|
|
53
55
|
return undefined;
|
|
@@ -69,16 +71,17 @@ function runCacheOperation(cacheKey, message, operation) {
|
|
|
69
71
|
}
|
|
70
72
|
}
|
|
71
73
|
function readCacheEntry(cacheKey) {
|
|
72
|
-
|
|
74
|
+
const now = Date.now();
|
|
75
|
+
return readCacheItem(cacheKey, now)?.entry;
|
|
73
76
|
}
|
|
74
|
-
function isExpired(item) {
|
|
75
|
-
return
|
|
77
|
+
function isExpired(item, now) {
|
|
78
|
+
return now > item.expiresAt;
|
|
76
79
|
}
|
|
77
|
-
function readCacheItem(cacheKey) {
|
|
80
|
+
function readCacheItem(cacheKey, now) {
|
|
78
81
|
const item = contentCache.get(cacheKey);
|
|
79
82
|
if (!item)
|
|
80
83
|
return undefined;
|
|
81
|
-
if (isExpired(item)) {
|
|
84
|
+
if (isExpired(item, now)) {
|
|
82
85
|
contentCache.delete(cacheKey);
|
|
83
86
|
return undefined;
|
|
84
87
|
}
|
|
@@ -89,8 +92,15 @@ export function set(cacheKey, content, metadata) {
|
|
|
89
92
|
return;
|
|
90
93
|
runCacheOperation(cacheKey, 'Cache set error', () => {
|
|
91
94
|
startCleanupLoop();
|
|
92
|
-
const
|
|
93
|
-
|
|
95
|
+
const now = Date.now();
|
|
96
|
+
const expiresAtMs = now + config.cache.ttl * 1000;
|
|
97
|
+
const entry = buildCacheEntry({
|
|
98
|
+
content,
|
|
99
|
+
metadata,
|
|
100
|
+
fetchedAtMs: now,
|
|
101
|
+
expiresAtMs,
|
|
102
|
+
});
|
|
103
|
+
persistCacheEntry(cacheKey, entry, expiresAtMs);
|
|
94
104
|
});
|
|
95
105
|
}
|
|
96
106
|
export function keys() {
|
|
@@ -99,20 +109,19 @@ export function keys() {
|
|
|
99
109
|
export function isEnabled() {
|
|
100
110
|
return config.cache.enabled;
|
|
101
111
|
}
|
|
102
|
-
function buildCacheEntry(content, metadata) {
|
|
112
|
+
function buildCacheEntry({ content, metadata, fetchedAtMs, expiresAtMs, }) {
|
|
103
113
|
return {
|
|
104
114
|
url: metadata.url,
|
|
105
115
|
content,
|
|
106
|
-
fetchedAt: new Date().toISOString(),
|
|
107
|
-
expiresAt: new Date(
|
|
116
|
+
fetchedAt: new Date(fetchedAtMs).toISOString(),
|
|
117
|
+
expiresAt: new Date(expiresAtMs).toISOString(),
|
|
108
118
|
...(metadata.title === undefined ? {} : { title: metadata.title }),
|
|
109
119
|
};
|
|
110
120
|
}
|
|
111
|
-
function persistCacheEntry(cacheKey, entry) {
|
|
112
|
-
|
|
113
|
-
contentCache.set(cacheKey, { entry, expiresAt });
|
|
121
|
+
function persistCacheEntry(cacheKey, entry, expiresAtMs) {
|
|
122
|
+
contentCache.set(cacheKey, { entry, expiresAt: expiresAtMs });
|
|
114
123
|
trimCacheToMaxKeys();
|
|
115
|
-
|
|
124
|
+
notifyCacheUpdate(cacheKey);
|
|
116
125
|
}
|
|
117
126
|
function trimCacheToMaxKeys() {
|
|
118
127
|
if (contentCache.size <= config.cache.maxKeys)
|
|
@@ -120,19 +129,14 @@ function trimCacheToMaxKeys() {
|
|
|
120
129
|
removeOldestEntries(contentCache.size - config.cache.maxKeys);
|
|
121
130
|
}
|
|
122
131
|
function removeOldestEntries(count) {
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
if (removed >= count)
|
|
130
|
-
return;
|
|
132
|
+
const iterator = contentCache.keys();
|
|
133
|
+
for (let removed = 0; removed < count; removed += 1) {
|
|
134
|
+
const next = iterator.next();
|
|
135
|
+
if (next.done)
|
|
136
|
+
break;
|
|
137
|
+
contentCache.delete(next.value);
|
|
131
138
|
}
|
|
132
139
|
}
|
|
133
|
-
function resolveExpiryTimestamp() {
|
|
134
|
-
return Date.now() + config.cache.ttl * 1000;
|
|
135
|
-
}
|
|
136
140
|
function logCacheError(message, cacheKey, error) {
|
|
137
141
|
logWarn(message, {
|
|
138
142
|
key: cacheKey.length > 100 ? cacheKey.slice(0, 100) : cacheKey,
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
interface RequestContext {
|
|
2
2
|
readonly requestId: string;
|
|
3
3
|
readonly sessionId?: string;
|
|
4
|
+
readonly operationId?: string;
|
|
4
5
|
}
|
|
5
6
|
export declare function runWithRequestContext<T>(context: RequestContext, fn: () => T): T;
|
|
6
7
|
export declare function getRequestId(): string | undefined;
|
|
7
8
|
export declare function getSessionId(): string | undefined;
|
|
9
|
+
export declare function getOperationId(): string | undefined;
|
|
8
10
|
export {};
|
package/dist/services/context.js
CHANGED
|
@@ -1,28 +1,31 @@
|
|
|
1
1
|
import { parseHTML } from 'linkedom';
|
|
2
2
|
import { Readability } from '@mozilla/readability';
|
|
3
|
-
import {
|
|
3
|
+
import { FetchError } from '../errors/app-error.js';
|
|
4
|
+
import { throwIfAborted } from '../utils/cancellation.js';
|
|
5
|
+
import { getErrorMessage } from '../utils/error-details.js';
|
|
4
6
|
import { isRecord } from '../utils/guards.js';
|
|
5
7
|
import { truncateHtml } from '../utils/html-truncator.js';
|
|
6
8
|
import { logError, logInfo, logWarn } from './logger.js';
|
|
7
9
|
import { extractMetadata } from './metadata-collector.js';
|
|
10
|
+
import { endTransformStage, startTransformStage } from './telemetry.js';
|
|
8
11
|
function isReadabilityCompatible(doc) {
|
|
9
12
|
if (!isRecord(doc))
|
|
10
13
|
return false;
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
return
|
|
14
|
+
return hasDocumentElement(doc) && hasQuerySelectors(doc);
|
|
15
|
+
}
|
|
16
|
+
function hasDocumentElement(record) {
|
|
17
|
+
return 'documentElement' in record;
|
|
18
|
+
}
|
|
19
|
+
function hasQuerySelectors(record) {
|
|
20
|
+
return (typeof record.querySelectorAll === 'function' &&
|
|
21
|
+
typeof record.querySelector === 'function');
|
|
18
22
|
}
|
|
19
23
|
function extractArticle(document) {
|
|
20
24
|
if (!isReadabilityCompatible(document)) {
|
|
21
25
|
logWarn('Document not compatible with Readability');
|
|
22
26
|
return null;
|
|
23
27
|
}
|
|
24
|
-
|
|
25
|
-
return parsed ? mapReadabilityResult(parsed) : null;
|
|
28
|
+
return mapParsedArticle(parseReadabilityArticle(document));
|
|
26
29
|
}
|
|
27
30
|
function parseReadabilityArticle(document) {
|
|
28
31
|
try {
|
|
@@ -31,33 +34,42 @@ function parseReadabilityArticle(document) {
|
|
|
31
34
|
return reader.parse();
|
|
32
35
|
}
|
|
33
36
|
catch (error) {
|
|
34
|
-
logError('Failed to extract article with Readability', error
|
|
37
|
+
logError('Failed to extract article with Readability', asError(error));
|
|
35
38
|
return null;
|
|
36
39
|
}
|
|
37
40
|
}
|
|
41
|
+
function asError(error) {
|
|
42
|
+
if (error instanceof Error) {
|
|
43
|
+
return error;
|
|
44
|
+
}
|
|
45
|
+
return undefined;
|
|
46
|
+
}
|
|
47
|
+
function mapParsedArticle(parsed) {
|
|
48
|
+
return parsed ? mapReadabilityResult(parsed) : null;
|
|
49
|
+
}
|
|
38
50
|
function mapReadabilityResult(parsed) {
|
|
39
|
-
|
|
51
|
+
return {
|
|
40
52
|
content: parsed.content ?? '',
|
|
41
53
|
textContent: parsed.textContent ?? '',
|
|
54
|
+
...buildOptionalArticleFields(parsed),
|
|
42
55
|
};
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if (
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
}
|
|
57
|
-
function
|
|
58
|
-
|
|
59
|
-
}
|
|
60
|
-
export function extractContent(html, url, options = { extractArticle: true }) {
|
|
56
|
+
}
|
|
57
|
+
function buildOptionalArticleFields(parsed) {
|
|
58
|
+
const optional = {};
|
|
59
|
+
addOptionalField(optional, 'title', parsed.title);
|
|
60
|
+
addOptionalField(optional, 'byline', parsed.byline);
|
|
61
|
+
addOptionalField(optional, 'excerpt', parsed.excerpt);
|
|
62
|
+
addOptionalField(optional, 'siteName', parsed.siteName);
|
|
63
|
+
return optional;
|
|
64
|
+
}
|
|
65
|
+
function addOptionalField(target, key, value) {
|
|
66
|
+
if (value == null)
|
|
67
|
+
return;
|
|
68
|
+
target[key] = value;
|
|
69
|
+
}
|
|
70
|
+
export function extractContent(html, url, options = {
|
|
71
|
+
extractArticle: true,
|
|
72
|
+
}) {
|
|
61
73
|
if (!isValidInput(html, url)) {
|
|
62
74
|
return { article: null, metadata: {} };
|
|
63
75
|
}
|
|
@@ -65,29 +77,54 @@ export function extractContent(html, url, options = { extractArticle: true }) {
|
|
|
65
77
|
}
|
|
66
78
|
function tryExtractContent(html, url, options) {
|
|
67
79
|
try {
|
|
80
|
+
throwIfAborted(options.signal, url, 'extract:begin');
|
|
81
|
+
const parseStage = startTransformStage(url, 'extract:parse');
|
|
68
82
|
const { document } = parseHTML(truncateHtml(html));
|
|
83
|
+
endTransformStage(parseStage);
|
|
84
|
+
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
69
85
|
applyBaseUri(document, url);
|
|
86
|
+
const metadataStage = startTransformStage(url, 'extract:metadata');
|
|
70
87
|
const metadata = extractMetadata(document);
|
|
88
|
+
endTransformStage(metadataStage);
|
|
89
|
+
throwIfAborted(options.signal, url, 'extract:metadata');
|
|
90
|
+
let article;
|
|
91
|
+
if (options.extractArticle) {
|
|
92
|
+
const articleStage = startTransformStage(url, 'extract:article');
|
|
93
|
+
article = resolveArticleExtraction(document, options.extractArticle);
|
|
94
|
+
endTransformStage(articleStage);
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
article = null;
|
|
98
|
+
}
|
|
99
|
+
throwIfAborted(options.signal, url, 'extract:article');
|
|
71
100
|
return {
|
|
72
|
-
article
|
|
101
|
+
article,
|
|
73
102
|
metadata,
|
|
74
103
|
};
|
|
75
104
|
}
|
|
76
105
|
catch (error) {
|
|
106
|
+
if (error instanceof FetchError) {
|
|
107
|
+
throw error;
|
|
108
|
+
}
|
|
109
|
+
throwIfAborted(options.signal, url, 'extract:error');
|
|
77
110
|
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
78
111
|
return { article: null, metadata: {} };
|
|
79
112
|
}
|
|
80
113
|
}
|
|
81
114
|
function isValidInput(html, url) {
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
115
|
+
return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
|
|
116
|
+
}
|
|
117
|
+
function validateRequiredString(value, message) {
|
|
118
|
+
if (isNonEmptyString(value))
|
|
119
|
+
return true;
|
|
120
|
+
logWarn(message);
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
function isNonEmptyString(value) {
|
|
124
|
+
return typeof value === 'string' && value.length > 0;
|
|
125
|
+
}
|
|
126
|
+
function resolveArticleExtraction(document, shouldExtract) {
|
|
127
|
+
return shouldExtract ? extractArticle(document) : null;
|
|
91
128
|
}
|
|
92
129
|
function applyBaseUri(document, url) {
|
|
93
130
|
try {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import dns from 'node:dns';
|
|
2
2
|
import os from 'node:os';
|
|
3
3
|
import { Agent } from 'undici';
|
|
4
|
-
import { createErrorWithCode } from '../../utils/error-
|
|
4
|
+
import { createErrorWithCode } from '../../utils/error-details.js';
|
|
5
5
|
import { isRecord } from '../../utils/guards.js';
|
|
6
6
|
import { handleLookupResult } from './dns-selection.js';
|
|
7
7
|
const DNS_LOOKUP_TIMEOUT_MS = 5000;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { createErrorWithCode } from '../../utils/error-
|
|
1
|
+
import { createErrorWithCode } from '../../utils/error-details.js';
|
|
2
2
|
import { isBlockedIp } from '../../utils/url-validator.js';
|
|
3
3
|
function normalizeLookupResults(addresses, family) {
|
|
4
4
|
if (Array.isArray(addresses)) {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { randomUUID } from 'node:crypto';
|
|
2
2
|
import diagnosticsChannel from 'node:diagnostics_channel';
|
|
3
3
|
import { performance } from 'node:perf_hooks';
|
|
4
|
-
import { isSystemError } from '../../utils/error-
|
|
4
|
+
import { isSystemError } from '../../utils/error-details.js';
|
|
5
5
|
import { logDebug, logError, logWarn } from '../logger.js';
|
|
6
6
|
const fetchChannel = diagnosticsChannel.channel('superfetch.fetch');
|
|
7
7
|
function redactUrl(rawUrl) {
|
|
@@ -27,7 +27,14 @@ function publishFetchEvent(event) {
|
|
|
27
27
|
// Avoid crashing the publisher if a subscriber throws.
|
|
28
28
|
}
|
|
29
29
|
}
|
|
30
|
-
function
|
|
30
|
+
export function startFetchTelemetry(url, method) {
|
|
31
|
+
const safeUrl = redactUrl(url);
|
|
32
|
+
const context = {
|
|
33
|
+
requestId: randomUUID(),
|
|
34
|
+
startTime: performance.now(),
|
|
35
|
+
url: safeUrl,
|
|
36
|
+
method: method.toUpperCase(),
|
|
37
|
+
};
|
|
31
38
|
publishFetchEvent({
|
|
32
39
|
v: 1,
|
|
33
40
|
type: 'start',
|
|
@@ -40,65 +47,40 @@ function publishAndLogFetchStart(context) {
|
|
|
40
47
|
method: context.method,
|
|
41
48
|
url: context.url,
|
|
42
49
|
});
|
|
43
|
-
}
|
|
44
|
-
export function startFetchTelemetry(url, method) {
|
|
45
|
-
const safeUrl = redactUrl(url);
|
|
46
|
-
const context = {
|
|
47
|
-
requestId: randomUUID(),
|
|
48
|
-
startTime: performance.now(),
|
|
49
|
-
url: safeUrl,
|
|
50
|
-
method: method.toUpperCase(),
|
|
51
|
-
};
|
|
52
|
-
publishAndLogFetchStart(context);
|
|
53
50
|
return context;
|
|
54
51
|
}
|
|
55
52
|
export function recordFetchResponse(context, response, contentSize) {
|
|
56
53
|
const duration = performance.now() - context.startTime;
|
|
57
|
-
|
|
58
|
-
logDebug('HTTP Response', {
|
|
59
|
-
requestId: context.requestId,
|
|
60
|
-
status: response.status,
|
|
61
|
-
url: context.url,
|
|
62
|
-
...buildResponseMeta(response, contentSize, duration),
|
|
63
|
-
});
|
|
64
|
-
logSlowRequestIfNeeded(context, duration);
|
|
65
|
-
}
|
|
66
|
-
function publishFetchEnd(context, status, duration) {
|
|
54
|
+
const durationLabel = `${Math.round(duration)}ms`;
|
|
67
55
|
publishFetchEvent({
|
|
68
56
|
v: 1,
|
|
69
57
|
type: 'end',
|
|
70
58
|
requestId: context.requestId,
|
|
71
|
-
status,
|
|
59
|
+
status: response.status,
|
|
72
60
|
duration,
|
|
73
61
|
});
|
|
74
|
-
}
|
|
75
|
-
function buildResponseMeta(response, contentSize, duration) {
|
|
76
|
-
const contentLength = response.headers.get('content-length') ?? contentSize?.toString();
|
|
77
|
-
const meta = {
|
|
78
|
-
duration: `${Math.round(duration)}ms`,
|
|
79
|
-
};
|
|
80
62
|
const contentType = response.headers.get('content-type');
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if (contentLength !== undefined) {
|
|
85
|
-
meta.size = contentLength;
|
|
86
|
-
}
|
|
87
|
-
return meta;
|
|
88
|
-
}
|
|
89
|
-
function logSlowRequestIfNeeded(context, duration) {
|
|
90
|
-
if (duration <= 5000)
|
|
91
|
-
return;
|
|
92
|
-
logWarn('Slow HTTP request detected', {
|
|
63
|
+
const contentLength = response.headers.get('content-length') ??
|
|
64
|
+
(contentSize === undefined ? undefined : String(contentSize));
|
|
65
|
+
logDebug('HTTP Response', {
|
|
93
66
|
requestId: context.requestId,
|
|
67
|
+
status: response.status,
|
|
94
68
|
url: context.url,
|
|
95
|
-
duration:
|
|
69
|
+
duration: durationLabel,
|
|
70
|
+
...(contentType ? { contentType } : {}),
|
|
71
|
+
...(contentLength ? { size: contentLength } : {}),
|
|
96
72
|
});
|
|
73
|
+
if (duration > 5000) {
|
|
74
|
+
logWarn('Slow HTTP request detected', {
|
|
75
|
+
requestId: context.requestId,
|
|
76
|
+
url: context.url,
|
|
77
|
+
duration: durationLabel,
|
|
78
|
+
});
|
|
79
|
+
}
|
|
97
80
|
}
|
|
98
|
-
function
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
function buildFetchErrorEvent(context, err, duration, status) {
|
|
81
|
+
export function recordFetchError(context, error, status) {
|
|
82
|
+
const duration = performance.now() - context.startTime;
|
|
83
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
102
84
|
const event = {
|
|
103
85
|
v: 1,
|
|
104
86
|
type: 'error',
|
|
@@ -107,10 +89,6 @@ function buildFetchErrorEvent(context, err, duration, status) {
|
|
|
107
89
|
error: err.message,
|
|
108
90
|
duration,
|
|
109
91
|
};
|
|
110
|
-
addOptionalErrorFields(event, err, status);
|
|
111
|
-
return event;
|
|
112
|
-
}
|
|
113
|
-
function addOptionalErrorFields(event, err, status) {
|
|
114
92
|
const code = isSystemError(err) ? err.code : undefined;
|
|
115
93
|
if (code !== undefined) {
|
|
116
94
|
event.code = code;
|
|
@@ -118,17 +96,8 @@ function addOptionalErrorFields(event, err, status) {
|
|
|
118
96
|
if (status !== undefined) {
|
|
119
97
|
event.status = status;
|
|
120
98
|
}
|
|
121
|
-
}
|
|
122
|
-
function selectErrorLogger(status) {
|
|
123
|
-
return status === 429 ? logWarn : logError;
|
|
124
|
-
}
|
|
125
|
-
export function recordFetchError(context, error, status) {
|
|
126
|
-
const duration = performance.now() - context.startTime;
|
|
127
|
-
const err = normalizeError(error);
|
|
128
|
-
const event = buildFetchErrorEvent(context, err, duration, status);
|
|
129
99
|
publishFetchEvent(event);
|
|
130
|
-
const log =
|
|
131
|
-
const code = isSystemError(err) ? err.code : undefined;
|
|
100
|
+
const log = status === 429 ? logWarn : logError;
|
|
132
101
|
log('HTTP Request Error', {
|
|
133
102
|
requestId: context.requestId,
|
|
134
103
|
url: context.url,
|
|
@@ -1,11 +1,19 @@
|
|
|
1
1
|
import { FetchError } from '../../errors/app-error.js';
|
|
2
|
-
import { createErrorWithCode } from '../../utils/error-
|
|
2
|
+
import { createErrorWithCode } from '../../utils/error-details.js';
|
|
3
3
|
import { isRecord } from '../../utils/guards.js';
|
|
4
4
|
import { validateAndNormalizeUrl } from '../../utils/url-validator.js';
|
|
5
5
|
const REDIRECT_STATUSES = new Set([301, 302, 303, 307, 308]);
|
|
6
6
|
function isRedirectStatus(status) {
|
|
7
7
|
return REDIRECT_STATUSES.has(status);
|
|
8
8
|
}
|
|
9
|
+
function cancelResponseBody(response) {
|
|
10
|
+
const cancelPromise = response.body?.cancel();
|
|
11
|
+
if (cancelPromise) {
|
|
12
|
+
cancelPromise.catch(() => {
|
|
13
|
+
// Best-effort cancellation; ignore failures.
|
|
14
|
+
});
|
|
15
|
+
}
|
|
16
|
+
}
|
|
9
17
|
async function performFetchCycle(currentUrl, init, redirectLimit, redirectCount) {
|
|
10
18
|
const response = await fetch(currentUrl, { ...init, redirect: 'manual' });
|
|
11
19
|
if (!isRedirectStatus(response.status)) {
|
|
@@ -13,7 +21,7 @@ async function performFetchCycle(currentUrl, init, redirectLimit, redirectCount)
|
|
|
13
21
|
}
|
|
14
22
|
assertRedirectWithinLimit(response, currentUrl, redirectLimit, redirectCount);
|
|
15
23
|
const location = getRedirectLocation(response, currentUrl);
|
|
16
|
-
|
|
24
|
+
cancelResponseBody(response);
|
|
17
25
|
return {
|
|
18
26
|
response,
|
|
19
27
|
nextUrl: resolveRedirectTarget(currentUrl, location),
|
|
@@ -22,14 +30,14 @@ async function performFetchCycle(currentUrl, init, redirectLimit, redirectCount)
|
|
|
22
30
|
function assertRedirectWithinLimit(response, currentUrl, redirectLimit, redirectCount) {
|
|
23
31
|
if (redirectCount < redirectLimit)
|
|
24
32
|
return;
|
|
25
|
-
|
|
33
|
+
cancelResponseBody(response);
|
|
26
34
|
throw new FetchError('Too many redirects', currentUrl);
|
|
27
35
|
}
|
|
28
36
|
function getRedirectLocation(response, currentUrl) {
|
|
29
37
|
const location = response.headers.get('location');
|
|
30
38
|
if (location)
|
|
31
39
|
return location;
|
|
32
|
-
|
|
40
|
+
cancelResponseBody(response);
|
|
33
41
|
throw new FetchError('Redirect response missing Location header', currentUrl);
|
|
34
42
|
}
|
|
35
43
|
function annotateRedirectError(error, url) {
|
|
@@ -7,7 +7,7 @@ function assertContentLengthWithinLimit(response, url, maxBytes) {
|
|
|
7
7
|
if (Number.isNaN(contentLength) || contentLength <= maxBytes) {
|
|
8
8
|
return;
|
|
9
9
|
}
|
|
10
|
-
|
|
10
|
+
cancelResponseBody(response);
|
|
11
11
|
throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
|
|
12
12
|
}
|
|
13
13
|
function createReadState() {
|
|
@@ -36,6 +36,14 @@ function createAbortError(url) {
|
|
|
36
36
|
reason: 'aborted',
|
|
37
37
|
});
|
|
38
38
|
}
|
|
39
|
+
function cancelResponseBody(response) {
|
|
40
|
+
const cancelPromise = response.body?.cancel();
|
|
41
|
+
if (cancelPromise) {
|
|
42
|
+
cancelPromise.catch(() => {
|
|
43
|
+
// Best-effort cancellation; ignore failures.
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
}
|
|
39
47
|
async function cancelReaderQuietly(reader) {
|
|
40
48
|
try {
|
|
41
49
|
await reader.cancel();
|
|
@@ -50,6 +58,14 @@ async function throwIfAborted(signal, url, reader) {
|
|
|
50
58
|
await cancelReaderQuietly(reader);
|
|
51
59
|
throw createAbortError(url);
|
|
52
60
|
}
|
|
61
|
+
async function handleReadFailure(error, signal, url, reader) {
|
|
62
|
+
const aborted = signal?.aborted ?? false;
|
|
63
|
+
await cancelReaderQuietly(reader);
|
|
64
|
+
if (aborted) {
|
|
65
|
+
throw createAbortError(url);
|
|
66
|
+
}
|
|
67
|
+
throw error;
|
|
68
|
+
}
|
|
53
69
|
async function readAllChunks(reader, state, url, maxBytes, signal) {
|
|
54
70
|
await throwIfAborted(signal, url, reader);
|
|
55
71
|
let result = await reader.read();
|
|
@@ -66,13 +82,7 @@ async function readStreamWithLimit(stream, url, maxBytes, signal) {
|
|
|
66
82
|
await readAllChunks(reader, state, url, maxBytes, signal);
|
|
67
83
|
}
|
|
68
84
|
catch (error) {
|
|
69
|
-
|
|
70
|
-
await cancelReaderQuietly(reader);
|
|
71
|
-
}
|
|
72
|
-
if (signal?.aborted) {
|
|
73
|
-
throw createAbortError(url);
|
|
74
|
-
}
|
|
75
|
-
throw error;
|
|
85
|
+
await handleReadFailure(error, signal, url, reader);
|
|
76
86
|
}
|
|
77
87
|
finally {
|
|
78
88
|
reader.releaseLock();
|
|
@@ -1,2 +1,25 @@
|
|
|
1
|
+
import type { Dispatcher } from 'undici';
|
|
1
2
|
import type { FetchOptions } from '../config/types/runtime.js';
|
|
3
|
+
export declare const dispatcher: Dispatcher;
|
|
4
|
+
export declare function destroyAgents(): void;
|
|
5
|
+
interface FetchTelemetryContext {
|
|
6
|
+
requestId: string;
|
|
7
|
+
startTime: number;
|
|
8
|
+
url: string;
|
|
9
|
+
method: string;
|
|
10
|
+
contextRequestId?: string;
|
|
11
|
+
operationId?: string;
|
|
12
|
+
}
|
|
13
|
+
export declare function startFetchTelemetry(url: string, method: string): FetchTelemetryContext;
|
|
14
|
+
export declare function recordFetchResponse(context: FetchTelemetryContext, response: Response, contentSize?: number): void;
|
|
15
|
+
export declare function recordFetchError(context: FetchTelemetryContext, error: unknown, status?: number): void;
|
|
16
|
+
export declare function fetchWithRedirects(url: string, init: RequestInit, maxRedirects: number): Promise<{
|
|
17
|
+
response: Response;
|
|
18
|
+
url: string;
|
|
19
|
+
}>;
|
|
20
|
+
export declare function readResponseText(response: Response, url: string, maxBytes: number, signal?: AbortSignal): Promise<{
|
|
21
|
+
text: string;
|
|
22
|
+
size: number;
|
|
23
|
+
}>;
|
|
2
24
|
export declare function fetchNormalizedUrl(normalizedUrl: string, options?: FetchOptions): Promise<string>;
|
|
25
|
+
export {};
|