@j0hanz/superfetch 1.2.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -152
- package/dist/config/auth-config.d.ts +16 -0
- package/dist/config/auth-config.js +53 -0
- package/dist/config/constants.d.ts +11 -13
- package/dist/config/constants.js +1 -3
- package/dist/config/env-parsers.d.ts +7 -0
- package/dist/config/env-parsers.js +84 -0
- package/dist/config/formatting.d.ts +2 -2
- package/dist/config/index.d.ts +47 -53
- package/dist/config/index.js +25 -59
- package/dist/config/types/content.d.ts +1 -49
- package/dist/config/types/runtime.d.ts +8 -16
- package/dist/config/types/tools.d.ts +2 -28
- package/dist/http/accept-policy.d.ts +3 -0
- package/dist/http/accept-policy.js +45 -0
- package/dist/http/async-handler.d.ts +2 -0
- package/dist/http/async-handler.js +5 -0
- package/dist/http/auth-introspection.d.ts +2 -0
- package/dist/http/auth-introspection.js +141 -0
- package/dist/http/auth-static.d.ts +2 -0
- package/dist/http/auth-static.js +23 -0
- package/dist/http/auth.d.ts +3 -2
- package/dist/http/auth.js +98 -26
- package/dist/http/cors.d.ts +6 -6
- package/dist/http/cors.js +7 -42
- package/dist/http/download-routes.d.ts +0 -12
- package/dist/http/download-routes.js +21 -58
- package/dist/http/jsonrpc-http.d.ts +2 -0
- package/dist/http/jsonrpc-http.js +10 -0
- package/dist/http/mcp-routes.d.ts +0 -1
- package/dist/http/mcp-routes.js +43 -30
- package/dist/http/mcp-session-helpers.d.ts +0 -1
- package/dist/http/mcp-session-helpers.js +1 -1
- package/dist/http/mcp-session-transport.d.ts +7 -0
- package/dist/http/mcp-session-transport.js +57 -0
- package/dist/http/mcp-session.js +60 -73
- package/dist/http/mcp-validation.d.ts +1 -0
- package/dist/http/mcp-validation.js +11 -10
- package/dist/http/protocol-policy.d.ts +2 -0
- package/dist/http/protocol-policy.js +31 -0
- package/dist/http/rate-limit.js +5 -2
- package/dist/http/server-config.d.ts +1 -0
- package/dist/http/server-config.js +40 -0
- package/dist/http/server-middleware.d.ts +2 -9
- package/dist/http/server-middleware.js +96 -43
- package/dist/http/server-shutdown.d.ts +4 -0
- package/dist/http/server-shutdown.js +43 -0
- package/dist/http/server.js +52 -64
- package/dist/http/session-cleanup.js +1 -1
- package/dist/middleware/error-handler.js +1 -3
- package/dist/resources/cached-content.js +50 -108
- package/dist/resources/index.js +0 -82
- package/dist/server.js +51 -30
- package/dist/services/cache-keys.d.ts +7 -0
- package/dist/services/cache-keys.js +57 -0
- package/dist/services/cache.d.ts +1 -7
- package/dist/services/cache.js +53 -119
- package/dist/services/context.d.ts +0 -1
- package/dist/services/context.js +0 -7
- package/dist/services/extractor.js +10 -82
- package/dist/services/fetcher/agents.d.ts +2 -2
- package/dist/services/fetcher/agents.js +34 -95
- package/dist/services/fetcher/dns-selection.d.ts +2 -0
- package/dist/services/fetcher/dns-selection.js +72 -0
- package/dist/services/fetcher/interceptors.d.ts +0 -22
- package/dist/services/fetcher/interceptors.js +30 -13
- package/dist/services/fetcher/redirects.js +4 -3
- package/dist/services/fetcher/response.js +66 -31
- package/dist/services/fetcher.d.ts +1 -3
- package/dist/services/fetcher.js +14 -33
- package/dist/services/fifo-queue.d.ts +8 -0
- package/dist/services/fifo-queue.js +25 -0
- package/dist/services/logger.js +2 -2
- package/dist/services/metadata-collector.d.ts +1 -9
- package/dist/services/metadata-collector.js +71 -2
- package/dist/services/transform-worker-pool.d.ts +4 -14
- package/dist/services/transform-worker-pool.js +177 -129
- package/dist/services/transform-worker-types.d.ts +32 -0
- package/dist/services/transform-worker-types.js +14 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
- package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
- package/dist/tools/handlers/fetch-single.shared.d.ts +1 -20
- package/dist/tools/handlers/fetch-single.shared.js +44 -87
- package/dist/tools/handlers/fetch-url.tool.d.ts +1 -1
- package/dist/tools/handlers/fetch-url.tool.js +46 -123
- package/dist/tools/index.js +21 -40
- package/dist/tools/schemas.d.ts +1 -51
- package/dist/tools/schemas.js +2 -108
- package/dist/tools/utils/cached-markdown.d.ts +5 -0
- package/dist/tools/utils/cached-markdown.js +46 -0
- package/dist/tools/utils/content-shaping.d.ts +4 -0
- package/dist/tools/utils/content-shaping.js +52 -0
- package/dist/tools/utils/content-transform.d.ts +2 -17
- package/dist/tools/utils/content-transform.js +120 -114
- package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
- package/dist/tools/utils/fetch-pipeline.js +65 -62
- package/dist/tools/utils/inline-content.d.ts +1 -2
- package/dist/tools/utils/inline-content.js +4 -7
- package/dist/transformers/markdown.transformer.js +109 -34
- package/dist/utils/cached-payload.d.ts +7 -0
- package/dist/utils/cached-payload.js +36 -0
- package/dist/utils/error-utils.js +1 -1
- package/dist/utils/filename-generator.js +21 -10
- package/dist/utils/guards.d.ts +1 -0
- package/dist/utils/guards.js +3 -0
- package/dist/utils/header-normalizer.d.ts +0 -3
- package/dist/utils/header-normalizer.js +3 -3
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +11 -38
- package/dist/utils/url-transformer.d.ts +7 -0
- package/dist/utils/url-transformer.js +147 -0
- package/dist/utils/url-validator.d.ts +1 -2
- package/dist/utils/url-validator.js +20 -93
- package/dist/workers/content-transform.worker.d.ts +1 -0
- package/dist/workers/content-transform.worker.js +40 -0
- package/package.json +13 -16
package/dist/services/cache.d.ts
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
1
|
import type { CacheEntry } from '../config/types/content.js';
|
|
2
|
-
|
|
3
|
-
namespace: string;
|
|
4
|
-
urlHash: string;
|
|
5
|
-
}
|
|
2
|
+
import type { CacheKeyParts } from './cache-keys.js';
|
|
6
3
|
interface CacheUpdateEvent extends CacheKeyParts {
|
|
7
4
|
cacheKey: string;
|
|
8
5
|
}
|
|
@@ -11,9 +8,6 @@ interface CacheEntryMetadata {
|
|
|
11
8
|
title?: string;
|
|
12
9
|
}
|
|
13
10
|
type CacheUpdateListener = (event: CacheUpdateEvent) => void;
|
|
14
|
-
export declare function createCacheKey(namespace: string, url: string, vary?: Record<string, unknown> | string): string | null;
|
|
15
|
-
export declare function parseCacheKey(cacheKey: string): CacheKeyParts | null;
|
|
16
|
-
export declare function toResourceUri(cacheKey: string): string | null;
|
|
17
11
|
export declare function onCacheUpdate(listener: CacheUpdateListener): () => void;
|
|
18
12
|
export declare function get(cacheKey: string | null): CacheEntry | undefined;
|
|
19
13
|
export declare function set(cacheKey: string | null, content: string, metadata: CacheEntryMetadata): void;
|
package/dist/services/cache.js
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import { setInterval as setIntervalPromise } from 'node:timers/promises';
|
|
2
|
-
import { CACHE_HASH } from '../config/constants.js';
|
|
3
2
|
import { config } from '../config/index.js';
|
|
4
|
-
import { sha256Hex } from '../utils/crypto.js';
|
|
5
3
|
import { getErrorMessage } from '../utils/error-utils.js';
|
|
4
|
+
import { parseCacheKey } from './cache-keys.js';
|
|
6
5
|
import { logWarn } from './logger.js';
|
|
7
6
|
const contentCache = new Map();
|
|
8
7
|
let cleanupController = null;
|
|
@@ -22,90 +21,19 @@ async function runCleanupLoop(signal) {
|
|
|
22
21
|
signal,
|
|
23
22
|
ref: false,
|
|
24
23
|
})) {
|
|
25
|
-
|
|
24
|
+
enforceCacheLimits();
|
|
26
25
|
}
|
|
27
26
|
}
|
|
28
|
-
function
|
|
27
|
+
function enforceCacheLimits() {
|
|
29
28
|
const now = Date.now();
|
|
30
29
|
for (const [key, item] of contentCache.entries()) {
|
|
31
30
|
if (now > item.expiresAt) {
|
|
32
31
|
contentCache.delete(key);
|
|
33
32
|
}
|
|
34
33
|
}
|
|
35
|
-
|
|
36
|
-
return;
|
|
37
|
-
const keysToRemove = contentCache.size - config.cache.maxKeys;
|
|
38
|
-
const iterator = contentCache.keys();
|
|
39
|
-
for (let i = 0; i < keysToRemove; i++) {
|
|
40
|
-
const { value, done } = iterator.next();
|
|
41
|
-
if (done)
|
|
42
|
-
break;
|
|
43
|
-
contentCache.delete(value);
|
|
44
|
-
}
|
|
34
|
+
trimCacheToMaxKeys();
|
|
45
35
|
}
|
|
46
36
|
const updateListeners = new Set();
|
|
47
|
-
function stableStringify(value) {
|
|
48
|
-
if (value === null || value === undefined) {
|
|
49
|
-
return '';
|
|
50
|
-
}
|
|
51
|
-
if (typeof value !== 'object') {
|
|
52
|
-
return JSON.stringify(value);
|
|
53
|
-
}
|
|
54
|
-
if (Array.isArray(value)) {
|
|
55
|
-
return `[${value.map((item) => stableStringify(item)).join(',')}]`;
|
|
56
|
-
}
|
|
57
|
-
const entries = Object.entries(value)
|
|
58
|
-
.filter(([, entryValue]) => entryValue !== undefined)
|
|
59
|
-
.sort(([a], [b]) => a.localeCompare(b))
|
|
60
|
-
.map(([key, entryValue]) => `${JSON.stringify(key)}:${stableStringify(entryValue)}`);
|
|
61
|
-
return `{${entries.join(',')}}`;
|
|
62
|
-
}
|
|
63
|
-
function createHashFragment(input, length) {
|
|
64
|
-
return sha256Hex(input).substring(0, length);
|
|
65
|
-
}
|
|
66
|
-
/**
|
|
67
|
-
* Constructs a cache key from namespace, URL hash, and optional vary hash.
|
|
68
|
-
* Format: "namespace:urlHash" or "namespace:urlHash.varyHash" if vary params exist.
|
|
69
|
-
* @param namespace - Cache namespace (e.g., "fetch-markdown")
|
|
70
|
-
* @param urlHash - SHA-256 hash of the URL (truncated to 16 chars)
|
|
71
|
-
* @param varyHash - Optional hash of vary parameters (e.g., headers, options)
|
|
72
|
-
* @returns Complete cache key string
|
|
73
|
-
*/
|
|
74
|
-
function buildCacheKey(namespace, urlHash, varyHash) {
|
|
75
|
-
return varyHash
|
|
76
|
-
? `${namespace}:${urlHash}.${varyHash}`
|
|
77
|
-
: `${namespace}:${urlHash}`;
|
|
78
|
-
}
|
|
79
|
-
function getVaryHash(vary) {
|
|
80
|
-
if (!vary)
|
|
81
|
-
return undefined;
|
|
82
|
-
const varyString = typeof vary === 'string' ? vary : stableStringify(vary);
|
|
83
|
-
if (!varyString)
|
|
84
|
-
return undefined;
|
|
85
|
-
return createHashFragment(varyString, CACHE_HASH.VARY_HASH_LENGTH);
|
|
86
|
-
}
|
|
87
|
-
export function createCacheKey(namespace, url, vary) {
|
|
88
|
-
if (!namespace || !url)
|
|
89
|
-
return null;
|
|
90
|
-
const urlHash = createHashFragment(url, CACHE_HASH.URL_HASH_LENGTH);
|
|
91
|
-
const varyHash = getVaryHash(vary);
|
|
92
|
-
return buildCacheKey(namespace, urlHash, varyHash);
|
|
93
|
-
}
|
|
94
|
-
export function parseCacheKey(cacheKey) {
|
|
95
|
-
if (!cacheKey)
|
|
96
|
-
return null;
|
|
97
|
-
const [namespace, ...rest] = cacheKey.split(':');
|
|
98
|
-
const urlHash = rest.join(':');
|
|
99
|
-
if (!namespace || !urlHash)
|
|
100
|
-
return null;
|
|
101
|
-
return { namespace, urlHash };
|
|
102
|
-
}
|
|
103
|
-
export function toResourceUri(cacheKey) {
|
|
104
|
-
const parts = parseCacheKey(cacheKey);
|
|
105
|
-
if (!parts)
|
|
106
|
-
return null;
|
|
107
|
-
return `superfetch://cache/${parts.namespace}/${parts.urlHash}`;
|
|
108
|
-
}
|
|
109
37
|
export function onCacheUpdate(listener) {
|
|
110
38
|
updateListeners.add(listener);
|
|
111
39
|
return () => {
|
|
@@ -123,21 +51,30 @@ function emitCacheUpdate(cacheKey) {
|
|
|
123
51
|
export function get(cacheKey) {
|
|
124
52
|
if (!isCacheReadable(cacheKey))
|
|
125
53
|
return undefined;
|
|
54
|
+
return runCacheOperation(cacheKey, 'Cache get error', () => readCacheEntry(cacheKey));
|
|
55
|
+
}
|
|
56
|
+
function isCacheReadable(cacheKey) {
|
|
57
|
+
return config.cache.enabled && Boolean(cacheKey);
|
|
58
|
+
}
|
|
59
|
+
function isCacheWritable(cacheKey, content) {
|
|
60
|
+
return config.cache.enabled && Boolean(cacheKey) && Boolean(content);
|
|
61
|
+
}
|
|
62
|
+
function runCacheOperation(cacheKey, message, operation) {
|
|
126
63
|
try {
|
|
127
|
-
return
|
|
64
|
+
return operation();
|
|
128
65
|
}
|
|
129
66
|
catch (error) {
|
|
130
|
-
|
|
131
|
-
key: cacheKey.substring(0, 100),
|
|
132
|
-
error: getErrorMessage(error),
|
|
133
|
-
});
|
|
67
|
+
logCacheError(message, cacheKey, error);
|
|
134
68
|
return undefined;
|
|
135
69
|
}
|
|
136
70
|
}
|
|
137
|
-
function isCacheReadable(cacheKey) {
|
|
138
|
-
return config.cache.enabled && Boolean(cacheKey);
|
|
139
|
-
}
|
|
140
71
|
function readCacheEntry(cacheKey) {
|
|
72
|
+
return readCacheItem(cacheKey)?.entry;
|
|
73
|
+
}
|
|
74
|
+
function isExpired(item) {
|
|
75
|
+
return Date.now() > item.expiresAt;
|
|
76
|
+
}
|
|
77
|
+
function readCacheItem(cacheKey) {
|
|
141
78
|
const item = contentCache.get(cacheKey);
|
|
142
79
|
if (!item)
|
|
143
80
|
return undefined;
|
|
@@ -145,29 +82,16 @@ function readCacheEntry(cacheKey) {
|
|
|
145
82
|
contentCache.delete(cacheKey);
|
|
146
83
|
return undefined;
|
|
147
84
|
}
|
|
148
|
-
return item
|
|
149
|
-
}
|
|
150
|
-
function isExpired(item) {
|
|
151
|
-
return Date.now() > item.expiresAt;
|
|
85
|
+
return item;
|
|
152
86
|
}
|
|
153
87
|
export function set(cacheKey, content, metadata) {
|
|
154
|
-
if (!
|
|
155
|
-
return;
|
|
156
|
-
if (!cacheKey)
|
|
157
|
-
return;
|
|
158
|
-
if (!content)
|
|
88
|
+
if (!isCacheWritable(cacheKey, content))
|
|
159
89
|
return;
|
|
160
|
-
|
|
90
|
+
runCacheOperation(cacheKey, 'Cache set error', () => {
|
|
161
91
|
startCleanupLoop();
|
|
162
|
-
const entry = buildCacheEntry(
|
|
92
|
+
const entry = buildCacheEntry(content, metadata);
|
|
163
93
|
persistCacheEntry(cacheKey, entry);
|
|
164
|
-
}
|
|
165
|
-
catch (error) {
|
|
166
|
-
logWarn('Cache set error', {
|
|
167
|
-
key: cacheKey.substring(0, 100),
|
|
168
|
-
error: getErrorMessage(error),
|
|
169
|
-
});
|
|
170
|
-
}
|
|
94
|
+
});
|
|
171
95
|
}
|
|
172
96
|
export function keys() {
|
|
173
97
|
return Array.from(contentCache.keys());
|
|
@@ -175,33 +99,43 @@ export function keys() {
|
|
|
175
99
|
export function isEnabled() {
|
|
176
100
|
return config.cache.enabled;
|
|
177
101
|
}
|
|
178
|
-
function buildCacheEntry(
|
|
179
|
-
|
|
102
|
+
function buildCacheEntry(content, metadata) {
|
|
103
|
+
return {
|
|
180
104
|
url: metadata.url,
|
|
181
105
|
content,
|
|
182
106
|
fetchedAt: new Date().toISOString(),
|
|
183
|
-
expiresAt: new Date(
|
|
107
|
+
expiresAt: new Date(resolveExpiryTimestamp()).toISOString(),
|
|
108
|
+
...(metadata.title === undefined ? {} : { title: metadata.title }),
|
|
184
109
|
};
|
|
185
|
-
if (metadata.title !== undefined) {
|
|
186
|
-
entry.title = metadata.title;
|
|
187
|
-
}
|
|
188
|
-
return entry;
|
|
189
110
|
}
|
|
190
111
|
function persistCacheEntry(cacheKey, entry) {
|
|
191
|
-
const expiresAt =
|
|
112
|
+
const expiresAt = resolveExpiryTimestamp();
|
|
192
113
|
contentCache.set(cacheKey, { entry, expiresAt });
|
|
193
|
-
|
|
114
|
+
trimCacheToMaxKeys();
|
|
194
115
|
emitCacheUpdate(cacheKey);
|
|
195
116
|
}
|
|
196
|
-
function
|
|
117
|
+
function trimCacheToMaxKeys() {
|
|
197
118
|
if (contentCache.size <= config.cache.maxKeys)
|
|
198
119
|
return;
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
120
|
+
removeOldestEntries(contentCache.size - config.cache.maxKeys);
|
|
121
|
+
}
|
|
122
|
+
function removeOldestEntries(count) {
|
|
123
|
+
if (count <= 0)
|
|
124
|
+
return;
|
|
125
|
+
let removed = 0;
|
|
126
|
+
for (const key of contentCache.keys()) {
|
|
127
|
+
contentCache.delete(key);
|
|
128
|
+
removed += 1;
|
|
129
|
+
if (removed >= count)
|
|
130
|
+
return;
|
|
206
131
|
}
|
|
207
132
|
}
|
|
133
|
+
function resolveExpiryTimestamp() {
|
|
134
|
+
return Date.now() + config.cache.ttl * 1000;
|
|
135
|
+
}
|
|
136
|
+
function logCacheError(message, cacheKey, error) {
|
|
137
|
+
logWarn(message, {
|
|
138
|
+
key: cacheKey.length > 100 ? cacheKey.slice(0, 100) : cacheKey,
|
|
139
|
+
error: getErrorMessage(error),
|
|
140
|
+
});
|
|
141
|
+
}
|
|
@@ -3,7 +3,6 @@ interface RequestContext {
|
|
|
3
3
|
readonly sessionId?: string;
|
|
4
4
|
}
|
|
5
5
|
export declare function runWithRequestContext<T>(context: RequestContext, fn: () => T): T;
|
|
6
|
-
export declare function bindToRequestContext<T extends (...args: unknown[]) => unknown>(fn: T): T;
|
|
7
6
|
export declare function getRequestId(): string | undefined;
|
|
8
7
|
export declare function getSessionId(): string | undefined;
|
|
9
8
|
export {};
|
package/dist/services/context.js
CHANGED
|
@@ -3,13 +3,6 @@ const requestContext = new AsyncLocalStorage();
|
|
|
3
3
|
export function runWithRequestContext(context, fn) {
|
|
4
4
|
return requestContext.run(context, fn);
|
|
5
5
|
}
|
|
6
|
-
export function bindToRequestContext(fn) {
|
|
7
|
-
const store = requestContext.getStore();
|
|
8
|
-
if (!store) {
|
|
9
|
-
return fn;
|
|
10
|
-
}
|
|
11
|
-
return ((...args) => requestContext.run(store, () => fn(...args)));
|
|
12
|
-
}
|
|
13
6
|
export function getRequestId() {
|
|
14
7
|
return requestContext.getStore()?.requestId;
|
|
15
8
|
}
|
|
@@ -1,91 +1,18 @@
|
|
|
1
1
|
import { parseHTML } from 'linkedom';
|
|
2
2
|
import { Readability } from '@mozilla/readability';
|
|
3
3
|
import { getErrorMessage } from '../utils/error-utils.js';
|
|
4
|
+
import { isRecord } from '../utils/guards.js';
|
|
4
5
|
import { truncateHtml } from '../utils/html-truncator.js';
|
|
5
6
|
import { logError, logInfo, logWarn } from './logger.js';
|
|
6
|
-
import {
|
|
7
|
-
function collectMetaTag(state, tag) {
|
|
8
|
-
const content = getMetaContent(tag);
|
|
9
|
-
if (!content)
|
|
10
|
-
return;
|
|
11
|
-
if (collectOpenGraphMeta(state, tag, content))
|
|
12
|
-
return;
|
|
13
|
-
if (collectTwitterMeta(state, tag, content))
|
|
14
|
-
return;
|
|
15
|
-
collectStandardMeta(state, tag, content);
|
|
16
|
-
}
|
|
17
|
-
function getMetaContent(tag) {
|
|
18
|
-
return tag.getAttribute('content')?.trim() ?? null;
|
|
19
|
-
}
|
|
20
|
-
function collectOpenGraphMeta(state, tag, content) {
|
|
21
|
-
const property = tag.getAttribute('property');
|
|
22
|
-
if (!property?.startsWith('og:'))
|
|
23
|
-
return false;
|
|
24
|
-
const key = property.replace('og:', '');
|
|
25
|
-
if (key === 'title')
|
|
26
|
-
state.title.og = content;
|
|
27
|
-
if (key === 'description')
|
|
28
|
-
state.description.og = content;
|
|
29
|
-
return true;
|
|
30
|
-
}
|
|
31
|
-
function collectTwitterMeta(state, tag, content) {
|
|
32
|
-
const name = tag.getAttribute('name');
|
|
33
|
-
if (!name?.startsWith('twitter:'))
|
|
34
|
-
return false;
|
|
35
|
-
const key = name.replace('twitter:', '');
|
|
36
|
-
if (key === 'title')
|
|
37
|
-
state.title.twitter = content;
|
|
38
|
-
if (key === 'description')
|
|
39
|
-
state.description.twitter = content;
|
|
40
|
-
return true;
|
|
41
|
-
}
|
|
42
|
-
function collectStandardMeta(state, tag, content) {
|
|
43
|
-
const name = tag.getAttribute('name');
|
|
44
|
-
if (name === 'description') {
|
|
45
|
-
state.description.standard = content;
|
|
46
|
-
}
|
|
47
|
-
if (name === 'author') {
|
|
48
|
-
state.author.standard = content;
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
function scanMetaTags(document, state) {
|
|
52
|
-
const metaTags = document.querySelectorAll('meta');
|
|
53
|
-
for (const tag of metaTags) {
|
|
54
|
-
collectMetaTag(state, tag);
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
function ensureTitleFallback(document, state) {
|
|
58
|
-
if (state.title.standard)
|
|
59
|
-
return;
|
|
60
|
-
const titleEl = document.querySelector('title');
|
|
61
|
-
if (titleEl?.textContent) {
|
|
62
|
-
state.title.standard = titleEl.textContent.trim();
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
function extractMetadata(document) {
|
|
66
|
-
const state = createMetaCollectorState();
|
|
67
|
-
scanMetaTags(document, state);
|
|
68
|
-
ensureTitleFallback(document, state);
|
|
69
|
-
const metadata = {};
|
|
70
|
-
const title = resolveMetaField(state, 'title');
|
|
71
|
-
const description = resolveMetaField(state, 'description');
|
|
72
|
-
const author = resolveMetaField(state, 'author');
|
|
73
|
-
if (title !== undefined)
|
|
74
|
-
metadata.title = title;
|
|
75
|
-
if (description !== undefined)
|
|
76
|
-
metadata.description = description;
|
|
77
|
-
if (author !== undefined)
|
|
78
|
-
metadata.author = author;
|
|
79
|
-
return metadata;
|
|
80
|
-
}
|
|
7
|
+
import { extractMetadata } from './metadata-collector.js';
|
|
81
8
|
function isReadabilityCompatible(doc) {
|
|
82
|
-
if (!doc
|
|
9
|
+
if (!isRecord(doc))
|
|
83
10
|
return false;
|
|
84
11
|
if (!('documentElement' in doc))
|
|
85
12
|
return false;
|
|
86
|
-
if (
|
|
13
|
+
if (typeof doc.querySelectorAll !== 'function')
|
|
87
14
|
return false;
|
|
88
|
-
if (
|
|
15
|
+
if (typeof doc.querySelector !== 'function')
|
|
89
16
|
return false;
|
|
90
17
|
return true;
|
|
91
18
|
}
|
|
@@ -138,12 +65,13 @@ export function extractContent(html, url, options = { extractArticle: true }) {
|
|
|
138
65
|
}
|
|
139
66
|
function tryExtractContent(html, url, options) {
|
|
140
67
|
try {
|
|
141
|
-
const
|
|
142
|
-
const { document } = parseHTML(processedHtml);
|
|
68
|
+
const { document } = parseHTML(truncateHtml(html));
|
|
143
69
|
applyBaseUri(document, url);
|
|
144
70
|
const metadata = extractMetadata(document);
|
|
145
|
-
|
|
146
|
-
|
|
71
|
+
return {
|
|
72
|
+
article: options.extractArticle ? extractArticle(document) : null,
|
|
73
|
+
metadata,
|
|
74
|
+
};
|
|
147
75
|
}
|
|
148
76
|
catch (error) {
|
|
149
77
|
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export declare const dispatcher:
|
|
1
|
+
import { type Dispatcher } from 'undici';
|
|
2
|
+
export declare const dispatcher: Dispatcher;
|
|
3
3
|
export declare function destroyAgents(): void;
|
|
@@ -2,26 +2,14 @@ import dns from 'node:dns';
|
|
|
2
2
|
import os from 'node:os';
|
|
3
3
|
import { Agent } from 'undici';
|
|
4
4
|
import { createErrorWithCode } from '../../utils/error-utils.js';
|
|
5
|
-
import {
|
|
5
|
+
import { isRecord } from '../../utils/guards.js';
|
|
6
|
+
import { handleLookupResult } from './dns-selection.js';
|
|
6
7
|
const DNS_LOOKUP_TIMEOUT_MS = 5000;
|
|
7
8
|
function resolveDns(hostname, options, callback) {
|
|
8
9
|
const { normalizedOptions, useAll, resolvedFamily } = buildLookupContext(options);
|
|
9
10
|
const lookupOptions = buildLookupOptions(normalizedOptions);
|
|
10
|
-
|
|
11
|
-
const
|
|
12
|
-
if (done)
|
|
13
|
-
return;
|
|
14
|
-
done = true;
|
|
15
|
-
callback(createErrorWithCode(`DNS lookup timed out for ${hostname}`, 'ETIMEOUT'), []);
|
|
16
|
-
}, DNS_LOOKUP_TIMEOUT_MS);
|
|
17
|
-
timer.unref();
|
|
18
|
-
const safeCallback = (err, address, family) => {
|
|
19
|
-
if (done)
|
|
20
|
-
return;
|
|
21
|
-
done = true;
|
|
22
|
-
clearTimeout(timer);
|
|
23
|
-
callback(err, address, family);
|
|
24
|
-
};
|
|
11
|
+
const timeout = createLookupTimeout(hostname, callback);
|
|
12
|
+
const safeCallback = wrapLookupCallback(callback, timeout);
|
|
25
13
|
dns.lookup(hostname, lookupOptions, createLookupCallback(hostname, resolvedFamily, useAll, safeCallback));
|
|
26
14
|
}
|
|
27
15
|
function normalizeLookupOptions(options) {
|
|
@@ -46,17 +34,19 @@ function resolveResultOrder(options) {
|
|
|
46
34
|
return DEFAULT_DNS_ORDER;
|
|
47
35
|
}
|
|
48
36
|
function getLegacyVerbatim(options) {
|
|
49
|
-
|
|
50
|
-
|
|
37
|
+
if (isRecord(options)) {
|
|
38
|
+
const { verbatim } = options;
|
|
39
|
+
return typeof verbatim === 'boolean' ? verbatim : undefined;
|
|
40
|
+
}
|
|
41
|
+
return undefined;
|
|
51
42
|
}
|
|
52
43
|
function buildLookupOptions(normalizedOptions) {
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
44
|
+
return {
|
|
45
|
+
family: normalizedOptions.family,
|
|
46
|
+
hints: normalizedOptions.hints,
|
|
56
47
|
all: true,
|
|
48
|
+
order: resolveResultOrder(normalizedOptions),
|
|
57
49
|
};
|
|
58
|
-
delete options.verbatim;
|
|
59
|
-
return options;
|
|
60
50
|
}
|
|
61
51
|
function createLookupCallback(hostname, resolvedFamily, useAll, callback) {
|
|
62
52
|
return (err, addresses) => {
|
|
@@ -70,81 +60,30 @@ function resolveFamily(family) {
|
|
|
70
60
|
return 6;
|
|
71
61
|
return family;
|
|
72
62
|
}
|
|
73
|
-
function
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
return;
|
|
83
|
-
}
|
|
84
|
-
const list = normalizeLookupResults(addresses, resolvedFamily);
|
|
85
|
-
const invalidFamilyError = findInvalidFamilyError(list, hostname);
|
|
86
|
-
if (invalidFamilyError) {
|
|
87
|
-
callback(invalidFamilyError, list);
|
|
88
|
-
return;
|
|
89
|
-
}
|
|
90
|
-
const blockedError = findBlockedIpError(list, hostname);
|
|
91
|
-
if (blockedError) {
|
|
92
|
-
callback(blockedError, list);
|
|
93
|
-
return;
|
|
94
|
-
}
|
|
95
|
-
const selection = selectLookupResult(list, useAll, hostname);
|
|
96
|
-
if (selection.error) {
|
|
97
|
-
callback(selection.error, selection.fallback);
|
|
98
|
-
return;
|
|
99
|
-
}
|
|
100
|
-
callback(null, selection.address, selection.family);
|
|
101
|
-
}
|
|
102
|
-
function selectLookupResult(list, useAll, hostname) {
|
|
103
|
-
if (list.length === 0) {
|
|
104
|
-
return {
|
|
105
|
-
error: createNoDnsResultsError(hostname),
|
|
106
|
-
fallback: [],
|
|
107
|
-
address: [],
|
|
108
|
-
};
|
|
109
|
-
}
|
|
110
|
-
if (useAll) {
|
|
111
|
-
return { address: list, fallback: list };
|
|
112
|
-
}
|
|
113
|
-
const first = list.at(0);
|
|
114
|
-
if (!first) {
|
|
115
|
-
return {
|
|
116
|
-
error: createNoDnsResultsError(hostname),
|
|
117
|
-
fallback: [],
|
|
118
|
-
address: [],
|
|
119
|
-
};
|
|
120
|
-
}
|
|
63
|
+
function createLookupTimeout(hostname, callback) {
|
|
64
|
+
let done = false;
|
|
65
|
+
const timer = setTimeout(() => {
|
|
66
|
+
if (done)
|
|
67
|
+
return;
|
|
68
|
+
done = true;
|
|
69
|
+
callback(createErrorWithCode(`DNS lookup timed out for ${hostname}`, 'ETIMEOUT'), []);
|
|
70
|
+
}, DNS_LOOKUP_TIMEOUT_MS);
|
|
71
|
+
timer.unref();
|
|
121
72
|
return {
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
73
|
+
isDone: () => done,
|
|
74
|
+
markDone: () => {
|
|
75
|
+
done = true;
|
|
76
|
+
clearTimeout(timer);
|
|
77
|
+
},
|
|
125
78
|
};
|
|
126
79
|
}
|
|
127
|
-
function
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
}
|
|
135
|
-
return null;
|
|
136
|
-
}
|
|
137
|
-
function findInvalidFamilyError(list, hostname) {
|
|
138
|
-
for (const addr of list) {
|
|
139
|
-
const family = typeof addr === 'string' ? 0 : addr.family;
|
|
140
|
-
if (family === 4 || family === 6)
|
|
141
|
-
continue;
|
|
142
|
-
return createErrorWithCode(`Invalid address family returned for ${hostname}`, 'EINVAL');
|
|
143
|
-
}
|
|
144
|
-
return null;
|
|
145
|
-
}
|
|
146
|
-
function createNoDnsResultsError(hostname) {
|
|
147
|
-
return createErrorWithCode(`No DNS results returned for ${hostname}`, 'ENODATA');
|
|
80
|
+
function wrapLookupCallback(callback, timeout) {
|
|
81
|
+
return (err, address, family) => {
|
|
82
|
+
if (timeout.isDone())
|
|
83
|
+
return;
|
|
84
|
+
timeout.markDone();
|
|
85
|
+
callback(err, address, family);
|
|
86
|
+
};
|
|
148
87
|
}
|
|
149
88
|
function getAgentOptions() {
|
|
150
89
|
const cpuCount = os.availableParallelism();
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import type { LookupAddress } from 'node:dns';
|
|
2
|
+
export declare function handleLookupResult(error: NodeJS.ErrnoException | null, addresses: string | LookupAddress[], hostname: string, resolvedFamily: number | undefined, useAll: boolean, callback: (err: NodeJS.ErrnoException | null, address: string | LookupAddress[], family?: number) => void): void;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { createErrorWithCode } from '../../utils/error-utils.js';
|
|
2
|
+
import { isBlockedIp } from '../../utils/url-validator.js';
|
|
3
|
+
function normalizeLookupResults(addresses, family) {
|
|
4
|
+
if (Array.isArray(addresses)) {
|
|
5
|
+
return addresses;
|
|
6
|
+
}
|
|
7
|
+
return [{ address: addresses, family: family ?? 4 }];
|
|
8
|
+
}
|
|
9
|
+
function findBlockedIpError(list, hostname) {
|
|
10
|
+
for (const addr of list) {
|
|
11
|
+
const ip = typeof addr === 'string' ? addr : addr.address;
|
|
12
|
+
if (!isBlockedIp(ip)) {
|
|
13
|
+
continue;
|
|
14
|
+
}
|
|
15
|
+
return createErrorWithCode(`Blocked IP detected for ${hostname}`, 'EBLOCKED');
|
|
16
|
+
}
|
|
17
|
+
return null;
|
|
18
|
+
}
|
|
19
|
+
function findInvalidFamilyError(list, hostname) {
|
|
20
|
+
for (const addr of list) {
|
|
21
|
+
const family = typeof addr === 'string' ? 0 : addr.family;
|
|
22
|
+
if (family === 4 || family === 6)
|
|
23
|
+
continue;
|
|
24
|
+
return createErrorWithCode(`Invalid address family returned for ${hostname}`, 'EINVAL');
|
|
25
|
+
}
|
|
26
|
+
return null;
|
|
27
|
+
}
|
|
28
|
+
function createNoDnsResultsError(hostname) {
|
|
29
|
+
return createErrorWithCode(`No DNS results returned for ${hostname}`, 'ENODATA');
|
|
30
|
+
}
|
|
31
|
+
function createEmptySelection(hostname) {
|
|
32
|
+
return {
|
|
33
|
+
error: createNoDnsResultsError(hostname),
|
|
34
|
+
fallback: [],
|
|
35
|
+
address: [],
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
function selectLookupResult(list, useAll, hostname) {
|
|
39
|
+
if (list.length === 0)
|
|
40
|
+
return createEmptySelection(hostname);
|
|
41
|
+
if (useAll)
|
|
42
|
+
return { address: list, fallback: list };
|
|
43
|
+
const first = list.at(0);
|
|
44
|
+
if (!first)
|
|
45
|
+
return createEmptySelection(hostname);
|
|
46
|
+
return {
|
|
47
|
+
address: first.address,
|
|
48
|
+
family: first.family,
|
|
49
|
+
fallback: list,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
function findLookupError(list, hostname) {
|
|
53
|
+
return (findInvalidFamilyError(list, hostname) ?? findBlockedIpError(list, hostname));
|
|
54
|
+
}
|
|
55
|
+
export function handleLookupResult(error, addresses, hostname, resolvedFamily, useAll, callback) {
|
|
56
|
+
if (error) {
|
|
57
|
+
callback(error, addresses);
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
const list = normalizeLookupResults(addresses, resolvedFamily);
|
|
61
|
+
const lookupError = findLookupError(list, hostname);
|
|
62
|
+
if (lookupError) {
|
|
63
|
+
callback(lookupError, list);
|
|
64
|
+
return;
|
|
65
|
+
}
|
|
66
|
+
const selection = selectLookupResult(list, useAll, hostname);
|
|
67
|
+
if (selection.error) {
|
|
68
|
+
callback(selection.error, selection.fallback);
|
|
69
|
+
return;
|
|
70
|
+
}
|
|
71
|
+
callback(null, selection.address, selection.family);
|
|
72
|
+
}
|
|
@@ -1,25 +1,3 @@
|
|
|
1
|
-
export type FetchChannelEvent = {
|
|
2
|
-
v: 1;
|
|
3
|
-
type: 'start';
|
|
4
|
-
requestId: string;
|
|
5
|
-
method: string;
|
|
6
|
-
url: string;
|
|
7
|
-
} | {
|
|
8
|
-
v: 1;
|
|
9
|
-
type: 'end';
|
|
10
|
-
requestId: string;
|
|
11
|
-
status: number;
|
|
12
|
-
duration: number;
|
|
13
|
-
} | {
|
|
14
|
-
v: 1;
|
|
15
|
-
type: 'error';
|
|
16
|
-
requestId: string;
|
|
17
|
-
url: string;
|
|
18
|
-
error: string;
|
|
19
|
-
code?: string;
|
|
20
|
-
status?: number;
|
|
21
|
-
duration: number;
|
|
22
|
-
};
|
|
23
1
|
interface FetchTelemetryContext {
|
|
24
2
|
requestId: string;
|
|
25
3
|
startTime: number;
|