@j0hanz/superfetch 1.2.5 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +131 -156
- package/dist/config/auth-config.d.ts +16 -0
- package/dist/config/auth-config.js +53 -0
- package/dist/config/constants.d.ts +11 -13
- package/dist/config/constants.js +1 -3
- package/dist/config/env-parsers.d.ts +7 -0
- package/dist/config/env-parsers.js +84 -0
- package/dist/config/formatting.d.ts +2 -2
- package/dist/config/index.d.ts +47 -53
- package/dist/config/index.js +35 -64
- package/dist/config/types/content.d.ts +1 -49
- package/dist/config/types/runtime.d.ts +8 -16
- package/dist/config/types/tools.d.ts +2 -28
- package/dist/http/accept-policy.d.ts +3 -0
- package/dist/http/accept-policy.js +45 -0
- package/dist/http/async-handler.d.ts +2 -0
- package/dist/http/async-handler.js +5 -0
- package/dist/http/auth-introspection.d.ts +2 -0
- package/dist/http/auth-introspection.js +141 -0
- package/dist/http/auth-static.d.ts +2 -0
- package/dist/http/auth-static.js +23 -0
- package/dist/http/auth.d.ts +3 -2
- package/dist/http/auth.js +254 -23
- package/dist/http/cors.d.ts +6 -6
- package/dist/http/cors.js +7 -42
- package/dist/http/download-routes.d.ts +0 -12
- package/dist/http/download-routes.js +21 -58
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/jsonrpc-http.d.ts +2 -0
- package/dist/http/jsonrpc-http.js +10 -0
- package/dist/http/mcp-routes.d.ts +8 -3
- package/dist/http/mcp-routes.js +137 -31
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-helpers.d.ts +0 -1
- package/dist/http/mcp-session-helpers.js +1 -1
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-transport.d.ts +7 -0
- package/dist/http/mcp-session-transport.js +57 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +15 -137
- package/dist/http/mcp-sessions.d.ts +43 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/mcp-validation.d.ts +1 -0
- package/dist/http/mcp-validation.js +11 -10
- package/dist/http/protocol-policy.d.ts +2 -0
- package/dist/http/protocol-policy.js +31 -0
- package/dist/http/rate-limit.js +7 -4
- package/dist/http/server-config.d.ts +1 -0
- package/dist/http/server-config.js +40 -0
- package/dist/http/server-middleware.d.ts +7 -9
- package/dist/http/server-middleware.js +9 -70
- package/dist/http/server-shutdown.d.ts +4 -0
- package/dist/http/server-shutdown.js +43 -0
- package/dist/http/server.d.ts +10 -0
- package/dist/http/server.js +546 -61
- package/dist/http/session-cleanup.js +8 -5
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +32 -33
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +67 -125
- package/dist/resources/index.js +0 -82
- package/dist/server.js +50 -29
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache-keys.d.ts +7 -0
- package/dist/services/cache-keys.js +57 -0
- package/dist/services/cache.d.ts +4 -9
- package/dist/services/cache.js +77 -139
- package/dist/services/context.d.ts +0 -1
- package/dist/services/context.js +0 -7
- package/dist/services/extractor.js +55 -116
- package/dist/services/fetcher/agents.d.ts +2 -2
- package/dist/services/fetcher/agents.js +35 -96
- package/dist/services/fetcher/dns-selection.d.ts +2 -0
- package/dist/services/fetcher/dns-selection.js +72 -0
- package/dist/services/fetcher/interceptors.d.ts +0 -22
- package/dist/services/fetcher/interceptors.js +18 -32
- package/dist/services/fetcher/redirects.js +16 -7
- package/dist/services/fetcher/response.js +79 -34
- package/dist/services/fetcher.d.ts +22 -3
- package/dist/services/fetcher.js +544 -44
- package/dist/services/fifo-queue.d.ts +8 -0
- package/dist/services/fifo-queue.js +25 -0
- package/dist/services/logger.js +2 -2
- package/dist/services/metadata-collector.d.ts +1 -9
- package/dist/services/metadata-collector.js +71 -2
- package/dist/services/transform-worker-pool.d.ts +4 -14
- package/dist/services/transform-worker-pool.js +177 -129
- package/dist/services/transform-worker-types.d.ts +32 -0
- package/dist/services/transform-worker-types.js +14 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
- package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -22
- package/dist/tools/handlers/fetch-single.shared.js +175 -89
- package/dist/tools/handlers/fetch-url.tool.d.ts +7 -1
- package/dist/tools/handlers/fetch-url.tool.js +84 -119
- package/dist/tools/index.js +21 -40
- package/dist/tools/schemas.d.ts +1 -51
- package/dist/tools/schemas.js +1 -107
- package/dist/tools/utils/cached-markdown.d.ts +5 -0
- package/dist/tools/utils/cached-markdown.js +46 -0
- package/dist/tools/utils/content-shaping.d.ts +4 -0
- package/dist/tools/utils/content-shaping.js +67 -0
- package/dist/tools/utils/content-transform.d.ts +5 -17
- package/dist/tools/utils/content-transform.js +134 -114
- package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
- package/dist/tools/utils/fetch-pipeline.js +57 -63
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/inline-content.d.ts +1 -2
- package/dist/tools/utils/inline-content.js +4 -7
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +135 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +2 -0
- package/dist/transformers/markdown.js +185 -0
- package/dist/transformers/markdown.transformer.js +5 -117
- package/dist/utils/cached-payload.d.ts +7 -0
- package/dist/utils/cached-payload.js +36 -0
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.d.ts +9 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/error-utils.js +1 -1
- package/dist/utils/filename-generator.js +34 -12
- package/dist/utils/guards.d.ts +1 -0
- package/dist/utils/guards.js +3 -0
- package/dist/utils/header-normalizer.d.ts +0 -3
- package/dist/utils/header-normalizer.js +3 -3
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +14 -46
- package/dist/utils/url-transformer.d.ts +7 -0
- package/dist/utils/url-transformer.js +147 -0
- package/dist/utils/url-validator.d.ts +1 -2
- package/dist/utils/url-validator.js +53 -114
- package/dist/workers/content-transform.worker.d.ts +1 -0
- package/dist/workers/content-transform.worker.js +40 -0
- package/package.json +17 -18
package/dist/services/cache.d.ts
CHANGED
|
@@ -1,20 +1,15 @@
|
|
|
1
1
|
import type { CacheEntry } from '../config/types/content.js';
|
|
2
|
-
interface
|
|
2
|
+
export interface CacheUpdateEvent {
|
|
3
|
+
cacheKey: string;
|
|
3
4
|
namespace: string;
|
|
4
5
|
urlHash: string;
|
|
5
6
|
}
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
}
|
|
7
|
+
type CacheUpdateListener = (event: CacheUpdateEvent) => void;
|
|
8
|
+
export declare function onCacheUpdate(listener: CacheUpdateListener): () => void;
|
|
9
9
|
interface CacheEntryMetadata {
|
|
10
10
|
url: string;
|
|
11
11
|
title?: string;
|
|
12
12
|
}
|
|
13
|
-
type CacheUpdateListener = (event: CacheUpdateEvent) => void;
|
|
14
|
-
export declare function createCacheKey(namespace: string, url: string, vary?: Record<string, unknown> | string): string | null;
|
|
15
|
-
export declare function parseCacheKey(cacheKey: string): CacheKeyParts | null;
|
|
16
|
-
export declare function toResourceUri(cacheKey: string): string | null;
|
|
17
|
-
export declare function onCacheUpdate(listener: CacheUpdateListener): () => void;
|
|
18
13
|
export declare function get(cacheKey: string | null): CacheEntry | undefined;
|
|
19
14
|
export declare function set(cacheKey: string | null, content: string, metadata: CacheEntryMetadata): void;
|
|
20
15
|
export declare function keys(): readonly string[];
|
package/dist/services/cache.js
CHANGED
|
@@ -1,11 +1,28 @@
|
|
|
1
1
|
import { setInterval as setIntervalPromise } from 'node:timers/promises';
|
|
2
|
-
import { CACHE_HASH } from '../config/constants.js';
|
|
3
2
|
import { config } from '../config/index.js';
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
3
|
+
import { getErrorMessage } from '../utils/error-details.js';
|
|
4
|
+
import { parseCacheKey } from './cache-keys.js';
|
|
6
5
|
import { logWarn } from './logger.js';
|
|
7
6
|
const contentCache = new Map();
|
|
8
7
|
let cleanupController = null;
|
|
8
|
+
const updateListeners = new Set();
|
|
9
|
+
export function onCacheUpdate(listener) {
|
|
10
|
+
updateListeners.add(listener);
|
|
11
|
+
return () => {
|
|
12
|
+
updateListeners.delete(listener);
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
function notifyCacheUpdate(cacheKey) {
|
|
16
|
+
if (updateListeners.size === 0)
|
|
17
|
+
return;
|
|
18
|
+
const parts = parseCacheKey(cacheKey);
|
|
19
|
+
if (!parts)
|
|
20
|
+
return;
|
|
21
|
+
const event = { cacheKey, ...parts };
|
|
22
|
+
for (const listener of updateListeners) {
|
|
23
|
+
listener(event);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
9
26
|
function startCleanupLoop() {
|
|
10
27
|
if (cleanupController)
|
|
11
28
|
return;
|
|
@@ -18,156 +35,73 @@ function startCleanupLoop() {
|
|
|
18
35
|
}
|
|
19
36
|
async function runCleanupLoop(signal) {
|
|
20
37
|
const intervalMs = Math.floor(config.cache.ttl * 1000);
|
|
21
|
-
for await (const
|
|
38
|
+
for await (const getNow of setIntervalPromise(intervalMs, Date.now, {
|
|
22
39
|
signal,
|
|
23
40
|
ref: false,
|
|
24
41
|
})) {
|
|
25
|
-
|
|
42
|
+
enforceCacheLimits(getNow());
|
|
26
43
|
}
|
|
27
44
|
}
|
|
28
|
-
function
|
|
29
|
-
const now = Date.now();
|
|
45
|
+
function enforceCacheLimits(now) {
|
|
30
46
|
for (const [key, item] of contentCache.entries()) {
|
|
31
47
|
if (now > item.expiresAt) {
|
|
32
48
|
contentCache.delete(key);
|
|
33
49
|
}
|
|
34
50
|
}
|
|
35
|
-
|
|
36
|
-
return;
|
|
37
|
-
const keysToRemove = contentCache.size - config.cache.maxKeys;
|
|
38
|
-
const iterator = contentCache.keys();
|
|
39
|
-
for (let i = 0; i < keysToRemove; i++) {
|
|
40
|
-
const { value, done } = iterator.next();
|
|
41
|
-
if (done)
|
|
42
|
-
break;
|
|
43
|
-
contentCache.delete(value);
|
|
44
|
-
}
|
|
51
|
+
trimCacheToMaxKeys();
|
|
45
52
|
}
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
if (value === null || value === undefined) {
|
|
49
|
-
return '';
|
|
50
|
-
}
|
|
51
|
-
if (typeof value !== 'object') {
|
|
52
|
-
return JSON.stringify(value);
|
|
53
|
-
}
|
|
54
|
-
if (Array.isArray(value)) {
|
|
55
|
-
return `[${value.map((item) => stableStringify(item)).join(',')}]`;
|
|
56
|
-
}
|
|
57
|
-
const entries = Object.entries(value)
|
|
58
|
-
.filter(([, entryValue]) => entryValue !== undefined)
|
|
59
|
-
.sort(([a], [b]) => a.localeCompare(b))
|
|
60
|
-
.map(([key, entryValue]) => `${JSON.stringify(key)}:${stableStringify(entryValue)}`);
|
|
61
|
-
return `{${entries.join(',')}}`;
|
|
62
|
-
}
|
|
63
|
-
function createHashFragment(input, length) {
|
|
64
|
-
return sha256Hex(input).substring(0, length);
|
|
65
|
-
}
|
|
66
|
-
/**
|
|
67
|
-
* Constructs a cache key from namespace, URL hash, and optional vary hash.
|
|
68
|
-
* Format: "namespace:urlHash" or "namespace:urlHash.varyHash" if vary params exist.
|
|
69
|
-
* @param namespace - Cache namespace (e.g., "fetch-markdown")
|
|
70
|
-
* @param urlHash - SHA-256 hash of the URL (truncated to 16 chars)
|
|
71
|
-
* @param varyHash - Optional hash of vary parameters (e.g., headers, options)
|
|
72
|
-
* @returns Complete cache key string
|
|
73
|
-
*/
|
|
74
|
-
function buildCacheKey(namespace, urlHash, varyHash) {
|
|
75
|
-
return varyHash
|
|
76
|
-
? `${namespace}:${urlHash}.${varyHash}`
|
|
77
|
-
: `${namespace}:${urlHash}`;
|
|
78
|
-
}
|
|
79
|
-
function getVaryHash(vary) {
|
|
80
|
-
if (!vary)
|
|
81
|
-
return undefined;
|
|
82
|
-
const varyString = typeof vary === 'string' ? vary : stableStringify(vary);
|
|
83
|
-
if (!varyString)
|
|
53
|
+
export function get(cacheKey) {
|
|
54
|
+
if (!isCacheReadable(cacheKey))
|
|
84
55
|
return undefined;
|
|
85
|
-
return
|
|
86
|
-
}
|
|
87
|
-
export function createCacheKey(namespace, url, vary) {
|
|
88
|
-
if (!namespace || !url)
|
|
89
|
-
return null;
|
|
90
|
-
const urlHash = createHashFragment(url, CACHE_HASH.URL_HASH_LENGTH);
|
|
91
|
-
const varyHash = getVaryHash(vary);
|
|
92
|
-
return buildCacheKey(namespace, urlHash, varyHash);
|
|
93
|
-
}
|
|
94
|
-
export function parseCacheKey(cacheKey) {
|
|
95
|
-
if (!cacheKey)
|
|
96
|
-
return null;
|
|
97
|
-
const [namespace, ...rest] = cacheKey.split(':');
|
|
98
|
-
const urlHash = rest.join(':');
|
|
99
|
-
if (!namespace || !urlHash)
|
|
100
|
-
return null;
|
|
101
|
-
return { namespace, urlHash };
|
|
102
|
-
}
|
|
103
|
-
export function toResourceUri(cacheKey) {
|
|
104
|
-
const parts = parseCacheKey(cacheKey);
|
|
105
|
-
if (!parts)
|
|
106
|
-
return null;
|
|
107
|
-
return `superfetch://cache/${parts.namespace}/${parts.urlHash}`;
|
|
56
|
+
return runCacheOperation(cacheKey, 'Cache get error', () => readCacheEntry(cacheKey));
|
|
108
57
|
}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
return () => {
|
|
112
|
-
updateListeners.delete(listener);
|
|
113
|
-
};
|
|
58
|
+
function isCacheReadable(cacheKey) {
|
|
59
|
+
return config.cache.enabled && Boolean(cacheKey);
|
|
114
60
|
}
|
|
115
|
-
function
|
|
116
|
-
|
|
117
|
-
if (!parts)
|
|
118
|
-
return;
|
|
119
|
-
for (const listener of updateListeners) {
|
|
120
|
-
listener({ cacheKey, ...parts });
|
|
121
|
-
}
|
|
61
|
+
function isCacheWritable(cacheKey, content) {
|
|
62
|
+
return config.cache.enabled && Boolean(cacheKey) && Boolean(content);
|
|
122
63
|
}
|
|
123
|
-
|
|
124
|
-
if (!isCacheReadable(cacheKey))
|
|
125
|
-
return undefined;
|
|
64
|
+
function runCacheOperation(cacheKey, message, operation) {
|
|
126
65
|
try {
|
|
127
|
-
return
|
|
66
|
+
return operation();
|
|
128
67
|
}
|
|
129
68
|
catch (error) {
|
|
130
|
-
|
|
131
|
-
key: cacheKey.substring(0, 100),
|
|
132
|
-
error: getErrorMessage(error),
|
|
133
|
-
});
|
|
69
|
+
logCacheError(message, cacheKey, error);
|
|
134
70
|
return undefined;
|
|
135
71
|
}
|
|
136
72
|
}
|
|
137
|
-
function isCacheReadable(cacheKey) {
|
|
138
|
-
return config.cache.enabled && Boolean(cacheKey);
|
|
139
|
-
}
|
|
140
73
|
function readCacheEntry(cacheKey) {
|
|
74
|
+
const now = Date.now();
|
|
75
|
+
return readCacheItem(cacheKey, now)?.entry;
|
|
76
|
+
}
|
|
77
|
+
function isExpired(item, now) {
|
|
78
|
+
return now > item.expiresAt;
|
|
79
|
+
}
|
|
80
|
+
function readCacheItem(cacheKey, now) {
|
|
141
81
|
const item = contentCache.get(cacheKey);
|
|
142
82
|
if (!item)
|
|
143
83
|
return undefined;
|
|
144
|
-
if (isExpired(item)) {
|
|
84
|
+
if (isExpired(item, now)) {
|
|
145
85
|
contentCache.delete(cacheKey);
|
|
146
86
|
return undefined;
|
|
147
87
|
}
|
|
148
|
-
return item
|
|
149
|
-
}
|
|
150
|
-
function isExpired(item) {
|
|
151
|
-
return Date.now() > item.expiresAt;
|
|
88
|
+
return item;
|
|
152
89
|
}
|
|
153
90
|
export function set(cacheKey, content, metadata) {
|
|
154
|
-
if (!
|
|
91
|
+
if (!isCacheWritable(cacheKey, content))
|
|
155
92
|
return;
|
|
156
|
-
|
|
157
|
-
return;
|
|
158
|
-
if (!content)
|
|
159
|
-
return;
|
|
160
|
-
try {
|
|
93
|
+
runCacheOperation(cacheKey, 'Cache set error', () => {
|
|
161
94
|
startCleanupLoop();
|
|
162
|
-
const
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
95
|
+
const now = Date.now();
|
|
96
|
+
const expiresAtMs = now + config.cache.ttl * 1000;
|
|
97
|
+
const entry = buildCacheEntry({
|
|
98
|
+
content,
|
|
99
|
+
metadata,
|
|
100
|
+
fetchedAtMs: now,
|
|
101
|
+
expiresAtMs,
|
|
169
102
|
});
|
|
170
|
-
|
|
103
|
+
persistCacheEntry(cacheKey, entry, expiresAtMs);
|
|
104
|
+
});
|
|
171
105
|
}
|
|
172
106
|
export function keys() {
|
|
173
107
|
return Array.from(contentCache.keys());
|
|
@@ -175,33 +109,37 @@ export function keys() {
|
|
|
175
109
|
export function isEnabled() {
|
|
176
110
|
return config.cache.enabled;
|
|
177
111
|
}
|
|
178
|
-
function buildCacheEntry(
|
|
179
|
-
|
|
112
|
+
function buildCacheEntry({ content, metadata, fetchedAtMs, expiresAtMs, }) {
|
|
113
|
+
return {
|
|
180
114
|
url: metadata.url,
|
|
181
115
|
content,
|
|
182
|
-
fetchedAt: new Date().toISOString(),
|
|
183
|
-
expiresAt: new Date(
|
|
116
|
+
fetchedAt: new Date(fetchedAtMs).toISOString(),
|
|
117
|
+
expiresAt: new Date(expiresAtMs).toISOString(),
|
|
118
|
+
...(metadata.title === undefined ? {} : { title: metadata.title }),
|
|
184
119
|
};
|
|
185
|
-
if (metadata.title !== undefined) {
|
|
186
|
-
entry.title = metadata.title;
|
|
187
|
-
}
|
|
188
|
-
return entry;
|
|
189
120
|
}
|
|
190
|
-
function persistCacheEntry(cacheKey, entry) {
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
emitCacheUpdate(cacheKey);
|
|
121
|
+
function persistCacheEntry(cacheKey, entry, expiresAtMs) {
|
|
122
|
+
contentCache.set(cacheKey, { entry, expiresAt: expiresAtMs });
|
|
123
|
+
trimCacheToMaxKeys();
|
|
124
|
+
notifyCacheUpdate(cacheKey);
|
|
195
125
|
}
|
|
196
|
-
function
|
|
126
|
+
function trimCacheToMaxKeys() {
|
|
197
127
|
if (contentCache.size <= config.cache.maxKeys)
|
|
198
128
|
return;
|
|
199
|
-
|
|
129
|
+
removeOldestEntries(contentCache.size - config.cache.maxKeys);
|
|
130
|
+
}
|
|
131
|
+
function removeOldestEntries(count) {
|
|
200
132
|
const iterator = contentCache.keys();
|
|
201
|
-
for (let
|
|
202
|
-
const
|
|
203
|
-
if (done)
|
|
133
|
+
for (let removed = 0; removed < count; removed += 1) {
|
|
134
|
+
const next = iterator.next();
|
|
135
|
+
if (next.done)
|
|
204
136
|
break;
|
|
205
|
-
contentCache.delete(value);
|
|
137
|
+
contentCache.delete(next.value);
|
|
206
138
|
}
|
|
207
139
|
}
|
|
140
|
+
function logCacheError(message, cacheKey, error) {
|
|
141
|
+
logWarn(message, {
|
|
142
|
+
key: cacheKey.length > 100 ? cacheKey.slice(0, 100) : cacheKey,
|
|
143
|
+
error: getErrorMessage(error),
|
|
144
|
+
});
|
|
145
|
+
}
|
|
@@ -3,7 +3,6 @@ interface RequestContext {
|
|
|
3
3
|
readonly sessionId?: string;
|
|
4
4
|
}
|
|
5
5
|
export declare function runWithRequestContext<T>(context: RequestContext, fn: () => T): T;
|
|
6
|
-
export declare function bindToRequestContext<T extends (...args: unknown[]) => unknown>(fn: T): T;
|
|
7
6
|
export declare function getRequestId(): string | undefined;
|
|
8
7
|
export declare function getSessionId(): string | undefined;
|
|
9
8
|
export {};
|
package/dist/services/context.js
CHANGED
|
@@ -3,13 +3,6 @@ const requestContext = new AsyncLocalStorage();
|
|
|
3
3
|
export function runWithRequestContext(context, fn) {
|
|
4
4
|
return requestContext.run(context, fn);
|
|
5
5
|
}
|
|
6
|
-
export function bindToRequestContext(fn) {
|
|
7
|
-
const store = requestContext.getStore();
|
|
8
|
-
if (!store) {
|
|
9
|
-
return fn;
|
|
10
|
-
}
|
|
11
|
-
return ((...args) => requestContext.run(store, () => fn(...args)));
|
|
12
|
-
}
|
|
13
6
|
export function getRequestId() {
|
|
14
7
|
return requestContext.getStore()?.requestId;
|
|
15
8
|
}
|
|
@@ -1,101 +1,28 @@
|
|
|
1
1
|
import { parseHTML } from 'linkedom';
|
|
2
2
|
import { Readability } from '@mozilla/readability';
|
|
3
|
-
import { getErrorMessage } from '../utils/error-
|
|
3
|
+
import { getErrorMessage } from '../utils/error-details.js';
|
|
4
|
+
import { isRecord } from '../utils/guards.js';
|
|
4
5
|
import { truncateHtml } from '../utils/html-truncator.js';
|
|
5
6
|
import { logError, logInfo, logWarn } from './logger.js';
|
|
6
|
-
import {
|
|
7
|
-
function
|
|
8
|
-
|
|
9
|
-
if (!content)
|
|
10
|
-
return;
|
|
11
|
-
if (collectOpenGraphMeta(state, tag, content))
|
|
12
|
-
return;
|
|
13
|
-
if (collectTwitterMeta(state, tag, content))
|
|
14
|
-
return;
|
|
15
|
-
collectStandardMeta(state, tag, content);
|
|
16
|
-
}
|
|
17
|
-
function getMetaContent(tag) {
|
|
18
|
-
return tag.getAttribute('content')?.trim() ?? null;
|
|
19
|
-
}
|
|
20
|
-
function collectOpenGraphMeta(state, tag, content) {
|
|
21
|
-
const property = tag.getAttribute('property');
|
|
22
|
-
if (!property?.startsWith('og:'))
|
|
23
|
-
return false;
|
|
24
|
-
const key = property.replace('og:', '');
|
|
25
|
-
if (key === 'title')
|
|
26
|
-
state.title.og = content;
|
|
27
|
-
if (key === 'description')
|
|
28
|
-
state.description.og = content;
|
|
29
|
-
return true;
|
|
30
|
-
}
|
|
31
|
-
function collectTwitterMeta(state, tag, content) {
|
|
32
|
-
const name = tag.getAttribute('name');
|
|
33
|
-
if (!name?.startsWith('twitter:'))
|
|
7
|
+
import { extractMetadata } from './metadata-collector.js';
|
|
8
|
+
function isReadabilityCompatible(doc) {
|
|
9
|
+
if (!isRecord(doc))
|
|
34
10
|
return false;
|
|
35
|
-
|
|
36
|
-
if (key === 'title')
|
|
37
|
-
state.title.twitter = content;
|
|
38
|
-
if (key === 'description')
|
|
39
|
-
state.description.twitter = content;
|
|
40
|
-
return true;
|
|
41
|
-
}
|
|
42
|
-
function collectStandardMeta(state, tag, content) {
|
|
43
|
-
const name = tag.getAttribute('name');
|
|
44
|
-
if (name === 'description') {
|
|
45
|
-
state.description.standard = content;
|
|
46
|
-
}
|
|
47
|
-
if (name === 'author') {
|
|
48
|
-
state.author.standard = content;
|
|
49
|
-
}
|
|
11
|
+
return hasDocumentElement(doc) && hasQuerySelectors(doc);
|
|
50
12
|
}
|
|
51
|
-
function
|
|
52
|
-
|
|
53
|
-
for (const tag of metaTags) {
|
|
54
|
-
collectMetaTag(state, tag);
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
function ensureTitleFallback(document, state) {
|
|
58
|
-
if (state.title.standard)
|
|
59
|
-
return;
|
|
60
|
-
const titleEl = document.querySelector('title');
|
|
61
|
-
if (titleEl?.textContent) {
|
|
62
|
-
state.title.standard = titleEl.textContent.trim();
|
|
63
|
-
}
|
|
13
|
+
function hasDocumentElement(record) {
|
|
14
|
+
return 'documentElement' in record;
|
|
64
15
|
}
|
|
65
|
-
function
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
ensureTitleFallback(document, state);
|
|
69
|
-
const metadata = {};
|
|
70
|
-
const title = resolveMetaField(state, 'title');
|
|
71
|
-
const description = resolveMetaField(state, 'description');
|
|
72
|
-
const author = resolveMetaField(state, 'author');
|
|
73
|
-
if (title !== undefined)
|
|
74
|
-
metadata.title = title;
|
|
75
|
-
if (description !== undefined)
|
|
76
|
-
metadata.description = description;
|
|
77
|
-
if (author !== undefined)
|
|
78
|
-
metadata.author = author;
|
|
79
|
-
return metadata;
|
|
80
|
-
}
|
|
81
|
-
function isReadabilityCompatible(doc) {
|
|
82
|
-
if (!doc || typeof doc !== 'object')
|
|
83
|
-
return false;
|
|
84
|
-
if (!('documentElement' in doc))
|
|
85
|
-
return false;
|
|
86
|
-
if (!('querySelectorAll' in doc))
|
|
87
|
-
return false;
|
|
88
|
-
if (!('querySelector' in doc))
|
|
89
|
-
return false;
|
|
90
|
-
return true;
|
|
16
|
+
function hasQuerySelectors(record) {
|
|
17
|
+
return (typeof record.querySelectorAll === 'function' &&
|
|
18
|
+
typeof record.querySelector === 'function');
|
|
91
19
|
}
|
|
92
20
|
function extractArticle(document) {
|
|
93
21
|
if (!isReadabilityCompatible(document)) {
|
|
94
22
|
logWarn('Document not compatible with Readability');
|
|
95
23
|
return null;
|
|
96
24
|
}
|
|
97
|
-
|
|
98
|
-
return parsed ? mapReadabilityResult(parsed) : null;
|
|
25
|
+
return mapParsedArticle(parseReadabilityArticle(document));
|
|
99
26
|
}
|
|
100
27
|
function parseReadabilityArticle(document) {
|
|
101
28
|
try {
|
|
@@ -104,31 +31,38 @@ function parseReadabilityArticle(document) {
|
|
|
104
31
|
return reader.parse();
|
|
105
32
|
}
|
|
106
33
|
catch (error) {
|
|
107
|
-
logError('Failed to extract article with Readability', error
|
|
34
|
+
logError('Failed to extract article with Readability', asError(error));
|
|
108
35
|
return null;
|
|
109
36
|
}
|
|
110
37
|
}
|
|
38
|
+
function asError(error) {
|
|
39
|
+
if (error instanceof Error) {
|
|
40
|
+
return error;
|
|
41
|
+
}
|
|
42
|
+
return undefined;
|
|
43
|
+
}
|
|
44
|
+
function mapParsedArticle(parsed) {
|
|
45
|
+
return parsed ? mapReadabilityResult(parsed) : null;
|
|
46
|
+
}
|
|
111
47
|
function mapReadabilityResult(parsed) {
|
|
112
|
-
|
|
48
|
+
return {
|
|
113
49
|
content: parsed.content ?? '',
|
|
114
50
|
textContent: parsed.textContent ?? '',
|
|
51
|
+
...buildOptionalArticleFields(parsed),
|
|
115
52
|
};
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
if (
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
}
|
|
130
|
-
function toOptional(value) {
|
|
131
|
-
return value ?? undefined;
|
|
53
|
+
}
|
|
54
|
+
function buildOptionalArticleFields(parsed) {
|
|
55
|
+
const optional = {};
|
|
56
|
+
addOptionalField(optional, 'title', parsed.title);
|
|
57
|
+
addOptionalField(optional, 'byline', parsed.byline);
|
|
58
|
+
addOptionalField(optional, 'excerpt', parsed.excerpt);
|
|
59
|
+
addOptionalField(optional, 'siteName', parsed.siteName);
|
|
60
|
+
return optional;
|
|
61
|
+
}
|
|
62
|
+
function addOptionalField(target, key, value) {
|
|
63
|
+
if (value == null)
|
|
64
|
+
return;
|
|
65
|
+
target[key] = value;
|
|
132
66
|
}
|
|
133
67
|
export function extractContent(html, url, options = { extractArticle: true }) {
|
|
134
68
|
if (!isValidInput(html, url)) {
|
|
@@ -138,12 +72,13 @@ export function extractContent(html, url, options = { extractArticle: true }) {
|
|
|
138
72
|
}
|
|
139
73
|
function tryExtractContent(html, url, options) {
|
|
140
74
|
try {
|
|
141
|
-
const
|
|
142
|
-
const { document } = parseHTML(processedHtml);
|
|
75
|
+
const { document } = parseHTML(truncateHtml(html));
|
|
143
76
|
applyBaseUri(document, url);
|
|
144
77
|
const metadata = extractMetadata(document);
|
|
145
|
-
|
|
146
|
-
|
|
78
|
+
return {
|
|
79
|
+
article: resolveArticleExtraction(document, options.extractArticle),
|
|
80
|
+
metadata,
|
|
81
|
+
};
|
|
147
82
|
}
|
|
148
83
|
catch (error) {
|
|
149
84
|
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
@@ -151,15 +86,19 @@ function tryExtractContent(html, url, options) {
|
|
|
151
86
|
}
|
|
152
87
|
}
|
|
153
88
|
function isValidInput(html, url) {
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
89
|
+
return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
|
|
90
|
+
}
|
|
91
|
+
function validateRequiredString(value, message) {
|
|
92
|
+
if (isNonEmptyString(value))
|
|
93
|
+
return true;
|
|
94
|
+
logWarn(message);
|
|
95
|
+
return false;
|
|
96
|
+
}
|
|
97
|
+
function isNonEmptyString(value) {
|
|
98
|
+
return typeof value === 'string' && value.length > 0;
|
|
99
|
+
}
|
|
100
|
+
function resolveArticleExtraction(document, shouldExtract) {
|
|
101
|
+
return shouldExtract ? extractArticle(document) : null;
|
|
163
102
|
}
|
|
164
103
|
function applyBaseUri(document, url) {
|
|
165
104
|
try {
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export declare const dispatcher:
|
|
1
|
+
import { type Dispatcher } from 'undici';
|
|
2
|
+
export declare const dispatcher: Dispatcher;
|
|
3
3
|
export declare function destroyAgents(): void;
|