@j0hanz/superfetch 1.2.5 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/README.md +131 -156
  2. package/dist/config/auth-config.d.ts +16 -0
  3. package/dist/config/auth-config.js +53 -0
  4. package/dist/config/constants.d.ts +11 -13
  5. package/dist/config/constants.js +1 -3
  6. package/dist/config/env-parsers.d.ts +7 -0
  7. package/dist/config/env-parsers.js +84 -0
  8. package/dist/config/formatting.d.ts +2 -2
  9. package/dist/config/index.d.ts +47 -53
  10. package/dist/config/index.js +35 -64
  11. package/dist/config/types/content.d.ts +1 -49
  12. package/dist/config/types/runtime.d.ts +8 -16
  13. package/dist/config/types/tools.d.ts +2 -28
  14. package/dist/http/accept-policy.d.ts +3 -0
  15. package/dist/http/accept-policy.js +45 -0
  16. package/dist/http/async-handler.d.ts +2 -0
  17. package/dist/http/async-handler.js +5 -0
  18. package/dist/http/auth-introspection.d.ts +2 -0
  19. package/dist/http/auth-introspection.js +141 -0
  20. package/dist/http/auth-static.d.ts +2 -0
  21. package/dist/http/auth-static.js +23 -0
  22. package/dist/http/auth.d.ts +3 -2
  23. package/dist/http/auth.js +254 -23
  24. package/dist/http/cors.d.ts +6 -6
  25. package/dist/http/cors.js +7 -42
  26. package/dist/http/download-routes.d.ts +0 -12
  27. package/dist/http/download-routes.js +21 -58
  28. package/dist/http/host-allowlist.d.ts +3 -0
  29. package/dist/http/host-allowlist.js +117 -0
  30. package/dist/http/jsonrpc-http.d.ts +2 -0
  31. package/dist/http/jsonrpc-http.js +10 -0
  32. package/dist/http/mcp-routes.d.ts +8 -3
  33. package/dist/http/mcp-routes.js +137 -31
  34. package/dist/http/mcp-session-eviction.d.ts +3 -0
  35. package/dist/http/mcp-session-eviction.js +24 -0
  36. package/dist/http/mcp-session-helpers.d.ts +0 -1
  37. package/dist/http/mcp-session-helpers.js +1 -1
  38. package/dist/http/mcp-session-init.d.ts +7 -0
  39. package/dist/http/mcp-session-init.js +94 -0
  40. package/dist/http/mcp-session-slots.d.ts +17 -0
  41. package/dist/http/mcp-session-slots.js +55 -0
  42. package/dist/http/mcp-session-transport-init.d.ts +7 -0
  43. package/dist/http/mcp-session-transport-init.js +41 -0
  44. package/dist/http/mcp-session-transport.d.ts +7 -0
  45. package/dist/http/mcp-session-transport.js +57 -0
  46. package/dist/http/mcp-session-types.d.ts +5 -0
  47. package/dist/http/mcp-session-types.js +1 -0
  48. package/dist/http/mcp-session.d.ts +9 -9
  49. package/dist/http/mcp-session.js +15 -137
  50. package/dist/http/mcp-sessions.d.ts +43 -0
  51. package/dist/http/mcp-sessions.js +392 -0
  52. package/dist/http/mcp-validation.d.ts +1 -0
  53. package/dist/http/mcp-validation.js +11 -10
  54. package/dist/http/protocol-policy.d.ts +2 -0
  55. package/dist/http/protocol-policy.js +31 -0
  56. package/dist/http/rate-limit.js +7 -4
  57. package/dist/http/server-config.d.ts +1 -0
  58. package/dist/http/server-config.js +40 -0
  59. package/dist/http/server-middleware.d.ts +7 -9
  60. package/dist/http/server-middleware.js +9 -70
  61. package/dist/http/server-shutdown.d.ts +4 -0
  62. package/dist/http/server-shutdown.js +43 -0
  63. package/dist/http/server.d.ts +10 -0
  64. package/dist/http/server.js +546 -61
  65. package/dist/http/session-cleanup.js +8 -5
  66. package/dist/middleware/error-handler.d.ts +1 -1
  67. package/dist/middleware/error-handler.js +32 -33
  68. package/dist/resources/cached-content-params.d.ts +5 -0
  69. package/dist/resources/cached-content-params.js +36 -0
  70. package/dist/resources/cached-content.js +67 -125
  71. package/dist/resources/index.js +0 -82
  72. package/dist/server.js +50 -29
  73. package/dist/services/cache-events.d.ts +8 -0
  74. package/dist/services/cache-events.js +19 -0
  75. package/dist/services/cache-keys.d.ts +7 -0
  76. package/dist/services/cache-keys.js +57 -0
  77. package/dist/services/cache.d.ts +4 -9
  78. package/dist/services/cache.js +77 -139
  79. package/dist/services/context.d.ts +0 -1
  80. package/dist/services/context.js +0 -7
  81. package/dist/services/extractor.js +55 -116
  82. package/dist/services/fetcher/agents.d.ts +2 -2
  83. package/dist/services/fetcher/agents.js +35 -96
  84. package/dist/services/fetcher/dns-selection.d.ts +2 -0
  85. package/dist/services/fetcher/dns-selection.js +72 -0
  86. package/dist/services/fetcher/interceptors.d.ts +0 -22
  87. package/dist/services/fetcher/interceptors.js +18 -32
  88. package/dist/services/fetcher/redirects.js +16 -7
  89. package/dist/services/fetcher/response.js +79 -34
  90. package/dist/services/fetcher.d.ts +22 -3
  91. package/dist/services/fetcher.js +544 -44
  92. package/dist/services/fifo-queue.d.ts +8 -0
  93. package/dist/services/fifo-queue.js +25 -0
  94. package/dist/services/logger.js +2 -2
  95. package/dist/services/metadata-collector.d.ts +1 -9
  96. package/dist/services/metadata-collector.js +71 -2
  97. package/dist/services/transform-worker-pool.d.ts +4 -14
  98. package/dist/services/transform-worker-pool.js +177 -129
  99. package/dist/services/transform-worker-types.d.ts +32 -0
  100. package/dist/services/transform-worker-types.js +14 -0
  101. package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
  102. package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
  103. package/dist/tools/handlers/fetch-single.shared.d.ts +11 -22
  104. package/dist/tools/handlers/fetch-single.shared.js +175 -89
  105. package/dist/tools/handlers/fetch-url.tool.d.ts +7 -1
  106. package/dist/tools/handlers/fetch-url.tool.js +84 -119
  107. package/dist/tools/index.js +21 -40
  108. package/dist/tools/schemas.d.ts +1 -51
  109. package/dist/tools/schemas.js +1 -107
  110. package/dist/tools/utils/cached-markdown.d.ts +5 -0
  111. package/dist/tools/utils/cached-markdown.js +46 -0
  112. package/dist/tools/utils/content-shaping.d.ts +4 -0
  113. package/dist/tools/utils/content-shaping.js +67 -0
  114. package/dist/tools/utils/content-transform.d.ts +5 -17
  115. package/dist/tools/utils/content-transform.js +134 -114
  116. package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
  117. package/dist/tools/utils/fetch-pipeline.js +57 -63
  118. package/dist/tools/utils/frontmatter.d.ts +3 -0
  119. package/dist/tools/utils/frontmatter.js +73 -0
  120. package/dist/tools/utils/inline-content.d.ts +1 -2
  121. package/dist/tools/utils/inline-content.js +4 -7
  122. package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
  123. package/dist/tools/utils/markdown-heuristics.js +19 -0
  124. package/dist/tools/utils/markdown-signals.d.ts +1 -0
  125. package/dist/tools/utils/markdown-signals.js +19 -0
  126. package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
  127. package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
  128. package/dist/tools/utils/raw-markdown.d.ts +6 -0
  129. package/dist/tools/utils/raw-markdown.js +135 -0
  130. package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
  131. package/dist/transformers/markdown/fenced-code-rule.js +38 -0
  132. package/dist/transformers/markdown/frontmatter.d.ts +2 -0
  133. package/dist/transformers/markdown/frontmatter.js +45 -0
  134. package/dist/transformers/markdown/noise-rule.d.ts +2 -0
  135. package/dist/transformers/markdown/noise-rule.js +80 -0
  136. package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
  137. package/dist/transformers/markdown/turndown-instance.js +19 -0
  138. package/dist/transformers/markdown.d.ts +2 -0
  139. package/dist/transformers/markdown.js +185 -0
  140. package/dist/transformers/markdown.transformer.js +5 -117
  141. package/dist/utils/cached-payload.d.ts +7 -0
  142. package/dist/utils/cached-payload.js +36 -0
  143. package/dist/utils/code-language-bash.d.ts +1 -0
  144. package/dist/utils/code-language-bash.js +48 -0
  145. package/dist/utils/code-language-core.d.ts +2 -0
  146. package/dist/utils/code-language-core.js +13 -0
  147. package/dist/utils/code-language-detectors.d.ts +5 -0
  148. package/dist/utils/code-language-detectors.js +142 -0
  149. package/dist/utils/code-language-helpers.d.ts +5 -0
  150. package/dist/utils/code-language-helpers.js +62 -0
  151. package/dist/utils/code-language-parsing.d.ts +5 -0
  152. package/dist/utils/code-language-parsing.js +62 -0
  153. package/dist/utils/code-language.d.ts +9 -0
  154. package/dist/utils/code-language.js +250 -46
  155. package/dist/utils/error-details.d.ts +3 -0
  156. package/dist/utils/error-details.js +12 -0
  157. package/dist/utils/error-utils.js +1 -1
  158. package/dist/utils/filename-generator.js +34 -12
  159. package/dist/utils/guards.d.ts +1 -0
  160. package/dist/utils/guards.js +3 -0
  161. package/dist/utils/header-normalizer.d.ts +0 -3
  162. package/dist/utils/header-normalizer.js +3 -3
  163. package/dist/utils/ip-address.d.ts +4 -0
  164. package/dist/utils/ip-address.js +6 -0
  165. package/dist/utils/tool-error-handler.d.ts +2 -2
  166. package/dist/utils/tool-error-handler.js +14 -46
  167. package/dist/utils/url-transformer.d.ts +7 -0
  168. package/dist/utils/url-transformer.js +147 -0
  169. package/dist/utils/url-validator.d.ts +1 -2
  170. package/dist/utils/url-validator.js +53 -114
  171. package/dist/workers/content-transform.worker.d.ts +1 -0
  172. package/dist/workers/content-transform.worker.js +40 -0
  173. package/package.json +17 -18
@@ -1,20 +1,15 @@
1
1
  import type { CacheEntry } from '../config/types/content.js';
2
- interface CacheKeyParts {
2
+ export interface CacheUpdateEvent {
3
+ cacheKey: string;
3
4
  namespace: string;
4
5
  urlHash: string;
5
6
  }
6
- interface CacheUpdateEvent extends CacheKeyParts {
7
- cacheKey: string;
8
- }
7
+ type CacheUpdateListener = (event: CacheUpdateEvent) => void;
8
+ export declare function onCacheUpdate(listener: CacheUpdateListener): () => void;
9
9
  interface CacheEntryMetadata {
10
10
  url: string;
11
11
  title?: string;
12
12
  }
13
- type CacheUpdateListener = (event: CacheUpdateEvent) => void;
14
- export declare function createCacheKey(namespace: string, url: string, vary?: Record<string, unknown> | string): string | null;
15
- export declare function parseCacheKey(cacheKey: string): CacheKeyParts | null;
16
- export declare function toResourceUri(cacheKey: string): string | null;
17
- export declare function onCacheUpdate(listener: CacheUpdateListener): () => void;
18
13
  export declare function get(cacheKey: string | null): CacheEntry | undefined;
19
14
  export declare function set(cacheKey: string | null, content: string, metadata: CacheEntryMetadata): void;
20
15
  export declare function keys(): readonly string[];
@@ -1,11 +1,28 @@
1
1
  import { setInterval as setIntervalPromise } from 'node:timers/promises';
2
- import { CACHE_HASH } from '../config/constants.js';
3
2
  import { config } from '../config/index.js';
4
- import { sha256Hex } from '../utils/crypto.js';
5
- import { getErrorMessage } from '../utils/error-utils.js';
3
+ import { getErrorMessage } from '../utils/error-details.js';
4
+ import { parseCacheKey } from './cache-keys.js';
6
5
  import { logWarn } from './logger.js';
7
6
  const contentCache = new Map();
8
7
  let cleanupController = null;
8
+ const updateListeners = new Set();
9
+ export function onCacheUpdate(listener) {
10
+ updateListeners.add(listener);
11
+ return () => {
12
+ updateListeners.delete(listener);
13
+ };
14
+ }
15
+ function notifyCacheUpdate(cacheKey) {
16
+ if (updateListeners.size === 0)
17
+ return;
18
+ const parts = parseCacheKey(cacheKey);
19
+ if (!parts)
20
+ return;
21
+ const event = { cacheKey, ...parts };
22
+ for (const listener of updateListeners) {
23
+ listener(event);
24
+ }
25
+ }
9
26
  function startCleanupLoop() {
10
27
  if (cleanupController)
11
28
  return;
@@ -18,156 +35,73 @@ function startCleanupLoop() {
18
35
  }
19
36
  async function runCleanupLoop(signal) {
20
37
  const intervalMs = Math.floor(config.cache.ttl * 1000);
21
- for await (const _ of setIntervalPromise(intervalMs, undefined, {
38
+ for await (const getNow of setIntervalPromise(intervalMs, Date.now, {
22
39
  signal,
23
40
  ref: false,
24
41
  })) {
25
- evictEntries();
42
+ enforceCacheLimits(getNow());
26
43
  }
27
44
  }
28
- function evictEntries() {
29
- const now = Date.now();
45
+ function enforceCacheLimits(now) {
30
46
  for (const [key, item] of contentCache.entries()) {
31
47
  if (now > item.expiresAt) {
32
48
  contentCache.delete(key);
33
49
  }
34
50
  }
35
- if (contentCache.size <= config.cache.maxKeys)
36
- return;
37
- const keysToRemove = contentCache.size - config.cache.maxKeys;
38
- const iterator = contentCache.keys();
39
- for (let i = 0; i < keysToRemove; i++) {
40
- const { value, done } = iterator.next();
41
- if (done)
42
- break;
43
- contentCache.delete(value);
44
- }
51
+ trimCacheToMaxKeys();
45
52
  }
46
- const updateListeners = new Set();
47
- function stableStringify(value) {
48
- if (value === null || value === undefined) {
49
- return '';
50
- }
51
- if (typeof value !== 'object') {
52
- return JSON.stringify(value);
53
- }
54
- if (Array.isArray(value)) {
55
- return `[${value.map((item) => stableStringify(item)).join(',')}]`;
56
- }
57
- const entries = Object.entries(value)
58
- .filter(([, entryValue]) => entryValue !== undefined)
59
- .sort(([a], [b]) => a.localeCompare(b))
60
- .map(([key, entryValue]) => `${JSON.stringify(key)}:${stableStringify(entryValue)}`);
61
- return `{${entries.join(',')}}`;
62
- }
63
- function createHashFragment(input, length) {
64
- return sha256Hex(input).substring(0, length);
65
- }
66
- /**
67
- * Constructs a cache key from namespace, URL hash, and optional vary hash.
68
- * Format: "namespace:urlHash" or "namespace:urlHash.varyHash" if vary params exist.
69
- * @param namespace - Cache namespace (e.g., "fetch-markdown")
70
- * @param urlHash - SHA-256 hash of the URL (truncated to 16 chars)
71
- * @param varyHash - Optional hash of vary parameters (e.g., headers, options)
72
- * @returns Complete cache key string
73
- */
74
- function buildCacheKey(namespace, urlHash, varyHash) {
75
- return varyHash
76
- ? `${namespace}:${urlHash}.${varyHash}`
77
- : `${namespace}:${urlHash}`;
78
- }
79
- function getVaryHash(vary) {
80
- if (!vary)
81
- return undefined;
82
- const varyString = typeof vary === 'string' ? vary : stableStringify(vary);
83
- if (!varyString)
53
+ export function get(cacheKey) {
54
+ if (!isCacheReadable(cacheKey))
84
55
  return undefined;
85
- return createHashFragment(varyString, CACHE_HASH.VARY_HASH_LENGTH);
86
- }
87
- export function createCacheKey(namespace, url, vary) {
88
- if (!namespace || !url)
89
- return null;
90
- const urlHash = createHashFragment(url, CACHE_HASH.URL_HASH_LENGTH);
91
- const varyHash = getVaryHash(vary);
92
- return buildCacheKey(namespace, urlHash, varyHash);
93
- }
94
- export function parseCacheKey(cacheKey) {
95
- if (!cacheKey)
96
- return null;
97
- const [namespace, ...rest] = cacheKey.split(':');
98
- const urlHash = rest.join(':');
99
- if (!namespace || !urlHash)
100
- return null;
101
- return { namespace, urlHash };
102
- }
103
- export function toResourceUri(cacheKey) {
104
- const parts = parseCacheKey(cacheKey);
105
- if (!parts)
106
- return null;
107
- return `superfetch://cache/${parts.namespace}/${parts.urlHash}`;
56
+ return runCacheOperation(cacheKey, 'Cache get error', () => readCacheEntry(cacheKey));
108
57
  }
109
- export function onCacheUpdate(listener) {
110
- updateListeners.add(listener);
111
- return () => {
112
- updateListeners.delete(listener);
113
- };
58
+ function isCacheReadable(cacheKey) {
59
+ return config.cache.enabled && Boolean(cacheKey);
114
60
  }
115
- function emitCacheUpdate(cacheKey) {
116
- const parts = parseCacheKey(cacheKey);
117
- if (!parts)
118
- return;
119
- for (const listener of updateListeners) {
120
- listener({ cacheKey, ...parts });
121
- }
61
+ function isCacheWritable(cacheKey, content) {
62
+ return config.cache.enabled && Boolean(cacheKey) && Boolean(content);
122
63
  }
123
- export function get(cacheKey) {
124
- if (!isCacheReadable(cacheKey))
125
- return undefined;
64
+ function runCacheOperation(cacheKey, message, operation) {
126
65
  try {
127
- return readCacheEntry(cacheKey);
66
+ return operation();
128
67
  }
129
68
  catch (error) {
130
- logWarn('Cache get error', {
131
- key: cacheKey.substring(0, 100),
132
- error: getErrorMessage(error),
133
- });
69
+ logCacheError(message, cacheKey, error);
134
70
  return undefined;
135
71
  }
136
72
  }
137
- function isCacheReadable(cacheKey) {
138
- return config.cache.enabled && Boolean(cacheKey);
139
- }
140
73
  function readCacheEntry(cacheKey) {
74
+ const now = Date.now();
75
+ return readCacheItem(cacheKey, now)?.entry;
76
+ }
77
+ function isExpired(item, now) {
78
+ return now > item.expiresAt;
79
+ }
80
+ function readCacheItem(cacheKey, now) {
141
81
  const item = contentCache.get(cacheKey);
142
82
  if (!item)
143
83
  return undefined;
144
- if (isExpired(item)) {
84
+ if (isExpired(item, now)) {
145
85
  contentCache.delete(cacheKey);
146
86
  return undefined;
147
87
  }
148
- return item.entry;
149
- }
150
- function isExpired(item) {
151
- return Date.now() > item.expiresAt;
88
+ return item;
152
89
  }
153
90
  export function set(cacheKey, content, metadata) {
154
- if (!config.cache.enabled)
91
+ if (!isCacheWritable(cacheKey, content))
155
92
  return;
156
- if (!cacheKey)
157
- return;
158
- if (!content)
159
- return;
160
- try {
93
+ runCacheOperation(cacheKey, 'Cache set error', () => {
161
94
  startCleanupLoop();
162
- const entry = buildCacheEntry(cacheKey, content, metadata);
163
- persistCacheEntry(cacheKey, entry);
164
- }
165
- catch (error) {
166
- logWarn('Cache set error', {
167
- key: cacheKey.substring(0, 100),
168
- error: getErrorMessage(error),
95
+ const now = Date.now();
96
+ const expiresAtMs = now + config.cache.ttl * 1000;
97
+ const entry = buildCacheEntry({
98
+ content,
99
+ metadata,
100
+ fetchedAtMs: now,
101
+ expiresAtMs,
169
102
  });
170
- }
103
+ persistCacheEntry(cacheKey, entry, expiresAtMs);
104
+ });
171
105
  }
172
106
  export function keys() {
173
107
  return Array.from(contentCache.keys());
@@ -175,33 +109,37 @@ export function keys() {
175
109
  export function isEnabled() {
176
110
  return config.cache.enabled;
177
111
  }
178
- function buildCacheEntry(cacheKey, content, metadata) {
179
- const entry = {
112
+ function buildCacheEntry({ content, metadata, fetchedAtMs, expiresAtMs, }) {
113
+ return {
180
114
  url: metadata.url,
181
115
  content,
182
- fetchedAt: new Date().toISOString(),
183
- expiresAt: new Date(Date.now() + config.cache.ttl * 1000).toISOString(),
116
+ fetchedAt: new Date(fetchedAtMs).toISOString(),
117
+ expiresAt: new Date(expiresAtMs).toISOString(),
118
+ ...(metadata.title === undefined ? {} : { title: metadata.title }),
184
119
  };
185
- if (metadata.title !== undefined) {
186
- entry.title = metadata.title;
187
- }
188
- return entry;
189
120
  }
190
- function persistCacheEntry(cacheKey, entry) {
191
- const expiresAt = Date.now() + config.cache.ttl * 1000;
192
- contentCache.set(cacheKey, { entry, expiresAt });
193
- enforceMaxKeysLimit();
194
- emitCacheUpdate(cacheKey);
121
+ function persistCacheEntry(cacheKey, entry, expiresAtMs) {
122
+ contentCache.set(cacheKey, { entry, expiresAt: expiresAtMs });
123
+ trimCacheToMaxKeys();
124
+ notifyCacheUpdate(cacheKey);
195
125
  }
196
- function enforceMaxKeysLimit() {
126
+ function trimCacheToMaxKeys() {
197
127
  if (contentCache.size <= config.cache.maxKeys)
198
128
  return;
199
- const keysToRemove = contentCache.size - config.cache.maxKeys;
129
+ removeOldestEntries(contentCache.size - config.cache.maxKeys);
130
+ }
131
+ function removeOldestEntries(count) {
200
132
  const iterator = contentCache.keys();
201
- for (let i = 0; i < keysToRemove; i++) {
202
- const { value, done } = iterator.next();
203
- if (done)
133
+ for (let removed = 0; removed < count; removed += 1) {
134
+ const next = iterator.next();
135
+ if (next.done)
204
136
  break;
205
- contentCache.delete(value);
137
+ contentCache.delete(next.value);
206
138
  }
207
139
  }
140
+ function logCacheError(message, cacheKey, error) {
141
+ logWarn(message, {
142
+ key: cacheKey.length > 100 ? cacheKey.slice(0, 100) : cacheKey,
143
+ error: getErrorMessage(error),
144
+ });
145
+ }
@@ -3,7 +3,6 @@ interface RequestContext {
3
3
  readonly sessionId?: string;
4
4
  }
5
5
  export declare function runWithRequestContext<T>(context: RequestContext, fn: () => T): T;
6
- export declare function bindToRequestContext<T extends (...args: unknown[]) => unknown>(fn: T): T;
7
6
  export declare function getRequestId(): string | undefined;
8
7
  export declare function getSessionId(): string | undefined;
9
8
  export {};
@@ -3,13 +3,6 @@ const requestContext = new AsyncLocalStorage();
3
3
  export function runWithRequestContext(context, fn) {
4
4
  return requestContext.run(context, fn);
5
5
  }
6
- export function bindToRequestContext(fn) {
7
- const store = requestContext.getStore();
8
- if (!store) {
9
- return fn;
10
- }
11
- return ((...args) => requestContext.run(store, () => fn(...args)));
12
- }
13
6
  export function getRequestId() {
14
7
  return requestContext.getStore()?.requestId;
15
8
  }
@@ -1,101 +1,28 @@
1
1
  import { parseHTML } from 'linkedom';
2
2
  import { Readability } from '@mozilla/readability';
3
- import { getErrorMessage } from '../utils/error-utils.js';
3
+ import { getErrorMessage } from '../utils/error-details.js';
4
+ import { isRecord } from '../utils/guards.js';
4
5
  import { truncateHtml } from '../utils/html-truncator.js';
5
6
  import { logError, logInfo, logWarn } from './logger.js';
6
- import { createMetaCollectorState, resolveMetaField, } from './metadata-collector.js';
7
- function collectMetaTag(state, tag) {
8
- const content = getMetaContent(tag);
9
- if (!content)
10
- return;
11
- if (collectOpenGraphMeta(state, tag, content))
12
- return;
13
- if (collectTwitterMeta(state, tag, content))
14
- return;
15
- collectStandardMeta(state, tag, content);
16
- }
17
- function getMetaContent(tag) {
18
- return tag.getAttribute('content')?.trim() ?? null;
19
- }
20
- function collectOpenGraphMeta(state, tag, content) {
21
- const property = tag.getAttribute('property');
22
- if (!property?.startsWith('og:'))
23
- return false;
24
- const key = property.replace('og:', '');
25
- if (key === 'title')
26
- state.title.og = content;
27
- if (key === 'description')
28
- state.description.og = content;
29
- return true;
30
- }
31
- function collectTwitterMeta(state, tag, content) {
32
- const name = tag.getAttribute('name');
33
- if (!name?.startsWith('twitter:'))
7
+ import { extractMetadata } from './metadata-collector.js';
8
+ function isReadabilityCompatible(doc) {
9
+ if (!isRecord(doc))
34
10
  return false;
35
- const key = name.replace('twitter:', '');
36
- if (key === 'title')
37
- state.title.twitter = content;
38
- if (key === 'description')
39
- state.description.twitter = content;
40
- return true;
41
- }
42
- function collectStandardMeta(state, tag, content) {
43
- const name = tag.getAttribute('name');
44
- if (name === 'description') {
45
- state.description.standard = content;
46
- }
47
- if (name === 'author') {
48
- state.author.standard = content;
49
- }
11
+ return hasDocumentElement(doc) && hasQuerySelectors(doc);
50
12
  }
51
- function scanMetaTags(document, state) {
52
- const metaTags = document.querySelectorAll('meta');
53
- for (const tag of metaTags) {
54
- collectMetaTag(state, tag);
55
- }
56
- }
57
- function ensureTitleFallback(document, state) {
58
- if (state.title.standard)
59
- return;
60
- const titleEl = document.querySelector('title');
61
- if (titleEl?.textContent) {
62
- state.title.standard = titleEl.textContent.trim();
63
- }
13
+ function hasDocumentElement(record) {
14
+ return 'documentElement' in record;
64
15
  }
65
- function extractMetadata(document) {
66
- const state = createMetaCollectorState();
67
- scanMetaTags(document, state);
68
- ensureTitleFallback(document, state);
69
- const metadata = {};
70
- const title = resolveMetaField(state, 'title');
71
- const description = resolveMetaField(state, 'description');
72
- const author = resolveMetaField(state, 'author');
73
- if (title !== undefined)
74
- metadata.title = title;
75
- if (description !== undefined)
76
- metadata.description = description;
77
- if (author !== undefined)
78
- metadata.author = author;
79
- return metadata;
80
- }
81
- function isReadabilityCompatible(doc) {
82
- if (!doc || typeof doc !== 'object')
83
- return false;
84
- if (!('documentElement' in doc))
85
- return false;
86
- if (!('querySelectorAll' in doc))
87
- return false;
88
- if (!('querySelector' in doc))
89
- return false;
90
- return true;
16
+ function hasQuerySelectors(record) {
17
+ return (typeof record.querySelectorAll === 'function' &&
18
+ typeof record.querySelector === 'function');
91
19
  }
92
20
  function extractArticle(document) {
93
21
  if (!isReadabilityCompatible(document)) {
94
22
  logWarn('Document not compatible with Readability');
95
23
  return null;
96
24
  }
97
- const parsed = parseReadabilityArticle(document);
98
- return parsed ? mapReadabilityResult(parsed) : null;
25
+ return mapParsedArticle(parseReadabilityArticle(document));
99
26
  }
100
27
  function parseReadabilityArticle(document) {
101
28
  try {
@@ -104,31 +31,38 @@ function parseReadabilityArticle(document) {
104
31
  return reader.parse();
105
32
  }
106
33
  catch (error) {
107
- logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
34
+ logError('Failed to extract article with Readability', asError(error));
108
35
  return null;
109
36
  }
110
37
  }
38
+ function asError(error) {
39
+ if (error instanceof Error) {
40
+ return error;
41
+ }
42
+ return undefined;
43
+ }
44
+ function mapParsedArticle(parsed) {
45
+ return parsed ? mapReadabilityResult(parsed) : null;
46
+ }
111
47
  function mapReadabilityResult(parsed) {
112
- const article = {
48
+ return {
113
49
  content: parsed.content ?? '',
114
50
  textContent: parsed.textContent ?? '',
51
+ ...buildOptionalArticleFields(parsed),
115
52
  };
116
- const title = toOptional(parsed.title);
117
- if (title !== undefined)
118
- article.title = title;
119
- const byline = toOptional(parsed.byline);
120
- if (byline !== undefined)
121
- article.byline = byline;
122
- const excerpt = toOptional(parsed.excerpt);
123
- if (excerpt !== undefined)
124
- article.excerpt = excerpt;
125
- const siteName = toOptional(parsed.siteName);
126
- if (siteName !== undefined)
127
- article.siteName = siteName;
128
- return article;
129
- }
130
- function toOptional(value) {
131
- return value ?? undefined;
53
+ }
54
+ function buildOptionalArticleFields(parsed) {
55
+ const optional = {};
56
+ addOptionalField(optional, 'title', parsed.title);
57
+ addOptionalField(optional, 'byline', parsed.byline);
58
+ addOptionalField(optional, 'excerpt', parsed.excerpt);
59
+ addOptionalField(optional, 'siteName', parsed.siteName);
60
+ return optional;
61
+ }
62
+ function addOptionalField(target, key, value) {
63
+ if (value == null)
64
+ return;
65
+ target[key] = value;
132
66
  }
133
67
  export function extractContent(html, url, options = { extractArticle: true }) {
134
68
  if (!isValidInput(html, url)) {
@@ -138,12 +72,13 @@ export function extractContent(html, url, options = { extractArticle: true }) {
138
72
  }
139
73
  function tryExtractContent(html, url, options) {
140
74
  try {
141
- const processedHtml = truncateHtml(html);
142
- const { document } = parseHTML(processedHtml);
75
+ const { document } = parseHTML(truncateHtml(html));
143
76
  applyBaseUri(document, url);
144
77
  const metadata = extractMetadata(document);
145
- const article = options.extractArticle ? extractArticle(document) : null;
146
- return { article, metadata };
78
+ return {
79
+ article: resolveArticleExtraction(document, options.extractArticle),
80
+ metadata,
81
+ };
147
82
  }
148
83
  catch (error) {
149
84
  logError('Failed to extract content', error instanceof Error ? error : undefined);
@@ -151,15 +86,19 @@ function tryExtractContent(html, url, options) {
151
86
  }
152
87
  }
153
88
  function isValidInput(html, url) {
154
- if (!html || typeof html !== 'string') {
155
- logWarn('extractContent called with invalid HTML input');
156
- return false;
157
- }
158
- if (!url || typeof url !== 'string') {
159
- logWarn('extractContent called with invalid URL');
160
- return false;
161
- }
162
- return true;
89
+ return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
90
+ }
91
+ function validateRequiredString(value, message) {
92
+ if (isNonEmptyString(value))
93
+ return true;
94
+ logWarn(message);
95
+ return false;
96
+ }
97
+ function isNonEmptyString(value) {
98
+ return typeof value === 'string' && value.length > 0;
99
+ }
100
+ function resolveArticleExtraction(document, shouldExtract) {
101
+ return shouldExtract ? extractArticle(document) : null;
163
102
  }
164
103
  function applyBaseUri(document, url) {
165
104
  try {
@@ -1,3 +1,3 @@
1
- import { Agent } from 'undici';
2
- export declare const dispatcher: Agent;
1
+ import { type Dispatcher } from 'undici';
2
+ export declare const dispatcher: Dispatcher;
3
3
  export declare function destroyAgents(): void;