@j0hanz/superfetch 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -80,7 +80,8 @@ export function resolveDownloadPayload(params, cacheEntry) {
80
80
  const content = resolvePayloadContent(payload, params.namespace);
81
81
  if (!content)
82
82
  return null;
83
- const fileName = generateSafeFilename(cacheEntry.url, cacheEntry.title ?? payload.title, params.hash, resolveExtension(params.namespace));
83
+ const safeTitle = typeof payload.title === 'string' ? payload.title : undefined;
84
+ const fileName = generateSafeFilename(cacheEntry.url, cacheEntry.title ?? safeTitle, params.hash, resolveExtension(params.namespace));
84
85
  return {
85
86
  content,
86
87
  contentType: resolveContentType(params.namespace),
@@ -1,6 +1,6 @@
1
1
  import { randomUUID } from 'node:crypto';
2
2
  import { config } from '../config/index.js';
3
- import { requestContext } from '../services/context.js';
3
+ import { bindToRequestContext, runWithRequestContext, } from '../services/context.js';
4
4
  import { getSessionId } from './sessions.js';
5
5
  const LOOPBACK_HOSTS = new Set(['localhost', '127.0.0.1', '::1']);
6
6
  function normalizeHost(value) {
@@ -84,8 +84,9 @@ export function createContextMiddleware() {
84
84
  return (req, _res, next) => {
85
85
  const requestId = randomUUID();
86
86
  const sessionId = getSessionId(req);
87
- requestContext.run({ requestId, sessionId }, () => {
88
- next();
87
+ runWithRequestContext({ requestId, sessionId }, () => {
88
+ const boundNext = bindToRequestContext(next);
89
+ boundNext();
89
90
  });
90
91
  };
91
92
  }
@@ -18,4 +18,5 @@ export declare function onCacheUpdate(listener: CacheUpdateListener): () => void
18
18
  export declare function get(cacheKey: string | null): CacheEntry | undefined;
19
19
  export declare function set(cacheKey: string | null, content: string, metadata: CacheEntryMetadata): void;
20
20
  export declare function keys(): string[];
21
+ export declare function isEnabled(): boolean;
21
22
  export {};
@@ -175,6 +175,9 @@ export function set(cacheKey, content, metadata) {
175
175
  export function keys() {
176
176
  return Array.from(contentCache.keys());
177
177
  }
178
+ export function isEnabled() {
179
+ return config.cache.enabled;
180
+ }
178
181
  function buildCacheEntry(cacheKey, content, metadata) {
179
182
  return {
180
183
  url: metadata.url,
@@ -4,6 +4,8 @@ interface RequestContext {
4
4
  sessionId?: string;
5
5
  }
6
6
  export declare const requestContext: AsyncLocalStorage<RequestContext>;
7
+ export declare function runWithRequestContext<T>(context: RequestContext, fn: () => T): T;
8
+ export declare function bindToRequestContext<T extends (...args: unknown[]) => unknown>(fn: T): T;
7
9
  export declare function getRequestId(): string | undefined;
8
10
  export declare function getSessionId(): string | undefined;
9
11
  export {};
@@ -1,5 +1,15 @@
1
1
  import { AsyncLocalStorage } from 'node:async_hooks';
2
2
  export const requestContext = new AsyncLocalStorage();
3
+ export function runWithRequestContext(context, fn) {
4
+ return requestContext.run(context, fn);
5
+ }
6
+ export function bindToRequestContext(fn) {
7
+ const store = requestContext.getStore();
8
+ if (!store) {
9
+ return fn;
10
+ }
11
+ return ((...args) => requestContext.run(store, () => fn(...args)));
12
+ }
3
13
  export function getRequestId() {
4
14
  return requestContext.getStore()?.requestId;
5
15
  }
@@ -3,9 +3,26 @@ import os from 'node:os';
3
3
  import { Agent } from 'undici';
4
4
  import { createErrorWithCode } from '../../utils/error-utils.js';
5
5
  import { isBlockedIp } from '../../utils/url-validator.js';
6
+ const DNS_LOOKUP_TIMEOUT_MS = 5000;
6
7
  function resolveDns(hostname, options, callback) {
7
8
  const { normalizedOptions, useAll, resolvedFamily } = buildLookupContext(options);
8
- dns.lookup(hostname, { ...normalizedOptions, all: true }, createLookupCallback(hostname, resolvedFamily, useAll, callback));
9
+ const lookupOptions = buildLookupOptions(normalizedOptions);
10
+ let done = false;
11
+ const timer = setTimeout(() => {
12
+ if (done)
13
+ return;
14
+ done = true;
15
+ callback(createErrorWithCode(`DNS lookup timed out for ${hostname}`, 'ETIMEOUT'), []);
16
+ }, DNS_LOOKUP_TIMEOUT_MS);
17
+ timer.unref();
18
+ const safeCallback = (err, address, family) => {
19
+ if (done)
20
+ return;
21
+ done = true;
22
+ clearTimeout(timer);
23
+ callback(err, address, family);
24
+ };
25
+ dns.lookup(hostname, lookupOptions, createLookupCallback(hostname, resolvedFamily, useAll, safeCallback));
9
26
  }
10
27
  function normalizeLookupOptions(options) {
11
28
  return typeof options === 'number' ? { family: options } : options;
@@ -18,6 +35,29 @@ function buildLookupContext(options) {
18
35
  resolvedFamily: resolveFamily(normalizedOptions.family),
19
36
  };
20
37
  }
38
+ const DEFAULT_DNS_ORDER = 'verbatim';
39
+ function resolveResultOrder(options) {
40
+ if (options.order)
41
+ return options.order;
42
+ const legacyVerbatim = getLegacyVerbatim(options);
43
+ if (legacyVerbatim !== undefined) {
44
+ return legacyVerbatim ? 'verbatim' : 'ipv4first';
45
+ }
46
+ return DEFAULT_DNS_ORDER;
47
+ }
48
+ function getLegacyVerbatim(options) {
49
+ const legacy = options.verbatim;
50
+ return typeof legacy === 'boolean' ? legacy : undefined;
51
+ }
52
+ function buildLookupOptions(normalizedOptions) {
53
+ const options = {
54
+ ...normalizedOptions,
55
+ order: resolveResultOrder(normalizedOptions),
56
+ all: true,
57
+ };
58
+ delete options.verbatim;
59
+ return options;
60
+ }
21
61
  function createLookupCallback(hostname, resolvedFamily, useAll, callback) {
22
62
  return (err, addresses) => {
23
63
  handleLookupResult(err, addresses, hostname, resolvedFamily, useAll, callback);
@@ -42,6 +82,11 @@ function handleLookupResult(error, addresses, hostname, resolvedFamily, useAll,
42
82
  return;
43
83
  }
44
84
  const list = normalizeLookupResults(addresses, resolvedFamily);
85
+ const invalidFamilyError = findInvalidFamilyError(list, hostname);
86
+ if (invalidFamilyError) {
87
+ callback(invalidFamilyError, list);
88
+ return;
89
+ }
45
90
  const blockedError = findBlockedIpError(list, hostname);
46
91
  if (blockedError) {
47
92
  callback(blockedError, list);
@@ -89,6 +134,15 @@ function findBlockedIpError(list, hostname) {
89
134
  }
90
135
  return null;
91
136
  }
137
+ function findInvalidFamilyError(list, hostname) {
138
+ for (const addr of list) {
139
+ const family = typeof addr === 'string' ? 0 : addr.family;
140
+ if (family === 4 || family === 6)
141
+ continue;
142
+ return createErrorWithCode(`Invalid address family returned for ${hostname}`, 'EINVAL');
143
+ }
144
+ return null;
145
+ }
92
146
  function createNoDnsResultsError(hostname) {
93
147
  return createErrorWithCode(`No DNS results returned for ${hostname}`, 'ENODATA');
94
148
  }
@@ -4,21 +4,44 @@ import { performance } from 'node:perf_hooks';
4
4
  import { isSystemError } from '../../utils/error-utils.js';
5
5
  import { logDebug, logError, logWarn } from '../logger.js';
6
6
  const fetchChannel = diagnosticsChannel.channel('superfetch.fetch');
7
+ function redactUrl(rawUrl) {
8
+ try {
9
+ const url = new URL(rawUrl);
10
+ url.username = '';
11
+ url.password = '';
12
+ url.hash = '';
13
+ url.search = '';
14
+ return url.toString();
15
+ }
16
+ catch {
17
+ return rawUrl;
18
+ }
19
+ }
20
+ function publishFetchEvent(event) {
21
+ if (!fetchChannel.hasSubscribers)
22
+ return;
23
+ try {
24
+ fetchChannel.publish(event);
25
+ }
26
+ catch {
27
+ // Avoid crashing the publisher if a subscriber throws.
28
+ }
29
+ }
7
30
  export function startFetchTelemetry(url, method) {
31
+ const safeUrl = redactUrl(url);
8
32
  const context = {
9
33
  requestId: randomUUID(),
10
34
  startTime: performance.now(),
11
- url,
35
+ url: safeUrl,
12
36
  method: method.toUpperCase(),
13
37
  };
14
- if (fetchChannel.hasSubscribers) {
15
- fetchChannel.publish({
16
- type: 'start',
17
- requestId: context.requestId,
18
- method: context.method,
19
- url: context.url,
20
- });
21
- }
38
+ publishFetchEvent({
39
+ v: 1,
40
+ type: 'start',
41
+ requestId: context.requestId,
42
+ method: context.method,
43
+ url: context.url,
44
+ });
22
45
  logDebug('HTTP Request', {
23
46
  requestId: context.requestId,
24
47
  method: context.method,
@@ -38,9 +61,8 @@ export function recordFetchResponse(context, response, contentSize) {
38
61
  logSlowRequestIfNeeded(context, duration);
39
62
  }
40
63
  function publishFetchEnd(context, status, duration) {
41
- if (!fetchChannel.hasSubscribers)
42
- return;
43
- fetchChannel.publish({
64
+ publishFetchEvent({
65
+ v: 1,
44
66
  type: 'end',
45
67
  requestId: context.requestId,
46
68
  status,
@@ -69,17 +91,16 @@ export function recordFetchError(context, error, status) {
69
91
  const duration = performance.now() - context.startTime;
70
92
  const err = error instanceof Error ? error : new Error(String(error));
71
93
  const code = isSystemError(err) ? err.code : undefined;
72
- if (fetchChannel.hasSubscribers) {
73
- fetchChannel.publish({
74
- type: 'error',
75
- requestId: context.requestId,
76
- url: context.url,
77
- error: err.message,
78
- code,
79
- status,
80
- duration,
81
- });
82
- }
94
+ publishFetchEvent({
95
+ v: 1,
96
+ type: 'error',
97
+ requestId: context.requestId,
98
+ url: context.url,
99
+ error: err.message,
100
+ code,
101
+ status,
102
+ duration,
103
+ });
83
104
  const log = status === 429 ? logWarn : logError;
84
105
  log('HTTP Request Error', {
85
106
  requestId: context.requestId,
@@ -1,4 +1,4 @@
1
- export declare function readResponseText(response: Response, url: string, maxBytes: number): Promise<{
1
+ export declare function readResponseText(response: Response, url: string, maxBytes: number, signal?: AbortSignal): Promise<{
2
2
  text: string;
3
3
  size: number;
4
4
  }>;
@@ -9,30 +9,43 @@ function assertContentLengthWithinLimit(response, url, maxBytes) {
9
9
  }
10
10
  throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
11
11
  }
12
- async function readStreamWithLimit(stream, url, maxBytes) {
12
+ function throwIfReadAborted(url, signal) {
13
+ if (!signal?.aborted)
14
+ return;
15
+ throw new FetchError('Request was aborted during response read', url, 499, {
16
+ reason: 'aborted',
17
+ });
18
+ }
19
+ async function readStreamWithLimit(stream, url, maxBytes, signal) {
13
20
  const reader = stream.getReader();
14
21
  const decoder = new TextDecoder();
15
22
  let total = 0;
16
23
  const chunks = [];
17
- for (;;) {
18
- const { value, done } = await reader.read();
19
- if (done)
20
- break;
21
- total += value.byteLength;
22
- if (total > maxBytes) {
23
- await reader.cancel();
24
- throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
24
+ try {
25
+ for (;;) {
26
+ throwIfReadAborted(url, signal);
27
+ const { value, done } = await reader.read();
28
+ if (done)
29
+ break;
30
+ total += value.byteLength;
31
+ if (total > maxBytes) {
32
+ await reader.cancel();
33
+ throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
34
+ }
35
+ chunks.push(decoder.decode(value, { stream: true }));
25
36
  }
26
- chunks.push(decoder.decode(value, { stream: true }));
37
+ chunks.push(decoder.decode());
38
+ return { text: chunks.join(''), size: total };
39
+ }
40
+ finally {
41
+ reader.releaseLock();
27
42
  }
28
- chunks.push(decoder.decode());
29
- return { text: chunks.join(''), size: total };
30
43
  }
31
- export async function readResponseText(response, url, maxBytes) {
44
+ export async function readResponseText(response, url, maxBytes, signal) {
32
45
  assertContentLengthWithinLimit(response, url, maxBytes);
33
46
  if (!response.body) {
34
47
  const text = await response.text();
35
48
  return { text, size: Buffer.byteLength(text) };
36
49
  }
37
- return readStreamWithLimit(response.body, url, maxBytes);
50
+ return readStreamWithLimit(response.body, url, maxBytes, signal);
38
51
  }
@@ -2,3 +2,4 @@ import type { FetchOptions } from '../config/types/runtime.js';
2
2
  import { destroyAgents } from './fetcher/agents.js';
3
3
  export { destroyAgents };
4
4
  export declare function fetchUrlWithRetry(url: string, options?: FetchOptions, maxRetries?: number): Promise<string>;
5
+ export declare function fetchNormalizedUrlWithRetry(normalizedUrl: string, options?: FetchOptions, maxRetries?: number): Promise<string>;
@@ -39,7 +39,7 @@ function buildRequestInit(headers, signal) {
39
39
  dispatcher,
40
40
  };
41
41
  }
42
- async function handleFetchResponse(response, finalUrl, telemetry) {
42
+ async function handleFetchResponse(response, finalUrl, telemetry, signal) {
43
43
  if (response.status === 429) {
44
44
  void response.body?.cancel();
45
45
  throw createRateLimitError(finalUrl, response.headers.get('retry-after'));
@@ -48,7 +48,7 @@ async function handleFetchResponse(response, finalUrl, telemetry) {
48
48
  void response.body?.cancel();
49
49
  throw createHttpError(finalUrl, response.status, response.statusText);
50
50
  }
51
- const { text, size } = await readResponseText(response, finalUrl, config.fetcher.maxContentLength);
51
+ const { text, size } = await readResponseText(response, finalUrl, config.fetcher.maxContentLength, signal);
52
52
  recordFetchResponse(telemetry, response, size);
53
53
  return text;
54
54
  }
@@ -57,7 +57,7 @@ async function fetchWithTelemetry(normalizedUrl, requestInit, timeoutMs) {
57
57
  try {
58
58
  const { response, url: finalUrl } = await fetchWithRedirects(normalizedUrl, requestInit, config.fetcher.maxRedirects);
59
59
  telemetry.url = finalUrl;
60
- return await handleFetchResponse(response, finalUrl, telemetry);
60
+ return await handleFetchResponse(response, finalUrl, telemetry, requestInit.signal ?? undefined);
61
61
  }
62
62
  catch (error) {
63
63
  const mapped = mapFetchError(error, normalizedUrl, timeoutMs);
@@ -68,6 +68,9 @@ async function fetchWithTelemetry(normalizedUrl, requestInit, timeoutMs) {
68
68
  }
69
69
  export async function fetchUrlWithRetry(url, options, maxRetries = 3) {
70
70
  const normalizedUrl = await validateAndNormalizeUrl(url);
71
+ return fetchNormalizedUrlWithRetry(normalizedUrl, options, maxRetries);
72
+ }
73
+ export async function fetchNormalizedUrlWithRetry(normalizedUrl, options, maxRetries = 3) {
71
74
  const context = buildRequestContext(options);
72
75
  return executeWithRetry(normalizedUrl, maxRetries, async () => runFetch(normalizedUrl, context), context.signal);
73
76
  }
@@ -1,2 +1,6 @@
1
- import type { ContentBlockUnion } from '../config/types/content.js';
1
+ import type { ContentBlockUnion, ExtractedMetadata } from '../config/types/content.js';
2
2
  export declare function parseHtml(html: string): ContentBlockUnion[];
3
+ export declare function parseHtmlWithMetadata(html: string): {
4
+ blocks: ContentBlockUnion[];
5
+ metadata: ExtractedMetadata;
6
+ };
@@ -7,6 +7,60 @@ import { truncateHtml } from '../utils/html-truncator.js';
7
7
  import { sanitizeText } from '../utils/sanitizer.js';
8
8
  import { logWarn } from './logger.js';
9
9
  const CONTENT_SELECTOR = 'h1, h2, h3, h4, h5, h6, p, ul, ol, pre, code:not(pre code), table, img, blockquote';
10
+ function createMetaCollectorState() {
11
+ return {
12
+ title: {},
13
+ description: {},
14
+ author: {},
15
+ };
16
+ }
17
+ function resolveMetaField(state, field) {
18
+ const sources = state[field];
19
+ return sources.og ?? sources.twitter ?? sources.standard;
20
+ }
21
+ function extractMetadata($) {
22
+ const state = createMetaCollectorState();
23
+ $('meta').each((_, element) => {
24
+ const content = $(element).attr('content')?.trim();
25
+ if (!content)
26
+ return;
27
+ const property = $(element).attr('property');
28
+ if (property?.startsWith('og:')) {
29
+ const key = property.replace('og:', '');
30
+ if (key === 'title')
31
+ state.title.og = content;
32
+ if (key === 'description')
33
+ state.description.og = content;
34
+ return;
35
+ }
36
+ const name = $(element).attr('name');
37
+ if (name?.startsWith('twitter:')) {
38
+ const key = name.replace('twitter:', '');
39
+ if (key === 'title')
40
+ state.title.twitter = content;
41
+ if (key === 'description')
42
+ state.description.twitter = content;
43
+ return;
44
+ }
45
+ if (name === 'description') {
46
+ state.description.standard = content;
47
+ }
48
+ if (name === 'author') {
49
+ state.author.standard = content;
50
+ }
51
+ });
52
+ if (!state.title.standard) {
53
+ const titleText = $('title').first().text().trim();
54
+ if (titleText) {
55
+ state.title.standard = titleText;
56
+ }
57
+ }
58
+ return {
59
+ title: resolveMetaField(state, 'title'),
60
+ description: resolveMetaField(state, 'description'),
61
+ author: resolveMetaField(state, 'author'),
62
+ };
63
+ }
10
64
  function parseHeading($, element) {
11
65
  const rawText = sanitizeText($(element).text());
12
66
  const text = cleanHeading(rawText);
@@ -173,6 +227,10 @@ function loadHtml(html) {
173
227
  return null;
174
228
  }
175
229
  }
230
+ function prepareCheerio(html) {
231
+ const processedHtml = truncateHtml(html);
232
+ return loadHtml(processedHtml);
233
+ }
176
234
  function removeNoiseElements($) {
177
235
  $('script, style, noscript, iframe, svg').remove();
178
236
  }
@@ -198,10 +256,22 @@ function safeParseElement($, element) {
198
256
  export function parseHtml(html) {
199
257
  if (!html || typeof html !== 'string')
200
258
  return [];
201
- const processedHtml = truncateHtml(html);
202
- const $ = loadHtml(processedHtml);
259
+ const $ = prepareCheerio(html);
203
260
  if (!$)
204
261
  return [];
205
262
  removeNoiseElements($);
206
263
  return filterBlocks(collectBlocks($));
207
264
  }
265
+ export function parseHtmlWithMetadata(html) {
266
+ if (!html || typeof html !== 'string') {
267
+ return { blocks: [], metadata: {} };
268
+ }
269
+ const $ = prepareCheerio(html);
270
+ if (!$) {
271
+ return { blocks: [], metadata: {} };
272
+ }
273
+ const metadata = extractMetadata($);
274
+ removeNoiseElements($);
275
+ const blocks = filterBlocks(collectBlocks($));
276
+ return { blocks, metadata };
277
+ }
@@ -1,6 +1,6 @@
1
1
  import { z } from 'zod';
2
2
  export declare const fetchUrlInputSchema: z.ZodObject<{
3
- customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
3
+ customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
4
4
  timeout: z.ZodDefault<z.ZodNumber>;
5
5
  retries: z.ZodDefault<z.ZodNumber>;
6
6
  } & {
@@ -13,25 +13,25 @@ export declare const fetchUrlInputSchema: z.ZodObject<{
13
13
  format: z.ZodDefault<z.ZodEnum<["jsonl", "markdown"]>>;
14
14
  }, "strict", z.ZodTypeAny, {
15
15
  url: string;
16
+ timeout: number;
17
+ retries: number;
16
18
  extractMainContent: boolean;
17
19
  includeMetadata: boolean;
18
- retries: number;
19
20
  format: "jsonl" | "markdown";
20
- timeout: number;
21
21
  customHeaders?: Record<string, string> | undefined;
22
22
  maxContentLength?: number | undefined;
23
23
  }, {
24
24
  url: string;
25
25
  customHeaders?: Record<string, string> | undefined;
26
+ timeout?: number | undefined;
27
+ retries?: number | undefined;
26
28
  extractMainContent?: boolean | undefined;
27
29
  includeMetadata?: boolean | undefined;
28
- retries?: number | undefined;
29
- format?: "jsonl" | "markdown" | undefined;
30
30
  maxContentLength?: number | undefined;
31
- timeout?: number | undefined;
31
+ format?: "jsonl" | "markdown" | undefined;
32
32
  }>;
33
33
  export declare const fetchMarkdownInputSchema: z.ZodObject<{
34
- customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
34
+ customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
35
35
  timeout: z.ZodDefault<z.ZodNumber>;
36
36
  retries: z.ZodDefault<z.ZodNumber>;
37
37
  } & {
@@ -42,20 +42,20 @@ export declare const fetchMarkdownInputSchema: z.ZodObject<{
42
42
  maxContentLength: z.ZodOptional<z.ZodNumber>;
43
43
  }, "strict", z.ZodTypeAny, {
44
44
  url: string;
45
+ timeout: number;
46
+ retries: number;
45
47
  extractMainContent: boolean;
46
48
  includeMetadata: boolean;
47
- retries: number;
48
- timeout: number;
49
49
  customHeaders?: Record<string, string> | undefined;
50
50
  maxContentLength?: number | undefined;
51
51
  }, {
52
52
  url: string;
53
53
  customHeaders?: Record<string, string> | undefined;
54
+ timeout?: number | undefined;
55
+ retries?: number | undefined;
54
56
  extractMainContent?: boolean | undefined;
55
57
  includeMetadata?: boolean | undefined;
56
- retries?: number | undefined;
57
58
  maxContentLength?: number | undefined;
58
- timeout?: number | undefined;
59
59
  }>;
60
60
  export declare const fetchUrlOutputSchema: z.ZodObject<{
61
61
  url: z.ZodString;
@@ -74,31 +74,31 @@ export declare const fetchUrlOutputSchema: z.ZodObject<{
74
74
  errorCode: z.ZodOptional<z.ZodString>;
75
75
  }, "strict", z.ZodTypeAny, {
76
76
  url: string;
77
+ format: "jsonl" | "markdown";
77
78
  contentBlocks: number;
78
79
  fetchedAt: string;
79
- format: "jsonl" | "markdown";
80
80
  cached: boolean;
81
81
  error?: string | undefined;
82
- content?: string | undefined;
83
82
  title?: string | undefined;
84
- truncated?: boolean | undefined;
83
+ content?: string | undefined;
84
+ contentSize?: number | undefined;
85
85
  resourceUri?: string | undefined;
86
86
  resourceMimeType?: string | undefined;
87
- contentSize?: number | undefined;
87
+ truncated?: boolean | undefined;
88
88
  errorCode?: string | undefined;
89
89
  }, {
90
90
  url: string;
91
+ format: "jsonl" | "markdown";
91
92
  contentBlocks: number;
92
93
  fetchedAt: string;
93
- format: "jsonl" | "markdown";
94
94
  cached: boolean;
95
95
  error?: string | undefined;
96
- content?: string | undefined;
97
96
  title?: string | undefined;
98
- truncated?: boolean | undefined;
97
+ content?: string | undefined;
98
+ contentSize?: number | undefined;
99
99
  resourceUri?: string | undefined;
100
100
  resourceMimeType?: string | undefined;
101
- contentSize?: number | undefined;
101
+ truncated?: boolean | undefined;
102
102
  errorCode?: string | undefined;
103
103
  }>;
104
104
  export declare const fetchMarkdownOutputSchema: z.ZodObject<{
@@ -111,13 +111,13 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
111
111
  fileName: z.ZodString;
112
112
  expiresAt: z.ZodString;
113
113
  }, "strip", z.ZodTypeAny, {
114
+ downloadUrl: string;
114
115
  fileName: string;
115
116
  expiresAt: string;
116
- downloadUrl: string;
117
117
  }, {
118
+ downloadUrl: string;
118
119
  fileName: string;
119
120
  expiresAt: string;
120
- downloadUrl: string;
121
121
  }>>;
122
122
  } & {
123
123
  contentSize: z.ZodOptional<z.ZodNumber>;
@@ -134,16 +134,16 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
134
134
  error?: string | undefined;
135
135
  markdown?: string | undefined;
136
136
  title?: string | undefined;
137
- truncated?: boolean | undefined;
137
+ contentSize?: number | undefined;
138
138
  resourceUri?: string | undefined;
139
139
  resourceMimeType?: string | undefined;
140
- contentSize?: number | undefined;
140
+ truncated?: boolean | undefined;
141
+ errorCode?: string | undefined;
141
142
  file?: {
143
+ downloadUrl: string;
142
144
  fileName: string;
143
145
  expiresAt: string;
144
- downloadUrl: string;
145
146
  } | undefined;
146
- errorCode?: string | undefined;
147
147
  }, {
148
148
  url: string;
149
149
  fetchedAt: string;
@@ -151,14 +151,14 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
151
151
  error?: string | undefined;
152
152
  markdown?: string | undefined;
153
153
  title?: string | undefined;
154
- truncated?: boolean | undefined;
154
+ contentSize?: number | undefined;
155
155
  resourceUri?: string | undefined;
156
156
  resourceMimeType?: string | undefined;
157
- contentSize?: number | undefined;
157
+ truncated?: boolean | undefined;
158
+ errorCode?: string | undefined;
158
159
  file?: {
160
+ downloadUrl: string;
159
161
  fileName: string;
160
162
  expiresAt: string;
161
- downloadUrl: string;
162
163
  } | undefined;
163
- errorCode?: string | undefined;
164
164
  }>;
@@ -1,8 +1,16 @@
1
1
  import { z } from 'zod';
2
2
  import { config } from '../config/index.js';
3
+ const MAX_HEADER_NAME_LENGTH = 128;
4
+ const MAX_HEADER_VALUE_LENGTH = 2048;
5
+ const MAX_HEADER_COUNT = 50;
6
+ const MAX_CONTENT_LENGTH = config.constants.maxContentSize;
7
+ const customHeadersSchema = z
8
+ .record(z.string().max(MAX_HEADER_NAME_LENGTH), z.string().max(MAX_HEADER_VALUE_LENGTH))
9
+ .refine((headers) => Object.keys(headers).length <= MAX_HEADER_COUNT, {
10
+ message: `customHeaders must have at most ${MAX_HEADER_COUNT} entries`,
11
+ });
3
12
  const requestOptionsSchema = z.object({
4
- customHeaders: z
5
- .record(z.string())
13
+ customHeaders: customHeadersSchema
6
14
  .optional()
7
15
  .describe('Custom HTTP headers for the request'),
8
16
  timeout: z
@@ -30,6 +38,7 @@ const extractionOptionsSchema = z.object({
30
38
  maxContentLength: z
31
39
  .number()
32
40
  .positive()
41
+ .max(MAX_CONTENT_LENGTH)
33
42
  .optional()
34
43
  .describe('Maximum content length in characters'),
35
44
  });
@@ -64,14 +73,22 @@ const fileDownloadSchema = z.object({
64
73
  });
65
74
  export const fetchUrlInputSchema = requestOptionsSchema
66
75
  .extend({
67
- url: z.string().min(1).describe('The URL to fetch'),
76
+ url: z
77
+ .string()
78
+ .min(1)
79
+ .max(config.constants.maxUrlLength)
80
+ .describe('The URL to fetch'),
68
81
  })
69
82
  .merge(extractionOptionsSchema)
70
83
  .merge(formatOptionsSchema)
71
84
  .strict();
72
85
  export const fetchMarkdownInputSchema = requestOptionsSchema
73
86
  .extend({
74
- url: z.string().min(1).describe('The URL to fetch'),
87
+ url: z
88
+ .string()
89
+ .min(1)
90
+ .max(config.constants.maxUrlLength)
91
+ .describe('The URL to fetch'),
75
92
  })
76
93
  .merge(extractionOptionsSchema)
77
94
  .strict();
@@ -1,6 +1,6 @@
1
1
  import { TRUNCATION_MARKER } from '../../config/formatting.js';
2
2
  import { extractContent } from '../../services/extractor.js';
3
- import { parseHtml } from '../../services/parser.js';
3
+ import { parseHtml, parseHtmlWithMetadata } from '../../services/parser.js';
4
4
  import { sanitizeText } from '../../utils/sanitizer.js';
5
5
  import { toJsonl } from '../../transformers/jsonl.transformer.js';
6
6
  import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
@@ -56,7 +56,10 @@ function decodeHtmlEntities(value) {
56
56
  }
57
57
  function buildJsonlPayload(context, maxContentLength) {
58
58
  const contentBlocks = parseHtml(context.sourceHtml);
59
- const { content, truncated } = truncateContent(toJsonl(contentBlocks, context.metadata), maxContentLength);
59
+ return buildJsonlPayloadFromBlocks(contentBlocks, context.metadata, maxContentLength);
60
+ }
61
+ function buildJsonlPayloadFromBlocks(contentBlocks, metadata, maxContentLength) {
62
+ const { content, truncated } = truncateContent(toJsonl(contentBlocks, metadata), maxContentLength);
60
63
  return {
61
64
  content,
62
65
  contentBlocks: contentBlocks.length,
@@ -69,6 +72,17 @@ function buildMarkdownPayload(context, maxContentLength) {
69
72
  return { content, truncated };
70
73
  }
71
74
  export function transformHtmlToJsonl(html, url, options) {
75
+ if (!options.extractMainContent && options.includeMetadata) {
76
+ const parsed = parseHtmlWithMetadata(html);
77
+ const metadataBlock = createContentMetadataBlock(url, null, parsed.metadata, false, true);
78
+ const { content, contentBlocks, truncated } = buildJsonlPayloadFromBlocks(parsed.blocks, metadataBlock, options.maxContentLength);
79
+ return {
80
+ content,
81
+ contentBlocks,
82
+ title: parsed.metadata.title,
83
+ ...(truncated && { truncated }),
84
+ };
85
+ }
72
86
  const context = resolveContentSource(html, url, options);
73
87
  const { content, contentBlocks, truncated } = buildJsonlPayload(context, options.maxContentLength);
74
88
  return {
@@ -88,6 +102,21 @@ export function transformHtmlToMarkdown(html, url, options) {
88
102
  };
89
103
  }
90
104
  export function transformHtmlToMarkdownWithBlocks(html, url, options) {
105
+ if (!options.extractMainContent && options.includeMetadata) {
106
+ const parsed = parseHtmlWithMetadata(html);
107
+ const context = {
108
+ sourceHtml: html,
109
+ title: parsed.metadata.title,
110
+ metadata: createContentMetadataBlock(url, null, parsed.metadata, false, true),
111
+ };
112
+ const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
113
+ return {
114
+ content,
115
+ contentBlocks: parsed.blocks.length,
116
+ title: context.title,
117
+ ...(truncated && { truncated }),
118
+ };
119
+ }
91
120
  const context = resolveContentSource(html, url, options);
92
121
  const contentBlocks = parseHtml(context.sourceHtml);
93
122
  const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
@@ -1,7 +1,7 @@
1
1
  import * as cache from '../../services/cache.js';
2
- import { fetchUrlWithRetry } from '../../services/fetcher.js';
2
+ import { fetchNormalizedUrlWithRetry } from '../../services/fetcher.js';
3
3
  import { logDebug } from '../../services/logger.js';
4
- import { validateAndNormalizeUrl } from '../../utils/url-validator.js';
4
+ import { assertResolvedAddressesAllowed, normalizeUrl, } from '../../utils/url-validator.js';
5
5
  import { appendHeaderVary } from './cache-vary.js';
6
6
  function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
7
7
  if (!cacheKey)
@@ -42,16 +42,19 @@ function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalized
42
42
  * @returns Promise resolving to the pipeline result
43
43
  */
44
44
  export async function executeFetchPipeline(options) {
45
- const normalizedUrl = await validateAndNormalizeUrl(options.url);
45
+ const { normalizedUrl, hostname } = normalizeUrl(options.url);
46
46
  const cacheKey = resolveCacheKey(options, normalizedUrl);
47
47
  const cachedResult = attemptCacheRetrieval(cacheKey, options.deserialize, options.cacheNamespace, normalizedUrl);
48
48
  if (cachedResult)
49
49
  return cachedResult;
50
+ await assertResolvedAddressesAllowed(hostname);
50
51
  const fetchOptions = buildFetchOptions(options);
51
52
  logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
52
- const html = await fetchUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
53
+ const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
53
54
  const data = options.transform(html, normalizedUrl);
54
- persistCache(cacheKey, data, options.serialize, normalizedUrl);
55
+ if (cache.isEnabled()) {
56
+ persistCache(cacheKey, data, options.serialize, normalizedUrl);
57
+ }
55
58
  return buildPipelineResult(normalizedUrl, data, cacheKey);
56
59
  }
57
60
  function resolveCacheKey(options, normalizedUrl) {
@@ -1,2 +1,7 @@
1
1
  export declare function isBlockedIp(ip: string): boolean;
2
+ export declare function assertResolvedAddressesAllowed(hostname: string): Promise<void>;
3
+ export declare function normalizeUrl(urlString: string): {
4
+ normalizedUrl: string;
5
+ hostname: string;
6
+ };
2
7
  export declare function validateAndNormalizeUrl(urlString: string): Promise<string>;
@@ -78,7 +78,7 @@ function lookupWithTimeout(hostname) {
78
78
  });
79
79
  });
80
80
  }
81
- async function assertResolvedAddressesAllowed(hostname) {
81
+ export async function assertResolvedAddressesAllowed(hostname) {
82
82
  try {
83
83
  const result = await lookupWithTimeout(hostname);
84
84
  const addresses = Array.isArray(result) ? result : [result];
@@ -102,7 +102,7 @@ async function assertResolvedAddressesAllowed(hostname) {
102
102
  throw createValidationError(String(error));
103
103
  }
104
104
  }
105
- export async function validateAndNormalizeUrl(urlString) {
105
+ export function normalizeUrl(urlString) {
106
106
  const trimmedUrl = requireTrimmedUrl(urlString);
107
107
  assertUrlLength(trimmedUrl);
108
108
  const url = parseUrl(trimmedUrl);
@@ -110,8 +110,12 @@ export async function validateAndNormalizeUrl(urlString) {
110
110
  assertNoCredentials(url);
111
111
  const hostname = normalizeHostname(url);
112
112
  assertHostnameAllowed(hostname);
113
+ return { normalizedUrl: url.href, hostname };
114
+ }
115
+ export async function validateAndNormalizeUrl(urlString) {
116
+ const { normalizedUrl, hostname } = normalizeUrl(urlString);
113
117
  await assertResolvedAddressesAllowed(hostname);
114
- return url.href;
118
+ return normalizedUrl;
115
119
  }
116
120
  const VALIDATION_ERROR_CODE = 'VALIDATION_ERROR';
117
121
  function createValidationError(message) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/superfetch",
3
- "version": "1.2.1",
3
+ "version": "1.2.2",
4
4
  "mcpName": "io.github.j0hanz/superfetch",
5
5
  "description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable JSONL format",
6
6
  "type": "module",