@j0hanz/superfetch 2.0.1 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +121 -38
  2. package/dist/cache.d.ts +42 -0
  3. package/dist/cache.js +674 -0
  4. package/dist/config/env-parsers.d.ts +1 -0
  5. package/dist/config/env-parsers.js +12 -0
  6. package/dist/config/index.d.ts +7 -0
  7. package/dist/config/index.js +10 -3
  8. package/dist/config/types/content.d.ts +1 -0
  9. package/dist/config.d.ts +82 -0
  10. package/dist/config.js +274 -0
  11. package/dist/crypto.d.ts +2 -0
  12. package/dist/crypto.js +32 -0
  13. package/dist/errors.d.ts +10 -0
  14. package/dist/errors.js +28 -0
  15. package/dist/fetch.d.ts +40 -0
  16. package/dist/fetch.js +930 -0
  17. package/dist/http/base-middleware.d.ts +7 -0
  18. package/dist/http/base-middleware.js +143 -0
  19. package/dist/http/cors.d.ts +0 -5
  20. package/dist/http/cors.js +0 -6
  21. package/dist/http/download-routes.js +6 -2
  22. package/dist/http/error-handler.d.ts +2 -0
  23. package/dist/http/error-handler.js +55 -0
  24. package/dist/http/mcp-routes.js +2 -2
  25. package/dist/http/mcp-sessions.d.ts +3 -5
  26. package/dist/http/mcp-sessions.js +8 -8
  27. package/dist/http/server-tuning.d.ts +9 -0
  28. package/dist/http/server-tuning.js +45 -0
  29. package/dist/http/server.d.ts +0 -10
  30. package/dist/http/server.js +33 -333
  31. package/dist/http.d.ts +86 -0
  32. package/dist/http.js +1507 -0
  33. package/dist/index.js +3 -3
  34. package/dist/instructions.md +96 -0
  35. package/dist/mcp.d.ts +3 -0
  36. package/dist/mcp.js +104 -0
  37. package/dist/observability.d.ts +16 -0
  38. package/dist/observability.js +78 -0
  39. package/dist/server.js +20 -5
  40. package/dist/services/cache.d.ts +1 -1
  41. package/dist/services/context.d.ts +2 -0
  42. package/dist/services/context.js +3 -0
  43. package/dist/services/extractor.d.ts +1 -0
  44. package/dist/services/extractor.js +28 -2
  45. package/dist/services/fetcher.d.ts +2 -0
  46. package/dist/services/fetcher.js +35 -14
  47. package/dist/services/logger.js +4 -1
  48. package/dist/services/telemetry.d.ts +19 -0
  49. package/dist/services/telemetry.js +43 -0
  50. package/dist/services/transform-worker-pool.d.ts +10 -3
  51. package/dist/services/transform-worker-pool.js +213 -184
  52. package/dist/tools/handlers/fetch-url.tool.js +8 -6
  53. package/dist/tools/index.d.ts +1 -0
  54. package/dist/tools/index.js +13 -1
  55. package/dist/tools/schemas.d.ts +2 -0
  56. package/dist/tools/schemas.js +8 -0
  57. package/dist/tools/utils/content-transform-core.d.ts +5 -0
  58. package/dist/tools/utils/content-transform-core.js +180 -0
  59. package/dist/tools/utils/content-transform-workers.d.ts +1 -0
  60. package/dist/tools/utils/content-transform-workers.js +1 -0
  61. package/dist/tools/utils/content-transform.d.ts +3 -5
  62. package/dist/tools/utils/content-transform.js +35 -148
  63. package/dist/tools/utils/raw-markdown.js +15 -1
  64. package/dist/tools.d.ts +109 -0
  65. package/dist/tools.js +434 -0
  66. package/dist/transform.d.ts +69 -0
  67. package/dist/transform.js +1814 -0
  68. package/dist/transformers/markdown.d.ts +4 -1
  69. package/dist/transformers/markdown.js +182 -53
  70. package/dist/utils/cancellation.d.ts +1 -0
  71. package/dist/utils/cancellation.js +18 -0
  72. package/dist/utils/code-language.d.ts +0 -9
  73. package/dist/utils/code-language.js +5 -5
  74. package/dist/utils/host-normalizer.d.ts +1 -0
  75. package/dist/utils/host-normalizer.js +37 -0
  76. package/dist/utils/url-redactor.d.ts +1 -0
  77. package/dist/utils/url-redactor.js +13 -0
  78. package/dist/utils/url-validator.js +8 -5
  79. package/dist/utils.d.ts +1 -0
  80. package/dist/utils.js +3 -0
  81. package/dist/workers/transform-worker.js +80 -38
  82. package/package.json +10 -9
@@ -1,2 +1,5 @@
1
1
  import type { MetadataBlock } from '../config/types/content.js';
2
- export declare function htmlToMarkdown(html: string, metadata?: MetadataBlock): string;
2
+ export declare function htmlToMarkdown(html: string, metadata?: MetadataBlock, options?: {
3
+ url?: string;
4
+ signal?: AbortSignal;
5
+ }): string;
@@ -1,5 +1,9 @@
1
- import TurndownService from 'turndown';
1
+ import { parseHTML } from 'linkedom';
2
+ import { NodeHtmlMarkdown, } from 'node-html-markdown';
2
3
  import { CODE_BLOCK, FRONTMATTER_DELIMITER, joinLines, } from '../config/formatting.js';
4
+ import { FetchError } from '../errors/app-error.js';
5
+ import { endTransformStage, startTransformStage, } from '../services/telemetry.js';
6
+ import { throwIfAborted } from '../utils/cancellation.js';
3
7
  import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../utils/code-language.js';
4
8
  import { isRecord } from '../utils/guards.js';
5
9
  const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
@@ -43,6 +47,9 @@ function buildFrontmatter(metadata) {
43
47
  const lines = [FRONTMATTER_DELIMITER];
44
48
  appendFrontmatterField(lines, 'title', metadata.title);
45
49
  appendFrontmatterField(lines, 'source', metadata.url);
50
+ appendFrontmatterField(lines, 'author', metadata.author);
51
+ appendFrontmatterField(lines, 'description', metadata.description);
52
+ appendFrontmatterField(lines, 'fetchedAt', metadata.fetchedAt);
46
53
  lines.push(FRONTMATTER_DELIMITER);
47
54
  return joinLines(lines);
48
55
  }
@@ -51,36 +58,6 @@ function isElement(node) {
51
58
  'getAttribute' in node &&
52
59
  typeof node.getAttribute === 'function');
53
60
  }
54
- function isFencedCodeBlock(node, options) {
55
- return (options.codeBlockStyle === 'fenced' &&
56
- node.nodeName === 'PRE' &&
57
- node.firstChild?.nodeName === 'CODE');
58
- }
59
- function formatFencedCodeBlock(node) {
60
- const codeNode = node.firstChild;
61
- if (!isElement(codeNode))
62
- return '';
63
- const code = codeNode.textContent || '';
64
- const language = resolveCodeLanguage(codeNode, code);
65
- return CODE_BLOCK.format(code, language);
66
- }
67
- function resolveCodeLanguage(codeNode, code) {
68
- const { className, dataLanguage } = readCodeAttributes(codeNode);
69
- const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
70
- return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
71
- }
72
- function readCodeAttributes(codeNode) {
73
- return {
74
- className: codeNode.getAttribute('class') ?? '',
75
- dataLanguage: codeNode.getAttribute('data-language') ?? '',
76
- };
77
- }
78
- function addFencedCodeRule(instance) {
79
- instance.addRule('fencedCodeBlockWithLanguage', {
80
- filter: (node, options) => isFencedCodeBlock(node, options),
81
- replacement: (_content, node) => formatFencedCodeBlock(node),
82
- });
83
- }
84
61
  const STRUCTURAL_TAGS = new Set([
85
62
  'script',
86
63
  'style',
@@ -109,6 +86,67 @@ const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|s
109
86
  const FIXED_PATTERN = /\b(fixed|sticky)\b/;
110
87
  const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
111
88
  const ISOLATE_PATTERN = /\bisolate\b/;
89
+ const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
90
+ const NOISE_MARKERS = [
91
+ '<script',
92
+ '<style',
93
+ '<noscript',
94
+ '<iframe',
95
+ '<nav',
96
+ '<footer',
97
+ '<aside',
98
+ '<header',
99
+ '<form',
100
+ '<button',
101
+ '<input',
102
+ '<select',
103
+ '<textarea',
104
+ '<svg',
105
+ '<canvas',
106
+ ' aria-hidden="true"',
107
+ " aria-hidden='true'",
108
+ ' hidden',
109
+ ' role="navigation"',
110
+ " role='navigation'",
111
+ ' role="banner"',
112
+ " role='banner'",
113
+ ' role="complementary"',
114
+ " role='complementary'",
115
+ ' role="contentinfo"',
116
+ " role='contentinfo'",
117
+ ' role="tree"',
118
+ " role='tree'",
119
+ ' role="menubar"',
120
+ " role='menubar'",
121
+ ' role="menu"',
122
+ " role='menu'",
123
+ ' banner',
124
+ ' promo',
125
+ ' announcement',
126
+ ' cta',
127
+ ' callout',
128
+ ' advert',
129
+ ' newsletter',
130
+ ' subscribe',
131
+ ' cookie',
132
+ ' consent',
133
+ ' popup',
134
+ ' modal',
135
+ ' overlay',
136
+ ' toast',
137
+ ' fixed',
138
+ ' sticky',
139
+ ' z-50',
140
+ ' z-4',
141
+ ' isolate',
142
+ ];
143
+ function mayContainNoise(html) {
144
+ const haystack = html.toLowerCase();
145
+ return NOISE_MARKERS.some((marker) => haystack.includes(marker));
146
+ }
147
+ function isFullDocumentHtml(html) {
148
+ return HTML_DOCUMENT_MARKERS.test(html);
149
+ }
112
150
  function isStructuralNoiseTag(tagName) {
113
151
  return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
114
152
  }
@@ -146,40 +184,131 @@ function isNoiseElement(node) {
146
184
  matchesFixedOrHighZIsolate(metadata.className) ||
147
185
  matchesPromoIdOrClass(metadata.className, metadata.id));
148
186
  }
149
- function isNoiseNode(node) {
150
- return isElement(node) && isNoiseElement(node);
187
+ function removeNoiseFromHtml(html) {
188
+ const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
189
+ if (!shouldParse)
190
+ return html;
191
+ const shouldRemove = mayContainNoise(html);
192
+ try {
193
+ const { document } = parseHTML(html);
194
+ if (shouldRemove) {
195
+ const nodes = Array.from(document.querySelectorAll('*'));
196
+ for (let index = nodes.length - 1; index >= 0; index -= 1) {
197
+ const node = nodes[index];
198
+ if (!node)
199
+ continue;
200
+ if (isElement(node) && isNoiseElement(node)) {
201
+ node.remove();
202
+ }
203
+ }
204
+ }
205
+ const { body } = document;
206
+ if (body?.innerHTML)
207
+ return body.innerHTML;
208
+ if (typeof document.toString ===
209
+ 'function') {
210
+ return document.toString();
211
+ }
212
+ const { documentElement } = document;
213
+ if (documentElement?.outerHTML)
214
+ return documentElement.outerHTML;
215
+ return html;
216
+ }
217
+ catch {
218
+ return html;
219
+ }
220
+ }
221
+ function buildInlineCode(content) {
222
+ const runs = content.match(/`+/g);
223
+ const longest = runs?.sort((a, b) => b.length - a.length)[0] ?? '';
224
+ const delimiter = `\`${longest}`;
225
+ const padding = delimiter.length > 1 ? ' ' : '';
226
+ return `${delimiter}${padding}${content}${padding}${delimiter}`;
151
227
  }
152
- function addNoiseRule(instance) {
153
- instance.addRule('removeNoise', {
154
- filter: (node) => isNoiseNode(node),
155
- replacement: () => '',
156
- });
228
+ function isCodeBlock(parent) {
229
+ if (!isRecord(parent))
230
+ return false;
231
+ const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
232
+ return ['PRE', 'WRAPPED-PRE'].includes(tagName);
157
233
  }
158
- let turndownInstance = null;
159
- function createTurndownInstance() {
160
- const instance = new TurndownService({
161
- headingStyle: 'atx',
234
+ function createCodeTranslator() {
235
+ return {
236
+ code: (ctx) => {
237
+ if (!isRecord(ctx)) {
238
+ return {
239
+ spaceIfRepeatingChar: true,
240
+ noEscape: true,
241
+ postprocess: ({ content }) => buildInlineCode(content),
242
+ };
243
+ }
244
+ const { node, parent, visitor } = ctx;
245
+ const getAttribute = isRecord(node) && typeof node.getAttribute === 'function'
246
+ ? node.getAttribute.bind(node)
247
+ : undefined;
248
+ if (!isCodeBlock(parent)) {
249
+ return {
250
+ spaceIfRepeatingChar: true,
251
+ noEscape: true,
252
+ postprocess: ({ content }) => buildInlineCode(content),
253
+ };
254
+ }
255
+ const className = getAttribute?.('class') ?? '';
256
+ const dataLanguage = getAttribute?.('data-language') ?? '';
257
+ const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
258
+ const childTranslators = isRecord(visitor) ? visitor.instance : null;
259
+ const codeBlockTranslators = isRecord(childTranslators) &&
260
+ isRecord(childTranslators
261
+ .codeBlockTranslators)
262
+ ? childTranslators.codeBlockTranslators
263
+ : null;
264
+ return {
265
+ noEscape: true,
266
+ preserveWhitespace: true,
267
+ ...(codeBlockTranslators
268
+ ? { childTranslators: codeBlockTranslators }
269
+ : null),
270
+ postprocess: ({ content }) => {
271
+ const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
272
+ return CODE_BLOCK.format(content, language);
273
+ },
274
+ };
275
+ },
276
+ };
277
+ }
278
+ let markdownInstance = null;
279
+ function createMarkdownInstance() {
280
+ return new NodeHtmlMarkdown({
281
+ codeFence: CODE_BLOCK.fence,
162
282
  codeBlockStyle: 'fenced',
163
283
  emDelimiter: '_',
164
- bulletListMarker: '-',
165
- });
166
- addNoiseRule(instance);
167
- addFencedCodeRule(instance);
168
- return instance;
284
+ bulletMarker: '-',
285
+ }, createCodeTranslator());
169
286
  }
170
- function getTurndown() {
171
- turndownInstance ??= createTurndownInstance();
172
- return turndownInstance;
287
+ function getMarkdownConverter() {
288
+ markdownInstance ??= createMarkdownInstance();
289
+ return markdownInstance;
173
290
  }
174
- export function htmlToMarkdown(html, metadata) {
291
+ export function htmlToMarkdown(html, metadata, options) {
292
+ const url = options?.url ?? metadata?.url ?? '';
175
293
  const frontmatter = buildFrontmatter(metadata);
176
294
  if (!html)
177
295
  return frontmatter;
178
296
  try {
179
- const content = getTurndown().turndown(html).trim();
297
+ throwIfAborted(options?.signal, url, 'markdown:begin');
298
+ const noiseStage = startTransformStage(url, 'markdown:noise');
299
+ const cleanedHtml = removeNoiseFromHtml(html);
300
+ endTransformStage(noiseStage);
301
+ throwIfAborted(options?.signal, url, 'markdown:cleaned');
302
+ const translateStage = startTransformStage(url, 'markdown:translate');
303
+ const content = getMarkdownConverter().translate(cleanedHtml).trim();
304
+ endTransformStage(translateStage);
305
+ throwIfAborted(options?.signal, url, 'markdown:translated');
180
306
  return frontmatter ? `${frontmatter}\n${content}` : content;
181
307
  }
182
- catch {
308
+ catch (error) {
309
+ if (error instanceof FetchError) {
310
+ throw error;
311
+ }
183
312
  return frontmatter;
184
313
  }
185
314
  }
@@ -0,0 +1 @@
1
+ export declare function throwIfAborted(signal: AbortSignal | undefined, url: string, stage: string): void;
@@ -0,0 +1,18 @@
1
+ import { FetchError } from '../errors/app-error.js';
2
+ function isTimeoutReason(reason) {
3
+ return reason instanceof Error && reason.name === 'TimeoutError';
4
+ }
5
+ export function throwIfAborted(signal, url, stage) {
6
+ if (!signal?.aborted)
7
+ return;
8
+ if (isTimeoutReason(signal.reason)) {
9
+ throw new FetchError('Request timeout', url, 504, {
10
+ reason: 'timeout',
11
+ stage,
12
+ });
13
+ }
14
+ throw new FetchError('Request was canceled', url, 499, {
15
+ reason: 'aborted',
16
+ stage,
17
+ });
18
+ }
@@ -1,11 +1,2 @@
1
- export declare function containsJsxTag(code: string): boolean;
2
- export declare function containsWord(source: string, word: string): boolean;
3
- export declare function splitLines(content: string): string[];
4
- export declare function extractLanguageFromClassName(className: string): string | undefined;
5
- export declare function resolveLanguageFromDataAttribute(dataLang: string): string | undefined;
6
- export interface CodeDetector {
7
- language: string;
8
- detect: (code: string) => boolean;
9
- }
10
1
  export declare function detectLanguageFromCode(code: string): string | undefined;
11
2
  export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
@@ -1,4 +1,4 @@
1
- export function containsJsxTag(code) {
1
+ function containsJsxTag(code) {
2
2
  for (let index = 0; index < code.length - 1; index += 1) {
3
3
  if (code[index] !== '<')
4
4
  continue;
@@ -10,7 +10,7 @@ export function containsJsxTag(code) {
10
10
  }
11
11
  return false;
12
12
  }
13
- export function containsWord(source, word) {
13
+ function containsWord(source, word) {
14
14
  let startIndex = source.indexOf(word);
15
15
  while (startIndex !== -1) {
16
16
  const before = startIndex === 0 ? '' : source[startIndex - 1];
@@ -22,10 +22,10 @@ export function containsWord(source, word) {
22
22
  }
23
23
  return false;
24
24
  }
25
- export function splitLines(content) {
25
+ function splitLines(content) {
26
26
  return content.split('\n');
27
27
  }
28
- export function extractLanguageFromClassName(className) {
28
+ function extractLanguageFromClassName(className) {
29
29
  const tokens = className.match(/\S+/g);
30
30
  if (!tokens)
31
31
  return undefined;
@@ -41,7 +41,7 @@ export function extractLanguageFromClassName(className) {
41
41
  }
42
42
  return undefined;
43
43
  }
44
- export function resolveLanguageFromDataAttribute(dataLang) {
44
+ function resolveLanguageFromDataAttribute(dataLang) {
45
45
  const trimmed = dataLang.trim();
46
46
  if (!trimmed)
47
47
  return undefined;
@@ -0,0 +1 @@
1
+ export declare function normalizeHost(value: string): string | null;
@@ -0,0 +1,37 @@
1
+ import { isIP } from 'node:net';
2
+ function takeFirstHostValue(value) {
3
+ const first = value.split(',')[0];
4
+ if (!first)
5
+ return null;
6
+ const trimmed = first.trim();
7
+ return trimmed ? trimmed : null;
8
+ }
9
+ function stripIpv6Brackets(value) {
10
+ if (!value.startsWith('['))
11
+ return null;
12
+ const end = value.indexOf(']');
13
+ if (end === -1)
14
+ return null;
15
+ return value.slice(1, end);
16
+ }
17
+ function stripPortIfPresent(value) {
18
+ const colonIndex = value.indexOf(':');
19
+ if (colonIndex === -1)
20
+ return value;
21
+ return value.slice(0, colonIndex);
22
+ }
23
+ export function normalizeHost(value) {
24
+ const trimmed = value.trim().toLowerCase();
25
+ if (!trimmed)
26
+ return null;
27
+ const first = takeFirstHostValue(trimmed);
28
+ if (!first)
29
+ return null;
30
+ const ipv6 = stripIpv6Brackets(first);
31
+ if (ipv6)
32
+ return ipv6;
33
+ if (isIP(first) === 6) {
34
+ return first;
35
+ }
36
+ return stripPortIfPresent(first);
37
+ }
@@ -0,0 +1 @@
1
+ export declare function redactUrl(rawUrl: string): string;
@@ -0,0 +1,13 @@
1
+ export function redactUrl(rawUrl) {
2
+ try {
3
+ const url = new URL(rawUrl);
4
+ url.username = '';
5
+ url.password = '';
6
+ url.hash = '';
7
+ url.search = '';
8
+ return url.toString();
9
+ }
10
+ catch {
11
+ return rawUrl;
12
+ }
13
+ }
@@ -78,6 +78,8 @@ export function normalizeUrl(urlString) {
78
78
  assertNoCredentials(url);
79
79
  const hostname = normalizeHostname(url);
80
80
  assertHostnameAllowed(hostname);
81
+ // Canonicalize hostname to avoid trailing-dot variants and keep url.href consistent.
82
+ url.hostname = hostname;
81
83
  return { normalizedUrl: url.href, hostname };
82
84
  }
83
85
  export function validateAndNormalizeUrl(urlString) {
@@ -103,12 +105,10 @@ function assertUrlLength(url) {
103
105
  throw createValidationError(`URL exceeds maximum length of ${config.constants.maxUrlLength} characters`);
104
106
  }
105
107
  function parseUrl(urlString) {
106
- try {
107
- return new URL(urlString);
108
- }
109
- catch {
108
+ if (!URL.canParse(urlString)) {
110
109
  throw createValidationError('Invalid URL format');
111
110
  }
111
+ return new URL(urlString);
112
112
  }
113
113
  function assertHttpProtocol(url) {
114
114
  if (url.protocol === 'http:' || url.protocol === 'https:')
@@ -121,7 +121,10 @@ function assertNoCredentials(url) {
121
121
  throw createValidationError('URLs with embedded credentials are not allowed');
122
122
  }
123
123
  function normalizeHostname(url) {
124
- const hostname = url.hostname.toLowerCase();
124
+ let hostname = url.hostname.toLowerCase();
125
+ while (hostname.endsWith('.')) {
126
+ hostname = hostname.slice(0, -1);
127
+ }
125
128
  if (!hostname) {
126
129
  throw createValidationError('URL must have a valid hostname');
127
130
  }
@@ -0,0 +1 @@
1
+ export declare function isRecord(value: unknown): value is Record<string, unknown>;
package/dist/utils.js ADDED
@@ -0,0 +1,3 @@
1
+ export function isRecord(value) {
2
+ return typeof value === 'object' && value !== null;
3
+ }
@@ -1,50 +1,92 @@
1
1
  import { parentPort } from 'node:worker_threads';
2
- import { transformHtmlToJsonl, transformHtmlToMarkdown, transformHtmlToMarkdownWithBlocks, } from '../tools/utils/content-transform.js';
3
- function isTransformJob(value) {
4
- if (!value || typeof value !== 'object')
5
- return false;
6
- const record = value;
7
- return (typeof record.id === 'number' &&
8
- typeof record.mode === 'string' &&
9
- typeof record.html === 'string' &&
10
- typeof record.url === 'string');
2
+ import { FetchError, getErrorMessage } from '../errors.js';
3
+ import { transformHtmlToMarkdownInProcess } from '../transform.js';
4
+ import { isRecord } from '../utils.js';
5
+ const controllers = new Map();
6
+ function post(message) {
7
+ parentPort?.postMessage(message);
11
8
  }
12
- function resolveTransform(job) {
13
- if (job.mode === 'markdown') {
14
- return transformHtmlToMarkdown(job.html, job.url, job.options);
9
+ function handleTransform(message) {
10
+ const controller = new AbortController();
11
+ controllers.set(message.id, controller);
12
+ try {
13
+ const result = transformHtmlToMarkdownInProcess(message.html, message.url, {
14
+ includeMetadata: message.includeMetadata,
15
+ signal: controller.signal,
16
+ });
17
+ post({
18
+ type: 'result',
19
+ id: message.id,
20
+ result: {
21
+ markdown: result.markdown,
22
+ ...(result.title === undefined ? {} : { title: result.title }),
23
+ truncated: result.truncated,
24
+ },
25
+ });
15
26
  }
16
- if (job.mode === 'markdown-blocks') {
17
- return transformHtmlToMarkdownWithBlocks(job.html, job.url, {
18
- ...job.options,
19
- includeContentBlocks: job.options.includeContentBlocks ?? true,
27
+ catch (error) {
28
+ if (error instanceof FetchError) {
29
+ post({
30
+ type: 'error',
31
+ id: message.id,
32
+ error: {
33
+ name: error.name,
34
+ message: error.message,
35
+ url: error.url,
36
+ statusCode: error.statusCode,
37
+ details: { ...error.details },
38
+ },
39
+ });
40
+ return;
41
+ }
42
+ post({
43
+ type: 'error',
44
+ id: message.id,
45
+ error: {
46
+ name: error instanceof Error ? error.name : 'Error',
47
+ message: getErrorMessage(error),
48
+ url: message.url,
49
+ },
20
50
  });
21
51
  }
22
- return transformHtmlToJsonl(job.html, job.url, job.options);
52
+ finally {
53
+ controllers.delete(message.id);
54
+ }
23
55
  }
24
- function sendResponse(response) {
25
- if (!parentPort)
56
+ function handleCancel(message) {
57
+ const controller = controllers.get(message.id);
58
+ if (!controller)
26
59
  return;
27
- parentPort.postMessage(response);
60
+ controller.abort(new Error('Canceled'));
28
61
  }
29
- function handleMessage(message) {
30
- if (!isTransformJob(message)) {
31
- sendResponse({
32
- id: -1,
33
- ok: false,
34
- error: 'Invalid transform job payload',
35
- });
62
+ if (!parentPort) {
63
+ throw new Error('transform-worker started without parentPort');
64
+ }
65
+ parentPort.on('message', (raw) => {
66
+ if (!isRecord(raw))
67
+ return;
68
+ const { type } = raw;
69
+ if (type === 'cancel') {
70
+ if (typeof raw.id !== 'string')
71
+ return;
72
+ handleCancel({ type: 'cancel', id: raw.id });
36
73
  return;
37
74
  }
38
- try {
39
- const result = resolveTransform(message);
40
- sendResponse({ id: message.id, ok: true, result });
41
- }
42
- catch (error) {
43
- sendResponse({
44
- id: message.id,
45
- ok: false,
46
- error: error instanceof Error ? error.message : String(error),
75
+ if (type === 'transform') {
76
+ if (typeof raw.id !== 'string')
77
+ return;
78
+ if (typeof raw.html !== 'string')
79
+ return;
80
+ if (typeof raw.url !== 'string')
81
+ return;
82
+ if (typeof raw.includeMetadata !== 'boolean')
83
+ return;
84
+ handleTransform({
85
+ type: 'transform',
86
+ id: raw.id,
87
+ html: raw.html,
88
+ url: raw.url,
89
+ includeMetadata: raw.includeMetadata,
47
90
  });
48
91
  }
49
- }
50
- parentPort?.on('message', handleMessage);
92
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/superfetch",
3
- "version": "2.0.1",
3
+ "version": "2.1.1",
4
4
  "mcpName": "io.github.j0hanz/superfetch",
5
5
  "description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
6
6
  "type": "module",
@@ -34,12 +34,14 @@
34
34
  "superfetch"
35
35
  ],
36
36
  "scripts": {
37
- "build": "tsc -p tsconfig.build.json && node -e \"require('fs').chmodSync('dist/index.js', '755')\"",
37
+ "build": "tsc -p tsconfig.build.json && node -e \"require('fs').copyFileSync('src/instructions.md','dist/instructions.md')\" && node -e \"require('fs').chmodSync('dist/index.js', '755')\"",
38
38
  "prepare": "npm run build",
39
39
  "dev": "tsx watch src/index.ts",
40
40
  "start": "node dist/index.js",
41
41
  "format": "prettier --write .",
42
42
  "type-check": "tsc --noEmit",
43
+ "type-check:diagnostics": "tsc --noEmit --extendedDiagnostics",
44
+ "type-check:trace": "node -e \"require('fs').rmSync('.ts-trace',{recursive:true,force:true})\" && tsc --noEmit --generateTrace .ts-trace",
43
45
  "lint": "eslint .",
44
46
  "lint:fix": "eslint . --fix",
45
47
  "test": "npm run build --silent && node --test --experimental-transform-types",
@@ -54,29 +56,28 @@
54
56
  "@mozilla/readability": "^0.6.0",
55
57
  "express": "^5.2.1",
56
58
  "linkedom": "^0.18.12",
57
- "turndown": "^7.2.2",
58
- "undici": "^6.23.0",
59
+ "node-html-markdown": "^2.0.0",
60
+ "undici": "^7.18.2",
59
61
  "zod": "^4.3.5"
60
62
  },
61
63
  "devDependencies": {
62
64
  "@eslint/js": "^9.39.2",
63
65
  "@trivago/prettier-plugin-sort-imports": "^6.0.2",
64
66
  "@types/express": "^5.0.6",
65
- "@types/node": "^22.19.3",
66
- "@types/turndown": "^5.0.6",
67
+ "@types/node": "^22.19.5",
67
68
  "eslint": "^9.23.2",
68
69
  "eslint-config-prettier": "^10.1.8",
69
70
  "eslint-plugin-de-morgan": "^2.0.0",
70
71
  "eslint-plugin-depend": "^1.4.0",
71
72
  "eslint-plugin-sonarjs": "^3.0.5",
72
73
  "eslint-plugin-unused-imports": "^4.3.0",
73
- "knip": "^5.80.1",
74
+ "knip": "^5.80.2",
74
75
  "prettier": "^3.7.4",
75
76
  "tsx": "^4.21.0",
76
77
  "typescript": "^5.9.3",
77
- "typescript-eslint": "^8.52.0"
78
+ "typescript-eslint": "^8.53.0"
78
79
  },
79
80
  "engines": {
80
- "node": ">=20.12.0"
81
+ "node": ">=20.18.1"
81
82
  }
82
83
  }