@j0hanz/superfetch 2.4.12 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cache.d.ts CHANGED
@@ -41,10 +41,6 @@ export declare function get(cacheKey: string | null): CacheEntry | undefined;
41
41
  export declare function set(cacheKey: string | null, content: string, metadata: CacheEntryMetadata): void;
42
42
  export declare function keys(): readonly string[];
43
43
  export declare function isEnabled(): boolean;
44
- export declare function getRecentCachedUrls(): {
45
- url: string;
46
- title?: string;
47
- }[];
48
44
  export declare function registerCachedContentResource(server: McpServer, serverIcons?: McpIcon[]): void;
49
45
  export declare function generateSafeFilename(url: string, title?: string, hashFallback?: string, extension?: string): string;
50
46
  export declare function handleDownload(res: ServerResponse, namespace: string, hash: string): void;
package/dist/cache.js CHANGED
@@ -273,24 +273,6 @@ export function keys() {
273
273
  export function isEnabled() {
274
274
  return store.isEnabled();
275
275
  }
276
- export function getRecentCachedUrls() {
277
- const cacheKeys = store.keys();
278
- const maxResults = 20;
279
- const results = [];
280
- for (let i = 0; i < Math.min(cacheKeys.length, maxResults); i++) {
281
- const key = cacheKeys[i];
282
- if (!key)
283
- continue;
284
- const entry = store.get(key);
285
- if (entry) {
286
- results.push({
287
- url: entry.url,
288
- ...(entry.title ? { title: entry.title } : {}),
289
- });
290
- }
291
- }
292
- return results;
293
- }
294
276
  /* -------------------------------------------------------------------------------------------------
295
277
  * MCP cached content resource (superfetch://cache/markdown/{urlHash})
296
278
  * ------------------------------------------------------------------------------------------------- */
package/dist/config.d.ts CHANGED
@@ -46,6 +46,7 @@ export declare const config: {
46
46
  timeoutMs: number;
47
47
  stageWarnRatio: number;
48
48
  metadataFormat: TransformMetadataFormat;
49
+ maxWorkerScale: number;
49
50
  };
50
51
  tools: {
51
52
  enabled: string[];
package/dist/config.js CHANGED
@@ -103,7 +103,7 @@ const SIZE_LIMITS = {
103
103
  TEN_MB: 10 * 1024 * 1024,
104
104
  };
105
105
  const TIMEOUT = {
106
- DEFAULT_FETCH_TIMEOUT_MS: 15000,
106
+ DEFAULT_FETCH_TIMEOUT_MS: parseInteger(process.env.FETCH_TIMEOUT_MS, 15000, 1000, 60000),
107
107
  DEFAULT_SESSION_TTL_MS: 30 * 60 * 1000,
108
108
  DEFAULT_TRANSFORM_TIMEOUT_MS: parseInteger(process.env.TRANSFORM_TIMEOUT_MS, 30000, 5000, 120000),
109
109
  };
@@ -202,6 +202,7 @@ export const config = {
202
202
  timeoutMs: TIMEOUT.DEFAULT_TRANSFORM_TIMEOUT_MS,
203
203
  stageWarnRatio: parseFloat(process.env.TRANSFORM_STAGE_WARN_RATIO ?? '0.5'),
204
204
  metadataFormat: parseTransformMetadataFormat(process.env.TRANSFORM_METADATA_FORMAT),
205
+ maxWorkerScale: parseInteger(process.env.TRANSFORM_WORKER_MAX_SCALE, 4, 1, 16),
205
206
  },
206
207
  tools: {
207
208
  enabled: parseList(process.env.ENABLED_TOOLS ?? 'fetch-url'),
@@ -268,8 +269,8 @@ export const config = {
268
269
  auth: buildAuthConfig(baseUrl),
269
270
  rateLimit: {
270
271
  enabled: true,
271
- maxRequests: 100,
272
- windowMs: 60000,
272
+ maxRequests: parseInteger(process.env.RATE_LIMIT_MAX, 100, 1, 10000),
273
+ windowMs: parseInteger(process.env.RATE_LIMIT_WINDOW_MS, 60000, 1000, 3600000),
273
274
  cleanupIntervalMs: 60000,
274
275
  },
275
276
  runtime: runtimeState,
@@ -319,6 +319,8 @@ const detector = new LanguageDetector();
319
319
  * Detect programming language from code content using heuristics.
320
320
  */
321
321
  export function detectLanguageFromCode(code) {
322
+ if (!code || code.trim().length === 0)
323
+ return undefined;
322
324
  return detector.detect(code);
323
325
  }
324
326
  /**
@@ -5,8 +5,3 @@ export declare function addSourceToMarkdown(content: string, url: string): strin
5
5
  export declare function isRawTextContent(content: string): boolean;
6
6
  export declare function isLikelyHtmlContent(content: string): boolean;
7
7
  export declare function buildMetadataFooter(metadata?: MetadataBlock, fallbackUrl?: string): string;
8
- /**
9
- * Promote standalone lines that look like headings to proper markdown headings.
10
- * Fence-aware: never modifies content inside fenced code blocks.
11
- */
12
- export declare function promoteOrphanHeadings(markdown: string): string;
@@ -398,7 +398,7 @@ function hasMarkdownSourceLine(content) {
398
398
  }
399
399
  return false;
400
400
  }
401
- function addSourceToMarkdownMarkdownFormat(content, url) {
401
+ function addSourceToMarkdownAsMarkdown(content, url) {
402
402
  if (hasMarkdownSourceLine(content))
403
403
  return content;
404
404
  const lineEnding = detectLineEnding(content);
@@ -423,7 +423,7 @@ function addSourceToMarkdownMarkdownFormat(content, url) {
423
423
  export function addSourceToMarkdown(content, url) {
424
424
  const fm = frontmatter.find(content);
425
425
  if (config.transform.metadataFormat === 'markdown' && !fm) {
426
- return addSourceToMarkdownMarkdownFormat(content, url);
426
+ return addSourceToMarkdownAsMarkdown(content, url);
427
427
  }
428
428
  if (!fm) {
429
429
  // Preserve existing behavior: always uses LF even if content uses CRLF.
@@ -503,28 +503,3 @@ export function buildMetadataFooter(metadata, fallbackUrl) {
503
503
  }
504
504
  return lines.join('\n');
505
505
  }
506
- /* -------------------------------------------------------------------------------------------------
507
- * Heading promotion (fence-aware)
508
- * ------------------------------------------------------------------------------------------------- */
509
- /**
510
- * Promote standalone lines that look like headings to proper markdown headings.
511
- * Fence-aware: never modifies content inside fenced code blocks.
512
- */
513
- export function promoteOrphanHeadings(markdown) {
514
- if (!markdown)
515
- return '';
516
- const lines = markdown.split('\n');
517
- const result = [];
518
- const state = initialFenceState();
519
- for (let i = 0; i < lines.length; i += 1) {
520
- const line = lines[i] ?? '';
521
- const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
522
- if (state.inFence || isFenceStart(line)) {
523
- result.push(line);
524
- advanceFenceState(line, state);
525
- continue;
526
- }
527
- result.push(orphanHeadingPromoter.processLine(line, prevLine));
528
- }
529
- return result.join('\n');
530
- }
@@ -8,12 +8,10 @@ interface RequestContext {
8
8
  export declare function setMcpServer(server: McpServer): void;
9
9
  export declare function runWithRequestContext<T>(context: RequestContext, fn: () => T): T;
10
10
  export declare function getRequestId(): string | undefined;
11
- export declare function getSessionId(): string | undefined;
12
11
  export declare function getOperationId(): string | undefined;
13
12
  export declare function logInfo(message: string, meta?: LogMetadata): void;
14
13
  export declare function logDebug(message: string, meta?: LogMetadata): void;
15
14
  export declare function logWarn(message: string, meta?: LogMetadata): void;
16
15
  export declare function logError(message: string, error?: Error | LogMetadata): void;
17
16
  export declare function redactUrl(rawUrl: string): string;
18
- export declare function redactHeaders(headers: Record<string, unknown>): Record<string, unknown>;
19
17
  export {};
@@ -11,7 +11,7 @@ export function runWithRequestContext(context, fn) {
11
11
  export function getRequestId() {
12
12
  return requestContext.getStore()?.requestId;
13
13
  }
14
- export function getSessionId() {
14
+ function getSessionId() {
15
15
  return requestContext.getStore()?.sessionId;
16
16
  }
17
17
  export function getOperationId() {
@@ -102,13 +102,3 @@ export function redactUrl(rawUrl) {
102
102
  return rawUrl;
103
103
  }
104
104
  }
105
- export function redactHeaders(headers) {
106
- const redacted = { ...headers };
107
- const sensitiveKeys = ['authorization', 'cookie', 'set-cookie', 'x-api-key'];
108
- for (const key of Object.keys(redacted)) {
109
- if (sensitiveKeys.includes(key.toLowerCase())) {
110
- redacted[key] = '[REDACTED]';
111
- }
112
- }
113
- return redacted;
114
- }
package/dist/tasks.d.ts CHANGED
@@ -31,7 +31,7 @@ export interface CreateTaskResult {
31
31
  };
32
32
  _meta?: Record<string, unknown>;
33
33
  }
34
- export declare class TaskManager {
34
+ declare class TaskManager {
35
35
  private tasks;
36
36
  private waiters;
37
37
  createTask(options?: CreateTaskOptions, statusMessage?: string, ownerKey?: string): TaskState;
@@ -46,7 +46,6 @@ export declare class TaskManager {
46
46
  tasks: TaskState[];
47
47
  nextCursor?: string;
48
48
  };
49
- cleanupExpiredTasks(): number;
50
49
  waitForTerminalTask(taskId: string, ownerKey: string, signal?: AbortSignal): Promise<TaskState | undefined>;
51
50
  private notifyWaiters;
52
51
  private isExpired;
@@ -54,3 +53,4 @@ export declare class TaskManager {
54
53
  private decodeCursor;
55
54
  }
56
55
  export declare const taskManager: TaskManager;
56
+ export {};
package/dist/tasks.js CHANGED
@@ -12,7 +12,7 @@ const TERMINAL_STATUSES = new Set([
12
12
  function isTerminalStatus(status) {
13
13
  return TERMINAL_STATUSES.has(status);
14
14
  }
15
- export class TaskManager {
15
+ class TaskManager {
16
16
  tasks = new Map();
17
17
  waiters = new Map();
18
18
  createTask(options, statusMessage = 'Task started', ownerKey = DEFAULT_OWNER_KEY) {
@@ -93,20 +93,6 @@ export class TaskManager {
93
93
  const nextCursor = nextIndex < allTasks.length ? this.encodeCursor(nextIndex) : undefined;
94
94
  return nextCursor ? { tasks: page, nextCursor } : { tasks: page };
95
95
  }
96
- // Helper to check if task is expired and could be cleaned up
97
- // In a real implementation, this would be called by a periodic job
98
- cleanupExpiredTasks() {
99
- const now = Date.now();
100
- let count = 0;
101
- for (const [id, task] of this.tasks.entries()) {
102
- const created = new Date(task.createdAt).getTime();
103
- if (now - created > task.ttl) {
104
- this.tasks.delete(id);
105
- count++;
106
- }
107
- }
108
- return count;
109
- }
110
96
  async waitForTerminalTask(taskId, ownerKey, signal) {
111
97
  const task = this.getTask(taskId, ownerKey);
112
98
  if (!task)
package/dist/tools.d.ts CHANGED
@@ -72,7 +72,6 @@ export interface ToolHandlerExtra {
72
72
  sendNotification?: (notification: ProgressNotification) => Promise<void>;
73
73
  }
74
74
  export declare const FETCH_URL_TOOL_NAME = "fetch-url";
75
- export declare const FETCH_URL_TOOL_DESCRIPTION: string;
76
75
  interface ProgressReporter {
77
76
  report: (progress: number, message: string) => Promise<void>;
78
77
  }
package/dist/tools.js CHANGED
@@ -46,7 +46,7 @@ const fetchUrlOutputSchema = z.strictObject({
46
46
  .describe('Error message if the request failed'),
47
47
  });
48
48
  export const FETCH_URL_TOOL_NAME = 'fetch-url';
49
- export const FETCH_URL_TOOL_DESCRIPTION = `
49
+ const FETCH_URL_TOOL_DESCRIPTION = `
50
50
  Fetches a webpage and converts it to clean Markdown format optimized for LLM context.
51
51
 
52
52
  This tool is useful for:
package/dist/transform.js CHANGED
@@ -684,12 +684,12 @@ function tryTransformRawContent(params) {
684
684
  /* -------------------------------------------------------------------------------------------------
685
685
  * Quality gates + content source resolution
686
686
  * ------------------------------------------------------------------------------------------------- */
687
- const MIN_CONTENT_RATIO = 0.3;
687
+ const MIN_CONTENT_RATIO = 0.15;
688
688
  const MIN_HTML_LENGTH_FOR_GATE = 100;
689
- const MIN_HEADING_RETENTION_RATIO = 0.7;
690
- const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
689
+ const MIN_HEADING_RETENTION_RATIO = 0.3;
690
+ const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
691
691
  const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
692
- const MAX_TRUNCATED_LINE_RATIO = 0.5;
692
+ const MAX_TRUNCATED_LINE_RATIO = 0.95;
693
693
  function needsDocumentWrapper(html) {
694
694
  const trimmed = html.trim().toLowerCase();
695
695
  return (!trimmed.startsWith('<!doctype') &&
@@ -780,8 +780,8 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
780
780
  return metadata;
781
781
  }
782
782
  const CONTENT_ROOT_SELECTORS = [
783
- 'main',
784
783
  'article',
784
+ 'main',
785
785
  '[role="main"]',
786
786
  '#content',
787
787
  '#main-content',
@@ -808,10 +808,9 @@ function findContentRoot(document) {
808
808
  }
809
809
  return undefined;
810
810
  }
811
- function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
811
+ function shouldUseArticleContent(article, originalHtmlOrDocument) {
812
812
  const articleLength = article.textContent.length;
813
813
  const originalLength = getVisibleTextLength(originalHtmlOrDocument);
814
- const safeUrl = url.substring(0, 80);
815
814
  let articleDocument = null;
816
815
  const getArticleDocument = () => {
817
816
  if (articleDocument)
@@ -821,69 +820,45 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
821
820
  };
822
821
  if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
823
822
  const ratio = articleLength / originalLength;
824
- if (ratio < MIN_CONTENT_RATIO) {
825
- logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
826
- url: safeUrl,
827
- articleLength,
828
- });
823
+ if (ratio < MIN_CONTENT_RATIO)
829
824
  return false;
830
- }
831
825
  }
832
826
  const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
833
827
  if (originalHeadings > 0) {
834
828
  const articleHeadings = countHeadingsDom(getArticleDocument());
835
829
  const retentionRatio = articleHeadings / originalHeadings;
836
- if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
837
- logDebug('Quality gate: Readability broke heading structure, using full HTML', {
838
- url: safeUrl,
839
- originalHeadings,
840
- articleHeadings,
841
- });
830
+ if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
842
831
  return false;
843
- }
844
832
  }
845
833
  const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
846
834
  if (originalCodeBlocks > 0) {
847
835
  const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
848
836
  const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
849
- logDebug('Code block retention check', {
850
- url: safeUrl,
851
- originalCodeBlocks,
852
- articleCodeBlocks,
853
- codeRetentionRatio,
854
- });
855
- if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
856
- logDebug('Quality gate: Readability removed code blocks, using full HTML', {
857
- url: safeUrl,
858
- originalCodeBlocks,
859
- articleCodeBlocks,
860
- });
837
+ if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
861
838
  return false;
862
- }
863
839
  }
864
- if (hasTruncatedSentences(article.textContent)) {
865
- logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', {
866
- url: safeUrl,
867
- });
868
- return false;
869
- }
870
- return true;
840
+ return !hasTruncatedSentences(article.textContent);
871
841
  }
872
842
  function buildContentSource(params) {
873
843
  const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, } = params;
874
844
  const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
875
845
  if (useArticleContent && article) {
876
- return { sourceHtml: article.content, title: article.title, metadata };
846
+ // Apply noise removal to Readability-extracted content to remove
847
+ // author bylines, social share buttons, and other boilerplate
848
+ // that Readability may have included in the article content
849
+ const cleanedArticleHtml = removeNoiseFromHtml(article.content, undefined, url);
850
+ return {
851
+ sourceHtml: cleanedArticleHtml,
852
+ title: article.title,
853
+ metadata,
854
+ skipNoiseRemoval: true, // Already cleaned
855
+ };
877
856
  }
878
857
  if (document) {
879
- removeNoiseFromHtml(html, document, url);
880
- const cleanedDoc = document;
858
+ const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
859
+ const { document: cleanedDoc } = parseHTML(cleanedHtml);
881
860
  const contentRoot = findContentRoot(cleanedDoc);
882
861
  if (contentRoot) {
883
- logDebug('Using content root fallback instead of full HTML', {
884
- url: url.substring(0, 80),
885
- contentLength: contentRoot.length,
886
- });
887
862
  return {
888
863
  sourceHtml: contentRoot,
889
864
  title: extractedMeta.title,
@@ -905,7 +880,7 @@ function resolveContentSource(params) {
905
880
  ...(params.signal ? { signal: params.signal } : {}),
906
881
  });
907
882
  const useArticleContent = article
908
- ? shouldUseArticleContent(article, document, params.url)
883
+ ? shouldUseArticleContent(article, document)
909
884
  : false;
910
885
  return buildContentSource({
911
886
  html: params.html,
@@ -984,7 +959,7 @@ const workerMessageSchema = z.discriminatedUnion('type', [
984
959
  }),
985
960
  ]);
986
961
  const POOL_MIN_WORKERS = 2;
987
- const POOL_MAX_WORKERS = 4;
962
+ const POOL_MAX_WORKERS = config.transform.maxWorkerScale;
988
963
  const POOL_SCALE_THRESHOLD = 0.5;
989
964
  const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
990
965
  class WorkerPool {
@@ -1029,6 +1004,13 @@ class WorkerPool {
1029
1004
  getCapacity() {
1030
1005
  return this.capacity;
1031
1006
  }
1007
+ resize(size) {
1008
+ const newCapacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
1009
+ if (newCapacity === this.capacity)
1010
+ return;
1011
+ this.capacity = newCapacity;
1012
+ this.drainQueue();
1013
+ }
1032
1014
  async close() {
1033
1015
  if (this.closed)
1034
1016
  return;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/superfetch",
3
- "version": "2.4.12",
3
+ "version": "2.5.0",
4
4
  "mcpName": "io.github.j0hanz/superfetch",
5
5
  "description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
6
6
  "type": "module",