npm - @j0hanz/superfetch - Versions diffs - 2.3.0 → 2.4.0 - Mend

@j0hanz/superfetch 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +4 -9
package/dist/assets/logo.svg +24835 -0
package/dist/cache.js +58 -4
package/dist/config.d.ts +2 -0
package/dist/config.js +2 -0
package/dist/dom-noise-removal.js +15 -13
package/dist/fetch.js +16 -25
package/dist/http-native.js +19 -3
package/dist/markdown-cleanup.d.ts +6 -12
package/dist/markdown-cleanup.js +243 -25
package/dist/mcp.js +20 -9
package/dist/observability.d.ts +2 -0
package/dist/observability.js +25 -0
package/dist/tools.d.ts +5 -3
package/dist/tools.js +27 -12
package/dist/transform-types.d.ts +38 -0
package/dist/transform.d.ts +12 -6
package/dist/transform.js +120 -265
package/package.json +1 -2

package/dist/transform.js CHANGED Viewed

@@ -1,6 +1,5 @@
 import { randomUUID } from 'node:crypto';
 import diagnosticsChannel from 'node:diagnostics_channel';
-import os from 'node:os';
 import { performance } from 'node:perf_hooks';
 import { Worker } from 'node:worker_threads';
 import { parseHTML } from 'linkedom';
@@ -12,15 +11,9 @@ import { removeNoiseFromHtml } from './dom-noise-removal.js';
 import { FetchError, getErrorMessage } from './errors.js';
 import { isRawTextContentUrl } from './fetch.js';
 import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
-import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
+import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isLikelyHtmlContent, isRawTextContent, } from './markdown-cleanup.js';
 import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
 import { isObject } from './type-guards.js';
-// Re-export language detection for backward compatibility
-export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
-// Re-export markdown cleanup for backward compatibility
-export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
-// Re-export DOM noise removal for backward compatibility
-export { removeNoiseFromHtml } from './dom-noise-removal.js';
 function getAbortReason(signal) {
     if (!isObject(signal))
         return undefined;
@@ -48,25 +41,48 @@ function publishTransformEvent(event) {
         /* empty */
     }
 }
-export function startTransformStage(url, stage) {
-    if (!transformChannel.hasSubscribers)
+export function startTransformStage(url, stage, budget) {
+    if (!transformChannel.hasSubscribers && !budget)
         return null;
-    return {
+    const remainingBudgetMs = budget
+        ? budget.totalBudgetMs - budget.elapsedMs
+        : undefined;
+    const base = {
         stage,
         startTime: performance.now(),
         url: redactUrl(url),
     };
+    if (remainingBudgetMs !== undefined && budget) {
+        return {
+            ...base,
+            budgetMs: remainingBudgetMs,
+            totalBudgetMs: budget.totalBudgetMs,
+        };
+    }
+    return base;
 }
 export function endTransformStage(context, options) {
     if (!context)
-        return;
+        return 0;
+    const durationMs = performance.now() - context.startTime;
     const requestId = getRequestId();
     const operationId = getOperationId();
+    if (context.totalBudgetMs !== undefined) {
+        const warnThresholdMs = context.totalBudgetMs * config.transform.stageWarnRatio;
+        if (durationMs > warnThresholdMs) {
+            logWarn('Transform stage exceeded warning threshold', {
+                stage: context.stage,
+                durationMs: Math.round(durationMs),
+                thresholdMs: Math.round(warnThresholdMs),
+                url: context.url,
+            });
+        }
+    }
     const event = {
         v: 1,
         type: 'stage',
         stage: context.stage,
-        durationMs: performance.now() - context.startTime,
+        durationMs,
         url: context.url,
         ...(requestId ? { requestId } : {}),
         ...(operationId ? { operationId } : {}),
@@ -75,14 +91,22 @@ export function endTransformStage(context, options) {
             : {}),
     };
     publishTransformEvent(event);
+    return durationMs;
 }
-function runTransformStage(url, stage, fn) {
-    const context = startTransformStage(url, stage);
+function runTransformStage(url, stage, fn, budget) {
+    if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
+        throw new FetchError('Transform budget exhausted', url, 504, {
+            reason: 'timeout',
+            stage: `${stage}:budget_exhausted`,
+            elapsedMs: budget.elapsedMs,
+            totalBudgetMs: budget.totalBudgetMs,
+        });
+    }
+    const context = startTransformStage(url, stage, budget);
     try {
         return fn();
     }
     finally {
-        // Emit duration even if the stage throws; callers decide how to handle the error.
         endTransformStage(context);
     }
 }
@@ -340,21 +364,22 @@ function applyBaseUri(document, url) {
         });
     }
 }
-// DOM noise removal functions moved to ./dom-noise-removal.ts
 function buildInlineCode(content) {
-    const runs = content.match(/`+/g);
-    let longest = '';
-    if (runs) {
-        for (const run of runs) {
-            if (run.length > longest.length) {
-                longest = run;
-            }
+    let maxBackticks = 0;
+    let currentRun = 0;
+    for (const char of content) {
+        if (char === '`') {
+            currentRun++;
+        }
+        else {
+            if (currentRun > maxBackticks)
+                maxBackticks = currentRun;
+            currentRun = 0;
         }
     }
-    // Use a fence longer than any run of backticks in the content.
-    const delimiter = `\`${longest}`;
-    // Only pad when needed to avoid altering code spans unnecessarily.
-    // CommonMark recommends padding when the code starts/ends with a backtick.
+    if (currentRun > maxBackticks)
+        maxBackticks = currentRun;
+    const delimiter = '`'.repeat(maxBackticks + 1);
     const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
     return `${delimiter}${padding}${content}${padding}${delimiter}`;
 }
@@ -531,8 +556,7 @@ function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval)
     throwIfAborted(signal, url, 'markdown:cleaned');
     const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
     throwIfAborted(signal, url, 'markdown:translated');
-    const cleaned = cleanupMarkdownArtifacts(content);
-    return promoteOrphanHeadings(cleaned);
+    return cleanupMarkdownArtifacts(content);
 }
 function appendMetadataFooter(content, metadata, url) {
     const footer = buildMetadataFooter(metadata, url);
@@ -554,223 +578,6 @@ export function htmlToMarkdown(html, metadata, options) {
         return buildMetadataFooter(metadata, url);
     }
 }
-// Markdown cleanup functions moved to ./markdown-cleanup.ts
-function formatFetchedDate(isoString) {
-    try {
-        const date = new Date(isoString);
-        const day = String(date.getDate()).padStart(2, '0');
-        const month = String(date.getMonth() + 1).padStart(2, '0');
-        const year = date.getFullYear();
-        return `${day}-${month}-${year}`;
-    }
-    catch {
-        return isoString;
-    }
-}
-function buildMetadataFooter(metadata, fallbackUrl) {
-    if (!metadata)
-        return '';
-    const lines = ['---', ''];
-    const url = metadata.url || fallbackUrl;
-    const parts = [];
-    if (metadata.title)
-        parts.push(`_${metadata.title}_`);
-    if (metadata.author)
-        parts.push(`_${metadata.author}_`);
-    if (url)
-        parts.push(`[_Original Source_](${url})`);
-    if (metadata.fetchedAt) {
-        const formattedDate = formatFetchedDate(metadata.fetchedAt);
-        parts.push(`_${formattedDate}_`);
-    }
-    if (parts.length > 0) {
-        lines.push(` ${parts.join(' | ')}`);
-    }
-    if (metadata.description) {
-        lines.push(` <sub>${metadata.description}</sub>`);
-    }
-    return lines.join('\n');
-}
-const HEADING_PATTERN = /^#{1,6}\s/m;
-const LIST_PATTERN = /^(?:[-*+])\s/m;
-const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
-function containsMarkdownHeading(content) {
-    return HEADING_PATTERN.test(content);
-}
-function containsMarkdownList(content) {
-    return LIST_PATTERN.test(content);
-}
-function containsFencedCodeBlock(content) {
-    const first = content.indexOf('```');
-    if (first === -1)
-        return false;
-    return content.includes('```', first + 3);
-}
-function looksLikeMarkdown(content) {
-    return (containsMarkdownHeading(content) ||
-        containsMarkdownList(content) ||
-        containsFencedCodeBlock(content));
-}
-function detectLineEnding(content) {
-    return content.includes('\r\n') ? '\r\n' : '\n';
-}
-const FRONTMATTER_DELIMITER = '---';
-function findFrontmatterLines(content) {
-    const lineEnding = detectLineEnding(content);
-    const lines = content.split(lineEnding);
-    if (lines[0] !== FRONTMATTER_DELIMITER)
-        return null;
-    const endIndex = lines.indexOf(FRONTMATTER_DELIMITER, 1);
-    if (endIndex === -1)
-        return null;
-    return { lineEnding, lines, endIndex };
-}
-function stripOptionalQuotes(value) {
-    const trimmed = value.trim();
-    if (trimmed.length < 2)
-        return trimmed;
-    const first = trimmed[0];
-    const last = trimmed[trimmed.length - 1];
-    if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
-        return trimmed.slice(1, -1).trim();
-    }
-    return trimmed;
-}
-function parseFrontmatterEntry(line) {
-    const trimmed = line.trim();
-    if (!trimmed)
-        return null;
-    const separatorIndex = trimmed.indexOf(':');
-    if (separatorIndex <= 0)
-        return null;
-    const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
-    const value = trimmed.slice(separatorIndex + 1);
-    return { key, value };
-}
-function isTitleKey(key) {
-    return key === 'title' || key === 'name';
-}
-function extractTitleFromHeading(content) {
-    const lineEnding = detectLineEnding(content);
-    const lines = content.split(lineEnding);
-    for (const line of lines) {
-        const trimmed = line.trim();
-        if (!trimmed)
-            continue;
-        let index = 0;
-        while (index < trimmed.length && trimmed[index] === '#') {
-            index += 1;
-        }
-        if (index === 0 || index > 6)
-            return undefined;
-        const nextChar = trimmed[index];
-        if (nextChar !== ' ' && nextChar !== '\t')
-            return undefined;
-        const heading = trimmed.slice(index).trim();
-        return heading.length > 0 ? heading : undefined;
-    }
-    return undefined;
-}
-function extractTitleFromRawMarkdown(content) {
-    const frontmatter = findFrontmatterLines(content);
-    if (!frontmatter) {
-        return extractTitleFromHeading(content);
-    }
-    const { lines, endIndex } = frontmatter;
-    const entry = lines
-        .slice(1, endIndex)
-        .map((line) => parseFrontmatterEntry(line))
-        .find((parsed) => parsed !== null && isTitleKey(parsed.key));
-    if (!entry)
-        return undefined;
-    const value = stripOptionalQuotes(entry.value);
-    return value || undefined;
-}
-function hasMarkdownSourceLine(content) {
-    const lineEnding = detectLineEnding(content);
-    const lines = content.split(lineEnding);
-    const limit = Math.min(lines.length, 50);
-    for (let index = 0; index < limit; index += 1) {
-        const line = lines[index];
-        if (!line)
-            continue;
-        if (line.trimStart().toLowerCase().startsWith('source:')) {
-            return true;
-        }
-    }
-    return false;
-}
-function addSourceToMarkdownMarkdownFormat(content, url) {
-    if (hasMarkdownSourceLine(content))
-        return content;
-    const lineEnding = detectLineEnding(content);
-    const lines = content.split(lineEnding);
-    const firstNonEmptyIndex = lines.findIndex((line) => line.trim().length > 0);
-    if (firstNonEmptyIndex !== -1) {
-        const firstLine = lines[firstNonEmptyIndex];
-        if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
-            const insertAt = firstNonEmptyIndex + 1;
-            const updated = [
-                ...lines.slice(0, insertAt),
-                '',
-                `Source: ${url}`,
-                '',
-                ...lines.slice(insertAt),
-            ];
-            return updated.join(lineEnding);
-        }
-    }
-    return [`Source: ${url}`, '', content].join(lineEnding);
-}
-function addSourceToMarkdown(content, url) {
-    const frontmatter = findFrontmatterLines(content);
-    if (config.transform.metadataFormat === 'markdown' && !frontmatter) {
-        return addSourceToMarkdownMarkdownFormat(content, url);
-    }
-    if (!frontmatter) {
-        return `---\nsource: "${url}"\n---\n\n${content}`;
-    }
-    const { lineEnding, lines, endIndex } = frontmatter;
-    const bodyLines = lines.slice(1, endIndex);
-    const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
-    if (hasSource)
-        return content;
-    const updatedLines = [
-        lines[0],
-        ...bodyLines,
-        `source: "${url}"`,
-        ...lines.slice(endIndex),
-    ];
-    return updatedLines.join(lineEnding);
-}
-function hasFrontmatter(trimmed) {
-    return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
-}
-function looksLikeHtmlDocument(trimmed) {
-    return HTML_DOCUMENT_PATTERN.test(trimmed);
-}
-function countCommonHtmlTags(content) {
-    const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
-        [];
-    return matches.length;
-}
-function isRawTextContent(content) {
-    const trimmed = content.trim();
-    const isHtmlDocument = looksLikeHtmlDocument(trimmed);
-    const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
-    const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
-    const isMarkdown = looksLikeMarkdown(content);
-    return (!isHtmlDocument &&
-        (hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
-}
-function isLikelyHtmlContent(content) {
-    const trimmed = content.trim();
-    if (!trimmed)
-        return false;
-    if (looksLikeHtmlDocument(trimmed))
-        return true;
-    return countCommonHtmlTags(content) > 2;
-}
 function shouldPreserveRawContent(url, content) {
     if (isRawTextContentUrl(url)) {
         return !isLikelyHtmlContent(content);
@@ -1189,11 +996,11 @@ const workerMessageSchema = z.discriminatedUnion('type', [
     }),
 ]);
 let pool = null;
+const POOL_MIN_WORKERS = 2;
+const POOL_MAX_WORKERS = 4;
+const POOL_SCALE_THRESHOLD = 0.5;
 function resolveDefaultWorkerCount() {
-    const parallelism = typeof os.availableParallelism === 'function'
-        ? os.availableParallelism()
-        : os.cpus().length;
-    return Math.min(16, Math.max(1, parallelism - 1));
+    return POOL_MIN_WORKERS;
 }
 const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
 function getOrCreateTransformWorkerPool() {
@@ -1206,8 +1013,20 @@ export async function shutdownTransformWorkerPool() {
     await pool.close();
     pool = null;
 }
+export function getTransformPoolStats() {
+    if (!pool)
+        return null;
+    return {
+        queueDepth: pool.getQueueDepth(),
+        activeWorkers: pool.getActiveWorkers(),
+        capacity: pool.getCapacity(),
+    };
+}
 class WorkerPool {
     workers = [];
+    capacity;
+    minCapacity;
+    maxCapacity;
     queue = [];
     inflight = new Map();
     timeoutMs;
@@ -1323,12 +1142,11 @@ class WorkerPool {
         });
     }
     constructor(size, timeoutMs) {
-        const safeSize = Math.max(1, size);
+        this.minCapacity = POOL_MIN_WORKERS;
+        this.maxCapacity = POOL_MAX_WORKERS;
+        this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
         this.timeoutMs = timeoutMs;
-        this.queueMax = safeSize * 2;
-        for (let index = 0; index < safeSize; index += 1) {
-            this.workers.push(this.spawnWorker(index));
-        }
+        this.queueMax = this.maxCapacity * 32;
     }
     spawnWorker(workerIndex) {
         const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
@@ -1426,21 +1244,46 @@ class WorkerPool {
             this.drainQueue();
         });
     }
+    /** Scale capacity up if queue pressure exceeds threshold. */
+    maybeScaleUp() {
+        if (this.queue.length > this.capacity * POOL_SCALE_THRESHOLD &&
+            this.capacity < this.maxCapacity) {
+            this.capacity += 1;
+        }
+    }
     drainQueue() {
+        if (this.closed)
+            return;
         if (this.queue.length === 0)
             return;
+        this.maybeScaleUp();
+        // First pass: try to find an idle existing worker
         for (let workerIndex = 0; workerIndex < this.workers.length; workerIndex += 1) {
             const slot = this.workers[workerIndex];
-            if (!slot || slot.busy)
-                continue;
-            const task = this.queue.shift();
-            if (!task)
-                return;
-            this.dispatch(workerIndex, slot, task);
-            if (this.queue.length === 0)
-                return;
+            if (slot && !slot.busy) {
+                this.dispatchQueueTask(workerIndex, slot);
+                if (this.queue.length === 0)
+                    return;
+            }
+        }
+        if (this.workers.length < this.capacity && this.queue.length > 0) {
+            const workerIndex = this.workers.length;
+            const slot = this.spawnWorker(workerIndex);
+            this.workers.push(slot);
+            this.dispatchQueueTask(workerIndex, slot);
+            if (this.workers.length < this.capacity && this.queue.length > 0) {
+                setImmediate(() => {
+                    this.drainQueue();
+                });
+            }
         }
     }
+    dispatchQueueTask(workerIndex, slot) {
+        const task = this.queue.shift();
+        if (!task)
+            return;
+        this.dispatch(workerIndex, slot, task);
+    }
     dispatch(workerIndex, slot, task) {
         if (this.rejectIfAborted(task))
             return;
@@ -1510,11 +1353,23 @@ class WorkerPool {
         task.reject(message);
         this.restartWorker(workerIndex, slot);
     }
+    getQueueDepth() {
+        return this.queue.length;
+    }
+    getActiveWorkers() {
+        return this.workers.filter((s) => s?.busy).length;
+    }
+    getCapacity() {
+        return this.capacity;
+    }
     async close() {
         if (this.closed)
             return;
         this.closed = true;
-        const terminations = this.workers.map((slot) => slot.worker.terminate());
+        const terminations = this.workers
+            .map((slot) => slot?.worker.terminate())
+            .filter((p) => p !== undefined);
+        this.workers.fill(undefined);
         this.workers.length = 0;
         for (const [id, inflight] of this.inflight.entries()) {
             clearTimeout(inflight.timer);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@j0hanz/superfetch",
-  "version": "2.3.0",
+  "version": "2.4.0",
   "mcpName": "io.github.j0hanz/superfetch",
   "description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
   "type": "module",
@@ -59,7 +59,6 @@
     "@modelcontextprotocol/sdk": "^1.25.3",
     "@mozilla/readability": "^0.6.0",
     "linkedom": "^0.18.12",
-    "lru-cache": "^11.2.5",
     "node-html-markdown": "^2.0.0",
     "undici": "^7.19.2",
     "zod": "^4.3.6"