@j0hanz/superfetch 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/transform.js CHANGED
@@ -1,6 +1,5 @@
1
1
  import { randomUUID } from 'node:crypto';
2
2
  import diagnosticsChannel from 'node:diagnostics_channel';
3
- import os from 'node:os';
4
3
  import { performance } from 'node:perf_hooks';
5
4
  import { Worker } from 'node:worker_threads';
6
5
  import { parseHTML } from 'linkedom';
@@ -12,15 +11,9 @@ import { removeNoiseFromHtml } from './dom-noise-removal.js';
12
11
  import { FetchError, getErrorMessage } from './errors.js';
13
12
  import { isRawTextContentUrl } from './fetch.js';
14
13
  import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
15
- import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
14
+ import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isLikelyHtmlContent, isRawTextContent, } from './markdown-cleanup.js';
16
15
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
17
16
  import { isObject } from './type-guards.js';
18
- // Re-export language detection for backward compatibility
19
- export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
20
- // Re-export markdown cleanup for backward compatibility
21
- export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
22
- // Re-export DOM noise removal for backward compatibility
23
- export { removeNoiseFromHtml } from './dom-noise-removal.js';
24
17
  function getAbortReason(signal) {
25
18
  if (!isObject(signal))
26
19
  return undefined;
@@ -48,25 +41,48 @@ function publishTransformEvent(event) {
48
41
  /* empty */
49
42
  }
50
43
  }
51
- export function startTransformStage(url, stage) {
52
- if (!transformChannel.hasSubscribers)
44
+ export function startTransformStage(url, stage, budget) {
45
+ if (!transformChannel.hasSubscribers && !budget)
53
46
  return null;
54
- return {
47
+ const remainingBudgetMs = budget
48
+ ? budget.totalBudgetMs - budget.elapsedMs
49
+ : undefined;
50
+ const base = {
55
51
  stage,
56
52
  startTime: performance.now(),
57
53
  url: redactUrl(url),
58
54
  };
55
+ if (remainingBudgetMs !== undefined && budget) {
56
+ return {
57
+ ...base,
58
+ budgetMs: remainingBudgetMs,
59
+ totalBudgetMs: budget.totalBudgetMs,
60
+ };
61
+ }
62
+ return base;
59
63
  }
60
64
  export function endTransformStage(context, options) {
61
65
  if (!context)
62
- return;
66
+ return 0;
67
+ const durationMs = performance.now() - context.startTime;
63
68
  const requestId = getRequestId();
64
69
  const operationId = getOperationId();
70
+ if (context.totalBudgetMs !== undefined) {
71
+ const warnThresholdMs = context.totalBudgetMs * config.transform.stageWarnRatio;
72
+ if (durationMs > warnThresholdMs) {
73
+ logWarn('Transform stage exceeded warning threshold', {
74
+ stage: context.stage,
75
+ durationMs: Math.round(durationMs),
76
+ thresholdMs: Math.round(warnThresholdMs),
77
+ url: context.url,
78
+ });
79
+ }
80
+ }
65
81
  const event = {
66
82
  v: 1,
67
83
  type: 'stage',
68
84
  stage: context.stage,
69
- durationMs: performance.now() - context.startTime,
85
+ durationMs,
70
86
  url: context.url,
71
87
  ...(requestId ? { requestId } : {}),
72
88
  ...(operationId ? { operationId } : {}),
@@ -75,14 +91,22 @@ export function endTransformStage(context, options) {
75
91
  : {}),
76
92
  };
77
93
  publishTransformEvent(event);
94
+ return durationMs;
78
95
  }
79
- function runTransformStage(url, stage, fn) {
80
- const context = startTransformStage(url, stage);
96
+ function runTransformStage(url, stage, fn, budget) {
97
+ if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
98
+ throw new FetchError('Transform budget exhausted', url, 504, {
99
+ reason: 'timeout',
100
+ stage: `${stage}:budget_exhausted`,
101
+ elapsedMs: budget.elapsedMs,
102
+ totalBudgetMs: budget.totalBudgetMs,
103
+ });
104
+ }
105
+ const context = startTransformStage(url, stage, budget);
81
106
  try {
82
107
  return fn();
83
108
  }
84
109
  finally {
85
- // Emit duration even if the stage throws; callers decide how to handle the error.
86
110
  endTransformStage(context);
87
111
  }
88
112
  }
@@ -340,21 +364,22 @@ function applyBaseUri(document, url) {
340
364
  });
341
365
  }
342
366
  }
343
- // DOM noise removal functions moved to ./dom-noise-removal.ts
344
367
  function buildInlineCode(content) {
345
- const runs = content.match(/`+/g);
346
- let longest = '';
347
- if (runs) {
348
- for (const run of runs) {
349
- if (run.length > longest.length) {
350
- longest = run;
351
- }
368
+ let maxBackticks = 0;
369
+ let currentRun = 0;
370
+ for (const char of content) {
371
+ if (char === '`') {
372
+ currentRun++;
373
+ }
374
+ else {
375
+ if (currentRun > maxBackticks)
376
+ maxBackticks = currentRun;
377
+ currentRun = 0;
352
378
  }
353
379
  }
354
- // Use a fence longer than any run of backticks in the content.
355
- const delimiter = `\`${longest}`;
356
- // Only pad when needed to avoid altering code spans unnecessarily.
357
- // CommonMark recommends padding when the code starts/ends with a backtick.
380
+ if (currentRun > maxBackticks)
381
+ maxBackticks = currentRun;
382
+ const delimiter = '`'.repeat(maxBackticks + 1);
358
383
  const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
359
384
  return `${delimiter}${padding}${content}${padding}${delimiter}`;
360
385
  }
@@ -531,8 +556,7 @@ function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval)
531
556
  throwIfAborted(signal, url, 'markdown:cleaned');
532
557
  const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
533
558
  throwIfAborted(signal, url, 'markdown:translated');
534
- const cleaned = cleanupMarkdownArtifacts(content);
535
- return promoteOrphanHeadings(cleaned);
559
+ return cleanupMarkdownArtifacts(content);
536
560
  }
537
561
  function appendMetadataFooter(content, metadata, url) {
538
562
  const footer = buildMetadataFooter(metadata, url);
@@ -554,223 +578,6 @@ export function htmlToMarkdown(html, metadata, options) {
554
578
  return buildMetadataFooter(metadata, url);
555
579
  }
556
580
  }
557
- // Markdown cleanup functions moved to ./markdown-cleanup.ts
558
- function formatFetchedDate(isoString) {
559
- try {
560
- const date = new Date(isoString);
561
- const day = String(date.getDate()).padStart(2, '0');
562
- const month = String(date.getMonth() + 1).padStart(2, '0');
563
- const year = date.getFullYear();
564
- return `${day}-${month}-${year}`;
565
- }
566
- catch {
567
- return isoString;
568
- }
569
- }
570
- function buildMetadataFooter(metadata, fallbackUrl) {
571
- if (!metadata)
572
- return '';
573
- const lines = ['---', ''];
574
- const url = metadata.url || fallbackUrl;
575
- const parts = [];
576
- if (metadata.title)
577
- parts.push(`_${metadata.title}_`);
578
- if (metadata.author)
579
- parts.push(`_${metadata.author}_`);
580
- if (url)
581
- parts.push(`[_Original Source_](${url})`);
582
- if (metadata.fetchedAt) {
583
- const formattedDate = formatFetchedDate(metadata.fetchedAt);
584
- parts.push(`_${formattedDate}_`);
585
- }
586
- if (parts.length > 0) {
587
- lines.push(` ${parts.join(' | ')}`);
588
- }
589
- if (metadata.description) {
590
- lines.push(` <sub>${metadata.description}</sub>`);
591
- }
592
- return lines.join('\n');
593
- }
594
- const HEADING_PATTERN = /^#{1,6}\s/m;
595
- const LIST_PATTERN = /^(?:[-*+])\s/m;
596
- const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
597
- function containsMarkdownHeading(content) {
598
- return HEADING_PATTERN.test(content);
599
- }
600
- function containsMarkdownList(content) {
601
- return LIST_PATTERN.test(content);
602
- }
603
- function containsFencedCodeBlock(content) {
604
- const first = content.indexOf('```');
605
- if (first === -1)
606
- return false;
607
- return content.includes('```', first + 3);
608
- }
609
- function looksLikeMarkdown(content) {
610
- return (containsMarkdownHeading(content) ||
611
- containsMarkdownList(content) ||
612
- containsFencedCodeBlock(content));
613
- }
614
- function detectLineEnding(content) {
615
- return content.includes('\r\n') ? '\r\n' : '\n';
616
- }
617
- const FRONTMATTER_DELIMITER = '---';
618
- function findFrontmatterLines(content) {
619
- const lineEnding = detectLineEnding(content);
620
- const lines = content.split(lineEnding);
621
- if (lines[0] !== FRONTMATTER_DELIMITER)
622
- return null;
623
- const endIndex = lines.indexOf(FRONTMATTER_DELIMITER, 1);
624
- if (endIndex === -1)
625
- return null;
626
- return { lineEnding, lines, endIndex };
627
- }
628
- function stripOptionalQuotes(value) {
629
- const trimmed = value.trim();
630
- if (trimmed.length < 2)
631
- return trimmed;
632
- const first = trimmed[0];
633
- const last = trimmed[trimmed.length - 1];
634
- if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
635
- return trimmed.slice(1, -1).trim();
636
- }
637
- return trimmed;
638
- }
639
- function parseFrontmatterEntry(line) {
640
- const trimmed = line.trim();
641
- if (!trimmed)
642
- return null;
643
- const separatorIndex = trimmed.indexOf(':');
644
- if (separatorIndex <= 0)
645
- return null;
646
- const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
647
- const value = trimmed.slice(separatorIndex + 1);
648
- return { key, value };
649
- }
650
- function isTitleKey(key) {
651
- return key === 'title' || key === 'name';
652
- }
653
- function extractTitleFromHeading(content) {
654
- const lineEnding = detectLineEnding(content);
655
- const lines = content.split(lineEnding);
656
- for (const line of lines) {
657
- const trimmed = line.trim();
658
- if (!trimmed)
659
- continue;
660
- let index = 0;
661
- while (index < trimmed.length && trimmed[index] === '#') {
662
- index += 1;
663
- }
664
- if (index === 0 || index > 6)
665
- return undefined;
666
- const nextChar = trimmed[index];
667
- if (nextChar !== ' ' && nextChar !== '\t')
668
- return undefined;
669
- const heading = trimmed.slice(index).trim();
670
- return heading.length > 0 ? heading : undefined;
671
- }
672
- return undefined;
673
- }
674
- function extractTitleFromRawMarkdown(content) {
675
- const frontmatter = findFrontmatterLines(content);
676
- if (!frontmatter) {
677
- return extractTitleFromHeading(content);
678
- }
679
- const { lines, endIndex } = frontmatter;
680
- const entry = lines
681
- .slice(1, endIndex)
682
- .map((line) => parseFrontmatterEntry(line))
683
- .find((parsed) => parsed !== null && isTitleKey(parsed.key));
684
- if (!entry)
685
- return undefined;
686
- const value = stripOptionalQuotes(entry.value);
687
- return value || undefined;
688
- }
689
- function hasMarkdownSourceLine(content) {
690
- const lineEnding = detectLineEnding(content);
691
- const lines = content.split(lineEnding);
692
- const limit = Math.min(lines.length, 50);
693
- for (let index = 0; index < limit; index += 1) {
694
- const line = lines[index];
695
- if (!line)
696
- continue;
697
- if (line.trimStart().toLowerCase().startsWith('source:')) {
698
- return true;
699
- }
700
- }
701
- return false;
702
- }
703
- function addSourceToMarkdownMarkdownFormat(content, url) {
704
- if (hasMarkdownSourceLine(content))
705
- return content;
706
- const lineEnding = detectLineEnding(content);
707
- const lines = content.split(lineEnding);
708
- const firstNonEmptyIndex = lines.findIndex((line) => line.trim().length > 0);
709
- if (firstNonEmptyIndex !== -1) {
710
- const firstLine = lines[firstNonEmptyIndex];
711
- if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
712
- const insertAt = firstNonEmptyIndex + 1;
713
- const updated = [
714
- ...lines.slice(0, insertAt),
715
- '',
716
- `Source: ${url}`,
717
- '',
718
- ...lines.slice(insertAt),
719
- ];
720
- return updated.join(lineEnding);
721
- }
722
- }
723
- return [`Source: ${url}`, '', content].join(lineEnding);
724
- }
725
- function addSourceToMarkdown(content, url) {
726
- const frontmatter = findFrontmatterLines(content);
727
- if (config.transform.metadataFormat === 'markdown' && !frontmatter) {
728
- return addSourceToMarkdownMarkdownFormat(content, url);
729
- }
730
- if (!frontmatter) {
731
- return `---\nsource: "${url}"\n---\n\n${content}`;
732
- }
733
- const { lineEnding, lines, endIndex } = frontmatter;
734
- const bodyLines = lines.slice(1, endIndex);
735
- const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
736
- if (hasSource)
737
- return content;
738
- const updatedLines = [
739
- lines[0],
740
- ...bodyLines,
741
- `source: "${url}"`,
742
- ...lines.slice(endIndex),
743
- ];
744
- return updatedLines.join(lineEnding);
745
- }
746
- function hasFrontmatter(trimmed) {
747
- return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
748
- }
749
- function looksLikeHtmlDocument(trimmed) {
750
- return HTML_DOCUMENT_PATTERN.test(trimmed);
751
- }
752
- function countCommonHtmlTags(content) {
753
- const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
754
- [];
755
- return matches.length;
756
- }
757
- function isRawTextContent(content) {
758
- const trimmed = content.trim();
759
- const isHtmlDocument = looksLikeHtmlDocument(trimmed);
760
- const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
761
- const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
762
- const isMarkdown = looksLikeMarkdown(content);
763
- return (!isHtmlDocument &&
764
- (hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
765
- }
766
- function isLikelyHtmlContent(content) {
767
- const trimmed = content.trim();
768
- if (!trimmed)
769
- return false;
770
- if (looksLikeHtmlDocument(trimmed))
771
- return true;
772
- return countCommonHtmlTags(content) > 2;
773
- }
774
581
  function shouldPreserveRawContent(url, content) {
775
582
  if (isRawTextContentUrl(url)) {
776
583
  return !isLikelyHtmlContent(content);
@@ -1189,11 +996,11 @@ const workerMessageSchema = z.discriminatedUnion('type', [
1189
996
  }),
1190
997
  ]);
1191
998
  let pool = null;
999
+ const POOL_MIN_WORKERS = 2;
1000
+ const POOL_MAX_WORKERS = 4;
1001
+ const POOL_SCALE_THRESHOLD = 0.5;
1192
1002
  function resolveDefaultWorkerCount() {
1193
- const parallelism = typeof os.availableParallelism === 'function'
1194
- ? os.availableParallelism()
1195
- : os.cpus().length;
1196
- return Math.min(16, Math.max(1, parallelism - 1));
1003
+ return POOL_MIN_WORKERS;
1197
1004
  }
1198
1005
  const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
1199
1006
  function getOrCreateTransformWorkerPool() {
@@ -1206,8 +1013,20 @@ export async function shutdownTransformWorkerPool() {
1206
1013
  await pool.close();
1207
1014
  pool = null;
1208
1015
  }
1016
+ export function getTransformPoolStats() {
1017
+ if (!pool)
1018
+ return null;
1019
+ return {
1020
+ queueDepth: pool.getQueueDepth(),
1021
+ activeWorkers: pool.getActiveWorkers(),
1022
+ capacity: pool.getCapacity(),
1023
+ };
1024
+ }
1209
1025
  class WorkerPool {
1210
1026
  workers = [];
1027
+ capacity;
1028
+ minCapacity;
1029
+ maxCapacity;
1211
1030
  queue = [];
1212
1031
  inflight = new Map();
1213
1032
  timeoutMs;
@@ -1323,12 +1142,11 @@ class WorkerPool {
1323
1142
  });
1324
1143
  }
1325
1144
  constructor(size, timeoutMs) {
1326
- const safeSize = Math.max(1, size);
1145
+ this.minCapacity = POOL_MIN_WORKERS;
1146
+ this.maxCapacity = POOL_MAX_WORKERS;
1147
+ this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
1327
1148
  this.timeoutMs = timeoutMs;
1328
- this.queueMax = safeSize * 2;
1329
- for (let index = 0; index < safeSize; index += 1) {
1330
- this.workers.push(this.spawnWorker(index));
1331
- }
1149
+ this.queueMax = this.maxCapacity * 32;
1332
1150
  }
1333
1151
  spawnWorker(workerIndex) {
1334
1152
  const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
@@ -1426,21 +1244,46 @@ class WorkerPool {
1426
1244
  this.drainQueue();
1427
1245
  });
1428
1246
  }
1247
+ /** Scale capacity up if queue pressure exceeds threshold. */
1248
+ maybeScaleUp() {
1249
+ if (this.queue.length > this.capacity * POOL_SCALE_THRESHOLD &&
1250
+ this.capacity < this.maxCapacity) {
1251
+ this.capacity += 1;
1252
+ }
1253
+ }
1429
1254
  drainQueue() {
1255
+ if (this.closed)
1256
+ return;
1430
1257
  if (this.queue.length === 0)
1431
1258
  return;
1259
+ this.maybeScaleUp();
1260
+ // First pass: try to find an idle existing worker
1432
1261
  for (let workerIndex = 0; workerIndex < this.workers.length; workerIndex += 1) {
1433
1262
  const slot = this.workers[workerIndex];
1434
- if (!slot || slot.busy)
1435
- continue;
1436
- const task = this.queue.shift();
1437
- if (!task)
1438
- return;
1439
- this.dispatch(workerIndex, slot, task);
1440
- if (this.queue.length === 0)
1441
- return;
1263
+ if (slot && !slot.busy) {
1264
+ this.dispatchQueueTask(workerIndex, slot);
1265
+ if (this.queue.length === 0)
1266
+ return;
1267
+ }
1268
+ }
1269
+ if (this.workers.length < this.capacity && this.queue.length > 0) {
1270
+ const workerIndex = this.workers.length;
1271
+ const slot = this.spawnWorker(workerIndex);
1272
+ this.workers.push(slot);
1273
+ this.dispatchQueueTask(workerIndex, slot);
1274
+ if (this.workers.length < this.capacity && this.queue.length > 0) {
1275
+ setImmediate(() => {
1276
+ this.drainQueue();
1277
+ });
1278
+ }
1442
1279
  }
1443
1280
  }
1281
+ dispatchQueueTask(workerIndex, slot) {
1282
+ const task = this.queue.shift();
1283
+ if (!task)
1284
+ return;
1285
+ this.dispatch(workerIndex, slot, task);
1286
+ }
1444
1287
  dispatch(workerIndex, slot, task) {
1445
1288
  if (this.rejectIfAborted(task))
1446
1289
  return;
@@ -1510,11 +1353,23 @@ class WorkerPool {
1510
1353
  task.reject(message);
1511
1354
  this.restartWorker(workerIndex, slot);
1512
1355
  }
1356
+ getQueueDepth() {
1357
+ return this.queue.length;
1358
+ }
1359
+ getActiveWorkers() {
1360
+ return this.workers.filter((s) => s?.busy).length;
1361
+ }
1362
+ getCapacity() {
1363
+ return this.capacity;
1364
+ }
1513
1365
  async close() {
1514
1366
  if (this.closed)
1515
1367
  return;
1516
1368
  this.closed = true;
1517
- const terminations = this.workers.map((slot) => slot.worker.terminate());
1369
+ const terminations = this.workers
1370
+ .map((slot) => slot?.worker.terminate())
1371
+ .filter((p) => p !== undefined);
1372
+ this.workers.fill(undefined);
1518
1373
  this.workers.length = 0;
1519
1374
  for (const [id, inflight] of this.inflight.entries()) {
1520
1375
  clearTimeout(inflight.timer);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/superfetch",
3
- "version": "2.3.0",
3
+ "version": "2.4.0",
4
4
  "mcpName": "io.github.j0hanz/superfetch",
5
5
  "description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
6
6
  "type": "module",
@@ -59,7 +59,6 @@
59
59
  "@modelcontextprotocol/sdk": "^1.25.3",
60
60
  "@mozilla/readability": "^0.6.0",
61
61
  "linkedom": "^0.18.12",
62
- "lru-cache": "^11.2.5",
63
62
  "node-html-markdown": "^2.0.0",
64
63
  "undici": "^7.19.2",
65
64
  "zod": "^4.3.6"