@j0hanz/fetch-url-mcp 1.9.3 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,11 +3,14 @@ import diagnosticsChannel from 'node:diagnostics_channel';
3
3
  import { performance } from 'node:perf_hooks';
4
4
  import { isProbablyReaderable, Readability } from '@mozilla/readability';
5
5
  import { parseHTML } from 'linkedom';
6
- import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, detectLanguageFromCode, extractLanguageFromClassName, extractTitleFromRawMarkdown, isRawTextContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/content.js';
6
+ import { detectLanguageFromCode, extractLanguageFromClassName, } from '../lib/code-lang.js';
7
7
  import { config } from '../lib/core.js';
8
8
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
9
+ import { prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
9
10
  import { isRawTextContentUrl } from '../lib/http.js';
10
- import { createAbortError, throwIfAborted } from '../lib/utils.js';
11
+ import { cleanupMarkdownArtifacts, processFencedContent, } from '../lib/md-cleanup.js';
12
+ import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from '../lib/md-metadata.js';
13
+ import { throwIfAborted } from '../lib/utils.js';
11
14
  import { FetchError, getErrorMessage, toError } from '../lib/utils.js';
12
15
  import { isObject } from '../lib/utils.js';
13
16
  import { translateHtmlFragmentToMarkdown } from './html-translators.js';
@@ -34,7 +37,6 @@ function decodeInput(input, encoding) {
34
37
  function asError(value) {
35
38
  return value instanceof Error ? value : undefined;
36
39
  }
37
- const abortPolicy = { throwIfAborted, createAbortError };
38
40
  function isWhitespaceChar(code) {
39
41
  return code === 9 || code === 10 || code === 12 || code === 13 || code === 32;
40
42
  }
@@ -306,7 +308,7 @@ function extractArticle(document, url, signal) {
306
308
  return null;
307
309
  }
308
310
  const checkAbort = (stage) => {
309
- abortPolicy.throwIfAborted(signal, url, stage);
311
+ throwIfAborted(signal, url, stage);
310
312
  };
311
313
  try {
312
314
  const doc = document;
@@ -391,29 +393,43 @@ function applyBaseUri(document, url) {
391
393
  });
392
394
  }
393
395
  }
396
+ function createEmptyExtractionContext() {
397
+ const { document } = parseHTML('<html></html>');
398
+ return { article: null, metadata: {}, document };
399
+ }
400
+ function extractEarlyMetadataIfNeeded(html, url) {
401
+ if (!willTruncate(html))
402
+ return null;
403
+ return stageTracker.run(url, 'extract:early-metadata', () => extractMetadataFromHead(html, url));
404
+ }
405
+ function parseExtractionDocument(html, url, inputTruncated) {
406
+ const { html: limitedHtml, truncated } = truncateHtml(html, inputTruncated);
407
+ const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
408
+ return { document, truncated };
409
+ }
410
+ function extractMergedMetadata(html, url, document) {
411
+ const earlyMetadata = extractEarlyMetadataIfNeeded(html, url);
412
+ const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document, url));
413
+ return mergeMetadata(earlyMetadata, lateMetadata);
414
+ }
415
+ function extractArticleIfRequested(document, url, options) {
416
+ if (!options.extractArticle)
417
+ return null;
418
+ return stageTracker.run(url, 'extract:article', () => extractArticle(document, url, options.signal));
419
+ }
394
420
  function extractContentContext(html, url, options) {
395
421
  if (!isValidInput(html, url)) {
396
- const { document } = parseHTML('<html></html>');
397
- return { article: null, metadata: {}, document };
422
+ return createEmptyExtractionContext();
398
423
  }
399
424
  try {
400
- abortPolicy.throwIfAborted(options.signal, url, 'extract:begin');
401
- // F2: Extract metadata from <head> BEFORE truncation to preserve it
402
- const earlyMetadata = willTruncate(html)
403
- ? stageTracker.run(url, 'extract:early-metadata', () => extractMetadataFromHead(html, url))
404
- : null;
405
- const { html: limitedHtml, truncated } = truncateHtml(html, options.inputTruncated);
406
- const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
407
- abortPolicy.throwIfAborted(options.signal, url, 'extract:parsed');
425
+ throwIfAborted(options.signal, url, 'extract:begin');
426
+ const { document, truncated } = parseExtractionDocument(html, url, options.inputTruncated);
427
+ throwIfAborted(options.signal, url, 'extract:parsed');
408
428
  applyBaseUri(document, url);
409
- const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document, url));
410
- abortPolicy.throwIfAborted(options.signal, url, 'extract:metadata');
411
- // Merge early (pre-truncation) with late (post-truncation) metadata
412
- const metadata = mergeMetadata(earlyMetadata, lateMetadata);
413
- const article = options.extractArticle
414
- ? stageTracker.run(url, 'extract:article', () => extractArticle(document, url, options.signal))
415
- : null;
416
- abortPolicy.throwIfAborted(options.signal, url, 'extract:article');
429
+ const metadata = extractMergedMetadata(html, url, document);
430
+ throwIfAborted(options.signal, url, 'extract:metadata');
431
+ const article = extractArticleIfRequested(document, url, options);
432
+ throwIfAborted(options.signal, url, 'extract:article');
417
433
  return {
418
434
  article,
419
435
  metadata,
@@ -424,10 +440,9 @@ function extractContentContext(html, url, options) {
424
440
  catch (error) {
425
441
  if (error instanceof FetchError)
426
442
  throw error;
427
- abortPolicy.throwIfAborted(options.signal, url, 'extract:error');
443
+ throwIfAborted(options.signal, url, 'extract:error');
428
444
  logError('Failed to extract content', asError(error));
429
- const { document } = parseHTML('<html></html>');
430
- return { article: null, metadata: {}, document };
445
+ return createEmptyExtractionContext();
431
446
  }
432
447
  }
433
448
  export function extractContent(html, url, options = {
@@ -436,8 +451,6 @@ export function extractContent(html, url, options = {
436
451
  const result = extractContentContext(html, url, options);
437
452
  return { article: result.article, metadata: result.metadata };
438
453
  }
439
- const ABORT_CHECK_LINE_INTERVAL = 500;
440
- const CR_CHAR_CODE = 13;
441
454
  function resolveRelativeHref(href, baseUrl, origin) {
442
455
  const trimmedHref = href.trim();
443
456
  if (!trimmedHref || containsWhitespace(trimmedHref))
@@ -501,7 +514,6 @@ function isAbsoluteOrSpecialUrl(href) {
501
514
  return true;
502
515
  return URL.canParse(trimmedHref);
503
516
  }
504
- const FENCE_LINE_PATTERN = /^\s*(`{3,}|~{3,})/;
505
517
  function resolveRelativeUrlsInSegment(markdown, baseUrl, origin) {
506
518
  let cursor = 0;
507
519
  let output = '';
@@ -527,71 +539,20 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
527
539
  }
528
540
  if (!markdown)
529
541
  return markdown;
530
- let output = '';
531
- let buffer = '';
532
- let fenceMarker = null;
533
- const len = markdown.length;
534
- let lastIndex = 0;
535
- let lineCount = 0;
536
- while (lastIndex < len) {
537
- if (++lineCount % ABORT_CHECK_LINE_INTERVAL === 0) {
538
- abortPolicy.throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
539
- }
540
- // Extract next line (handling CR+LF)
541
- let nextIndex = markdown.indexOf('\n', lastIndex);
542
- const isLastLine = nextIndex === -1;
543
- if (isLastLine)
544
- nextIndex = len;
545
- const lineWithNewline = isLastLine
546
- ? markdown.slice(lastIndex)
547
- : markdown.slice(lastIndex, nextIndex + 1);
548
- const lineEnd = !isLastLine &&
549
- nextIndex > lastIndex &&
550
- markdown.charCodeAt(nextIndex - 1) === CR_CHAR_CODE
551
- ? nextIndex - 1
552
- : isLastLine
553
- ? len
554
- : nextIndex;
555
- const trimmed = markdown.slice(lastIndex, lineEnd).trimStart();
556
- if (fenceMarker) {
557
- // Inside a code fence — pass through without URL resolution
558
- output += lineWithNewline;
559
- if (trimmed.startsWith(fenceMarker) &&
560
- trimmed.slice(fenceMarker.length).trim() === '') {
561
- fenceMarker = null;
562
- }
563
- }
564
- else {
565
- const fenceMatch = FENCE_LINE_PATTERN.exec(markdown.slice(lastIndex, lineEnd));
566
- if (fenceMatch?.[1]) {
567
- // Entering a code fence — flush buffered content first
568
- if (buffer) {
569
- output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
570
- buffer = '';
571
- }
572
- output += lineWithNewline;
573
- fenceMarker = fenceMatch[1];
574
- }
575
- else {
576
- buffer += lineWithNewline;
577
- }
578
- }
579
- lastIndex = isLastLine ? len : nextIndex + 1;
580
- }
581
- if (buffer) {
582
- output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
583
- }
584
- return output;
542
+ return processFencedContent(markdown, (text) => {
543
+ throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
544
+ return resolveRelativeUrlsInSegment(text, baseUrl, origin);
545
+ });
585
546
  }
586
547
  function translateHtmlToMarkdown(params) {
587
548
  const { html, url, signal, document, skipNoiseRemoval } = params;
588
- abortPolicy.throwIfAborted(signal, url, 'markdown:begin');
549
+ throwIfAborted(signal, url, 'markdown:begin');
589
550
  const cleanedHtml = skipNoiseRemoval
590
551
  ? html
591
552
  : stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url, signal));
592
- abortPolicy.throwIfAborted(signal, url, 'markdown:cleaned');
553
+ throwIfAborted(signal, url, 'markdown:cleaned');
593
554
  const content = stageTracker.run(url, 'markdown:translate', () => translateHtmlFragmentToMarkdown(cleanedHtml));
594
- abortPolicy.throwIfAborted(signal, url, 'markdown:translated');
555
+ throwIfAborted(signal, url, 'markdown:translated');
595
556
  const cleaned = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
596
557
  return url ? resolveRelativeUrls(cleaned, url, signal) : cleaned;
597
558
  }
@@ -937,57 +898,74 @@ export const TransformHeuristics = {
937
898
  findPrimaryHeading,
938
899
  isGithubRepositoryRootUrl,
939
900
  };
940
- function shouldUseArticleContent(article, document) {
941
- const articleLength = article.textContent.length;
901
+ const ARTICLE_INTERACTIVE_SELECTOR = 'button,[role="tab"],[role="tabpanel"],[aria-controls]';
902
+ function buildArticleDocument(article) {
903
+ return parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`).document;
904
+ }
905
+ function hasSufficientArticleContentRatio(article, document) {
942
906
  const originalLength = getVisibleTextLength(document);
943
- if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
944
- const ratio = articleLength / originalLength;
945
- if (ratio < MIN_CONTENT_RATIO)
946
- return false;
947
- }
948
- const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
907
+ if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
908
+ return true;
909
+ return article.textContent.length / originalLength >= MIN_CONTENT_RATIO;
910
+ }
911
+ function retainsEnoughHeadings(articleDoc, document) {
949
912
  const originalHeadings = countMatchingElements(document, 'h1,h2,h3,h4,h5,h6');
913
+ if (originalHeadings === 0)
914
+ return true;
950
915
  const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
951
- if (originalHeadings > 0) {
952
- const retentionRatio = articleHeadings / originalHeadings;
953
- if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
954
- return false;
955
- }
916
+ return articleHeadings / originalHeadings >= MIN_HEADING_RETENTION_RATIO;
917
+ }
918
+ function retainsEnoughCodeBlocks(articleDoc, document) {
956
919
  const originalCodeBlocks = countMatchingElements(document, 'pre');
957
- if (originalCodeBlocks > 0) {
958
- const articleCodeBlocks = countMatchingElements(articleDoc, 'pre');
959
- const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
960
- if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
961
- return false;
962
- }
920
+ if (originalCodeBlocks === 0)
921
+ return true;
922
+ const articleCodeBlocks = countMatchingElements(articleDoc, 'pre');
923
+ return (articleCodeBlocks / originalCodeBlocks >= MIN_CODE_BLOCK_RETENTION_RATIO);
924
+ }
925
+ function retainsEnoughTables(articleDoc, document) {
963
926
  const originalTables = countMatchingElements(document, 'table');
964
- if (originalTables > 0) {
965
- const articleTables = countMatchingElements(articleDoc, 'table');
966
- const tableRetentionRatio = articleTables / originalTables;
967
- if (tableRetentionRatio < MIN_TABLE_RETENTION_RATIO)
968
- return false;
969
- }
927
+ if (originalTables === 0)
928
+ return true;
929
+ const articleTables = countMatchingElements(articleDoc, 'table');
930
+ return articleTables / originalTables >= MIN_TABLE_RETENTION_RATIO;
931
+ }
932
+ function retainsEnoughImages(articleDoc, document) {
970
933
  const originalImages = countMatchingElements(document, 'img');
971
- if (originalImages >= MIN_IMAGE_ELEMENTS_FOR_GATE) {
972
- const articleImages = countMatchingElements(articleDoc, 'img');
973
- const imageRetentionRatio = articleImages / originalImages;
974
- if (imageRetentionRatio < MIN_IMAGE_RETENTION_RATIO)
975
- return false;
976
- }
977
- const interactiveSelector = 'button,[role="tab"],[role="tabpanel"],[aria-controls]';
978
- const originalInteractive = countMatchingElements(document, interactiveSelector);
979
- if (originalInteractive >= MIN_INTERACTIVE_ELEMENTS_FOR_GATE) {
980
- const articleInteractive = countMatchingElements(articleDoc, interactiveSelector);
981
- const interactiveRetentionRatio = articleInteractive / originalInteractive;
982
- if (interactiveRetentionRatio < MIN_INTERACTIVE_RETENTION_RATIO) {
983
- return false;
984
- }
985
- }
986
- if (articleHeadings >= MIN_HEADINGS_FOR_EMPTY_SECTION_GATE) {
987
- const emptySectionRatio = countEmptyHeadingSections(articleDoc) / articleHeadings;
988
- if (emptySectionRatio > MAX_EMPTY_SECTION_RATIO)
989
- return false;
990
- }
934
+ if (originalImages < MIN_IMAGE_ELEMENTS_FOR_GATE)
935
+ return true;
936
+ const articleImages = countMatchingElements(articleDoc, 'img');
937
+ return articleImages / originalImages >= MIN_IMAGE_RETENTION_RATIO;
938
+ }
939
+ function retainsEnoughInteractiveElements(articleDoc, document) {
940
+ const originalInteractive = countMatchingElements(document, ARTICLE_INTERACTIVE_SELECTOR);
941
+ if (originalInteractive < MIN_INTERACTIVE_ELEMENTS_FOR_GATE)
942
+ return true;
943
+ const articleInteractive = countMatchingElements(articleDoc, ARTICLE_INTERACTIVE_SELECTOR);
944
+ return (articleInteractive / originalInteractive >= MIN_INTERACTIVE_RETENTION_RATIO);
945
+ }
946
+ function hasAcceptableEmptySectionRatio(articleDoc) {
947
+ const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
948
+ if (articleHeadings < MIN_HEADINGS_FOR_EMPTY_SECTION_GATE)
949
+ return true;
950
+ const emptySectionRatio = countEmptyHeadingSections(articleDoc) / articleHeadings;
951
+ return emptySectionRatio <= MAX_EMPTY_SECTION_RATIO;
952
+ }
953
+ function shouldUseArticleContent(article, document) {
954
+ if (!hasSufficientArticleContentRatio(article, document))
955
+ return false;
956
+ const articleDoc = buildArticleDocument(article);
957
+ if (!retainsEnoughHeadings(articleDoc, document))
958
+ return false;
959
+ if (!retainsEnoughCodeBlocks(articleDoc, document))
960
+ return false;
961
+ if (!retainsEnoughTables(articleDoc, document))
962
+ return false;
963
+ if (!retainsEnoughImages(articleDoc, document))
964
+ return false;
965
+ if (!retainsEnoughInteractiveElements(articleDoc, document))
966
+ return false;
967
+ if (!hasAcceptableEmptySectionRatio(articleDoc))
968
+ return false;
991
969
  return !hasTruncatedSentences(article.textContent);
992
970
  }
993
971
  function buildContentSource(params) {
@@ -1339,6 +1317,33 @@ function resolveContentSource(params) {
1339
1317
  ...(params.signal ? { signal: params.signal } : {}),
1340
1318
  });
1341
1319
  }
1320
+ function shouldStripGithubPrimaryHeading(context, url) {
1321
+ return (context.primaryHeading !== undefined &&
1322
+ TransformHeuristics.isGithubRepositoryRootUrl(url));
1323
+ }
1324
+ function maybeStripGithubPrimaryHeading(markdown, context, url) {
1325
+ if (!shouldStripGithubPrimaryHeading(context, url))
1326
+ return markdown;
1327
+ return stripLeadingHeading(markdown, context.primaryHeading ?? '');
1328
+ }
1329
+ function buildSyntheticTitlePrefix(url, favicon) {
1330
+ if (!favicon)
1331
+ return ' ';
1332
+ let alt = '';
1333
+ try {
1334
+ alt = new URL(url).hostname;
1335
+ }
1336
+ catch {
1337
+ /* skip */
1338
+ }
1339
+ return ` ![${alt}](${favicon}) `;
1340
+ }
1341
+ function maybePrependSyntheticTitle(markdown, context, url) {
1342
+ if (!context.title || /^(#{1,6})\s/.test(markdown.trimStart())) {
1343
+ return markdown;
1344
+ }
1345
+ return `#${buildSyntheticTitlePrefix(url, context.favicon)}${context.title}\n\n${markdown}`;
1346
+ }
1342
1347
  function buildMarkdownFromContext(context, url, signal) {
1343
1348
  let content = stageTracker.run(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
1344
1349
  url,
@@ -1346,25 +1351,8 @@ function buildMarkdownFromContext(context, url, signal) {
1346
1351
  ...(context.document ? { document: context.document } : {}),
1347
1352
  ...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
1348
1353
  }));
1349
- if (context.primaryHeading &&
1350
- TransformHeuristics.isGithubRepositoryRootUrl(url)) {
1351
- content = stripLeadingHeading(content, context.primaryHeading);
1352
- }
1353
- if (context.title && !/^(#{1,6})\s/.test(content.trimStart())) {
1354
- const icon = context.favicon;
1355
- let prefix = ' ';
1356
- if (icon) {
1357
- let alt = '';
1358
- try {
1359
- alt = new URL(url).hostname;
1360
- }
1361
- catch {
1362
- /* skip */
1363
- }
1364
- prefix = ` ![${alt}](${icon}) `;
1365
- }
1366
- content = `#${prefix}${context.title}\n\n${content}`;
1367
- }
1354
+ content = maybeStripGithubPrimaryHeading(content, context, url);
1355
+ content = maybePrependSyntheticTitle(content, context, url);
1368
1356
  content = supplementMarkdownFromNextFlight(content, context.originalHtml);
1369
1357
  content = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
1370
1358
  return {
@@ -1422,10 +1410,24 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
1422
1410
  const signal = buildTransformSignal(options.signal);
1423
1411
  const totalStage = stageTracker.start(url, 'transform:total');
1424
1412
  try {
1425
- abortPolicy.throwIfAborted(signal, url, 'transform:begin');
1413
+ throwIfAborted(signal, url, 'transform:begin');
1426
1414
  validateBinaryContent(html, url);
1427
- const result = tryRawContentPipeline(html, url, options) ??
1428
- tryHtmlContentPipeline(html, url, options, signal);
1415
+ const result = stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
1416
+ html,
1417
+ url,
1418
+ includeMetadata: options.includeMetadata,
1419
+ ...(options.inputTruncated ? { inputTruncated: true } : {}),
1420
+ })) ??
1421
+ (() => {
1422
+ const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
1423
+ html,
1424
+ url,
1425
+ includeMetadata: options.includeMetadata,
1426
+ ...(signal ? { signal } : {}),
1427
+ ...(options.inputTruncated ? { inputTruncated: true } : {}),
1428
+ }));
1429
+ return buildMarkdownFromContext(context, url, signal);
1430
+ })();
1429
1431
  stageTracker.end(totalStage, { truncated: result.truncated });
1430
1432
  return result;
1431
1433
  }
@@ -1439,24 +1441,6 @@ function validateBinaryContent(html, url) {
1439
1441
  throw new FetchError('Content appears to be binary data (high replacement character ratio or null bytes)', url, 415, { reason: 'binary_content_detected', stage: 'transform:validate' });
1440
1442
  }
1441
1443
  }
1442
- function tryRawContentPipeline(html, url, options) {
1443
- return stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
1444
- html,
1445
- url,
1446
- includeMetadata: options.includeMetadata,
1447
- ...(options.inputTruncated ? { inputTruncated: true } : {}),
1448
- }));
1449
- }
1450
- function tryHtmlContentPipeline(html, url, options, signal) {
1451
- const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
1452
- html,
1453
- url,
1454
- includeMetadata: options.includeMetadata,
1455
- ...(signal ? { signal } : {}),
1456
- ...(options.inputTruncated ? { inputTruncated: true } : {}),
1457
- }));
1458
- return buildMarkdownFromContext(context, url, signal);
1459
- }
1460
1444
  export function getTransformPoolStats() {
1461
1445
  return getWorkerPoolStats();
1462
1446
  }
@@ -1466,7 +1450,7 @@ export async function shutdownTransformWorkerPool() {
1466
1450
  function transformInputInProcess(htmlOrBuffer, url, options) {
1467
1451
  return transformHtmlToMarkdownInProcess(decodeInput(htmlOrBuffer, options.encoding), url, options);
1468
1452
  }
1469
- function buildWorkerTransformOptions(options) {
1453
+ function workerTransformOptions(options) {
1470
1454
  return {
1471
1455
  includeMetadata: options.includeMetadata,
1472
1456
  ...(options.signal ? { signal: options.signal } : {}),
@@ -1479,10 +1463,10 @@ async function transformWithWorkerPool(htmlOrBuffer, url, options) {
1479
1463
  return transformInputInProcess(htmlOrBuffer, url, options);
1480
1464
  }
1481
1465
  if (typeof htmlOrBuffer === 'string') {
1482
- return pool.transform(htmlOrBuffer, url, buildWorkerTransformOptions(options));
1466
+ return pool.transform(htmlOrBuffer, url, workerTransformOptions(options));
1483
1467
  }
1484
1468
  return pool.transform(htmlOrBuffer, url, {
1485
- ...buildWorkerTransformOptions(options),
1469
+ ...workerTransformOptions(options),
1486
1470
  ...(options.encoding ? { encoding: options.encoding } : {}),
1487
1471
  });
1488
1472
  }
@@ -1494,7 +1478,7 @@ function resolveWorkerFallback(error, htmlOrBuffer, url, options) {
1494
1478
  });
1495
1479
  return transformInputInProcess(htmlOrBuffer, url, options);
1496
1480
  }
1497
- abortPolicy.throwIfAborted(options.signal, url, 'transform:worker-fallback');
1481
+ throwIfAborted(options.signal, url, 'transform:worker-fallback');
1498
1482
  if (error instanceof FetchError)
1499
1483
  throw error;
1500
1484
  if (!(error instanceof Error))
@@ -1521,7 +1505,7 @@ async function runWorkerTransformWithFallback(htmlOrBuffer, url, options) {
1521
1505
  async function transformInputToMarkdown(htmlOrBuffer, url, options) {
1522
1506
  const totalStage = stageTracker.start(url, 'transform:total');
1523
1507
  try {
1524
- abortPolicy.throwIfAborted(options.signal, url, 'transform:begin');
1508
+ throwIfAborted(options.signal, url, 'transform:begin');
1525
1509
  const result = await runWorkerTransformWithFallback(htmlOrBuffer, url, options);
1526
1510
  stageTracker.end(totalStage, { truncated: result.truncated });
1527
1511
  return result;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/fetch-url-mcp",
3
- "version": "1.9.3",
3
+ "version": "1.9.4",
4
4
  "mcpName": "io.github.j0hanz/fetch-url-mcp",
5
5
  "description": "A web content fetcher MCP server that converts HTML to clean, AI and human readable markdown.",
6
6
  "type": "module",
@@ -1,18 +0,0 @@
1
- import { type MetadataBlock } from '../transform/types.js';
2
- export declare function serializeDocumentForMarkdown(document: Document, fallback: string): string;
3
- export declare function prepareDocumentForMarkdown(document: Document, baseUrl?: string, signal?: AbortSignal): void;
4
- export declare function removeNoiseFromHtml(html: string, document?: Document, baseUrl?: string, signal?: AbortSignal): string;
5
- export declare function extractLanguageFromClassName(className: string): string | undefined;
6
- export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
7
- export declare function detectLanguageFromCode(code: string): string | undefined;
8
- interface CleanupOptions {
9
- signal?: AbortSignal;
10
- url?: string;
11
- }
12
- export declare function cleanupMarkdownArtifacts(content: string, options?: CleanupOptions): string;
13
- export declare function extractTitleFromRawMarkdown(content: string): string | undefined;
14
- export declare function addSourceToMarkdown(content: string, url: string): string;
15
- export declare function isRawTextContent(content: string): boolean;
16
- export declare function buildMetadataFooter(metadata?: MetadataBlock, fallbackUrl?: string): string;
17
- export {};
18
- //# sourceMappingURL=content.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAwjB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AA2DD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAiBN;AA0BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAwQD,wBAAgB,4BAA4B,CAC1C,SAAS,EAAE,MAAM,GAChB,MAAM,GAAG,SAAS,CAuBpB;AAqBD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAqBvE;AAsDD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAqTD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6CR;AAgGD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAuCxE;AAmBD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}