soup-chop 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +224 -48
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -650,6 +650,18 @@ var DOCS_PATH_HINTS = ["/docs", "/documentation", "/guide", "/guides", "/api", "
650
650
  var DOC_LABEL_PATTERN = /\b(docs?|documentation|guide|guides|api|reference|learn|tutorial|get started)\b/i;
651
651
  var EXCLUDED_DOC_HOST_PATTERN = /(^|\.)(github\.com|npmjs\.com|www\.npmjs\.com|unpkg\.com)$/i;
652
652
  var EXCLUDED_FILE_EXTENSION_PATTERN = /\.(?:png|jpe?g|gif|svg|webp|ico|pdf|zip|gz|tgz|woff2?|ttf|eot|mp4|webm|mov|mp3|wav)$/i;
653
+ function discoveryMethodRank(discoveryMethod) {
654
+ switch (discoveryMethod) {
655
+ case "explicit":
656
+ return 0;
657
+ case "readme_link":
658
+ return 1;
659
+ case "homepage":
660
+ return 2;
661
+ case "derived_pages":
662
+ return 3;
663
+ }
664
+ }
653
665
  var turndown = new TurndownService({
654
666
  bulletListMarker: "-",
655
667
  codeBlockStyle: "fenced",
@@ -706,24 +718,40 @@ function extractMarkdownLinks(markdown) {
706
718
  }
707
719
  return links;
708
720
  }
709
- function detectDocsUrl(readmeContent) {
710
- const best = extractMarkdownLinks(readmeContent).filter((candidate) => {
721
+ function createDocsCandidate(url, discoveryMethod, score) {
722
+ return {
723
+ url,
724
+ discoveryMethod,
725
+ score
726
+ };
727
+ }
728
+ function dedupeDocsCandidates(candidates) {
729
+ const deduped = /* @__PURE__ */ new Map();
730
+ for (const candidate of candidates) {
731
+ const existing = deduped.get(candidate.url);
732
+ if (existing === void 0 || discoveryMethodRank(candidate.discoveryMethod) < discoveryMethodRank(existing.discoveryMethod) || discoveryMethodRank(candidate.discoveryMethod) === discoveryMethodRank(existing.discoveryMethod) && candidate.score > existing.score) {
733
+ deduped.set(candidate.url, candidate);
734
+ }
735
+ }
736
+ return [...deduped.values()].sort((left, right) => {
737
+ if (right.score !== left.score) {
738
+ return right.score - left.score;
739
+ }
740
+ if (left.discoveryMethod !== right.discoveryMethod) {
741
+ return discoveryMethodRank(left.discoveryMethod) - discoveryMethodRank(right.discoveryMethod);
742
+ }
743
+ return left.url.localeCompare(right.url);
744
+ });
745
+ }
746
+ function detectDocsUrls(readmeContent) {
747
+ return dedupeDocsCandidates(extractMarkdownLinks(readmeContent).filter((candidate) => {
711
748
  try {
712
749
  const url = new URL(candidate.url);
713
750
  return url.protocol === "http:" || url.protocol === "https:";
714
751
  } catch {
715
752
  return false;
716
753
  }
717
- }).map((candidate) => ({ candidate, score: scoreDocsCandidate(candidate) })).filter((entry) => entry.score > 0).sort((left, right) => {
718
- if (right.score !== left.score) {
719
- return right.score - left.score;
720
- }
721
- return left.candidate.url.localeCompare(right.candidate.url);
722
- })[0];
723
- if (best === void 0) {
724
- return null;
725
- }
726
- return normalizeUrl(new URL(best.candidate.url));
754
+ }).map((candidate) => ({ candidate, score: scoreDocsCandidate(candidate) })).filter((entry) => entry.score > 0).map((entry) => createDocsCandidate(normalizeUrl(new URL(entry.candidate.url)), "readme_link", entry.score)));
727
755
  }
728
756
  function readRepositoryUrl(repository) {
729
757
  if (typeof repository === "string" && repository.trim().length > 0) {
@@ -780,6 +808,55 @@ function detectPagesUrl(packageJson) {
780
808
  }
781
809
  return null;
782
810
  }
811
+ function scoreHomepageCandidate(homepage) {
812
+ const host = homepage.hostname.toLowerCase();
813
+ const path = normalizePathname(homepage.pathname).toLowerCase();
814
+ let score = homepage.protocol === "https:" ? 1 : 0;
815
+ if (DOC_LABEL_PATTERN.test(path)) {
816
+ score += 6;
817
+ }
818
+ if (DOCS_PATH_HINTS.some((hint) => path === hint || path.startsWith(`${hint}/`))) {
819
+ score += 3;
820
+ }
821
+ if (EXCLUDED_DOC_HOST_PATTERN.test(host)) {
822
+ score -= 10;
823
+ }
824
+ return score;
825
+ }
826
+ function collectDocsCandidates(packageJson, readmeContent, explicitDocsUrl) {
827
+ const candidates = [];
828
+ if (explicitDocsUrl !== void 0) {
829
+ try {
830
+ candidates.push(createDocsCandidate(normalizeUrl(new URL(explicitDocsUrl)), "explicit", 100));
831
+ } catch {
832
+ }
833
+ }
834
+ candidates.push(...detectDocsUrls(readmeContent));
835
+ if (typeof packageJson.homepage === "string" && packageJson.homepage.trim().length > 0) {
836
+ try {
837
+ const homepage = new URL(packageJson.homepage.trim());
838
+ if (!/\.github\.io$/i.test(homepage.hostname) && !/\.gitlab\.io$/i.test(homepage.hostname)) {
839
+ const score = scoreHomepageCandidate(homepage);
840
+ if (score > 0) {
841
+ candidates.push(createDocsCandidate(normalizeUrl(homepage), "homepage", score));
842
+ }
843
+ }
844
+ } catch {
845
+ }
846
+ }
847
+ const pagesUrl = detectPagesUrl(packageJson);
848
+ if (pagesUrl !== null) {
849
+ candidates.push(createDocsCandidate(pagesUrl, "derived_pages", 1));
850
+ }
851
+ return dedupeDocsCandidates(candidates);
852
+ }
853
+ function isLikelySpaShell(html) {
854
+ const markdown = convertHtmlToMarkdown(html);
855
+ if (markdown.replace(/\s+/g, " ").trim().length >= 200) {
856
+ return false;
857
+ }
858
+ return /<div\b[^>]+id=["'](?:app|root|__next)["']/i.test(html);
859
+ }
783
860
  function extractPrimaryHtml(html) {
784
861
  const mainMatch = html.match(/<main\b[^>]*>([\s\S]*?)<\/main>/i);
785
862
  if (mainMatch?.[1]) {
@@ -877,9 +954,10 @@ async function fetchHtmlPage(fetchImpl, url) {
877
954
  if (!/text\/html|application\/xhtml\+xml/i.test(contentType)) {
878
955
  return null;
879
956
  }
957
+ const responseUrl = response.url.length > 0 ? response.url : url;
880
958
  return {
881
959
  html: await response.text(),
882
- pageUrl: new URL(url)
960
+ pageUrl: new URL(responseUrl)
883
961
  };
884
962
  }
885
963
  function createWebsiteSource(html, pageUrl) {
@@ -954,6 +1032,46 @@ async function discoverStructuredRouteUrls(fetchImpl, initialUrl, initialHtml) {
954
1032
  }
955
1033
  return [];
956
1034
  }
1035
+ async function probeDocsCandidate(candidate, options = {}) {
1036
+ const fetchImpl = options.fetchImpl ?? fetch;
1037
+ const page = await fetchHtmlPage(fetchImpl, candidate.url);
1038
+ if (page === null) {
1039
+ return null;
1040
+ }
1041
+ const structuredRouteUrls = await discoverStructuredRouteUrls(fetchImpl, page.pageUrl, page.html);
1042
+ if (isLikelySpaShell(page.html) && structuredRouteUrls.length === 0) {
1043
+ return null;
1044
+ }
1045
+ const links = extractHtmlLinks(page.html, page.pageUrl);
1046
+ const docsPrefix = deriveDocsPrefix(page.pageUrl, links);
1047
+ const docsScopedLinks = links.filter((link) => matchesDocsScope(link, page.pageUrl, docsPrefix));
1048
+ const markdown = convertHtmlToMarkdown(page.html).replace(/\s+/g, " ").trim();
1049
+ let score = candidate.score;
1050
+ if (structuredRouteUrls.length > 0) {
1051
+ score += 6;
1052
+ }
1053
+ if (docsScopedLinks.length > 0) {
1054
+ score += 4;
1055
+ }
1056
+ if (markdown.length >= 200) {
1057
+ score += 2;
1058
+ }
1059
+ return {
1060
+ ...candidate,
1061
+ url: normalizeUrl(page.pageUrl),
1062
+ score
1063
+ };
1064
+ }
1065
+ async function probeDocsCandidates(candidates, options = {}) {
1066
+ const probed = await Promise.all(candidates.map(async (candidate) => {
1067
+ try {
1068
+ return await probeDocsCandidate(candidate, options);
1069
+ } catch {
1070
+ return null;
1071
+ }
1072
+ }));
1073
+ return dedupeDocsCandidates(probed.filter((candidate) => candidate !== null));
1074
+ }
957
1075
  async function crawlDocsSite(entryUrl, options = {}) {
958
1076
  const fetchImpl = options.fetchImpl ?? fetch;
959
1077
  const maxPages = options.maxPages ?? DEFAULT_MAX_WEBSITE_PAGES;
@@ -1008,7 +1126,6 @@ async function crawlDocsSite(entryUrl, options = {}) {
1008
1126
  continue;
1009
1127
  }
1010
1128
  visited.add(current);
1011
- const currentUrl = new URL(current);
1012
1129
  const page = await fetchHtmlPage(fetchImpl, current);
1013
1130
  if (page === null) {
1014
1131
  if (sources.length === 0) {
@@ -1017,7 +1134,7 @@ async function crawlDocsSite(entryUrl, options = {}) {
1017
1134
  continue;
1018
1135
  }
1019
1136
  const html = page.html;
1020
- const links = extractHtmlLinks(html, currentUrl);
1137
+ const links = extractHtmlLinks(html, page.pageUrl);
1021
1138
  docsPrefix ??= deriveDocsPrefix(initialUrl, links);
1022
1139
  for (const link of links) {
1023
1140
  const normalized = normalizeUrl(link);
@@ -1031,10 +1148,10 @@ async function crawlDocsSite(entryUrl, options = {}) {
1031
1148
  queue.push(normalized);
1032
1149
  }
1033
1150
  }
1034
- if (!matchesDocsScope(currentUrl, initialUrl, docsPrefix)) {
1151
+ if (!matchesDocsScope(page.pageUrl, initialUrl, docsPrefix)) {
1035
1152
  continue;
1036
1153
  }
1037
- const source = createWebsiteSource(html, currentUrl);
1154
+ const source = createWebsiteSource(html, page.pageUrl);
1038
1155
  if (source === null) {
1039
1156
  continue;
1040
1157
  }
@@ -1555,18 +1672,16 @@ function extractRepositoryUrl(repository) {
1555
1672
 
1556
1673
  // src/lib/discoverSources.ts
1557
1674
  var UNPKG_BASE2 = "https://unpkg.com";
1558
- var SOURCE_MANIFEST_VERSION = 3;
1675
+ var SOURCE_MANIFEST_VERSION = 4;
1559
1676
  var TOP_LEVEL_DOC_ALLOWLIST = /* @__PURE__ */ new Set(["API.MD", "FAQ.MD", "MIGRATING.MD", "UPGRADING.MD", "CHANGELOG.MD", "CONTRIBUTING.MD"]);
1560
1677
  var LOCAL_TS_DIR_EXCLUDES = /* @__PURE__ */ new Set([".git", ".windsurf", "coverage", "dist", "dist-test", "node_modules", "sandbox"]);
1678
+ var MAX_WEBSITE_CANDIDATES = 2;
1561
1679
  function normalizeOrigin(origin) {
1562
1680
  return origin.replace(/^\//, "");
1563
1681
  }
1564
1682
  function canonicalTopLevelDocName(path) {
1565
1683
  return path.replace(/\.md$/i, ".md").toUpperCase();
1566
1684
  }
1567
- function isSearchSource(source) {
1568
- return source !== null;
1569
- }
1570
1685
  function cacheFileForOrigin(origin) {
1571
1686
  const normalized = normalizeOrigin(origin);
1572
1687
  if (normalized === "README.md") {
@@ -1577,39 +1692,84 @@ function cacheFileForOrigin(origin) {
1577
1692
  function createSourceId(origin) {
1578
1693
  return normalizeOrigin(origin).replace(/[^a-zA-Z0-9]+/g, "__").replace(/^__+|__+$/g, "").toLowerCase();
1579
1694
  }
1580
- function createSource(sourceKind, origin, content) {
1695
+ function createSource(sourceKind, origin, content, discoveryMethod) {
1581
1696
  const normalized = normalizeOrigin(origin);
1582
1697
  return {
1583
1698
  sourceId: createSourceId(normalized),
1584
1699
  sourceKind,
1585
1700
  origin: normalized,
1586
1701
  title: posix2.basename(normalized),
1587
- content
1702
+ content,
1703
+ discoveryMethod
1588
1704
  };
1589
1705
  }
1590
1706
  function mapSourceKind(source, sourceKind, originPrefix) {
1591
1707
  const origin = originPrefix === void 0 ? source.origin : `${originPrefix}/${source.origin}`;
1592
- return createSourceWithTitle(sourceKind, origin, source.title, source.content);
1708
+ return createSourceWithTitle(sourceKind, origin, source.title, source.content, source.discoveryMethod);
1593
1709
  }
1594
- function createSourceWithTitle(sourceKind, origin, title, content) {
1710
+ function createSourceWithTitle(sourceKind, origin, title, content, discoveryMethod) {
1595
1711
  const normalized = normalizeOrigin(origin);
1596
1712
  return {
1597
1713
  sourceId: createSourceId(normalized),
1598
1714
  sourceKind,
1599
1715
  origin: normalized,
1600
1716
  title,
1601
- content
1717
+ content,
1718
+ discoveryMethod
1602
1719
  };
1603
1720
  }
1604
1721
  function countLines(content) {
1605
1722
  return content.length === 0 ? 0 : content.split("\n").length;
1606
1723
  }
1607
1724
  async function discoverWebsiteSourcesFromTarget(target, readme, canCache, options = {}) {
1608
- const docsUrl = await detectResolvedDocsUrl(target, readme, canCache, options.docsUrl);
1609
- if (docsUrl === null) {
1725
+ const packageJson = await readPackageJsonFromTarget(target, canCache);
1726
+ const candidates = await probeDocsCandidates(collectDocsCandidates(packageJson, readme, options.docsUrl));
1727
+ if (candidates.length === 0) {
1610
1728
  return [];
1611
1729
  }
1612
- return crawlDocsSite(docsUrl);
1730
+ return discoverWebsiteSourcesFromCandidates(candidates);
1731
+ }
1732
+ async function discoverWebsiteSourcesFromCandidates(candidates) {
1733
+ const crawled = await Promise.all(candidates.slice(0, MAX_WEBSITE_CANDIDATES).map(async (candidate) => {
1734
+ try {
1735
+ const sources = await crawlDocsSite(candidate.url);
1736
+ return sources.map((source) => ({ ...source, discoveryMethod: candidate.discoveryMethod }));
1737
+ } catch {
1738
+ return [];
1739
+ }
1740
+ }));
1741
+ return dedupeWebsiteSources(crawled.flat());
1742
+ }
1743
+ function dedupeWebsiteSources(sources) {
1744
+ const deduped = /* @__PURE__ */ new Map();
1745
+ for (const source of sources) {
1746
+ const key = source.content.replace(/\s+/g, " ").trim();
1747
+ if (key.length === 0) {
1748
+ continue;
1749
+ }
1750
+ const existing = deduped.get(key);
1751
+ if (existing === void 0 || compareDiscoveryMethod(existing.discoveryMethod, source.discoveryMethod) > 0) {
1752
+ deduped.set(key, source);
1753
+ }
1754
+ }
1755
+ return [...deduped.values()].sort((left, right) => left.origin.localeCompare(right.origin));
1756
+ }
1757
+ function compareDiscoveryMethod(left, right) {
1758
+ const rank = (value) => {
1759
+ switch (value) {
1760
+ case "explicit":
1761
+ return 0;
1762
+ case "readme_link":
1763
+ return 1;
1764
+ case "homepage":
1765
+ return 2;
1766
+ case "derived_pages":
1767
+ return 3;
1768
+ default:
1769
+ return 4;
1770
+ }
1771
+ };
1772
+ return rank(left) - rank(right);
1613
1773
  }
1614
1774
  async function discoverWikiSourcesFromTarget(target, canCache) {
1615
1775
  const packageJson = await readPackageJsonFromTarget(target, canCache);
@@ -1696,17 +1856,6 @@ async function readPackageJsonFromTarget(target, canCache) {
1696
1856
  const parsed = JSON.parse(raw);
1697
1857
  return isPackageJsonLike(parsed) ? parsed : {};
1698
1858
  }
1699
- async function detectResolvedDocsUrl(target, readme, canCache, explicitDocsUrl) {
1700
- if (explicitDocsUrl !== void 0) {
1701
- return explicitDocsUrl;
1702
- }
1703
- const packageJson = await readPackageJsonFromTarget(target, canCache);
1704
- const pagesUrl = detectPagesUrl(packageJson);
1705
- if (pagesUrl !== null) {
1706
- return pagesUrl;
1707
- }
1708
- return detectDocsUrl(readme);
1709
- }
1710
1859
  async function discoverJsDocSourcesFromTarget(target, packageJson, canCache) {
1711
1860
  const fetcher = await discoverSourceCodeFetcher(target, packageJson, { canCache });
1712
1861
  if (fetcher === null) {
@@ -1740,7 +1889,8 @@ function buildSourceCatalogEntries(sources) {
1740
1889
  sourceKind: source.sourceKind,
1741
1890
  origin: source.origin,
1742
1891
  title: source.title,
1743
- lineCount: countLines(source.content)
1892
+ lineCount: countLines(source.content),
1893
+ discoveryMethod: source.discoveryMethod
1744
1894
  }));
1745
1895
  }
1746
1896
  async function collectWorkspacePackages(workspaceRoot) {
@@ -1847,11 +1997,19 @@ async function readManifestSources(pkg, version) {
1847
1997
  sourceKind: entry.sourceKind,
1848
1998
  origin: entry.origin,
1849
1999
  title: entry.title,
1850
- content
2000
+ content,
2001
+ discoveryMethod: entry.discoveryMethod
1851
2002
  };
1852
2003
  })
1853
2004
  );
1854
- return sources.every(isSearchSource) ? sources : null;
2005
+ const resolvedSources = [];
2006
+ for (const source of sources) {
2007
+ if (source === null) {
2008
+ return null;
2009
+ }
2010
+ resolvedSources.push(source);
2011
+ }
2012
+ return resolvedSources;
1855
2013
  }
1856
2014
  async function writeManifestSources(pkg, version, sources) {
1857
2015
  await Promise.all(
@@ -1864,7 +2022,8 @@ async function writeManifestSources(pkg, version, sources) {
1864
2022
  sourceKind: source.sourceKind,
1865
2023
  origin: source.origin,
1866
2024
  title: source.title,
1867
- cacheFile: cacheFileForOrigin(source.origin)
2025
+ cacheFile: cacheFileForOrigin(source.origin),
2026
+ discoveryMethod: source.discoveryMethod
1868
2027
  }))
1869
2028
  };
1870
2029
  await writeCache(pkg, version, "sources-manifest.json", JSON.stringify(manifest, null, 2));
@@ -2465,7 +2624,8 @@ function extractMarkdownChunks(source) {
2465
2624
  path: source.origin,
2466
2625
  startLine: 1,
2467
2626
  endLine: totalLines,
2468
- content
2627
+ content,
2628
+ discoveryMethod: source.discoveryMethod
2469
2629
  }];
2470
2630
  }
2471
2631
  return entries.map((entry, index) => ({
@@ -2478,7 +2638,8 @@ function extractMarkdownChunks(source) {
2478
2638
  path: entry.path,
2479
2639
  startLine: entry.startLine,
2480
2640
  endLine: entry.endLine,
2481
- content: sliceLines(source.content, entry)
2641
+ content: sliceLines(source.content, entry),
2642
+ discoveryMethod: source.discoveryMethod
2482
2643
  }));
2483
2644
  }
2484
2645
 
@@ -2782,7 +2943,7 @@ function stemPorter(word) {
2782
2943
  // src/lib/searchDocs.ts
2783
2944
  var STOPWORDS = /* @__PURE__ */ new Set(["a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "how", "in", "is", "it", "of", "on", "or", "that", "the", "to", "with"]);
2784
2945
  var FIELD_WEIGHTS = { title: 5, path: 3, body: 1 };
2785
- var SEARCH_INDEX_VERSION = 4;
2946
+ var SEARCH_INDEX_VERSION = 5;
2786
2947
  function searchIndexCacheFile() {
2787
2948
  return "search-index.json";
2788
2949
  }
@@ -2927,6 +3088,20 @@ function buildExampleMetadata(chunk) {
2927
3088
  topicId
2928
3089
  };
2929
3090
  }
3091
+ function discoveryMethodScore(discoveryMethod) {
3092
+ switch (discoveryMethod) {
3093
+ case "explicit":
3094
+ return 0.3;
3095
+ case "readme_link":
3096
+ return 0.2;
3097
+ case "homepage":
3098
+ return 0.1;
3099
+ case "derived_pages":
3100
+ return 0;
3101
+ default:
3102
+ return 0;
3103
+ }
3104
+ }
2930
3105
  function rankIndexEntries(index, query) {
2931
3106
  const queryTokens = [...new Set(tokenize(query))];
2932
3107
  if (queryTokens.length === 0) {
@@ -2960,6 +3135,7 @@ function rankIndexEntries(index, query) {
2960
3135
  score += 1.5;
2961
3136
  }
2962
3137
  }
3138
+ score += discoveryMethodScore(entry.chunk.discoveryMethod);
2963
3139
  return {
2964
3140
  chunk: entry.chunk,
2965
3141
  indexPosition,
@@ -3866,7 +4042,7 @@ async function buildCompareVersionsResponse(pkg, v_old, v_new) {
3866
4042
  function createServer() {
3867
4043
  const server = new McpServer({
3868
4044
  name: "soup-chop",
3869
- version: "1.0.4"
4045
+ version: "1.0.5"
3870
4046
  });
3871
4047
  server.registerResource(
3872
4048
  "capabilities",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "soup-chop",
3
- "version": "1.0.4",
3
+ "version": "1.0.5",
4
4
  "description": "A JIT Documentation Proxy for the Model Context Protocol",
5
5
  "repository": {
6
6
  "type": "git",