npm - docshark - Versions diffs - 0.1.6 → 0.1.8 - Mend

docshark 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/CHANGELOG.md +8 -0
package/README.md +6 -6
package/dist/cli.d.ts +1 -1
package/dist/cli.js +103 -44
package/dist/scraper/discoverer.d.ts +6 -1
package/dist/scraper/discoverer.js +358 -9
package/dist/server.js +35 -0
package/dist/version.d.ts +1 -1
package/dist/version.js +1 -1
package/package.json +6 -2

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,13 @@
 # Changelog
+## 0.1.8 (2026-03-11)
+**Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.7...v0.1.8
+## 0.1.7 (2026-03-07)
+**Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.6...v0.1.7
 ## 0.1.6 (2026-03-07)
 **Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.5...v0.1.6

package/README.md CHANGED Viewed

@@ -49,14 +49,14 @@ We are actively polishing the integration between the core engine and external M
 ### Quick Start (from npm)
-You can run DocShark directly without installing it globally using `npx`:
+You can run DocShark directly without installing it globally using `bunx`:
 ```bash
 # Add a documentation library to the index
-npx docshark add https://valibot.dev/guides/ --depth 2
+bunx docshark add https://valibot.dev/guides/ --depth 2
 # Search your indexed docs
-npx docshark search "schema validation"
+bunx docshark search "schema validation"
 ```
 ### Installation
@@ -87,7 +87,7 @@ Add DocShark to your `.vscode/settings.json` or global MCP configuration:
 {
   "mcpServers": {
     "docshark": {
-      "command": "npx",
+      "command": "bunx",
       "args": ["-y", "docshark", "start", "--stdio"]
     }
   }
@@ -100,7 +100,7 @@ Add DocShark to your `.vscode/settings.json` or global MCP configuration:
 2. Click **+ Add New MCP Server**.
 3. Name: `docshark`
 4. Type: `command`
-5. Command: `npx -y docshark start --stdio`
+5. Command: `bunx -y docshark start --stdio`
 ### Claude Desktop
@@ -113,7 +113,7 @@ Edit your Claude Desktop configuration file:
 {
   "mcpServers": {
     "docshark": {
-      "command": "npx",
+      "command": "bunx",
       "args": ["-y", "docshark", "start", "--stdio"]
     }
   }

package/dist/cli.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-#!/usr/bin/env node
+#!/usr/bin/env bun
 export {};

package/dist/cli.js CHANGED Viewed

@@ -1,20 +1,21 @@
-#!/usr/bin/env node
+#!/usr/bin/env bun
 // src/cli.ts — DocShark CLI entry point
-import { Command } from 'commander';
-import { startHttpServer } from './http.js';
-import { StdioTransport } from '@tmcp/transport-stdio';
-import { server, db, searchEngine, libraryService } from './server.js';
-import { VERSION } from './version.js';
+import { Command } from "commander";
+import { startHttpServer } from "./http.js";
+import { StdioTransport } from "@tmcp/transport-stdio";
+import { server, db, searchEngine, libraryService } from "./server.js";
+import { VERSION } from "./version.js";
 const program = new Command()
-    .name('docshark')
-    .description('🦈 Documentation MCP Server — scrape, index, and search any doc website')
-    .version(VERSION, '-v, --version', 'output the current version');
+    .name("docshark")
+    .description("🦈 Documentation MCP Server — scrape, index, and search any doc website")
+    .version(VERSION, "-v, --version", "output the current version");
 program
-    .command('start', { isDefault: true })
-    .description('Start the MCP server')
-    .option('-p, --port <port>', 'HTTP server port', '6380')
-    .option('--stdio', 'Run in STDIO mode (for Claude Desktop, Cursor, etc.)')
-    .option('--data-dir <path>', 'Data directory', '')
+    .command("start", { isDefault: true })
+    .alias("s")
+    .description("Start the MCP server (aliases: s, -s)")
+    .option("-p, --port <port>", "HTTP server port", "6380")
+    .option("-S, --stdio", "Run in STDIO mode (for Claude Desktop, Cursor, etc.)")
+    .option("-D, --data-dir <path>", "Data directory", "")
     .action(async (opts) => {
     if (opts.dataDir) {
         process.env.DOCSHARK_DATA_DIR = opts.dataDir;
@@ -30,11 +31,12 @@ program
     }
 });
 program
-    .command('add <url>')
-    .description('Add a documentation library and start crawling')
-    .option('-n, --name <name>', 'Library name (auto-generated from URL if omitted)')
-    .option('-d, --depth <n>', 'Max crawl depth', '3')
-    .option('--lib-version <version>', 'Library version')
+    .command("add <url>")
+    .alias("a")
+    .description("Add a documentation library and start crawling (aliases: a, -a)")
+    .option("-n, --name <name>", "Library name (auto-generated from URL if omitted)")
+    .option("-d, --depth <n>", "Max crawl depth", "3")
+    .option("-V, --lib-version <version>", "Library version")
     .action(async (url, opts) => {
     db.init();
     try {
@@ -56,10 +58,11 @@ program
     }
 });
 program
-    .command('search <query>')
-    .description('Search indexed documentation')
-    .option('-l, --library <name>', 'Filter by library')
-    .option('--limit <n>', 'Max results', '5')
+    .command("search <query>")
+    .alias("f")
+    .description("Search indexed documentation (aliases: f, -f)")
+    .option("-l, --library <name>", "Filter by library")
+    .option("-m, --limit <n>", "Max results", "5")
     .action(async (query, opts) => {
     db.init();
     const results = searchEngine.search(query, {
@@ -78,11 +81,13 @@ program
     }
 });
 program
-    .command('list')
-    .description('List indexed libraries')
-    .action(() => {
+    .command("list")
+    .alias("l")
+    .description("List indexed libraries (aliases: l, -l)")
+    .option("-s, --status <status>", "Filter by status (indexed, crawling, error, all)", "all")
+    .action((opts) => {
     db.init();
-    const libs = db.listLibraries();
+    const libs = db.listLibraries(opts.status);
     if (libs.length === 0) {
         console.log('\nNo libraries indexed. Use "docshark add <url>" to add one.\n');
         return;
@@ -93,19 +98,20 @@ program
         Pages: l.page_count,
         Chunks: l.chunk_count,
         Status: l.status,
-        'Last Crawled': l.last_crawled_at || 'never',
+        "Last Crawled": l.last_crawled_at || "never",
     })));
 });
 program
-    .command('refresh <name>')
-    .description('Refresh an existing documentation library')
+    .command("refresh <name>")
+    .alias("r")
+    .description("Refresh an existing documentation library (aliases: r, -r)")
     .action(async (name) => {
     db.init();
     try {
         const lib = db.getLibraryByName(name);
         if (!lib)
             throw new Error(`Library "${name}" not found.`);
-        const { jobManager } = await import('./server.js');
+        const { jobManager } = await import("./server.js");
         const job = jobManager.startCrawl(lib.id, { incremental: true });
         console.log(`\n🔄 Refreshing "${lib.display_name}" — crawling ${lib.url}...`);
         console.log(`   Job ID: ${job.id}`);
@@ -117,8 +123,9 @@ program
     }
 });
 program
-    .command('remove <name>')
-    .description('Remove a documentation library and its index')
+    .command("remove <name>")
+    .alias("rm")
+    .description("Remove a documentation library and its index (aliases: rm, -rm)")
     .action((name) => {
     db.init();
     try {
@@ -134,35 +141,87 @@ program
     }
 });
 program
-    .command('get <url>')
-    .description('Get the full markdown content of a specific indexed page')
-    .action((url) => {
+    .command("get [url]")
+    .alias("g")
+    .description("Get the full markdown content of a specific indexed page (aliases: g, -g)")
+    .option("-l, --library <name>", "Library name to search within")
+    .option("-p, --path <path>", "Relative path within the library")
+    .action((url, opts) => {
+    if (!url && (!opts.library || !opts.path)) {
+        console.error(`\n❌ Please provide either a URL, or both --library and --path\n`);
+        process.exit(1);
+    }
     db.init();
-    const page = db.getPage({ url });
+    const page = db.getPage({ url, library: opts.library, path: opts.path });
     if (!page) {
-        console.error(`\n❌ Page not found in index: ${url}\n`);
+        console.error(`\n❌ Page not found in index.\n`);
         process.exit(1);
     }
     console.log(`\n--- ${page.title} ---`);
     console.log(`Source: ${page.url}\n\n`);
     console.log(page.content_markdown);
-    console.log('\n');
+    console.log("\n");
+});
+// Intercept manual short flags (e.g., -l instead of l) so they act as command aliases
+const args = process.argv;
+const cmdAliases = {
+    "-s": "start",
+    "-a": "add",
+    "-f": "search",
+    "-l": "list",
+    "-r": "refresh",
+    "-rm": "remove",
+    "-g": "get",
+    "-i": "info",
+};
+if (args[2] && cmdAliases[args[2]]) {
+    args[2] = cmdAliases[args[2]];
+}
+program
+    .command("info <name>")
+    .alias("i")
+    .description("Get information about a library and list its pages (aliases: i, -i)")
+    .action((name) => {
+    db.init();
+    const lib = db.getLibraryByName(name);
+    if (!lib) {
+        console.error(`\n❌ Library not found: ${name}\n`);
+        process.exit(1);
+    }
+    console.log(`\n--- Library: ${lib.display_name} (${lib.name}) ---`);
+    console.log(`URL: ${lib.url}`);
+    console.log(`Status: ${lib.status}`);
+    console.log(`Pages: ${lib.page_count}`);
+    console.log(`Chunks: ${lib.chunk_count}`);
+    console.log(`Last Crawled: ${lib.last_crawled_at || "never"}`);
+    const pages = db.getPagesByLibrary(lib.id);
+    if (pages.length > 0) {
+        console.log(`\n--- Pages (${pages.length}) ---`);
+        console.table(pages.map((p) => ({
+            Title: p.title || "Untitled",
+            Path: p.path,
+            URL: p.url,
+        })));
+    }
+    else {
+        console.log(`\nNo pages found for this library.\n`);
+    }
 });
-program.parse();
+program.parse(args);
 /** Helper to wait for a crawl job to finish (CLI blocking mode) */
 async function waitForCrawl(jobId) {
-    const { jobManager } = await import('./server.js');
+    const { jobManager } = await import("./server.js");
     return new Promise((resolve) => {
         const check = () => {
             const job = jobManager.getJob(jobId);
-            if (!job || job.status === 'completed' || job.status === 'failed') {
-                if (job?.status === 'completed') {
+            if (!job || job.status === "completed" || job.status === "failed") {
+                if (job?.status === "completed") {
                     console.log(`\n🦈 Crawl complete: ${job.pages_crawled} pages, ${job.chunks_created} chunks indexed.`);
                     if (job.pages_failed > 0) {
                         console.log(`   ⚠️  ${job.pages_failed} pages failed.`);
                     }
                 }
-                else if (job?.status === 'failed') {
+                else if (job?.status === "failed") {
                     console.error(`\n❌ Crawl failed: ${job.error_message}`);
                 }
                 resolve();

package/dist/scraper/discoverer.d.ts CHANGED Viewed

@@ -1,6 +1,11 @@
 import type { CrawlConfig } from '../types.js';
 /**
  * Discover all documentation page URLs from a base URL.
- * Strategy: sitemap.xml → link crawl fallback
+ *
+ * Strategy cascade (stops at first strategy that yields >=1 URLs):
+ *   A. sitemap.xml
+ *   B. llms.txt (AI-friendly link manifest)
+ *   C. Navigation-aware HTML link extraction (nav/sidebar elements)
+ *   D. BFS link crawl (follows all same-origin links)
  */
 export declare function discoverPages(baseUrl: string, config?: CrawlConfig): Promise<string[]>;

package/dist/scraper/discoverer.js CHANGED Viewed

@@ -1,26 +1,76 @@
-// src/scraper/discoverer.ts — Page URL discovery via sitemap + link crawl
+// src/scraper/discoverer.ts — Page URL discovery via sitemap + llms.txt + nav-aware crawl + BFS fallback
 import * as cheerio from 'cheerio';
 import { getRobotsParser, isAllowed } from './robots.js';
 import { RateLimiter } from './rate-limiter.js';
 const USER_AGENT = 'DocShark/1.0';
+/**
+ * Well-known entry points that doc sites commonly use.
+ * When the root page yields no links (JS-rendered SPA landing pages),
+ * we probe these paths to find a server-rendered doc page with navigation.
+ */
+const COMMON_DOC_ENTRY_PATHS = [
+    '/docs',
+    '/docs/',
+    '/documentation',
+    '/guide',
+    '/guides',
+    '/reference',
+    '/api',
+    '/getting-started',
+    '/docs/getting-started',
+    '/docs/introduction',
+    '/docs/installation',
+    '/docs/overview',
+];
 /**
  * Discover all documentation page URLs from a base URL.
- * Strategy: sitemap.xml → link crawl fallback
+ *
+ * Strategy cascade (stops at first strategy that yields >=1 URLs):
+ *   A. sitemap.xml
+ *   B. llms.txt (AI-friendly link manifest)
+ *   C. Navigation-aware HTML link extraction (nav/sidebar elements)
+ *   D. BFS link crawl (follows all same-origin links)
  */
 export async function discoverPages(baseUrl, config = {}) {
     const maxDepth = config.maxDepth ?? 3;
     const robots = await getRobotsParser(baseUrl);
+    // ────────────────────────────────────────────
     // Strategy A: Try sitemap first
+    // ────────────────────────────────────────────
     const sitemapUrls = await discoverFromSitemap(baseUrl, robots);
     if (sitemapUrls.length > 0) {
-        console.log(`[DocShark] Found ${sitemapUrls.length} URLs from sitemap`);
+        console.log(`[DocShark] ✅ Found ${sitemapUrls.length} URLs from sitemap`);
         return filterUrls(sitemapUrls, baseUrl, config, robots);
     }
-    // Strategy B: BFS link crawl
-    console.log(`[DocShark] No sitemap found, crawling links (depth=${maxDepth})`);
+    // ────────────────────────────────────────────
+    // Strategy B: Try llms.txt / llms-full.txt
+    // ────────────────────────────────────────────
+    const llmsUrls = await discoverFromLlmsTxt(baseUrl);
+    if (llmsUrls.length > 0) {
+        console.log(`[DocShark] ✅ Found ${llmsUrls.length} URLs from llms.txt`);
+        return filterUrls(llmsUrls, baseUrl, config, robots);
+    }
+    // ────────────────────────────────────────────
+    // Strategy C: Navigation-aware link extraction
+    // ────────────────────────────────────────────
+    console.log(`[DocShark] No sitemap or llms.txt. Trying navigation-aware discovery...`);
+    const navUrls = await discoverFromNavigation(baseUrl, config, robots);
+    if (navUrls.length > 0) {
+        console.log(`[DocShark] ✅ Found ${navUrls.length} URLs from page navigation`);
+        // Enrich: BFS crawl from discovered nav URLs to find nested pages
+        const enrichedUrls = await enrichWithBfsCrawl(baseUrl, navUrls, maxDepth, config, robots);
+        return enrichedUrls;
+    }
+    // ────────────────────────────────────────────
+    // Strategy D: Full BFS link crawl (legacy fallback)
+    // ────────────────────────────────────────────
+    console.log(`[DocShark] No navigation links found, full BFS crawl (depth=${maxDepth})`);
     const crawledUrls = await discoverByLinkCrawl(baseUrl, maxDepth, config, robots);
     return crawledUrls;
 }
+// ═══════════════════════════════════════════════
+// Strategy A: Sitemap
+// ═══════════════════════════════════════════════
 /** Parse sitemap.xml for page URLs */
 async function discoverFromSitemap(baseUrl, robots) {
     // Check for sitemap in robots.txt
@@ -82,13 +132,308 @@ async function fetchSitemapUrls(sitemapUrl) {
         return [];
     }
 }
+// ═══════════════════════════════════════════════
+// Strategy B: llms.txt
+// ═══════════════════════════════════════════════
+/**
+ * Parse llms.txt / llms-full.txt for documentation URLs.
+ * The llms.txt standard uses markdown-style `[title](url)` links.
+ * @see https://llmstxt.org
+ */
+async function discoverFromLlmsTxt(baseUrl) {
+    const candidates = [
+        new URL('/llms-full.txt', baseUrl).href,
+        new URL('/llms.txt', baseUrl).href,
+    ];
+    for (const llmsUrl of candidates) {
+        try {
+            const response = await fetch(llmsUrl, {
+                headers: { 'User-Agent': USER_AGENT },
+                signal: AbortSignal.timeout(15_000),
+            });
+            if (!response.ok)
+                continue;
+            const text = await response.text();
+            // Extract markdown-style links: [text](url)
+            const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g;
+            const urls = [];
+            let match;
+            while ((match = linkRegex.exec(text)) !== null) {
+                const href = match[2].trim();
+                try {
+                    const resolved = new URL(href, baseUrl);
+                    // Only same-origin, strip .md extension if present
+                    if (resolved.origin === new URL(baseUrl).origin) {
+                        let pathname = resolved.pathname;
+                        // Strip .md extension — llms.txt often uses .md paths
+                        // but the actual page URL doesn't have .md
+                        if (pathname.endsWith('.md')) {
+                            pathname = pathname.slice(0, -3);
+                        }
+                        resolved.pathname = pathname;
+                        resolved.hash = '';
+                        resolved.search = '';
+                        urls.push(resolved.href);
+                    }
+                }
+                catch {
+                    // Invalid URL, skip
+                }
+            }
+            if (urls.length > 0) {
+                // Deduplicate
+                return [...new Set(urls)];
+            }
+        }
+        catch {
+            // Fetch failed, try next candidate
+        }
+    }
+    return [];
+}
+// ═══════════════════════════════════════════════
+// Strategy C: Navigation-aware link extraction
+// ═══════════════════════════════════════════════
+/**
+ * CSS selectors for navigation/sidebar elements in common doc site frameworks.
+ * These target areas where documentation sites list their page links.
+ */
+const NAV_SELECTORS = [
+    'nav a[href]',
+    '[role="navigation"] a[href]',
+    'aside a[href]',
+    '.sidebar a[href]',
+    '[class*="sidebar"] a[href]',
+    '[class*="nav"] a[href]',
+    '[class*="menu"] a[href]',
+    '[class*="toc"] a[href]',
+    '[data-sidebar] a[href]',
+    '[id*="sidebar"] a[href]',
+    '[id*="nav"] a[href]',
+];
+/**
+ * Extract links specifically from navigation elements (sidebar, nav, etc.)
+ * of a doc page. If the root page yields nothing (SPA), we try common
+ * doc entry points that are likely server-rendered.
+ */
+async function discoverFromNavigation(baseUrl, config, robots) {
+    const baseOrigin = new URL(baseUrl).origin;
+    // Step 1: Try extracting from the base URL first
+    let navLinks = await extractNavLinks(baseUrl, baseOrigin);
+    // Step 2: If root page yields very few links (likely JS-rendered landing),
+    // probe common doc entry paths
+    if (navLinks.length < 3) {
+        console.log(`[DocShark] Root page has only ${navLinks.length} nav links. Probing doc entry points...`);
+        for (const entryPath of COMMON_DOC_ENTRY_PATHS) {
+            const entryUrl = new URL(entryPath, baseUrl).href;
+            // Skip if robots disallow
+            if (!isAllowed(robots, entryUrl))
+                continue;
+            const entryLinks = await extractNavLinks(entryUrl, baseOrigin);
+            if (entryLinks.length > navLinks.length) {
+                console.log(`[DocShark] Found ${entryLinks.length} nav links at ${entryPath}`);
+                navLinks = entryLinks;
+            }
+            // If we found a rich source, stop probing
+            if (navLinks.length >= 10)
+                break;
+        }
+    }
+    // Step 3: If static fetch still yields nothing, try puppeteer on root
+    if (navLinks.length < 3) {
+        console.log(`[DocShark] Static fetch yielded few links. Trying headless browser...`);
+        const puppeteerLinks = await extractNavLinksWithPuppeteer(baseUrl, baseOrigin);
+        if (puppeteerLinks.length > navLinks.length) {
+            navLinks = puppeteerLinks;
+        }
+    }
+    return filterUrls(navLinks, baseUrl, config, robots);
+}
+/**
+ * Fetch a page and extract links from navigation elements.
+ * Uses targeted CSS selectors to find sidebar/nav links.
+ */
+async function extractNavLinks(url, baseOrigin) {
+    try {
+        const response = await fetch(url, {
+            headers: { 'User-Agent': USER_AGENT },
+            signal: AbortSignal.timeout(15_000),
+            redirect: 'follow',
+        });
+        if (!response.ok)
+            return [];
+        const contentType = response.headers.get('content-type') || '';
+        if (!contentType.includes('text/html'))
+            return [];
+        const html = await response.text();
+        return extractLinksFromHtml(html, url, baseOrigin, true);
+    }
+    catch {
+        return [];
+    }
+}
+/**
+ * Extract links from HTML content.
+ *
+ * @param navOnly - If true, only extract links from nav-like elements.
+ *                  If false, extract all `a[href]` links.
+ */
+function extractLinksFromHtml(html, pageUrl, baseOrigin, navOnly) {
+    const $ = cheerio.load(html);
+    const urls = new Set();
+    const selector = navOnly ? NAV_SELECTORS.join(', ') : 'a[href]';
+    $(selector).each((_, el) => {
+        try {
+            const href = $(el).attr('href');
+            if (!href)
+                return;
+            const resolved = new URL(href, pageUrl);
+            resolved.hash = '';
+            resolved.search = '';
+            if (resolved.origin === baseOrigin &&
+                !isNonDocUrl(resolved.href)) {
+                urls.add(resolved.href);
+            }
+        }
+        catch {
+            // Invalid URL, skip
+        }
+    });
+    return [...urls];
+}
+/**
+ * Use puppeteer-core to render a JS SPA and extract navigation links.
+ * Falls back silently if puppeteer is not installed.
+ */
+async function extractNavLinksWithPuppeteer(url, baseOrigin) {
+    try {
+        // @ts-ignore — puppeteer-core is an optional dependency
+        const puppeteer = await import('puppeteer-core');
+        const { existsSync } = await import('fs');
+        const executablePath = findChrome(existsSync);
+        if (!executablePath) {
+            console.warn(`[DocShark] Chrome not found for headless navigation discovery. ` +
+                `Install Chrome or set CHROME_PATH env var.`);
+            return [];
+        }
+        const browser = await puppeteer.default.launch({
+            headless: true,
+            executablePath,
+            args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
+        });
+        try {
+            const page = await browser.newPage();
+            // Block heavy resources for speed
+            await page.setRequestInterception(true);
+            page.on('request', (req) => {
+                const type = req.resourceType();
+                if (['image', 'stylesheet', 'font', 'media'].includes(type)) {
+                    req.abort();
+                }
+                else {
+                    req.continue();
+                }
+            });
+            await page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 });
+            const html = await page.content();
+            await page.close();
+            return extractLinksFromHtml(html, url, baseOrigin, true);
+        }
+        finally {
+            await browser.close();
+        }
+    }
+    catch (err) {
+        console.warn(`[DocShark] Puppeteer navigation discovery failed: ${err.message}`);
+        return [];
+    }
+}
+function findChrome(existsSync) {
+    const candidates = [
+        process.env.CHROME_PATH,
+        process.env.PUPPETEER_EXECUTABLE_PATH,
+        '/usr/bin/google-chrome',
+        '/usr/bin/google-chrome-stable',
+        '/usr/bin/chromium-browser',
+        '/usr/bin/chromium',
+        '/snap/bin/chromium',
+        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+    ];
+    for (const path of candidates) {
+        if (path && existsSync(path))
+            return path;
+    }
+    return undefined;
+}
+// ═══════════════════════════════════════════════
+// Strategy D: BFS Link Crawl
+// ═══════════════════════════════════════════════
+/**
+ * Enrich an initial set of discovered URLs by BFS-crawling each page
+ * for additional same-origin links. Useful after nav extraction to
+ * find nested pages that aren't in the top-level navigation.
+ */
+async function enrichWithBfsCrawl(baseUrl, seedUrls, maxDepth, config, robots) {
+    const visited = new Set(seedUrls);
+    const queue = seedUrls.map((url) => ({
+        url,
+        depth: 1, // Seed URLs are depth 1, their children are depth 2+
+    }));
+    const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
+    const baseOrigin = new URL(baseUrl).origin;
+    while (queue.length > 0) {
+        const item = queue.shift();
+        // Only follow links from nav-discovered pages to find sub-pages
+        // e.g. /docs/data-table might link to /docs/data-table/sorting
+        if (item.depth > maxDepth)
+            continue;
+        if (!isAllowed(robots, item.url))
+            continue;
+        // We already have this URL in our set; only crawl to find *new* links
+        try {
+            await rateLimiter.wait();
+            const response = await fetch(item.url, {
+                headers: { 'User-Agent': USER_AGENT },
+                signal: AbortSignal.timeout(15_000),
+            });
+            if (!response.ok)
+                continue;
+            const contentType = response.headers.get('content-type') || '';
+            if (!contentType.includes('text/html'))
+                continue;
+            const html = await response.text();
+            // Extract ALL links from the page (not just nav) for BFS enrichment
+            const pageLinks = extractLinksFromHtml(html, item.url, baseOrigin, false);
+            for (const link of pageLinks) {
+                if (!visited.has(link) && !isNonDocUrl(link)) {
+                    visited.add(link);
+                    queue.push({ url: link, depth: item.depth + 1 });
+                }
+            }
+        }
+        catch {
+            // Fetch failed, skip
+        }
+    }
+    return filterUrls([...visited], baseUrl, config, robots);
+}
 /** BFS link crawl from the base URL */
 async function discoverByLinkCrawl(baseUrl, maxDepth, config, robots) {
     const visited = new Set();
-    const queue = [{ url: baseUrl, depth: 0 }];
-    const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
     const baseOrigin = new URL(baseUrl).origin;
     const basePath = new URL(baseUrl).pathname;
+    // Seed queue: start with base URL + common doc entry points
+    const queue = [{ url: baseUrl, depth: 0 }];
+    // Also seed common doc entry points if base is root
+    if (basePath === '/' || basePath === '') {
+        for (const entryPath of COMMON_DOC_ENTRY_PATHS) {
+            const entryUrl = new URL(entryPath, baseUrl).href;
+            if (isAllowed(robots, entryUrl)) {
+                queue.push({ url: entryUrl, depth: 0 });
+            }
+        }
+    }
+    const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
     while (queue.length > 0) {
         const item = queue.shift();
         if (visited.has(item.url) || item.depth > maxDepth)
@@ -138,6 +483,9 @@ async function discoverByLinkCrawl(baseUrl, maxDepth, config, robots) {
     }
     return filterUrls([...visited], baseUrl, config, robots);
 }
+// ═══════════════════════════════════════════════
+// Shared Utilities
+// ═══════════════════════════════════════════════
 /** Filter URLs based on config patterns */
 function filterUrls(urls, baseUrl, config, robots) {
     const baseOrigin = new URL(baseUrl).origin;
@@ -145,10 +493,11 @@ function filterUrls(urls, baseUrl, config, robots) {
     return urls.filter((url) => {
         try {
             const parsed = new URL(url);
-            // Must be same origin and under base path
+            // Must be same origin
             if (parsed.origin !== baseOrigin)
                 return false;
-            if (!parsed.pathname.startsWith(basePath))
+            // Must be under base path (unless base is root)
+            if (basePath !== '/' && !parsed.pathname.startsWith(basePath))
                 return false;
             // Check robots.txt
             if (!isAllowed(robots, url))

package/dist/server.js CHANGED Viewed

@@ -152,3 +152,38 @@ server.tool({
     db.removeLibrary(lib.id);
     return tool.text(`🗑️ Library "${lib.display_name}" removed.\nDeleted ${lib.page_count} pages and ${lib.chunk_count} chunks.`);
 });
+// ──────────────────────────────────────
+// Tool 7: library_info — detailed stats and pages
+// ──────────────────────────────────────
+server.tool({
+    name: 'library_info',
+    description: 'Get detailed information about a specific documentation library, including a list of all its indexed pages and their paths. ' +
+        'Use this to see what pages are available in a library before retrieving them.',
+    schema: v.object({
+        library: v.pipe(v.string(), v.description('The library name to get information for.')),
+    }),
+}, async ({ library }) => {
+    const lib = db.getLibraryByName(library);
+    if (!lib)
+        return tool.text(`Library "${library}" not found. Use list_libraries to see available libraries.`);
+    const pages = db.getPagesByLibrary(lib.id);
+    let output = `## Library: ${lib.display_name} (${lib.name})\n`;
+    output += `- **URL:** ${lib.url}\n`;
+    output += `- **Status:** ${lib.status}\n`;
+    output += `- **Pages:** ${lib.page_count}\n`;
+    output += `- **Chunks:** ${lib.chunk_count}\n`;
+    output += `- **Last Crawled:** ${lib.last_crawled_at || 'never'}\n\n`;
+    if (pages.length > 0) {
+        output += `### Pages (${pages.length})\n\n`;
+        output += '| Title | Path | URL |\n';
+        output += '| ----- | ---- | --- |\n';
+        for (const p of pages) {
+            const title = p.title?.replace(/\|/g, '-') || 'Untitled';
+            output += `| ${title} | \`${p.path}\` | ${p.url} |\n`;
+        }
+    }
+    else {
+        output += `*No pages indexed yet for this library.*\n`;
+    }
+    return tool.text(output);
+});

package/dist/version.d.ts CHANGED Viewed

	@@ -1 +1 @@
1	- export declare const VERSION = "0.1.6";
1	+ export declare const VERSION = "0.1.8";

package/dist/version.js CHANGED Viewed

@@ -1,2 +1,2 @@
 // This file is automatically updated by release-please
-export const VERSION = '0.1.6'; // x-release-please-version
+export const VERSION = '0.1.8'; // x-release-please-version

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "docshark",
-  "version": "0.1.6",
+  "version": "0.1.8",
   "description": "🦈 Documentation MCP Server — scrape, index, and search any doc website",
   "type": "module",
   "main": "./dist/index.js",
@@ -26,10 +26,14 @@
     "dev": "bun run --watch src/cli.ts start",
     "cli": "bun run src/cli.ts",
     "check": "tsc --noEmit",
-    "build": "rm -rf dist && tsc",
+    "build": "rm -rf dist && tsc && chmod +x dist/cli.js",
     "prepublishOnly": "bun run build",
     "test:crawl": "bun run src/cli.ts add https://svelte.dev/docs/svelte/overview"
   },
+  "engines": {
+    "node": ">=20",
+    "bun": ">=1.1.0"
+  },
   "keywords": [
     "tmcp",
     "mcp",