docshark 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.1.8 (2026-03-11)
4
+
5
+ **Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.7...v0.1.8
6
+
7
+ ## 0.1.7 (2026-03-07)
8
+
9
+ **Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.6...v0.1.7
10
+
3
11
  ## 0.1.6 (2026-03-07)
4
12
 
5
13
  **Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.5...v0.1.6
package/README.md CHANGED
@@ -49,14 +49,14 @@ We are actively polishing the integration between the core engine and external M
49
49
 
50
50
  ### Quick Start (from npm)
51
51
 
52
- You can run DocShark directly without installing it globally using `npx`:
52
+ You can run DocShark directly without installing it globally using `bunx`:
53
53
 
54
54
  ```bash
55
55
  # Add a documentation library to the index
56
- npx docshark add https://valibot.dev/guides/ --depth 2
56
+ bunx docshark add https://valibot.dev/guides/ --depth 2
57
57
 
58
58
  # Search your indexed docs
59
- npx docshark search "schema validation"
59
+ bunx docshark search "schema validation"
60
60
  ```
61
61
 
62
62
  ### Installation
@@ -87,7 +87,7 @@ Add DocShark to your `.vscode/settings.json` or global MCP configuration:
87
87
  {
88
88
  "mcpServers": {
89
89
  "docshark": {
90
- "command": "npx",
90
+ "command": "bunx",
91
91
  "args": ["-y", "docshark", "start", "--stdio"]
92
92
  }
93
93
  }
@@ -100,7 +100,7 @@ Add DocShark to your `.vscode/settings.json` or global MCP configuration:
100
100
  2. Click **+ Add New MCP Server**.
101
101
  3. Name: `docshark`
102
102
  4. Type: `command`
103
- 5. Command: `npx -y docshark start --stdio`
103
+ 5. Command: `bunx -y docshark start --stdio`
104
104
 
105
105
  ### Claude Desktop
106
106
 
@@ -113,7 +113,7 @@ Edit your Claude Desktop configuration file:
113
113
  {
114
114
  "mcpServers": {
115
115
  "docshark": {
116
- "command": "npx",
116
+ "command": "bunx",
117
117
  "args": ["-y", "docshark", "start", "--stdio"]
118
118
  }
119
119
  }
package/dist/cli.d.ts CHANGED
@@ -1,2 +1,2 @@
1
- #!/usr/bin/env node
1
+ #!/usr/bin/env bun
2
2
  export {};
package/dist/cli.js CHANGED
@@ -1,20 +1,21 @@
1
- #!/usr/bin/env node
1
+ #!/usr/bin/env bun
2
2
  // src/cli.ts — DocShark CLI entry point
3
- import { Command } from 'commander';
4
- import { startHttpServer } from './http.js';
5
- import { StdioTransport } from '@tmcp/transport-stdio';
6
- import { server, db, searchEngine, libraryService } from './server.js';
7
- import { VERSION } from './version.js';
3
+ import { Command } from "commander";
4
+ import { startHttpServer } from "./http.js";
5
+ import { StdioTransport } from "@tmcp/transport-stdio";
6
+ import { server, db, searchEngine, libraryService } from "./server.js";
7
+ import { VERSION } from "./version.js";
8
8
  const program = new Command()
9
- .name('docshark')
10
- .description('🦈 Documentation MCP Server — scrape, index, and search any doc website')
11
- .version(VERSION, '-v, --version', 'output the current version');
9
+ .name("docshark")
10
+ .description("🦈 Documentation MCP Server — scrape, index, and search any doc website")
11
+ .version(VERSION, "-v, --version", "output the current version");
12
12
  program
13
- .command('start', { isDefault: true })
14
- .description('Start the MCP server')
15
- .option('-p, --port <port>', 'HTTP server port', '6380')
16
- .option('--stdio', 'Run in STDIO mode (for Claude Desktop, Cursor, etc.)')
17
- .option('--data-dir <path>', 'Data directory', '')
13
+ .command("start", { isDefault: true })
14
+ .alias("s")
15
+ .description("Start the MCP server (aliases: s, -s)")
16
+ .option("-p, --port <port>", "HTTP server port", "6380")
17
+ .option("-S, --stdio", "Run in STDIO mode (for Claude Desktop, Cursor, etc.)")
18
+ .option("-D, --data-dir <path>", "Data directory", "")
18
19
  .action(async (opts) => {
19
20
  if (opts.dataDir) {
20
21
  process.env.DOCSHARK_DATA_DIR = opts.dataDir;
@@ -30,11 +31,12 @@ program
30
31
  }
31
32
  });
32
33
  program
33
- .command('add <url>')
34
- .description('Add a documentation library and start crawling')
35
- .option('-n, --name <name>', 'Library name (auto-generated from URL if omitted)')
36
- .option('-d, --depth <n>', 'Max crawl depth', '3')
37
- .option('--lib-version <version>', 'Library version')
34
+ .command("add <url>")
35
+ .alias("a")
36
+ .description("Add a documentation library and start crawling (aliases: a, -a)")
37
+ .option("-n, --name <name>", "Library name (auto-generated from URL if omitted)")
38
+ .option("-d, --depth <n>", "Max crawl depth", "3")
39
+ .option("-V, --lib-version <version>", "Library version")
38
40
  .action(async (url, opts) => {
39
41
  db.init();
40
42
  try {
@@ -56,10 +58,11 @@ program
56
58
  }
57
59
  });
58
60
  program
59
- .command('search <query>')
60
- .description('Search indexed documentation')
61
- .option('-l, --library <name>', 'Filter by library')
62
- .option('--limit <n>', 'Max results', '5')
61
+ .command("search <query>")
62
+ .alias("f")
63
+ .description("Search indexed documentation (aliases: f, -f)")
64
+ .option("-l, --library <name>", "Filter by library")
65
+ .option("-m, --limit <n>", "Max results", "5")
63
66
  .action(async (query, opts) => {
64
67
  db.init();
65
68
  const results = searchEngine.search(query, {
@@ -78,11 +81,13 @@ program
78
81
  }
79
82
  });
80
83
  program
81
- .command('list')
82
- .description('List indexed libraries')
83
- .action(() => {
84
+ .command("list")
85
+ .alias("l")
86
+ .description("List indexed libraries (aliases: l, -l)")
87
+ .option("-s, --status <status>", "Filter by status (indexed, crawling, error, all)", "all")
88
+ .action((opts) => {
84
89
  db.init();
85
- const libs = db.listLibraries();
90
+ const libs = db.listLibraries(opts.status);
86
91
  if (libs.length === 0) {
87
92
  console.log('\nNo libraries indexed. Use "docshark add <url>" to add one.\n');
88
93
  return;
@@ -93,19 +98,20 @@ program
93
98
  Pages: l.page_count,
94
99
  Chunks: l.chunk_count,
95
100
  Status: l.status,
96
- 'Last Crawled': l.last_crawled_at || 'never',
101
+ "Last Crawled": l.last_crawled_at || "never",
97
102
  })));
98
103
  });
99
104
  program
100
- .command('refresh <name>')
101
- .description('Refresh an existing documentation library')
105
+ .command("refresh <name>")
106
+ .alias("r")
107
+ .description("Refresh an existing documentation library (aliases: r, -r)")
102
108
  .action(async (name) => {
103
109
  db.init();
104
110
  try {
105
111
  const lib = db.getLibraryByName(name);
106
112
  if (!lib)
107
113
  throw new Error(`Library "${name}" not found.`);
108
- const { jobManager } = await import('./server.js');
114
+ const { jobManager } = await import("./server.js");
109
115
  const job = jobManager.startCrawl(lib.id, { incremental: true });
110
116
  console.log(`\n🔄 Refreshing "${lib.display_name}" — crawling ${lib.url}...`);
111
117
  console.log(` Job ID: ${job.id}`);
@@ -117,8 +123,9 @@ program
117
123
  }
118
124
  });
119
125
  program
120
- .command('remove <name>')
121
- .description('Remove a documentation library and its index')
126
+ .command("remove <name>")
127
+ .alias("rm")
128
+ .description("Remove a documentation library and its index (aliases: rm, -rm)")
122
129
  .action((name) => {
123
130
  db.init();
124
131
  try {
@@ -134,35 +141,87 @@ program
134
141
  }
135
142
  });
136
143
  program
137
- .command('get <url>')
138
- .description('Get the full markdown content of a specific indexed page')
139
- .action((url) => {
144
+ .command("get [url]")
145
+ .alias("g")
146
+ .description("Get the full markdown content of a specific indexed page (aliases: g, -g)")
147
+ .option("-l, --library <name>", "Library name to search within")
148
+ .option("-p, --path <path>", "Relative path within the library")
149
+ .action((url, opts) => {
150
+ if (!url && (!opts.library || !opts.path)) {
151
+ console.error(`\n❌ Please provide either a URL, or both --library and --path\n`);
152
+ process.exit(1);
153
+ }
140
154
  db.init();
141
- const page = db.getPage({ url });
155
+ const page = db.getPage({ url, library: opts.library, path: opts.path });
142
156
  if (!page) {
143
- console.error(`\n❌ Page not found in index: ${url}\n`);
157
+ console.error(`\n❌ Page not found in index.\n`);
144
158
  process.exit(1);
145
159
  }
146
160
  console.log(`\n--- ${page.title} ---`);
147
161
  console.log(`Source: ${page.url}\n\n`);
148
162
  console.log(page.content_markdown);
149
- console.log('\n');
163
+ console.log("\n");
164
+ });
165
+ // Intercept manual short flags (e.g., -l instead of l) so they act as command aliases
166
+ const args = process.argv;
167
+ const cmdAliases = {
168
+ "-s": "start",
169
+ "-a": "add",
170
+ "-f": "search",
171
+ "-l": "list",
172
+ "-r": "refresh",
173
+ "-rm": "remove",
174
+ "-g": "get",
175
+ "-i": "info",
176
+ };
177
+ if (args[2] && cmdAliases[args[2]]) {
178
+ args[2] = cmdAliases[args[2]];
179
+ }
180
+ program
181
+ .command("info <name>")
182
+ .alias("i")
183
+ .description("Get information about a library and list its pages (aliases: i, -i)")
184
+ .action((name) => {
185
+ db.init();
186
+ const lib = db.getLibraryByName(name);
187
+ if (!lib) {
188
+ console.error(`\n❌ Library not found: ${name}\n`);
189
+ process.exit(1);
190
+ }
191
+ console.log(`\n--- Library: ${lib.display_name} (${lib.name}) ---`);
192
+ console.log(`URL: ${lib.url}`);
193
+ console.log(`Status: ${lib.status}`);
194
+ console.log(`Pages: ${lib.page_count}`);
195
+ console.log(`Chunks: ${lib.chunk_count}`);
196
+ console.log(`Last Crawled: ${lib.last_crawled_at || "never"}`);
197
+ const pages = db.getPagesByLibrary(lib.id);
198
+ if (pages.length > 0) {
199
+ console.log(`\n--- Pages (${pages.length}) ---`);
200
+ console.table(pages.map((p) => ({
201
+ Title: p.title || "Untitled",
202
+ Path: p.path,
203
+ URL: p.url,
204
+ })));
205
+ }
206
+ else {
207
+ console.log(`\nNo pages found for this library.\n`);
208
+ }
150
209
  });
151
- program.parse();
210
+ program.parse(args);
152
211
  /** Helper to wait for a crawl job to finish (CLI blocking mode) */
153
212
  async function waitForCrawl(jobId) {
154
- const { jobManager } = await import('./server.js');
213
+ const { jobManager } = await import("./server.js");
155
214
  return new Promise((resolve) => {
156
215
  const check = () => {
157
216
  const job = jobManager.getJob(jobId);
158
- if (!job || job.status === 'completed' || job.status === 'failed') {
159
- if (job?.status === 'completed') {
217
+ if (!job || job.status === "completed" || job.status === "failed") {
218
+ if (job?.status === "completed") {
160
219
  console.log(`\n🦈 Crawl complete: ${job.pages_crawled} pages, ${job.chunks_created} chunks indexed.`);
161
220
  if (job.pages_failed > 0) {
162
221
  console.log(` ⚠️ ${job.pages_failed} pages failed.`);
163
222
  }
164
223
  }
165
- else if (job?.status === 'failed') {
224
+ else if (job?.status === "failed") {
166
225
  console.error(`\n❌ Crawl failed: ${job.error_message}`);
167
226
  }
168
227
  resolve();
@@ -1,6 +1,11 @@
1
1
  import type { CrawlConfig } from '../types.js';
2
2
  /**
3
3
  * Discover all documentation page URLs from a base URL.
4
- * Strategy: sitemap.xml → link crawl fallback
4
+ *
5
+ * Strategy cascade (stops at first strategy that yields >=1 URLs):
6
+ * A. sitemap.xml
7
+ * B. llms.txt (AI-friendly link manifest)
8
+ * C. Navigation-aware HTML link extraction (nav/sidebar elements)
9
+ * D. BFS link crawl (follows all same-origin links)
5
10
  */
6
11
  export declare function discoverPages(baseUrl: string, config?: CrawlConfig): Promise<string[]>;
@@ -1,26 +1,76 @@
1
- // src/scraper/discoverer.ts — Page URL discovery via sitemap + link crawl
1
+ // src/scraper/discoverer.ts — Page URL discovery via sitemap + llms.txt + nav-aware crawl + BFS fallback
2
2
  import * as cheerio from 'cheerio';
3
3
  import { getRobotsParser, isAllowed } from './robots.js';
4
4
  import { RateLimiter } from './rate-limiter.js';
5
5
  const USER_AGENT = 'DocShark/1.0';
6
+ /**
7
+ * Well-known entry points that doc sites commonly use.
8
+ * When the root page yields no links (JS-rendered SPA landing pages),
9
+ * we probe these paths to find a server-rendered doc page with navigation.
10
+ */
11
+ const COMMON_DOC_ENTRY_PATHS = [
12
+ '/docs',
13
+ '/docs/',
14
+ '/documentation',
15
+ '/guide',
16
+ '/guides',
17
+ '/reference',
18
+ '/api',
19
+ '/getting-started',
20
+ '/docs/getting-started',
21
+ '/docs/introduction',
22
+ '/docs/installation',
23
+ '/docs/overview',
24
+ ];
6
25
  /**
7
26
  * Discover all documentation page URLs from a base URL.
8
- * Strategy: sitemap.xml → link crawl fallback
27
+ *
28
+ * Strategy cascade (stops at first strategy that yields >=1 URLs):
29
+ * A. sitemap.xml
30
+ * B. llms.txt (AI-friendly link manifest)
31
+ * C. Navigation-aware HTML link extraction (nav/sidebar elements)
32
+ * D. BFS link crawl (follows all same-origin links)
9
33
  */
10
34
  export async function discoverPages(baseUrl, config = {}) {
11
35
  const maxDepth = config.maxDepth ?? 3;
12
36
  const robots = await getRobotsParser(baseUrl);
37
+ // ────────────────────────────────────────────
13
38
  // Strategy A: Try sitemap first
39
+ // ────────────────────────────────────────────
14
40
  const sitemapUrls = await discoverFromSitemap(baseUrl, robots);
15
41
  if (sitemapUrls.length > 0) {
16
- console.log(`[DocShark] Found ${sitemapUrls.length} URLs from sitemap`);
42
+ console.log(`[DocShark] Found ${sitemapUrls.length} URLs from sitemap`);
17
43
  return filterUrls(sitemapUrls, baseUrl, config, robots);
18
44
  }
19
- // Strategy B: BFS link crawl
20
- console.log(`[DocShark] No sitemap found, crawling links (depth=${maxDepth})`);
45
+ // ────────────────────────────────────────────
46
+ // Strategy B: Try llms.txt / llms-full.txt
47
+ // ────────────────────────────────────────────
48
+ const llmsUrls = await discoverFromLlmsTxt(baseUrl);
49
+ if (llmsUrls.length > 0) {
50
+ console.log(`[DocShark] ✅ Found ${llmsUrls.length} URLs from llms.txt`);
51
+ return filterUrls(llmsUrls, baseUrl, config, robots);
52
+ }
53
+ // ────────────────────────────────────────────
54
+ // Strategy C: Navigation-aware link extraction
55
+ // ────────────────────────────────────────────
56
+ console.log(`[DocShark] No sitemap or llms.txt. Trying navigation-aware discovery...`);
57
+ const navUrls = await discoverFromNavigation(baseUrl, config, robots);
58
+ if (navUrls.length > 0) {
59
+ console.log(`[DocShark] ✅ Found ${navUrls.length} URLs from page navigation`);
60
+ // Enrich: BFS crawl from discovered nav URLs to find nested pages
61
+ const enrichedUrls = await enrichWithBfsCrawl(baseUrl, navUrls, maxDepth, config, robots);
62
+ return enrichedUrls;
63
+ }
64
+ // ────────────────────────────────────────────
65
+ // Strategy D: Full BFS link crawl (legacy fallback)
66
+ // ────────────────────────────────────────────
67
+ console.log(`[DocShark] No navigation links found, full BFS crawl (depth=${maxDepth})`);
21
68
  const crawledUrls = await discoverByLinkCrawl(baseUrl, maxDepth, config, robots);
22
69
  return crawledUrls;
23
70
  }
71
+ // ═══════════════════════════════════════════════
72
+ // Strategy A: Sitemap
73
+ // ═══════════════════════════════════════════════
24
74
  /** Parse sitemap.xml for page URLs */
25
75
  async function discoverFromSitemap(baseUrl, robots) {
26
76
  // Check for sitemap in robots.txt
@@ -82,13 +132,308 @@ async function fetchSitemapUrls(sitemapUrl) {
82
132
  return [];
83
133
  }
84
134
  }
135
+ // ═══════════════════════════════════════════════
136
+ // Strategy B: llms.txt
137
+ // ═══════════════════════════════════════════════
138
+ /**
139
+ * Parse llms.txt / llms-full.txt for documentation URLs.
140
+ * The llms.txt standard uses markdown-style `[title](url)` links.
141
+ * @see https://llmstxt.org
142
+ */
143
+ async function discoverFromLlmsTxt(baseUrl) {
144
+ const candidates = [
145
+ new URL('/llms-full.txt', baseUrl).href,
146
+ new URL('/llms.txt', baseUrl).href,
147
+ ];
148
+ for (const llmsUrl of candidates) {
149
+ try {
150
+ const response = await fetch(llmsUrl, {
151
+ headers: { 'User-Agent': USER_AGENT },
152
+ signal: AbortSignal.timeout(15_000),
153
+ });
154
+ if (!response.ok)
155
+ continue;
156
+ const text = await response.text();
157
+ // Extract markdown-style links: [text](url)
158
+ const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g;
159
+ const urls = [];
160
+ let match;
161
+ while ((match = linkRegex.exec(text)) !== null) {
162
+ const href = match[2].trim();
163
+ try {
164
+ const resolved = new URL(href, baseUrl);
165
+ // Only same-origin, strip .md extension if present
166
+ if (resolved.origin === new URL(baseUrl).origin) {
167
+ let pathname = resolved.pathname;
168
+ // Strip .md extension — llms.txt often uses .md paths
169
+ // but the actual page URL doesn't have .md
170
+ if (pathname.endsWith('.md')) {
171
+ pathname = pathname.slice(0, -3);
172
+ }
173
+ resolved.pathname = pathname;
174
+ resolved.hash = '';
175
+ resolved.search = '';
176
+ urls.push(resolved.href);
177
+ }
178
+ }
179
+ catch {
180
+ // Invalid URL, skip
181
+ }
182
+ }
183
+ if (urls.length > 0) {
184
+ // Deduplicate
185
+ return [...new Set(urls)];
186
+ }
187
+ }
188
+ catch {
189
+ // Fetch failed, try next candidate
190
+ }
191
+ }
192
+ return [];
193
+ }
194
+ // ═══════════════════════════════════════════════
195
+ // Strategy C: Navigation-aware link extraction
196
+ // ═══════════════════════════════════════════════
197
+ /**
198
+ * CSS selectors for navigation/sidebar elements in common doc site frameworks.
199
+ * These target areas where documentation sites list their page links.
200
+ */
201
+ const NAV_SELECTORS = [
202
+ 'nav a[href]',
203
+ '[role="navigation"] a[href]',
204
+ 'aside a[href]',
205
+ '.sidebar a[href]',
206
+ '[class*="sidebar"] a[href]',
207
+ '[class*="nav"] a[href]',
208
+ '[class*="menu"] a[href]',
209
+ '[class*="toc"] a[href]',
210
+ '[data-sidebar] a[href]',
211
+ '[id*="sidebar"] a[href]',
212
+ '[id*="nav"] a[href]',
213
+ ];
214
+ /**
215
+ * Extract links specifically from navigation elements (sidebar, nav, etc.)
216
+ * of a doc page. If the root page yields nothing (SPA), we try common
217
+ * doc entry points that are likely server-rendered.
218
+ */
219
+ async function discoverFromNavigation(baseUrl, config, robots) {
220
+ const baseOrigin = new URL(baseUrl).origin;
221
+ // Step 1: Try extracting from the base URL first
222
+ let navLinks = await extractNavLinks(baseUrl, baseOrigin);
223
+ // Step 2: If root page yields very few links (likely JS-rendered landing),
224
+ // probe common doc entry paths
225
+ if (navLinks.length < 3) {
226
+ console.log(`[DocShark] Root page has only ${navLinks.length} nav links. Probing doc entry points...`);
227
+ for (const entryPath of COMMON_DOC_ENTRY_PATHS) {
228
+ const entryUrl = new URL(entryPath, baseUrl).href;
229
+ // Skip if robots disallow
230
+ if (!isAllowed(robots, entryUrl))
231
+ continue;
232
+ const entryLinks = await extractNavLinks(entryUrl, baseOrigin);
233
+ if (entryLinks.length > navLinks.length) {
234
+ console.log(`[DocShark] Found ${entryLinks.length} nav links at ${entryPath}`);
235
+ navLinks = entryLinks;
236
+ }
237
+ // If we found a rich source, stop probing
238
+ if (navLinks.length >= 10)
239
+ break;
240
+ }
241
+ }
242
+ // Step 3: If static fetch still yields nothing, try puppeteer on root
243
+ if (navLinks.length < 3) {
244
+ console.log(`[DocShark] Static fetch yielded few links. Trying headless browser...`);
245
+ const puppeteerLinks = await extractNavLinksWithPuppeteer(baseUrl, baseOrigin);
246
+ if (puppeteerLinks.length > navLinks.length) {
247
+ navLinks = puppeteerLinks;
248
+ }
249
+ }
250
+ return filterUrls(navLinks, baseUrl, config, robots);
251
+ }
252
+ /**
253
+ * Fetch a page and extract links from navigation elements.
254
+ * Uses targeted CSS selectors to find sidebar/nav links.
255
+ */
256
+ async function extractNavLinks(url, baseOrigin) {
257
+ try {
258
+ const response = await fetch(url, {
259
+ headers: { 'User-Agent': USER_AGENT },
260
+ signal: AbortSignal.timeout(15_000),
261
+ redirect: 'follow',
262
+ });
263
+ if (!response.ok)
264
+ return [];
265
+ const contentType = response.headers.get('content-type') || '';
266
+ if (!contentType.includes('text/html'))
267
+ return [];
268
+ const html = await response.text();
269
+ return extractLinksFromHtml(html, url, baseOrigin, true);
270
+ }
271
+ catch {
272
+ return [];
273
+ }
274
+ }
275
+ /**
276
+ * Extract links from HTML content.
277
+ *
278
+ * @param navOnly - If true, only extract links from nav-like elements.
279
+ * If false, extract all `a[href]` links.
280
+ */
281
+ function extractLinksFromHtml(html, pageUrl, baseOrigin, navOnly) {
282
+ const $ = cheerio.load(html);
283
+ const urls = new Set();
284
+ const selector = navOnly ? NAV_SELECTORS.join(', ') : 'a[href]';
285
+ $(selector).each((_, el) => {
286
+ try {
287
+ const href = $(el).attr('href');
288
+ if (!href)
289
+ return;
290
+ const resolved = new URL(href, pageUrl);
291
+ resolved.hash = '';
292
+ resolved.search = '';
293
+ if (resolved.origin === baseOrigin &&
294
+ !isNonDocUrl(resolved.href)) {
295
+ urls.add(resolved.href);
296
+ }
297
+ }
298
+ catch {
299
+ // Invalid URL, skip
300
+ }
301
+ });
302
+ return [...urls];
303
+ }
304
+ /**
305
+ * Use puppeteer-core to render a JS SPA and extract navigation links.
306
+ * Falls back silently if puppeteer is not installed.
307
+ */
308
+ async function extractNavLinksWithPuppeteer(url, baseOrigin) {
309
+ try {
310
+ // @ts-ignore — puppeteer-core is an optional dependency
311
+ const puppeteer = await import('puppeteer-core');
312
+ const { existsSync } = await import('fs');
313
+ const executablePath = findChrome(existsSync);
314
+ if (!executablePath) {
315
+ console.warn(`[DocShark] Chrome not found for headless navigation discovery. ` +
316
+ `Install Chrome or set CHROME_PATH env var.`);
317
+ return [];
318
+ }
319
+ const browser = await puppeteer.default.launch({
320
+ headless: true,
321
+ executablePath,
322
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
323
+ });
324
+ try {
325
+ const page = await browser.newPage();
326
+ // Block heavy resources for speed
327
+ await page.setRequestInterception(true);
328
+ page.on('request', (req) => {
329
+ const type = req.resourceType();
330
+ if (['image', 'stylesheet', 'font', 'media'].includes(type)) {
331
+ req.abort();
332
+ }
333
+ else {
334
+ req.continue();
335
+ }
336
+ });
337
+ await page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 });
338
+ const html = await page.content();
339
+ await page.close();
340
+ return extractLinksFromHtml(html, url, baseOrigin, true);
341
+ }
342
+ finally {
343
+ await browser.close();
344
+ }
345
+ }
346
+ catch (err) {
347
+ console.warn(`[DocShark] Puppeteer navigation discovery failed: ${err.message}`);
348
+ return [];
349
+ }
350
+ }
351
+ function findChrome(existsSync) {
352
+ const candidates = [
353
+ process.env.CHROME_PATH,
354
+ process.env.PUPPETEER_EXECUTABLE_PATH,
355
+ '/usr/bin/google-chrome',
356
+ '/usr/bin/google-chrome-stable',
357
+ '/usr/bin/chromium-browser',
358
+ '/usr/bin/chromium',
359
+ '/snap/bin/chromium',
360
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
361
+ ];
362
+ for (const path of candidates) {
363
+ if (path && existsSync(path))
364
+ return path;
365
+ }
366
+ return undefined;
367
+ }
368
+ // ═══════════════════════════════════════════════
369
+ // Strategy D: BFS Link Crawl
370
+ // ═══════════════════════════════════════════════
371
+ /**
372
+ * Enrich an initial set of discovered URLs by BFS-crawling each page
373
+ * for additional same-origin links. Useful after nav extraction to
374
+ * find nested pages that aren't in the top-level navigation.
375
+ */
376
+ async function enrichWithBfsCrawl(baseUrl, seedUrls, maxDepth, config, robots) {
377
+ const visited = new Set(seedUrls);
378
+ const queue = seedUrls.map((url) => ({
379
+ url,
380
+ depth: 1, // Seed URLs are depth 1, their children are depth 2+
381
+ }));
382
+ const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
383
+ const baseOrigin = new URL(baseUrl).origin;
384
+ while (queue.length > 0) {
385
+ const item = queue.shift();
386
+ // Only follow links from nav-discovered pages to find sub-pages
387
+ // e.g. /docs/data-table might link to /docs/data-table/sorting
388
+ if (item.depth > maxDepth)
389
+ continue;
390
+ if (!isAllowed(robots, item.url))
391
+ continue;
392
+ // We already have this URL in our set; only crawl to find *new* links
393
+ try {
394
+ await rateLimiter.wait();
395
+ const response = await fetch(item.url, {
396
+ headers: { 'User-Agent': USER_AGENT },
397
+ signal: AbortSignal.timeout(15_000),
398
+ });
399
+ if (!response.ok)
400
+ continue;
401
+ const contentType = response.headers.get('content-type') || '';
402
+ if (!contentType.includes('text/html'))
403
+ continue;
404
+ const html = await response.text();
405
+ // Extract ALL links from the page (not just nav) for BFS enrichment
406
+ const pageLinks = extractLinksFromHtml(html, item.url, baseOrigin, false);
407
+ for (const link of pageLinks) {
408
+ if (!visited.has(link) && !isNonDocUrl(link)) {
409
+ visited.add(link);
410
+ queue.push({ url: link, depth: item.depth + 1 });
411
+ }
412
+ }
413
+ }
414
+ catch {
415
+ // Fetch failed, skip
416
+ }
417
+ }
418
+ return filterUrls([...visited], baseUrl, config, robots);
419
+ }
85
420
  /** BFS link crawl from the base URL */
86
421
  async function discoverByLinkCrawl(baseUrl, maxDepth, config, robots) {
87
422
  const visited = new Set();
88
- const queue = [{ url: baseUrl, depth: 0 }];
89
- const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
90
423
  const baseOrigin = new URL(baseUrl).origin;
91
424
  const basePath = new URL(baseUrl).pathname;
425
+ // Seed queue: start with base URL + common doc entry points
426
+ const queue = [{ url: baseUrl, depth: 0 }];
427
+ // Also seed common doc entry points if base is root
428
+ if (basePath === '/' || basePath === '') {
429
+ for (const entryPath of COMMON_DOC_ENTRY_PATHS) {
430
+ const entryUrl = new URL(entryPath, baseUrl).href;
431
+ if (isAllowed(robots, entryUrl)) {
432
+ queue.push({ url: entryUrl, depth: 0 });
433
+ }
434
+ }
435
+ }
436
+ const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
92
437
  while (queue.length > 0) {
93
438
  const item = queue.shift();
94
439
  if (visited.has(item.url) || item.depth > maxDepth)
@@ -138,6 +483,9 @@ async function discoverByLinkCrawl(baseUrl, maxDepth, config, robots) {
138
483
  }
139
484
  return filterUrls([...visited], baseUrl, config, robots);
140
485
  }
486
+ // ═══════════════════════════════════════════════
487
+ // Shared Utilities
488
+ // ═══════════════════════════════════════════════
141
489
  /** Filter URLs based on config patterns */
142
490
  function filterUrls(urls, baseUrl, config, robots) {
143
491
  const baseOrigin = new URL(baseUrl).origin;
@@ -145,10 +493,11 @@ function filterUrls(urls, baseUrl, config, robots) {
145
493
  return urls.filter((url) => {
146
494
  try {
147
495
  const parsed = new URL(url);
148
- // Must be same origin and under base path
496
+ // Must be same origin
149
497
  if (parsed.origin !== baseOrigin)
150
498
  return false;
151
- if (!parsed.pathname.startsWith(basePath))
499
+ // Must be under base path (unless base is root)
500
+ if (basePath !== '/' && !parsed.pathname.startsWith(basePath))
152
501
  return false;
153
502
  // Check robots.txt
154
503
  if (!isAllowed(robots, url))
package/dist/server.js CHANGED
@@ -152,3 +152,38 @@ server.tool({
152
152
  db.removeLibrary(lib.id);
153
153
  return tool.text(`🗑️ Library "${lib.display_name}" removed.\nDeleted ${lib.page_count} pages and ${lib.chunk_count} chunks.`);
154
154
  });
155
+ // ──────────────────────────────────────
156
+ // Tool 7: library_info — detailed stats and pages
157
+ // ──────────────────────────────────────
158
+ server.tool({
159
+ name: 'library_info',
160
+ description: 'Get detailed information about a specific documentation library, including a list of all its indexed pages and their paths. ' +
161
+ 'Use this to see what pages are available in a library before retrieving them.',
162
+ schema: v.object({
163
+ library: v.pipe(v.string(), v.description('The library name to get information for.')),
164
+ }),
165
+ }, async ({ library }) => {
166
+ const lib = db.getLibraryByName(library);
167
+ if (!lib)
168
+ return tool.text(`Library "${library}" not found. Use list_libraries to see available libraries.`);
169
+ const pages = db.getPagesByLibrary(lib.id);
170
+ let output = `## Library: ${lib.display_name} (${lib.name})\n`;
171
+ output += `- **URL:** ${lib.url}\n`;
172
+ output += `- **Status:** ${lib.status}\n`;
173
+ output += `- **Pages:** ${lib.page_count}\n`;
174
+ output += `- **Chunks:** ${lib.chunk_count}\n`;
175
+ output += `- **Last Crawled:** ${lib.last_crawled_at || 'never'}\n\n`;
176
+ if (pages.length > 0) {
177
+ output += `### Pages (${pages.length})\n\n`;
178
+ output += '| Title | Path | URL |\n';
179
+ output += '| ----- | ---- | --- |\n';
180
+ for (const p of pages) {
181
+ const title = p.title?.replace(/\|/g, '-') || 'Untitled';
182
+ output += `| ${title} | \`${p.path}\` | ${p.url} |\n`;
183
+ }
184
+ }
185
+ else {
186
+ output += `*No pages indexed yet for this library.*\n`;
187
+ }
188
+ return tool.text(output);
189
+ });
package/dist/version.d.ts CHANGED
@@ -1 +1 @@
1
- export declare const VERSION = "0.1.6";
1
+ export declare const VERSION = "0.1.8";
package/dist/version.js CHANGED
@@ -1,2 +1,2 @@
1
1
  // This file is automatically updated by release-please
2
- export const VERSION = '0.1.6'; // x-release-please-version
2
+ export const VERSION = '0.1.8'; // x-release-please-version
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "docshark",
3
- "version": "0.1.6",
3
+ "version": "0.1.8",
4
4
  "description": "🦈 Documentation MCP Server — scrape, index, and search any doc website",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -26,10 +26,14 @@
26
26
  "dev": "bun run --watch src/cli.ts start",
27
27
  "cli": "bun run src/cli.ts",
28
28
  "check": "tsc --noEmit",
29
- "build": "rm -rf dist && tsc",
29
+ "build": "rm -rf dist && tsc && chmod +x dist/cli.js",
30
30
  "prepublishOnly": "bun run build",
31
31
  "test:crawl": "bun run src/cli.ts add https://svelte.dev/docs/svelte/overview"
32
32
  },
33
+ "engines": {
34
+ "node": ">=20",
35
+ "bun": ">=1.1.0"
36
+ },
33
37
  "keywords": [
34
38
  "tmcp",
35
39
  "mcp",