npm - docshark - Versions diffs - 0.1.5 - Mend

docshark 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/CHANGELOG.md +21 -0
package/LICENSE +21 -0
package/README.md +113 -0
package/dist/api/router.d.ts +16 -0
package/dist/cli.d.ts +2 -0
package/dist/cli.js +179 -0
package/dist/http.d.ts +1 -0
package/dist/index.d.ts +4 -0
package/dist/index.js +5 -0
package/dist/jobs/events.d.ts +8 -0
package/dist/jobs/manager.d.ts +19 -0
package/dist/jobs/worker.d.ts +8 -0
package/dist/processor/chunker.d.ts +10 -0
package/dist/processor/extractor.d.ts +8 -0
package/dist/scraper/discoverer.d.ts +6 -0
package/dist/scraper/fetcher.d.ts +6 -0
package/dist/scraper/rate-limiter.d.ts +7 -0
package/dist/scraper/robots.d.ts +5 -0
package/dist/server.d.ts +13 -0
package/dist/services/library.d.ts +17 -0
package/dist/storage/db.d.ts +57 -0
package/dist/storage/search.d.ts +21 -0
package/dist/tools/add-library.d.ts +25 -0
package/dist/tools/get-doc-page.d.ts +23 -0
package/dist/tools/list-libraries.d.ts +19 -0
package/dist/tools/refresh-library.d.ts +20 -0
package/dist/tools/remove-library.d.ts +19 -0
package/dist/tools/search-docs.d.ts +23 -0
package/dist/types.d.ts +71 -0
package/dist/version.d.ts +1 -0
package/package.json +65 -0

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,21 @@
+# Changelog
+## 0.1.5 (2026-03-02)
+**Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.4...v0.1.5
+## 0.1.4 (2026-03-02)
+**Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.3...v0.1.4
+## 0.1.3 (2026-03-02)
+**Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.2...v0.1.3
+## 0.1.2 (2026-03-02)
+**Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.1...v0.1.2
+## 0.1.1 (2026-03-02)
+**Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.0...v0.1.1

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Michael-Obele
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,113 @@
+# 🦈 DocShark
+[![Built with Bun](https://img.shields.io/badge/Bun-%23000000.svg?style=flat&logo=bun&logoColor=white)](https://bun.sh/)
+[![MCP Compatible](https://img.shields.io/badge/MCP-Ready-0D1117.svg?style=flat&logo=github&logoColor=white)](https://modelcontextprotocol.io/)
+[![GitHub Release](https://img.shields.io/github/v/release/Michael-Obele/docshark?color=success)](https://github.com/Michael-Obele/docshark/releases)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+**DocShark** is a powerful MCP (Model Context Protocol) server designed to scrape, index, and search any documentation website. It creates a local, highly-searchable knowledge base from public documentation pages using FTS5 (Full-Text Search) and BM25 ranking, allowing AI assistants to query the latest docs effortlessly.
+---
+## 🚀 Features
+- **Automated Crawling**: Discovers pages via `sitemap.xml` with fallback to BFS link crawling.
+- **Smart Extraction**: Uses Readability and Turndown to extract main content and convert it to clean Markdown, filtering out navbars and sidebars.
+- **Semantic Chunking**: Splits content based on headings, preserving contextual headers for better AI understanding.
+- **High-Performance Search**: Built-in SQLite + FTS5 indexing with BM25 ranking for accurate and lightning-fast search results.
+- **JS-Rendered Site Support**: Tiered fetching strategy automatically detects React/Vue SPAs (empty shells) and upgrades to `puppeteer-core` if you have it installed (zero-config, auto-fallback).
+- **Polite Crawling**: Respects `robots.txt` and implements rate limiting to prevent overloading documentation servers.
+- **Standard MCP Tooling**: Connect perfectly with Desktop Claude, VS Code, Cursor, and any other MCP-compatible clients via standard `stdio` or `http`/`sse` transports.
+## 📦 What We Have Done (Phase 1)
+**Phase 1: Core Engine** is fully implemented and tested.
+- ✅ Custom SQLite Database with FTS5 virtual tables and auto-sync triggers.
+- ✅ Web scraping engine supporting standard `fetch()` and `puppeteer-core`.
+- ✅ Markdown processor utilizing Readability + Turndown.
+- ✅ Heading-based semantic chunker (500-1200 tokens per chunk).
+- ✅ Asynchronous job manager and queue system.
+- ✅ Complete HTTP API (REST endpoints + SSE event streams).
+- ✅ Seamless integration of 6 MCP tools: `add_library`, `search_docs`, `list_libraries`, `get_doc_page`, `refresh_library`, and `remove_library`.
+- ✅ Robust CLI interface (`start`, `add`, `search`, `list`).
+## 🏗️ What We Are Doing
+We are actively polishing the integration between the core engine and external MCP clients (like VS Code Agents and Claude Desktop).
+## 🔮 What We Plan To Do (Phase 2 & Beyond)
+- **Web Dashboard**: An intuitive SvelteKit dashboard to manage your synced libraries, view crawl progress in real-time (via SSE), and test searches manually.
+- **Incremental Crawling**: Smarter `refresh` jobs that compare `ETag` and `Last-Modified` headers to only re-scrape updated pages.
+- **Vector Search (RAG)**: Integration of lightweight vector embeddings for semantic similarity search alongside the existing FTS5 keyword search.
+- **Advanced Scraping Setup**: Support for custom CSS selectors to define exactly where content lives in non-standard documentation websites.
+---
+## 🛠️ Usage
+### Installing & Running Locally
+Ensure you have [Bun](https://bun.sh/) installed.
+```bash
+# Install dependencies
+bun install
+# (Optional) Enable auto-detection & scraping of Javascript React/Vue single-page apps
+bun add puppeteer-core
+# Start the DocShark MCP server in HTTP mode
+bun run src/cli.ts start --port 6380
+```
+### Important CLI Commands
+```bash
+# Add a documentation library to the index
+bun run src/cli.ts add https://valibot.dev/guides/ --depth 2
+# Search your indexed docs
+bun run src/cli.ts search "schema validation"
+# List all crawled libraries
+bun run src/cli.ts list
+```
+### Using in VS Code (Copilot Agent Mode)
+To use DocShark as an MCP server in VS Code:
+1. Enable MCP discovery in your VS Code settings.
+2. Create `.vscode/mcp.json` in your workspace:
+```json
+{
+  "servers": {
+    "docshark": {
+      "type": "stdio",
+      "command": "bun",
+      "args": [
+        "run",
+        "/absolute/path/to/docshark/src/cli.ts",
+        "start",
+        "--stdio"
+      ]
+    }
+  }
+}
+```
+3. Restart the server in VS Code properties, and your Copilot agent will now have access to the docshark tools.
+---
+## 🔄 Versioning & Changelog
+This project uses [Google's Release Please](https://github.com/googleapis/release-please) to automate versioning and changelog generation.
+- **Semantic Versioning**: Our versions automatically bump (e.g. `0.0.1` -> `0.0.2` or `0.1.0`) based on standard Conventional Commits (`feat:`, `fix:`, `chore:`, etc.).
+- **Automated**: A PR is automatically created on `master` when standard commits are merged, generating a standard `CHANGELOG.md`.
+## 📜 License
+This project is open-source and available under the [MIT License](LICENSE).
+---
+*Built to empower AI agents with the latest knowledge.*

package/dist/api/router.d.ts ADDED Viewed

@@ -0,0 +1,16 @@
+import type { Database } from '../storage/db.js';
+import type { SearchEngine } from '../storage/search.js';
+import type { JobManager } from '../jobs/manager.js';
+import type { LibraryService } from '../services/library.js';
+import type { EventBus } from '../jobs/events.js';
+interface ApiDeps {
+    db: Database;
+    searchEngine: SearchEngine;
+    jobManager: JobManager;
+    libraryService: LibraryService;
+    eventBus: EventBus;
+}
+export declare function createApiRouter(deps: ApiDeps): {
+    handle(request: Request): Promise<Response>;
+};
+export {};

package/dist/cli.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env node
2	+ export {};

package/dist/cli.js ADDED Viewed

@@ -0,0 +1,179 @@
+#!/usr/bin/env node
+import { createRequire } from "node:module";
+var __create = Object.create;
+var __getProtoOf = Object.getPrototypeOf;
+var __defProp = Object.defineProperty;
+var __getOwnPropNames = Object.getOwnPropertyNames;
+var __hasOwnProp = Object.prototype.hasOwnProperty;
+var __toESM = (mod, isNodeMode, target) => {
+  target = mod != null ? __create(__getProtoOf(mod)) : {};
+  const to = isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target;
+  for (let key of __getOwnPropNames(mod))
+    if (!__hasOwnProp.call(to, key))
+      __defProp(to, key, {
+        get: () => mod[key],
+        enumerable: true
+      });
+  return to;
+};
+var __require = /* @__PURE__ */ createRequire(import.meta.url);
+// src/cli.ts
+import { Command } from "commander";
+import { startHttpServer } from "./http.js";
+import { StdioTransport } from "@tmcp/transport-stdio";
+import { server, db, searchEngine, libraryService } from "./server.js";
+import { VERSION } from "./version.js";
+var program = new Command().name("docshark").description("\uD83E\uDD88 Documentation MCP Server — scrape, index, and search any doc website").version(VERSION, "-v, --version", "output the current version");
+program.command("start", { isDefault: true }).description("Start the MCP server").option("-p, --port <port>", "HTTP server port", "6380").option("--stdio", "Run in STDIO mode (for Claude Desktop, Cursor, etc.)").option("--data-dir <path>", "Data directory", "").action(async (opts) => {
+  if (opts.dataDir) {
+    process.env.DOCSHARK_DATA_DIR = opts.dataDir;
+  }
+  db.init();
+  if (opts.stdio) {
+    const stdio = new StdioTransport(server);
+    stdio.listen();
+  } else {
+    await startHttpServer(parseInt(opts.port));
+  }
+});
+program.command("add <url>").description("Add a documentation library and start crawling").option("-n, --name <name>", "Library name (auto-generated from URL if omitted)").option("-d, --depth <n>", "Max crawl depth", "3").option("--lib-version <version>", "Library version").action(async (url, opts) => {
+  db.init();
+  try {
+    const lib = await libraryService.add({
+      url,
+      name: opts.name,
+      version: opts.libVersion,
+      maxDepth: parseInt(opts.depth)
+    });
+    console.log(`
+✅ Added "${lib.display_name}" — crawling ${lib.url}...`);
+    console.log(`   Job ID: ${lib.jobId}`);
+    console.log(`   Use "docshark list" to check progress.
+`);
+    await waitForCrawl(lib.jobId);
+  } catch (err) {
+    console.error(`
+❌ ${err.message}
+`);
+    process.exit(1);
+  }
+});
+program.command("search <query>").description("Search indexed documentation").option("-l, --library <name>", "Filter by library").option("--limit <n>", "Max results", "5").action(async (query, opts) => {
+  db.init();
+  const results = searchEngine.search(query, {
+    library: opts.library,
+    limit: parseInt(opts.limit)
+  });
+  if (results.length === 0) {
+    console.log(`
+No results found for "${query}".
+`);
+    return;
+  }
+  for (const r of results) {
+    console.log(`
+--- ${r.page_title} (${r.library_display_name}) ---`);
+    console.log(`Section: ${r.heading_context}`);
+    console.log(r.content.slice(0, 300));
+    console.log(`Source: ${r.page_url}
+`);
+  }
+});
+program.command("list").description("List indexed libraries").action(() => {
+  db.init();
+  const libs = db.listLibraries();
+  if (libs.length === 0) {
+    console.log(`
+No libraries indexed. Use "docshark add <url>" to add one.
+`);
+    return;
+  }
+  console.table(libs.map((l) => ({
+    Name: l.name,
+    URL: l.url,
+    Pages: l.page_count,
+    Chunks: l.chunk_count,
+    Status: l.status,
+    "Last Crawled": l.last_crawled_at || "never"
+  })));
+});
+program.command("refresh <name>").description("Refresh an existing documentation library").action(async (name) => {
+  db.init();
+  try {
+    const lib = db.getLibraryByName(name);
+    if (!lib)
+      throw new Error(`Library "${name}" not found.`);
+    const { jobManager } = await import("./server.js");
+    const job = jobManager.startCrawl(lib.id, { incremental: true });
+    console.log(`
+\uD83D\uDD04 Refreshing "${lib.display_name}" — crawling ${lib.url}...`);
+    console.log(`   Job ID: ${job.id}`);
+    await waitForCrawl(job.id);
+  } catch (err) {
+    console.error(`
+❌ ${err.message}
+`);
+    process.exit(1);
+  }
+});
+program.command("remove <name>").description("Remove a documentation library and its index").action((name) => {
+  db.init();
+  try {
+    const lib = db.getLibraryByName(name);
+    if (!lib)
+      throw new Error(`Library "${name}" not found.`);
+    db.removeLibrary(lib.id);
+    console.log(`
+\uD83D\uDDD1️ Removed library "${lib.display_name}". Deleted ${lib.page_count} pages.
+`);
+  } catch (err) {
+    console.error(`
+❌ ${err.message}
+`);
+    process.exit(1);
+  }
+});
+program.command("get <url>").description("Get the full markdown content of a specific indexed page").action((url) => {
+  db.init();
+  const page = db.getPage({ url });
+  if (!page) {
+    console.error(`
+❌ Page not found in index: ${url}
+`);
+    process.exit(1);
+  }
+  console.log(`
+--- ${page.title} ---`);
+  console.log(`Source: ${page.url}
+`);
+  console.log(page.content_markdown);
+  console.log(`
+`);
+});
+program.parse();
+async function waitForCrawl(jobId) {
+  const { jobManager } = await import("./server.js");
+  return new Promise((resolve) => {
+    const check = () => {
+      const job = jobManager.getJob(jobId);
+      if (!job || job.status === "completed" || job.status === "failed") {
+        if (job?.status === "completed") {
+          console.log(`
+\uD83E\uDD88 Crawl complete: ${job.pages_crawled} pages, ${job.chunks_created} chunks indexed.`);
+          if (job.pages_failed > 0) {
+            console.log(`   ⚠️  ${job.pages_failed} pages failed.`);
+          }
+        } else if (job?.status === "failed") {
+          console.error(`
+❌ Crawl failed: ${job.error_message}`);
+        }
+        resolve();
+        return;
+      }
+      setTimeout(check, 1000);
+    };
+    check();
+  });
+}

package/dist/http.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare function startHttpServer(port: number): Promise<import("srvx").Server<import("srvx").ServerHandler>>;

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export * from "./server.js";
+export * from "./types.js";
+export * from "./version.js";
+export * from "./http.js";

package/dist/index.js ADDED Viewed

@@ -0,0 +1,5 @@
+// src/index.ts
+export * from "./server.js";
+export * from "./types.js";
+export * from "./version.js";
+export * from "./http.js";

package/dist/jobs/events.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+type Listener = (data: any) => void;
+export declare class EventBus {
+    private listeners;
+    on(event: string, listener: Listener): void;
+    off(event: string, listener: Listener): void;
+    emit(event: string, data: any): void;
+}
+export {};

package/dist/jobs/manager.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import type { Database } from '../storage/db.js';
+import type { EventBus } from './events.js';
+import type { CrawlJob } from '../types.js';
+export declare class JobManager {
+    private db;
+    private eventBus;
+    private activeJobs;
+    constructor(db: Database, eventBus: EventBus);
+    /** Start a crawl job for a library */
+    startCrawl(libraryId: string, opts?: {
+        incremental?: boolean;
+    }): CrawlJob;
+    /** Get status of a specific job */
+    getJob(jobId: string): CrawlJob | undefined;
+    /** List all jobs, optionally filtered by library */
+    listJobs(libraryId?: string): CrawlJob[];
+    /** Check if a crawl is currently running for a library */
+    isRunning(libraryId: string): boolean;
+}

package/dist/jobs/worker.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+import type { Database } from '../storage/db.js';
+import type { EventBus } from './events.js';
+export declare class CrawlWorker {
+    private db;
+    private eventBus;
+    constructor(db: Database, eventBus: EventBus);
+    crawl(libraryId: string, jobId: string): Promise<void>;
+}

package/dist/processor/chunker.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+export interface Chunk {
+    content: string;
+    headingContext: string;
+    tokenCount: number;
+    hasCodeBlock: boolean;
+}
+export declare function chunkMarkdown(markdown: string, _headings: Array<{
+    level: number;
+    text: string;
+}>): Chunk[];

package/dist/processor/extractor.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+export declare function extractAndConvert(html: string, url: string): {
+    markdown: string;
+    title: string;
+    headings: Array<{
+        level: number;
+        text: string;
+    }>;
+};

package/dist/scraper/discoverer.d.ts ADDED Viewed

@@ -0,0 +1,6 @@
+import type { CrawlConfig } from '../types.js';
+/**
+ * Discover all documentation page URLs from a base URL.
+ * Strategy: sitemap.xml → link crawl fallback
+ */
+export declare function discoverPages(baseUrl: string, config?: CrawlConfig): Promise<string[]>;

package/dist/scraper/fetcher.d.ts ADDED Viewed

@@ -0,0 +1,6 @@
+import type { FetchResult } from '../types.js';
+/**
+ * Fetch a page and return its HTML.
+ * Supports auto-detection of JS-rendered sites (falls back to puppeteer-core if installed).
+ */
+export declare function fetchPage(url: string, renderer?: 'auto' | 'fetch' | 'puppeteer'): Promise<FetchResult>;

package/dist/scraper/rate-limiter.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+export declare class RateLimiter {
+    private delayMs;
+    private lastRequest;
+    constructor(delayMs?: number);
+    wait(): Promise<void>;
+    setDelay(ms: number): void;
+}

package/dist/scraper/robots.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+import robotsParser from 'robots-parser';
+/** Fetch and parse robots.txt for a given base URL */
+export declare function getRobotsParser(baseUrl: string): Promise<import("robots-parser").Robot | null>;
+/** Check if a URL is allowed by robots.txt */
+export declare function isAllowed(robots: ReturnType<typeof robotsParser> | null, url: string): boolean;

package/dist/server.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+import { McpServer } from 'tmcp';
+import * as v from 'valibot';
+import { Database } from './storage/db.js';
+import { SearchEngine } from './storage/search.js';
+import { LibraryService } from './services/library.js';
+import { JobManager } from './jobs/manager.js';
+import { EventBus } from './jobs/events.js';
+export declare const db: Database;
+export declare const eventBus: EventBus;
+export declare const searchEngine: SearchEngine;
+export declare const jobManager: JobManager;
+export declare const libraryService: LibraryService;
+export declare const server: McpServer<v.GenericSchema, undefined>;

package/dist/services/library.d.ts ADDED Viewed

@@ -0,0 +1,17 @@
+import type { Database } from '../storage/db.js';
+import type { JobManager } from '../jobs/manager.js';
+import type { Library } from '../types.js';
+export declare class LibraryService {
+    private db;
+    private jobManager;
+    constructor(db: Database, jobManager: JobManager);
+    /** Add a new documentation library and start crawling */
+    add(opts: {
+        url: string;
+        name?: string;
+        version?: string;
+        maxDepth?: number;
+    }): Promise<Library & {
+        jobId: string;
+    }>;
+}

package/dist/storage/db.d.ts ADDED Viewed

@@ -0,0 +1,57 @@
+import { Database as BunDatabase } from 'bun:sqlite';
+import type { Library, Page, CrawlJob } from '../types.js';
+export declare class Database {
+    private db;
+    init(): void;
+    /** Expose raw DB for search engine direct queries */
+    raw(): BunDatabase;
+    private migrate;
+    addLibrary(lib: {
+        id: string;
+        name: string;
+        displayName: string;
+        url: string;
+        version?: string;
+        crawlConfig?: object;
+    }): import("bun:sqlite").Changes;
+    listLibraries(status?: string): Library[];
+    getLibraryByName(name: string): Library | undefined;
+    getLibraryById(id: string): Library | undefined;
+    removeLibrary(id: string): import("bun:sqlite").Changes;
+    updateLibraryStatus(id: string, status: string): import("bun:sqlite").Changes;
+    updateLibraryStats(id: string, pageCount: number, chunkCount: number): import("bun:sqlite").Changes;
+    upsertPage(page: {
+        id: string;
+        libraryId: string;
+        url: string;
+        path: string;
+        title: string;
+        contentMarkdown: string;
+        contentHash: string;
+        headings: object[];
+    }): string;
+    getPage(opts: {
+        url?: string;
+        library?: string;
+        path?: string;
+    }): Page | undefined;
+    getPagesByLibrary(libraryId: string): Page[];
+    insertChunks(chunks: Array<{
+        id: string;
+        pageId: string;
+        libraryId: string;
+        content: string;
+        headingContext: string;
+        chunkIndex: number;
+        tokenCount: number;
+        hasCodeBlock: boolean;
+    }>): void;
+    deleteChunksByPage(pageId: string): void;
+    createJob(job: {
+        id: string;
+        libraryId: string;
+    }): CrawlJob;
+    getJob(id: string): CrawlJob | undefined;
+    updateJob(id: string, updates: Partial<Pick<CrawlJob, 'status' | 'pages_discovered' | 'pages_crawled' | 'pages_failed' | 'chunks_created' | 'error_message' | 'started_at' | 'completed_at'>>): void;
+    listJobs(libraryId?: string): CrawlJob[];
+}

package/dist/storage/search.d.ts ADDED Viewed

@@ -0,0 +1,21 @@
+import type { Database } from './db.js';
+export interface SearchResult {
+    content: string;
+    heading_context: string;
+    page_url: string;
+    page_title: string;
+    library_name: string;
+    library_display_name: string;
+    relevance_score: number;
+    has_code_block: boolean;
+    token_count: number;
+}
+export declare class SearchEngine {
+    private db;
+    constructor(db: Database);
+    search(query: string, opts?: {
+        library?: string;
+        limit?: number;
+    }): SearchResult[];
+    private sanitizeQuery;
+}

package/dist/tools/add-library.d.ts ADDED Viewed

@@ -0,0 +1,25 @@
+import * as v from 'valibot';
+import type { LibraryService } from '../services/library.js';
+export declare function createAddLibraryTool(libraryService: LibraryService): {
+    definition: {
+        name: "add_library";
+        description: string;
+        schema: v.ObjectSchema<{
+            readonly url: v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.UrlAction<string, undefined>, v.DescriptionAction<string, "The base URL of the documentation website to crawl.">]>;
+            readonly name: v.OptionalSchema<v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.DescriptionAction<string, "A short identifier for the library (e.g., \"svelte-5\"). Auto-generated from URL if omitted.">]>, undefined>;
+            readonly version: v.OptionalSchema<v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.DescriptionAction<string, "Version string (e.g., \"5.0.0\", \"v4\").">]>, undefined>;
+            readonly max_depth: v.OptionalSchema<v.SchemaWithPipe<readonly [v.NumberSchema<undefined>, v.IntegerAction<number, undefined>, v.MinValueAction<number, 1, undefined>, v.MaxValueAction<number, 10, undefined>, v.DescriptionAction<number, "Maximum link depth to crawl. Default: 3.">]>, 3>;
+        }, undefined>;
+    };
+    handler: ({ url, name, version, max_depth, }: {
+        url: string;
+        name?: string;
+        version?: string;
+        max_depth?: number;
+    }) => Promise<{
+        content: {
+            type: "text";
+            text: string;
+        }[];
+    }>;
+};

package/dist/tools/get-doc-page.d.ts ADDED Viewed

@@ -0,0 +1,23 @@
+import * as v from 'valibot';
+import type { Database } from '../storage/db.js';
+export declare function createGetDocPageTool(db: Database): {
+    definition: {
+        name: "get_doc_page";
+        description: string;
+        schema: v.ObjectSchema<{
+            readonly url: v.OptionalSchema<v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.DescriptionAction<string, "The full URL of the documentation page.">]>, undefined>;
+            readonly library: v.OptionalSchema<v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.DescriptionAction<string, "Library name to search within.">]>, undefined>;
+            readonly path: v.OptionalSchema<v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.DescriptionAction<string, "Relative path within the library (e.g., \"/getting-started\").">]>, undefined>;
+        }, undefined>;
+    };
+    handler: ({ url, library, path }: {
+        url?: string;
+        library?: string;
+        path?: string;
+    }) => Promise<{
+        content: {
+            type: "text";
+            text: string;
+        }[];
+    }>;
+};

package/dist/tools/list-libraries.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import * as v from 'valibot';
+import type { Database } from '../storage/db.js';
+export declare function createListLibrariesTool(db: Database): {
+    definition: {
+        name: "list_libraries";
+        description: string;
+        schema: v.ObjectSchema<{
+            readonly status: v.OptionalSchema<v.SchemaWithPipe<readonly [v.PicklistSchema<["indexed", "crawling", "error", "all"], undefined>, v.DescriptionAction<"crawling" | "indexed" | "error" | "all", "Filter by indexing status. Default: \"all\".">]>, "all">;
+        }, undefined>;
+    };
+    handler: ({ status }: {
+        status?: string;
+    }) => Promise<{
+        content: {
+            type: "text";
+            text: string;
+        }[];
+    }>;
+};

package/dist/tools/refresh-library.d.ts ADDED Viewed

@@ -0,0 +1,20 @@
+import * as v from 'valibot';
+import type { JobManager } from '../jobs/manager.js';
+import type { Database } from '../storage/db.js';
+export declare function createRefreshLibraryTool(jobManager: JobManager, db: Database): {
+    definition: {
+        name: "refresh_library";
+        description: string;
+        schema: v.ObjectSchema<{
+            readonly library: v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.DescriptionAction<string, "The library name to refresh (e.g., \"svelte-5\").">]>;
+        }, undefined>;
+    };
+    handler: ({ library }: {
+        library: string;
+    }) => Promise<{
+        content: {
+            type: "text";
+            text: string;
+        }[];
+    }>;
+};

package/dist/tools/remove-library.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import * as v from 'valibot';
+import type { Database } from '../storage/db.js';
+export declare function createRemoveLibraryTool(db: Database): {
+    definition: {
+        name: "remove_library";
+        description: string;
+        schema: v.ObjectSchema<{
+            readonly library: v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.DescriptionAction<string, "The library name to remove (e.g., \"svelte-5\").">]>;
+        }, undefined>;
+    };
+    handler: ({ library }: {
+        library: string;
+    }) => Promise<{
+        content: {
+            type: "text";
+            text: string;
+        }[];
+    }>;
+};

package/dist/tools/search-docs.d.ts ADDED Viewed

@@ -0,0 +1,23 @@
+import * as v from 'valibot';
+import type { SearchEngine } from '../storage/search.js';
+export declare function createSearchDocsTool(searchEngine: SearchEngine): {
+    definition: {
+        name: "search_docs";
+        description: string;
+        schema: v.ObjectSchema<{
+            readonly query: v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.DescriptionAction<string, "The search query. Use natural language or specific terms.">]>;
+            readonly library: v.OptionalSchema<v.SchemaWithPipe<readonly [v.StringSchema<undefined>, v.DescriptionAction<string, "Filter results to a specific library name.">]>, undefined>;
+            readonly limit: v.OptionalSchema<v.SchemaWithPipe<readonly [v.NumberSchema<undefined>, v.IntegerAction<number, undefined>, v.MinValueAction<number, 1, undefined>, v.MaxValueAction<number, 20, undefined>, v.DescriptionAction<number, "Max results to return. Default: 5.">]>, 5>;
+        }, undefined>;
+    };
+    handler: ({ query, library, limit }: {
+        query: string;
+        library?: string;
+        limit?: number;
+    }) => Promise<{
+        content: {
+            type: "text";
+            text: string;
+        }[];
+    }>;
+};

package/dist/types.d.ts ADDED Viewed

@@ -0,0 +1,71 @@
+export interface Library {
+    id: string;
+    name: string;
+    display_name: string;
+    url: string;
+    version: string | null;
+    description: string | null;
+    status: 'pending' | 'crawling' | 'indexed' | 'error';
+    page_count: number;
+    chunk_count: number;
+    crawl_config: string | null;
+    last_crawled_at: string | null;
+    created_at: string;
+    updated_at: string;
+}
+export interface Page {
+    id: string;
+    library_id: string;
+    url: string;
+    path: string;
+    title: string | null;
+    content_markdown: string | null;
+    content_hash: string | null;
+    headings: string | null;
+    http_status: number | null;
+    last_modified: string | null;
+    etag: string | null;
+    created_at: string;
+    updated_at: string;
+}
+export interface ChunkRecord {
+    id: string;
+    page_id: string;
+    library_id: string;
+    content: string;
+    heading_context: string;
+    chunk_index: number;
+    token_count: number;
+    has_code_block: number;
+    created_at: string;
+}
+export interface CrawlJob {
+    id: string;
+    library_id: string;
+    status: 'queued' | 'running' | 'completed' | 'failed' | 'cancelled';
+    pages_discovered: number;
+    pages_crawled: number;
+    pages_failed: number;
+    chunks_created: number;
+    error_message: string | null;
+    started_at: string | null;
+    completed_at: string | null;
+    created_at: string;
+}
+export interface FetchResult {
+    html: string;
+    renderer: 'fetch' | 'puppeteer';
+    status: number;
+    etag?: string | null;
+    lastModified?: string | null;
+    unchanged?: boolean;
+}
+export interface CrawlConfig {
+    renderer?: 'auto' | 'fetch' | 'puppeteer';
+    maxDepth?: number;
+    includePatterns?: string[];
+    excludePatterns?: string[];
+    rateLimit?: number;
+    waitForSelector?: string;
+    waitTimeout?: number;
+}

package/dist/version.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare const VERSION = "0.1.5";

package/package.json ADDED Viewed

@@ -0,0 +1,65 @@
+{
+  "name": "docshark",
+  "version": "0.1.5",
+  "description": "🦈 Documentation MCP Server — scrape, index, and search any doc website",
+  "type": "module",
+  "main": "./dist/index.js",
+  "module": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": {
+    ".": {
+      "import": "./dist/index.js",
+      "types": "./dist/index.d.ts"
+    }
+  },
+  "bin": {
+    "docshark": "./dist/cli.js"
+  },
+  "files": [
+    "dist",
+    "README.md",
+    "LICENSE",
+    "CHANGELOG.md"
+  ],
+  "scripts": {
+    "start": "bun run src/cli.ts start",
+    "dev": "bun run --watch src/cli.ts start",
+    "cli": "bun run src/cli.ts",
+    "check": "tsc --noEmit",
+    "build": "rm -rf dist && bun build ./src/cli.ts ./src/index.ts --outdir ./dist --target node --external '*' && tsc --emitDeclarationOnly",
+    "prepublishOnly": "bun run build",
+    "test:crawl": "bun run src/cli.ts add https://svelte.dev/docs/svelte/overview"
+  },
+  "keywords": [
+    "tmcp",
+    "mcp",
+    "documentation",
+    "search",
+    "ai",
+    "scraper"
+  ],
+  "dependencies": {
+    "@mozilla/readability": "^0.6.0",
+    "@tmcp/adapter-valibot": "^0.1.5",
+    "@tmcp/transport-http": "^0.8.4",
+    "@tmcp/transport-sse": "^0.5.3",
+    "@tmcp/transport-stdio": "^0.4.1",
+    "cheerio": "^1.2.0",
+    "commander": "^14.0.3",
+    "linkedom": "^0.18.12",
+    "nanoid": "^5.1.6",
+    "puppeteer-core": "^24.37.5",
+    "robots-parser": "^3.0.1",
+    "srvx": "^0.11.8",
+    "tmcp": "^1.19.2",
+    "turndown": "^7.2.2",
+    "turndown-plugin-gfm": "^1.0.2",
+    "valibot": "^1.2.0"
+  },
+  "devDependencies": {
+    "@types/bun": "^1.3.9",
+    "@types/node": "^25.3.3",
+    "@types/turndown": "^5.0.6",
+    "typescript": "^5.9.3"
+  }
+}