docshark 0.1.17 → 0.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +176 -0
- package/dist/cli.js +2 -6
- package/dist/jobs/manager.d.ts +1 -0
- package/dist/jobs/manager.js +1 -1
- package/dist/scraper/rate-limiter.js +1 -1
- package/dist/scripts/sync-version.js +16 -4
- package/dist/search/format-results.d.ts +2 -0
- package/dist/search/format-results.js +23 -0
- package/dist/search/query-planner.d.ts +7 -0
- package/dist/search/query-planner.js +88 -0
- package/dist/search/sanitize.d.ts +10 -0
- package/dist/search/sanitize.js +40 -0
- package/dist/search/types.d.ts +33 -0
- package/dist/search/types.js +1 -0
- package/dist/server.js +73 -42
- package/dist/storage/db.d.ts +1 -0
- package/dist/storage/db.js +3 -2
- package/dist/storage/search.d.ts +19 -17
- package/dist/storage/search.js +305 -31
- package/dist/tools/search-docs.js +2 -10
- package/dist/types.d.ts +1 -0
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +3 -3
package/README.md
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# 🦈 DocShark
|
|
2
|
+
|
|
3
|
+
[](https://bun.sh/)
|
|
4
|
+
[](https://www.npmjs.com/package/docshark)
|
|
5
|
+
[](https://modelcontextprotocol.io/)
|
|
6
|
+
[](https://github.com/Michael-Obele/docshark/releases)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
**DocShark** is a powerful MCP (Model Context Protocol) server designed to scrape, index, and search any documentation website. It creates a local, highly-searchable knowledge base from public documentation pages using FTS5 (Full-Text Search) and BM25 ranking, allowing AI assistants to query the latest docs effortlessly.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## 🚀 Features
|
|
14
|
+
|
|
15
|
+
- **Automated Crawling**: Discovers pages via `sitemap.xml` with fallback to BFS link crawling.
|
|
16
|
+
- **Smart Extraction**: Uses Readability and Turndown to extract main content and convert it to clean Markdown, filtering out navbars and sidebars.
|
|
17
|
+
- **Semantic Chunking**: Splits content based on headings, preserving contextual headers for better AI understanding.
|
|
18
|
+
- **High-Performance Search**: Built-in SQLite + FTS5 indexing with BM25 ranking for accurate and lightning-fast search results.
|
|
19
|
+
- **JS-Rendered Site Support**: Tiered fetching strategy automatically detects React/Vue SPAs (empty shells) and upgrades to `puppeteer-core` if you have it installed (zero-config, auto-fallback).
|
|
20
|
+
- **Polite Crawling**: Respects `robots.txt` and implements rate limiting to prevent overloading documentation servers.
|
|
21
|
+
- **Standard MCP Tooling**: Connect perfectly with Desktop Claude, VS Code, Cursor, and any other MCP-compatible clients via standard `stdio` or `http`/`sse` transports.
|
|
22
|
+
|
|
23
|
+
## 📦 What We Have Done (Phase 1)
|
|
24
|
+
|
|
25
|
+
**Phase 1: Core Engine** is fully implemented and tested.
|
|
26
|
+
|
|
27
|
+
- ✅ Custom SQLite Database with FTS5 virtual tables and auto-sync triggers.
|
|
28
|
+
- ✅ Web scraping engine supporting standard `fetch()` and `puppeteer-core`.
|
|
29
|
+
- ✅ Markdown processor utilizing Readability + Turndown.
|
|
30
|
+
- ✅ Heading-based semantic chunker (500-1200 tokens per chunk).
|
|
31
|
+
- ✅ Asynchronous job manager and queue system.
|
|
32
|
+
- ✅ Complete HTTP API (REST endpoints + SSE event streams).
|
|
33
|
+
- ✅ Seamless integration of 4 MCP tools: `manage_library`, `search_docs`, `list_libraries`, and `get_doc_page`.
|
|
34
|
+
- ✅ Robust CLI interface (`start`, `add`, `rename`, `search`, `list`).
|
|
35
|
+
|
|
36
|
+
## 🏗️ What We Are Doing
|
|
37
|
+
|
|
38
|
+
We are actively polishing the integration between the core engine and external MCP clients (like VS Code Agents and Claude Desktop).
|
|
39
|
+
|
|
40
|
+
## 🔮 What We Plan To Do (Phase 2 & Beyond)
|
|
41
|
+
|
|
42
|
+
- **Web Dashboard**: An intuitive SvelteKit dashboard to manage your synced libraries, view crawl progress in real-time (via SSE), and test searches manually.
|
|
43
|
+
- **Incremental Crawling**: Smarter `refresh` jobs that compare `ETag` and `Last-Modified` headers to only re-scrape updated pages.
|
|
44
|
+
- **Vector Search (RAG)**: Integration of lightweight vector embeddings for semantic similarity search alongside the existing FTS5 keyword search.
|
|
45
|
+
- **Advanced Scraping Setup**: Support for custom CSS selectors to define exactly where content lives in non-standard documentation websites.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## 🛠️ Usage
|
|
50
|
+
|
|
51
|
+
### Quick Start (from npm)
|
|
52
|
+
|
|
53
|
+
You can run DocShark directly without installing it globally using `bunx`:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
# Add a documentation library to the index
|
|
57
|
+
bunx docshark add https://valibot.dev/guides/ --depth 2
|
|
58
|
+
|
|
59
|
+
# Search your indexed docs
|
|
60
|
+
bunx docshark search "schema validation"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Installation
|
|
64
|
+
|
|
65
|
+
To install DocShark globally as a CLI tool:
|
|
66
|
+
|
|
67
|
+
DocShark is intended to be installed and run with Bun.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Global Bun installation
|
|
71
|
+
bun add -g docshark
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
After installation, you can use the `docshark` command:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
docshark list
|
|
78
|
+
|
|
79
|
+
# Update the global Bun installation when a new release is published
|
|
80
|
+
docshark update
|
|
81
|
+
|
|
82
|
+
# Script-friendly update check
|
|
83
|
+
docshark update --check --quiet
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Interactive CLI runs will also let you know when a newer version is available. Update notices are intentionally skipped for MCP `stdio` mode so they never interfere with protocol output.
|
|
87
|
+
|
|
88
|
+
For scripts, `docshark update --check` exits `0` when current, `10` when a newer version is available, and `1` when the version check could not be completed.
|
|
89
|
+
|
|
90
|
+
## 🔌 MCP Integration
|
|
91
|
+
|
|
92
|
+
### VS Code (GitHub Copilot / MCP Extension)
|
|
93
|
+
|
|
94
|
+
Add DocShark to your `.vscode/settings.json` or global MCP configuration:
|
|
95
|
+
|
|
96
|
+
```json
|
|
97
|
+
{
|
|
98
|
+
"mcpServers": {
|
|
99
|
+
"docshark": {
|
|
100
|
+
"command": "bunx",
|
|
101
|
+
"args": ["-y", "docshark", "start", "--stdio"]
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Cursor
|
|
108
|
+
|
|
109
|
+
1. Open **Cursor Settings** > **Models** > **MCP**.
|
|
110
|
+
2. Click **+ Add New MCP Server**.
|
|
111
|
+
3. Name: `docshark`
|
|
112
|
+
4. Type: `command`
|
|
113
|
+
5. Command: `bunx -y docshark start --stdio`
|
|
114
|
+
|
|
115
|
+
### Claude Desktop
|
|
116
|
+
|
|
117
|
+
Edit your Claude Desktop configuration file:
|
|
118
|
+
|
|
119
|
+
- **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
|
|
120
|
+
- **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
|
|
121
|
+
|
|
122
|
+
```json
|
|
123
|
+
{
|
|
124
|
+
"mcpServers": {
|
|
125
|
+
"docshark": {
|
|
126
|
+
"command": "bunx",
|
|
127
|
+
"args": ["-y", "docshark", "start", "--stdio"]
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## 🛠️ Development
|
|
136
|
+
|
|
137
|
+
### Local Setup
|
|
138
|
+
|
|
139
|
+
Ensure you have [Bun](https://bun.sh/) installed.
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# Clone the repository
|
|
143
|
+
git clone https://github.com/Michael-Obele/docshark.git
|
|
144
|
+
cd docshark
|
|
145
|
+
|
|
146
|
+
# Install dependencies
|
|
147
|
+
bun install
|
|
148
|
+
|
|
149
|
+
# (Optional) Enable auto-detection & scraping of Javascript React/Vue single-page apps
|
|
150
|
+
bun add puppeteer-core
|
|
151
|
+
|
|
152
|
+
# Start the DocShark MCP server in HTTP mode for local testing
|
|
153
|
+
bun run src/cli.ts start --port 6380
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### Local CLI Debugging
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# Run CLI directly while developing
|
|
160
|
+
bun run src/cli.ts list
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## 🔄 Versioning & Changelog
|
|
164
|
+
|
|
165
|
+
This project uses [Google's Release Please](https://github.com/googleapis/release-please) to automate versioning and changelog generation.
|
|
166
|
+
|
|
167
|
+
- **Semantic Versioning**: Our versions automatically bump (e.g. `0.0.1` -> `0.0.2` or `0.1.0`) based on standard Conventional Commits (`feat:`, `fix:`, `chore:`, etc.).
|
|
168
|
+
- **Automated**: A PR is automatically created on `master` when standard commits are merged, generating a standard `CHANGELOG.md`.
|
|
169
|
+
|
|
170
|
+
## 📜 License
|
|
171
|
+
|
|
172
|
+
This project is open-source and available under the [MIT License](LICENSE).
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
_Built to empower AI agents with the latest knowledge._
|
package/dist/cli.js
CHANGED
|
@@ -5,6 +5,7 @@ import { startHttpServer } from "./http.js";
|
|
|
5
5
|
import { StdioTransport } from "@tmcp/transport-stdio";
|
|
6
6
|
import { server, db, searchEngine, libraryService } from "./server.js";
|
|
7
7
|
import { maybeNotifyAboutUpdate, runUpdateCommand } from "./cli-update.js";
|
|
8
|
+
import { formatSearchResults } from "./search/format-results.js";
|
|
8
9
|
import { VERSION } from "./version.js";
|
|
9
10
|
const useColor = process.stdout.isTTY;
|
|
10
11
|
const color = {
|
|
@@ -160,12 +161,7 @@ cli
|
|
|
160
161
|
console.log(`\nNo results found for "${query}".\n`);
|
|
161
162
|
return;
|
|
162
163
|
}
|
|
163
|
-
|
|
164
|
-
console.log(`\n--- ${r.page_title} (${r.library_display_name}) ---`);
|
|
165
|
-
console.log(`Section: ${r.heading_context}`);
|
|
166
|
-
console.log(r.content.slice(0, 300));
|
|
167
|
-
console.log(`Source: ${r.page_url}\n`);
|
|
168
|
-
}
|
|
164
|
+
console.log(`\n${formatSearchResults(query, results)}\n`);
|
|
169
165
|
});
|
|
170
166
|
cli
|
|
171
167
|
.command("list", "List indexed libraries")
|
package/dist/jobs/manager.d.ts
CHANGED
|
@@ -9,6 +9,7 @@ export declare class JobManager {
|
|
|
9
9
|
/** Start a crawl job for a library */
|
|
10
10
|
startCrawl(libraryId: string, opts?: {
|
|
11
11
|
incremental?: boolean;
|
|
12
|
+
sessionId?: string;
|
|
12
13
|
}): CrawlJob;
|
|
13
14
|
/** Get status of a specific job */
|
|
14
15
|
getJob(jobId: string): CrawlJob | undefined;
|
package/dist/jobs/manager.js
CHANGED
|
@@ -12,7 +12,7 @@ export class JobManager {
|
|
|
12
12
|
/** Start a crawl job for a library */
|
|
13
13
|
startCrawl(libraryId, opts) {
|
|
14
14
|
const jobId = nanoid();
|
|
15
|
-
const job = this.db.createJob({ id: jobId, libraryId });
|
|
15
|
+
const job = this.db.createJob({ id: jobId, libraryId, sessionId: opts?.sessionId });
|
|
16
16
|
// Run crawl async (non-blocking)
|
|
17
17
|
const worker = new CrawlWorker(this.db, this.eventBus);
|
|
18
18
|
this.activeJobs.set(jobId, worker);
|
|
@@ -4,20 +4,32 @@ import { fileURLToPath } from "node:url";
|
|
|
4
4
|
const scriptDir = dirname(fileURLToPath(import.meta.url));
|
|
5
5
|
const packageJsonPath = resolve(scriptDir, "../../package.json");
|
|
6
6
|
const versionFilePath = resolve(scriptDir, "../version.ts");
|
|
7
|
+
const releaseVersion = process.env.DOCSHARK_VERSION?.trim();
|
|
7
8
|
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf8"));
|
|
8
|
-
|
|
9
|
+
const targetVersion = releaseVersion && releaseVersion.length > 0
|
|
10
|
+
? releaseVersion
|
|
11
|
+
: packageJson.version;
|
|
12
|
+
if (typeof targetVersion !== "string") {
|
|
9
13
|
throw new Error("packages/core/package.json is missing a valid version.");
|
|
10
14
|
}
|
|
11
15
|
const nextVersionFile = [
|
|
12
16
|
"// This file is automatically updated by the version sync script.",
|
|
13
|
-
`export const VERSION = '${
|
|
17
|
+
`export const VERSION = '${targetVersion}';`,
|
|
14
18
|
"",
|
|
15
19
|
].join("\n");
|
|
16
20
|
const currentVersionFile = readFileSync(versionFilePath, "utf8");
|
|
17
21
|
if (currentVersionFile !== nextVersionFile) {
|
|
18
22
|
writeFileSync(versionFilePath, nextVersionFile, "utf8");
|
|
19
|
-
console.log(`Updated src/version.ts to ${
|
|
23
|
+
console.log(`Updated src/version.ts to ${targetVersion}`);
|
|
20
24
|
}
|
|
21
25
|
else {
|
|
22
|
-
console.log(`src/version.ts already matches ${
|
|
26
|
+
console.log(`src/version.ts already matches ${targetVersion}`);
|
|
27
|
+
}
|
|
28
|
+
if (releaseVersion && packageJson.version !== targetVersion) {
|
|
29
|
+
const nextPackageJson = {
|
|
30
|
+
...packageJson,
|
|
31
|
+
version: targetVersion,
|
|
32
|
+
};
|
|
33
|
+
writeFileSync(packageJsonPath, `${JSON.stringify(nextPackageJson, null, 2)}\n`, "utf8");
|
|
34
|
+
console.log(`Updated package.json to ${targetVersion}`);
|
|
23
35
|
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { sanitizeDocContent } from './sanitize.js';
|
|
2
|
+
function formatReasons(reasons) {
|
|
3
|
+
if (reasons.length === 0) {
|
|
4
|
+
return '';
|
|
5
|
+
}
|
|
6
|
+
return `**Why this ranked highly:** ${reasons.join(', ')}\n\n`;
|
|
7
|
+
}
|
|
8
|
+
export function formatSearchResults(query, results) {
|
|
9
|
+
const formatted = results
|
|
10
|
+
.map((result, index) => {
|
|
11
|
+
let block = `### ${index + 1}. ${result.page_title} — ${result.library_display_name}\n`;
|
|
12
|
+
block += `**Source:** ${result.page_url}\n`;
|
|
13
|
+
if (result.heading_context.trim().length > 0) {
|
|
14
|
+
block += `**Section:** ${result.heading_context}\n`;
|
|
15
|
+
}
|
|
16
|
+
// Sanitize content to prevent prompt injection
|
|
17
|
+
const sanitizedContent = sanitizeDocContent(result.content);
|
|
18
|
+
block += `${formatReasons(result.reasons)}${sanitizedContent}`;
|
|
19
|
+
return block;
|
|
20
|
+
})
|
|
21
|
+
.join('\n\n---\n\n');
|
|
22
|
+
return `## Results for "${query}"\n\n${formatted}`;
|
|
23
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
const STOP_WORDS = new Set([
|
|
2
|
+
"a",
|
|
3
|
+
"an",
|
|
4
|
+
"and",
|
|
5
|
+
"are",
|
|
6
|
+
"at",
|
|
7
|
+
"do",
|
|
8
|
+
"for",
|
|
9
|
+
"how",
|
|
10
|
+
"i",
|
|
11
|
+
"in",
|
|
12
|
+
"is",
|
|
13
|
+
"of",
|
|
14
|
+
"on",
|
|
15
|
+
"or",
|
|
16
|
+
"the",
|
|
17
|
+
"to",
|
|
18
|
+
"what",
|
|
19
|
+
"with",
|
|
20
|
+
]);
|
|
21
|
+
const PHRASE_HINTS = [
|
|
22
|
+
"getting started",
|
|
23
|
+
"quickstart",
|
|
24
|
+
"overview",
|
|
25
|
+
"reference",
|
|
26
|
+
"api",
|
|
27
|
+
"troubleshooting",
|
|
28
|
+
];
|
|
29
|
+
export function normalizeSearchText(value) {
|
|
30
|
+
return value
|
|
31
|
+
.toLowerCase()
|
|
32
|
+
.replace(/[^a-z0-9@/._\-\s]+/g, " ")
|
|
33
|
+
.replace(/\s+/g, " ")
|
|
34
|
+
.trim();
|
|
35
|
+
}
|
|
36
|
+
function sanitizeToken(value) {
|
|
37
|
+
return value.replace(/^[^a-z0-9@/._-]+|[^a-z0-9@/._-]+$/gi, "").toLowerCase();
|
|
38
|
+
}
|
|
39
|
+
export class QueryPlanner {
|
|
40
|
+
build(query, library) {
|
|
41
|
+
const normalizedQuery = normalizeSearchText(query);
|
|
42
|
+
const rawTokens = normalizedQuery
|
|
43
|
+
.split(/\s+/)
|
|
44
|
+
.map((token) => sanitizeToken(token))
|
|
45
|
+
.filter(Boolean);
|
|
46
|
+
const filteredKeywords = Array.from(new Set(rawTokens.filter((token) => token.length > 1 && !STOP_WORDS.has(token))));
|
|
47
|
+
return {
|
|
48
|
+
original_query: query,
|
|
49
|
+
normalized_query: normalizedQuery,
|
|
50
|
+
intent: this.detectIntent(normalizedQuery),
|
|
51
|
+
keywords: filteredKeywords.length > 0
|
|
52
|
+
? filteredKeywords
|
|
53
|
+
: Array.from(new Set(rawTokens)),
|
|
54
|
+
phrases: PHRASE_HINTS.filter((phrase) => normalizedQuery.includes(phrase)),
|
|
55
|
+
requested_library: library,
|
|
56
|
+
requested_version: this.extractVersion(normalizedQuery),
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
detectIntent(query) {
|
|
60
|
+
if (query.includes("getting started") ||
|
|
61
|
+
query.includes("quickstart") ||
|
|
62
|
+
query.startsWith("install ") ||
|
|
63
|
+
query.startsWith("setup ")) {
|
|
64
|
+
return "getting_started";
|
|
65
|
+
}
|
|
66
|
+
if (query.includes("overview") ||
|
|
67
|
+
query.startsWith("what is ") ||
|
|
68
|
+
query.startsWith("about ")) {
|
|
69
|
+
return "overview";
|
|
70
|
+
}
|
|
71
|
+
if (/[a-z]+\.[a-z]+/.test(query) ||
|
|
72
|
+
/[A-Z][a-zA-Z]+\(/.test(query) ||
|
|
73
|
+
query.includes(" api") ||
|
|
74
|
+
query.endsWith(" api") ||
|
|
75
|
+
query.includes("reference") ||
|
|
76
|
+
query.includes("@")) {
|
|
77
|
+
return "api_lookup";
|
|
78
|
+
}
|
|
79
|
+
if (/error|fail|issue|problem|broken|debug|fix|troubleshoot/.test(query)) {
|
|
80
|
+
return "troubleshooting";
|
|
81
|
+
}
|
|
82
|
+
return "general";
|
|
83
|
+
}
|
|
84
|
+
extractVersion(query) {
|
|
85
|
+
const match = query.match(/\bv(?:ersion)?\s*(\d+)\b/);
|
|
86
|
+
return match?.[1];
|
|
87
|
+
}
|
|
88
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Output sanitization to prevent prompt injection attacks
|
|
3
|
+
* Removes suspicious patterns that could escape agent context
|
|
4
|
+
*/
|
|
5
|
+
export declare function sanitizeOutput(text: string): string;
|
|
6
|
+
/**
|
|
7
|
+
* Sanitize a single chunk of documentation content
|
|
8
|
+
* Removes malicious content while preserving code blocks and formatting
|
|
9
|
+
*/
|
|
10
|
+
export declare function sanitizeDocContent(content: string): string;
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Output sanitization to prevent prompt injection attacks
|
|
3
|
+
* Removes suspicious patterns that could escape agent context
|
|
4
|
+
*/
|
|
5
|
+
export function sanitizeOutput(text) {
|
|
6
|
+
return text
|
|
7
|
+
// Remove template directives
|
|
8
|
+
.replace(/\{[#%].*?[#%]\}/g, '')
|
|
9
|
+
// Remove system prompt markers
|
|
10
|
+
.replace(/\[SYSTEM[\]:].*?\[\/SYSTEM\]/gi, '')
|
|
11
|
+
.replace(/\[ADMIN[\]:].*?\[\/ADMIN\]/gi, '')
|
|
12
|
+
// Remove potential prompt injection patterns
|
|
13
|
+
.replace(/ignore\s+above.*?instructions/gi, '')
|
|
14
|
+
.replace(/forget\s+previous.*?context/gi, '')
|
|
15
|
+
.trim();
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Sanitize a single chunk of documentation content
|
|
19
|
+
* Removes malicious content while preserving code blocks and formatting
|
|
20
|
+
*/
|
|
21
|
+
export function sanitizeDocContent(content) {
|
|
22
|
+
// First sanitize for injection patterns
|
|
23
|
+
let sanitized = sanitizeOutput(content);
|
|
24
|
+
// Escape potential dangerous markdown constructs
|
|
25
|
+
// but preserve code blocks (between triple backticks)
|
|
26
|
+
const codeBlockPattern = /```[\s\S]*?```/g;
|
|
27
|
+
const codeBlocks = sanitized.match(codeBlockPattern) || [];
|
|
28
|
+
// Temporarily replace code blocks
|
|
29
|
+
let temp = sanitized;
|
|
30
|
+
codeBlocks.forEach((block, i) => {
|
|
31
|
+
temp = temp.replace(block, `__CODE_BLOCK_${i}__`);
|
|
32
|
+
});
|
|
33
|
+
// Sanitize outside code blocks
|
|
34
|
+
temp = sanitizeOutput(temp);
|
|
35
|
+
// Restore code blocks
|
|
36
|
+
codeBlocks.forEach((block, i) => {
|
|
37
|
+
temp = temp.replace(`__CODE_BLOCK_${i}__`, block);
|
|
38
|
+
});
|
|
39
|
+
return temp;
|
|
40
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export type SearchIntent = "general" | "overview" | "getting_started" | "api_lookup" | "troubleshooting";
|
|
2
|
+
export interface SearchOptions {
|
|
3
|
+
library?: string;
|
|
4
|
+
limit?: number;
|
|
5
|
+
}
|
|
6
|
+
export interface SearchPlan {
|
|
7
|
+
original_query: string;
|
|
8
|
+
normalized_query: string;
|
|
9
|
+
intent: SearchIntent;
|
|
10
|
+
keywords: string[];
|
|
11
|
+
phrases: string[];
|
|
12
|
+
requested_version?: string;
|
|
13
|
+
requested_library?: string;
|
|
14
|
+
}
|
|
15
|
+
export interface SearchCandidate {
|
|
16
|
+
content: string;
|
|
17
|
+
heading_context: string;
|
|
18
|
+
page_url: string;
|
|
19
|
+
page_path: string;
|
|
20
|
+
page_title: string;
|
|
21
|
+
library_name: string;
|
|
22
|
+
library_display_name: string;
|
|
23
|
+
lexical_score: number;
|
|
24
|
+
has_code_block: boolean;
|
|
25
|
+
token_count: number;
|
|
26
|
+
chunk_index: number;
|
|
27
|
+
}
|
|
28
|
+
export interface SearchResult extends SearchCandidate {
|
|
29
|
+
rerank_score: number;
|
|
30
|
+
reasons: string[];
|
|
31
|
+
path_type: string;
|
|
32
|
+
version_tag: string | null;
|
|
33
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/dist/server.js
CHANGED
|
@@ -5,6 +5,7 @@ import * as v from "valibot";
|
|
|
5
5
|
import { tool } from "tmcp/utils";
|
|
6
6
|
import { Database } from "./storage/db.js";
|
|
7
7
|
import { SearchEngine } from "./storage/search.js";
|
|
8
|
+
import { formatSearchResults } from "./search/format-results.js";
|
|
8
9
|
import { LibraryService } from "./services/library.js";
|
|
9
10
|
import { JobManager } from "./jobs/manager.js";
|
|
10
11
|
import { VERSION } from "./version.js";
|
|
@@ -32,29 +33,27 @@ export const server = new McpServer({
|
|
|
32
33
|
// ──────────────────────────────────────
|
|
33
34
|
server.tool({
|
|
34
35
|
name: "search_docs",
|
|
35
|
-
description: "Search
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
36
|
+
description: "Search indexed docs by keyword or library. Returns ranked sections with URLs.",
|
|
37
|
+
annotations: {
|
|
38
|
+
readOnlyHint: true,
|
|
39
|
+
idempotentHint: true,
|
|
40
|
+
},
|
|
39
41
|
schema: v.object({
|
|
40
42
|
query: v.pipe(v.string(), v.description("Search query. Use natural language.")),
|
|
41
43
|
library: v.optional(v.pipe(v.string(), v.description("Filter to a specific library."))),
|
|
42
44
|
limit: v.optional(v.pipe(v.number(), v.integer(), v.minValue(1), v.maxValue(20)), 5),
|
|
43
45
|
}),
|
|
44
46
|
}, async ({ query, library, limit }) => {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
.
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
})
|
|
56
|
-
.join("\n\n---\n\n");
|
|
57
|
-
return tool.text(`## Results for "${query}"\n\n${formatted}`);
|
|
47
|
+
try {
|
|
48
|
+
const results = searchEngine.search(query, { library, limit });
|
|
49
|
+
if (results.length === 0)
|
|
50
|
+
return tool.text(`No results found for "${query}".`);
|
|
51
|
+
return tool.text(formatSearchResults(query, results));
|
|
52
|
+
}
|
|
53
|
+
catch (err) {
|
|
54
|
+
const message = err instanceof Error ? err.message : "Search failed";
|
|
55
|
+
return tool.text(`❌ Error: ${message}`);
|
|
56
|
+
}
|
|
58
57
|
});
|
|
59
58
|
function requireValue(value, message) {
|
|
60
59
|
if (value === undefined || value === null || value === "") {
|
|
@@ -89,52 +88,84 @@ function formatLibraryInfo(libraryId) {
|
|
|
89
88
|
return output;
|
|
90
89
|
}
|
|
91
90
|
// ──────────────────────────────────────
|
|
92
|
-
// Tool 2: list_libraries — Discovery tool
|
|
91
|
+
// Tool 2: list_libraries — Discovery tool with pagination
|
|
93
92
|
// ──────────────────────────────────────
|
|
94
93
|
server.tool({
|
|
95
94
|
name: "list_libraries",
|
|
96
|
-
description: "List
|
|
97
|
-
|
|
95
|
+
description: "List indexed documentation libraries. Paginated results.",
|
|
96
|
+
annotations: {
|
|
97
|
+
readOnlyHint: true,
|
|
98
|
+
idempotentHint: true,
|
|
99
|
+
},
|
|
98
100
|
schema: v.object({
|
|
99
101
|
status: v.optional(v.pipe(v.picklist(["indexed", "crawling", "error", "all"]), v.description('Filter by status. Default: "all".')), "all"),
|
|
102
|
+
page: v.optional(v.pipe(v.number(), v.integer(), v.minValue(1)), 1),
|
|
103
|
+
limit: v.optional(v.pipe(v.number(), v.integer(), v.minValue(1), v.maxValue(50)), 20),
|
|
100
104
|
}),
|
|
101
|
-
}, async ({ status }) => {
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
+
}, async ({ status, page = 1, limit = 20 }) => {
|
|
106
|
+
try {
|
|
107
|
+
const libraries = db.listLibraries(status);
|
|
108
|
+
if (libraries.length === 0) {
|
|
109
|
+
return tool.text("No libraries indexed yet. Use manage_library with action=add to add a documentation website.");
|
|
110
|
+
}
|
|
111
|
+
// Paginate results
|
|
112
|
+
const start = (page - 1) * limit;
|
|
113
|
+
const end = start + limit;
|
|
114
|
+
const paginated = libraries.slice(start, end);
|
|
115
|
+
const hasMore = end < libraries.length;
|
|
116
|
+
// Minified response (no pretty-printing)
|
|
117
|
+
let output = `## Libraries (${start + 1}-${Math.min(end, libraries.length)} of ${libraries.length})\n\n`;
|
|
118
|
+
output += "| Library | URL | Pages | Chunks | Status |\n";
|
|
119
|
+
output += "| ------- | --- | ----- | ------ | ------ |\n";
|
|
120
|
+
for (const lib of paginated) {
|
|
121
|
+
output += `|${lib.name}|${lib.url}|${lib.page_count}|${lib.chunk_count}|${lib.status}|\n`;
|
|
122
|
+
}
|
|
123
|
+
if (hasMore) {
|
|
124
|
+
output += `\n**More available.** Use page=${page + 1} to fetch next page.`;
|
|
125
|
+
}
|
|
126
|
+
return tool.text(output);
|
|
105
127
|
}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
for (const lib of libraries) {
|
|
110
|
-
output += `| ${lib.name} | ${lib.url} | ${lib.page_count} | ${lib.chunk_count} | ${lib.status} |\n`;
|
|
128
|
+
catch (err) {
|
|
129
|
+
const message = err instanceof Error ? err.message : "Failed to list libraries";
|
|
130
|
+
return tool.text(`❌ Error: ${message}`);
|
|
111
131
|
}
|
|
112
|
-
return tool.text(output);
|
|
113
132
|
});
|
|
114
133
|
// ──────────────────────────────────────
|
|
115
134
|
// Tool 3: get_doc_page — Full page read
|
|
116
135
|
// ──────────────────────────────────────
|
|
117
136
|
server.tool({
|
|
118
137
|
name: "get_doc_page",
|
|
119
|
-
description: "Retrieve
|
|
120
|
-
|
|
138
|
+
description: "Retrieve complete documentation page as markdown.",
|
|
139
|
+
annotations: {
|
|
140
|
+
readOnlyHint: true,
|
|
141
|
+
idempotentHint: true,
|
|
142
|
+
},
|
|
121
143
|
schema: v.object({
|
|
122
144
|
url: v.optional(v.pipe(v.string(), v.description("The full URL of the documentation page."))),
|
|
123
145
|
library: v.optional(v.pipe(v.string(), v.description("Library name to search within."))),
|
|
124
146
|
path: v.optional(v.pipe(v.string(), v.description("Relative path within the library."))),
|
|
125
147
|
}),
|
|
126
148
|
}, async ({ url, library, path }) => {
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
149
|
+
try {
|
|
150
|
+
const page = db.getPage({ url, library, path });
|
|
151
|
+
if (!page)
|
|
152
|
+
return tool.text("Page not found. Use search_docs to find the correct page.");
|
|
153
|
+
return tool.text(`# ${page.title}\n**Source:** ${page.url}\n\n${page.content_markdown}`);
|
|
154
|
+
}
|
|
155
|
+
catch (err) {
|
|
156
|
+
const message = err instanceof Error ? err.message : "Failed to fetch page";
|
|
157
|
+
return tool.text(`❌ Error: ${message}`);
|
|
158
|
+
}
|
|
131
159
|
});
|
|
132
160
|
// ──────────────────────────────────────
|
|
133
161
|
// Tool 4: manage_library — Create, rename, refresh, remove, inspect
|
|
134
162
|
// ──────────────────────────────────────
|
|
135
163
|
server.tool({
|
|
136
164
|
name: "manage_library",
|
|
137
|
-
description: "Manage
|
|
165
|
+
description: "Manage library lifecycle: add/rename/refresh/remove/info. Destructive actions require confirmation.",
|
|
166
|
+
annotations: {
|
|
167
|
+
destructiveHint: true,
|
|
168
|
+
},
|
|
138
169
|
schema: v.object({
|
|
139
170
|
action: v.pipe(v.picklist(["add", "rename", "refresh", "remove", "info"]), v.description("The management action to perform.")),
|
|
140
171
|
url: v.optional(v.pipe(v.string(), v.url(), v.description("Base URL of the documentation website."))),
|
|
@@ -170,7 +201,7 @@ server.tool({
|
|
|
170
201
|
const libraryName = requireValue(input.library, "library is required for action=refresh.");
|
|
171
202
|
const lib = db.getLibraryByName(libraryName);
|
|
172
203
|
if (!lib)
|
|
173
|
-
return tool.text(
|
|
204
|
+
return tool.text(`❌ Library "${libraryName}" not found. Use list_libraries to see available.`);
|
|
174
205
|
const job = jobManager.startCrawl(lib.id, { incremental: true });
|
|
175
206
|
return tool.text(`🔄 Refresh started for "${lib.display_name}".\nJob ${job.id}: checking for updated pages...`);
|
|
176
207
|
}
|
|
@@ -178,7 +209,7 @@ server.tool({
|
|
|
178
209
|
const libraryName = requireValue(input.library, "library is required for action=remove.");
|
|
179
210
|
const lib = db.getLibraryByName(libraryName);
|
|
180
211
|
if (!lib)
|
|
181
|
-
return tool.text(
|
|
212
|
+
return tool.text(`❌ Library "${libraryName}" not found.`);
|
|
182
213
|
db.removeLibrary(lib.id);
|
|
183
214
|
return tool.text(`🗑️ Library "${lib.display_name}" removed.\nDeleted ${lib.page_count} pages and ${lib.chunk_count} chunks.`);
|
|
184
215
|
}
|
|
@@ -186,14 +217,14 @@ server.tool({
|
|
|
186
217
|
const libraryName = requireValue(input.library, "library is required for action=info.");
|
|
187
218
|
const lib = db.getLibraryByName(libraryName);
|
|
188
219
|
if (!lib)
|
|
189
|
-
return tool.text(
|
|
220
|
+
return tool.text(`❌ Library "${libraryName}" not found. Use list_libraries to see available libraries.`);
|
|
190
221
|
return tool.text(formatLibraryInfo(lib.id));
|
|
191
222
|
}
|
|
192
223
|
}
|
|
193
224
|
}
|
|
194
225
|
catch (err) {
|
|
195
226
|
const message = err instanceof Error ? err.message : "Unknown error";
|
|
196
|
-
return tool.text(`❌
|
|
227
|
+
return tool.text(`❌ Error: ${message}`);
|
|
197
228
|
}
|
|
198
|
-
return tool.text(`❌
|
|
229
|
+
return tool.text(`❌ Error: Unsupported action.`);
|
|
199
230
|
});
|
package/dist/storage/db.d.ts
CHANGED
|
@@ -51,6 +51,7 @@ export declare class Database {
|
|
|
51
51
|
createJob(job: {
|
|
52
52
|
id: string;
|
|
53
53
|
libraryId: string;
|
|
54
|
+
sessionId?: string;
|
|
54
55
|
}): CrawlJob;
|
|
55
56
|
getJob(id: string): CrawlJob | undefined;
|
|
56
57
|
updateJob(id: string, updates: Partial<Pick<CrawlJob, "status" | "pages_discovered" | "pages_crawled" | "pages_failed" | "chunks_created" | "error_message" | "started_at" | "completed_at">>): void;
|
package/dist/storage/db.js
CHANGED
|
@@ -92,6 +92,7 @@ export class Database {
|
|
|
92
92
|
CREATE TABLE IF NOT EXISTS crawl_jobs (
|
|
93
93
|
id TEXT PRIMARY KEY,
|
|
94
94
|
library_id TEXT NOT NULL REFERENCES libraries(id) ON DELETE CASCADE,
|
|
95
|
+
session_id TEXT,
|
|
95
96
|
status TEXT NOT NULL DEFAULT 'queued',
|
|
96
97
|
pages_discovered INTEGER NOT NULL DEFAULT 0,
|
|
97
98
|
pages_crawled INTEGER NOT NULL DEFAULT 0,
|
|
@@ -213,8 +214,8 @@ export class Database {
|
|
|
213
214
|
// ──────────────────────────────────────
|
|
214
215
|
createJob(job) {
|
|
215
216
|
this.db
|
|
216
|
-
.prepare("INSERT INTO crawl_jobs (id, library_id) VALUES (?, ?)")
|
|
217
|
-
.run(job.id, job.libraryId);
|
|
217
|
+
.prepare("INSERT INTO crawl_jobs (id, library_id, session_id) VALUES (?, ?, ?)")
|
|
218
|
+
.run(job.id, job.libraryId, job.sessionId ?? null);
|
|
218
219
|
return this.db
|
|
219
220
|
.prepare("SELECT * FROM crawl_jobs WHERE id = ?")
|
|
220
221
|
.get(job.id);
|
package/dist/storage/search.d.ts
CHANGED
|
@@ -1,21 +1,23 @@
|
|
|
1
|
-
import type { Database } from
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
heading_context: string;
|
|
5
|
-
page_url: string;
|
|
6
|
-
page_title: string;
|
|
7
|
-
library_name: string;
|
|
8
|
-
library_display_name: string;
|
|
9
|
-
relevance_score: number;
|
|
10
|
-
has_code_block: boolean;
|
|
11
|
-
token_count: number;
|
|
12
|
-
}
|
|
1
|
+
import type { Database } from "./db.js";
|
|
2
|
+
import type { SearchOptions, SearchResult } from "../search/types.js";
|
|
3
|
+
export type { SearchOptions, SearchResult } from "../search/types.js";
|
|
13
4
|
export declare class SearchEngine {
|
|
14
5
|
private db;
|
|
6
|
+
private planner;
|
|
15
7
|
constructor(db: Database);
|
|
16
|
-
search(query: string, opts?:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
private
|
|
8
|
+
search(query: string, opts?: SearchOptions): SearchResult[];
|
|
9
|
+
private fetchCandidates;
|
|
10
|
+
private buildFtsQuery;
|
|
11
|
+
private quoteTerm;
|
|
12
|
+
private rerank;
|
|
13
|
+
private scoreCandidate;
|
|
14
|
+
private collapseDuplicates;
|
|
15
|
+
private preferenceScore;
|
|
16
|
+
private canonicalPageKey;
|
|
17
|
+
private inferPathType;
|
|
18
|
+
private pathTypeScore;
|
|
19
|
+
private keywordOverlap;
|
|
20
|
+
private hasPhraseMatch;
|
|
21
|
+
private primaryTitle;
|
|
22
|
+
private extractVersionTag;
|
|
21
23
|
}
|
package/dist/storage/search.js
CHANGED
|
@@ -1,49 +1,323 @@
|
|
|
1
|
+
import { QueryPlanner, normalizeSearchText } from "../search/query-planner.js";
|
|
1
2
|
export class SearchEngine {
|
|
2
3
|
db;
|
|
4
|
+
planner = new QueryPlanner();
|
|
3
5
|
constructor(db) {
|
|
4
6
|
this.db = db;
|
|
5
7
|
}
|
|
6
8
|
search(query, opts = {}) {
|
|
7
9
|
const limit = opts.limit ?? 5;
|
|
8
|
-
const
|
|
10
|
+
const plan = this.planner.build(query, opts.library);
|
|
11
|
+
const ftsQuery = this.buildFtsQuery(plan);
|
|
9
12
|
if (!ftsQuery)
|
|
10
13
|
return [];
|
|
11
14
|
try {
|
|
12
|
-
const
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
p.url AS page_url,
|
|
19
|
-
p.title AS page_title,
|
|
20
|
-
l.name AS library_name,
|
|
21
|
-
l.display_name AS library_display_name,
|
|
22
|
-
bm25(chunks_fts, 1.0, 0.5) AS relevance_score
|
|
23
|
-
FROM chunks_fts
|
|
24
|
-
JOIN chunks c ON chunks_fts.rowid = c.rowid
|
|
25
|
-
JOIN pages p ON c.page_id = p.id
|
|
26
|
-
JOIN libraries l ON c.library_id = l.id
|
|
27
|
-
WHERE chunks_fts MATCH ?
|
|
28
|
-
AND (? IS NULL OR l.name = ?)
|
|
29
|
-
ORDER BY relevance_score
|
|
30
|
-
LIMIT ?
|
|
31
|
-
`);
|
|
32
|
-
return stmt.all(ftsQuery, opts.library ?? null, opts.library ?? null, limit);
|
|
15
|
+
const candidates = this.fetchCandidates(ftsQuery, opts.library, limit);
|
|
16
|
+
if (candidates.length === 0) {
|
|
17
|
+
return [];
|
|
18
|
+
}
|
|
19
|
+
const reranked = this.rerank(plan, candidates);
|
|
20
|
+
return this.collapseDuplicates(plan, reranked).slice(0, limit);
|
|
33
21
|
}
|
|
34
22
|
catch (err) {
|
|
35
|
-
// FTS5 query might fail with bad syntax — return empty
|
|
36
23
|
console.warn(`[DocShark] Search failed:`, err.message);
|
|
37
24
|
return [];
|
|
38
25
|
}
|
|
39
26
|
}
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
27
|
+
fetchCandidates(ftsQuery, library, limit) {
|
|
28
|
+
const candidateLimit = Math.min(Math.max(limit * 12, 25), 80);
|
|
29
|
+
const stmt = this.db.raw().prepare(`
|
|
30
|
+
SELECT
|
|
31
|
+
c.content,
|
|
32
|
+
COALESCE(c.heading_context, '') AS heading_context,
|
|
33
|
+
c.has_code_block,
|
|
34
|
+
COALESCE(c.token_count, 0) AS token_count,
|
|
35
|
+
c.chunk_index,
|
|
36
|
+
p.url AS page_url,
|
|
37
|
+
p.path AS page_path,
|
|
38
|
+
COALESCE(p.title, 'Untitled') AS page_title,
|
|
39
|
+
l.name AS library_name,
|
|
40
|
+
l.display_name AS library_display_name,
|
|
41
|
+
bm25(chunks_fts, 1.0, 0.7) AS lexical_score
|
|
42
|
+
FROM chunks_fts
|
|
43
|
+
JOIN chunks c ON chunks_fts.rowid = c.rowid
|
|
44
|
+
JOIN pages p ON c.page_id = p.id
|
|
45
|
+
JOIN libraries l ON c.library_id = l.id
|
|
46
|
+
WHERE chunks_fts MATCH ?
|
|
47
|
+
AND (? IS NULL OR l.name = ?)
|
|
48
|
+
ORDER BY lexical_score
|
|
49
|
+
LIMIT ?
|
|
50
|
+
`);
|
|
51
|
+
const rows = stmt.all(ftsQuery, library ?? null, library ?? null, candidateLimit);
|
|
52
|
+
return rows.map((row) => ({
|
|
53
|
+
...row,
|
|
54
|
+
has_code_block: row.has_code_block === 1,
|
|
55
|
+
}));
|
|
56
|
+
}
|
|
57
|
+
buildFtsQuery(plan) {
|
|
58
|
+
const clauses = new Set();
|
|
59
|
+
const exactQuery = this.quoteTerm(plan.normalized_query);
|
|
60
|
+
if (plan.normalized_query.length > 0) {
|
|
61
|
+
clauses.add(exactQuery);
|
|
62
|
+
}
|
|
63
|
+
for (const phrase of plan.phrases) {
|
|
64
|
+
clauses.add(this.quoteTerm(phrase));
|
|
65
|
+
}
|
|
66
|
+
for (const keyword of plan.keywords) {
|
|
67
|
+
clauses.add(this.quoteTerm(keyword));
|
|
68
|
+
}
|
|
69
|
+
if (plan.keywords.length > 1 && plan.keywords.length <= 6) {
|
|
70
|
+
clauses.add(`(${plan.keywords.map((keyword) => this.quoteTerm(keyword)).join(" AND ")})`);
|
|
71
|
+
}
|
|
72
|
+
return Array.from(clauses).join(" OR ");
|
|
73
|
+
}
|
|
74
|
+
quoteTerm(value) {
|
|
75
|
+
return `"${value.replace(/["']/g, "").trim()}"`;
|
|
76
|
+
}
|
|
77
|
+
rerank(plan, candidates) {
|
|
78
|
+
const total = Math.max(candidates.length, 1);
|
|
79
|
+
return candidates
|
|
80
|
+
.map((candidate, index) => this.scoreCandidate(plan, candidate, index, total))
|
|
81
|
+
.sort((left, right) => {
|
|
82
|
+
if (right.rerank_score !== left.rerank_score) {
|
|
83
|
+
return right.rerank_score - left.rerank_score;
|
|
84
|
+
}
|
|
85
|
+
return left.lexical_score - right.lexical_score;
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
scoreCandidate(plan, candidate, index, total) {
|
|
89
|
+
const title = normalizeSearchText(candidate.page_title);
|
|
90
|
+
const primaryTitle = normalizeSearchText(this.primaryTitle(candidate.page_title));
|
|
91
|
+
const heading = normalizeSearchText(candidate.heading_context);
|
|
92
|
+
const path = normalizeSearchText(candidate.page_path);
|
|
93
|
+
const libraryText = normalizeSearchText(`${candidate.library_name} ${candidate.library_display_name}`);
|
|
94
|
+
const contentPreview = normalizeSearchText(candidate.content.slice(0, 800));
|
|
95
|
+
const pathType = this.inferPathType(candidate.page_path, candidate.page_title);
|
|
96
|
+
const versionTag = this.extractVersionTag(candidate.page_path);
|
|
97
|
+
const reasons = [];
|
|
98
|
+
let score = 0;
|
|
99
|
+
const lexicalRankScore = 0.35 * (1 - index / total);
|
|
100
|
+
score += lexicalRankScore;
|
|
101
|
+
const titleExact = primaryTitle.includes(plan.normalized_query) &&
|
|
102
|
+
plan.normalized_query.length > 0;
|
|
103
|
+
if (titleExact) {
|
|
104
|
+
score += 0.22;
|
|
105
|
+
reasons.push("exact title match");
|
|
106
|
+
}
|
|
107
|
+
const titleOverlap = this.keywordOverlap(plan.keywords, primaryTitle || title);
|
|
108
|
+
if (titleOverlap > 0) {
|
|
109
|
+
score += 0.14 * titleOverlap;
|
|
110
|
+
if (titleOverlap === 1 && !titleExact) {
|
|
111
|
+
reasons.push("all query keywords appear in title");
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
const headingOverlap = this.keywordOverlap(plan.keywords, heading);
|
|
115
|
+
if (headingOverlap > 0) {
|
|
116
|
+
score += 0.1 * headingOverlap;
|
|
117
|
+
if (headingOverlap >= 0.6) {
|
|
118
|
+
reasons.push("heading context aligns with the query");
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
const pathOverlap = this.keywordOverlap(plan.keywords, path);
|
|
122
|
+
if (pathOverlap > 0) {
|
|
123
|
+
score += 0.08 * pathOverlap;
|
|
124
|
+
}
|
|
125
|
+
const libraryOverlap = this.keywordOverlap(plan.keywords, libraryText);
|
|
126
|
+
if (libraryOverlap > 0) {
|
|
127
|
+
score += 0.08 * libraryOverlap;
|
|
128
|
+
if (libraryOverlap === 1) {
|
|
129
|
+
reasons.push("library name aligns with the query");
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
else if (plan.keywords.length === 1) {
|
|
133
|
+
score -= 0.03;
|
|
134
|
+
}
|
|
135
|
+
const phraseMatch = this.hasPhraseMatch(plan, title, heading, contentPreview);
|
|
136
|
+
if (phraseMatch) {
|
|
137
|
+
score += 0.08;
|
|
138
|
+
reasons.push("exact phrase match");
|
|
139
|
+
}
|
|
140
|
+
const pathPrior = this.pathTypeScore(plan.intent, pathType);
|
|
141
|
+
if (pathPrior > 0) {
|
|
142
|
+
score += pathPrior;
|
|
143
|
+
if (pathPrior >= 0.1) {
|
|
144
|
+
reasons.push(`matched ${pathType.replace(/_/g, "-")} page type`);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
if (candidate.has_code_block) {
|
|
148
|
+
const codeSignal = plan.intent === "api_lookup" || plan.intent === "troubleshooting"
|
|
149
|
+
? 0.07
|
|
150
|
+
: plan.intent === "getting_started"
|
|
151
|
+
? 0.03
|
|
152
|
+
: 0.015;
|
|
153
|
+
score += codeSignal;
|
|
154
|
+
if (codeSignal >= 0.03) {
|
|
155
|
+
reasons.push("includes code sample");
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
if (candidate.token_count >= 60 && candidate.token_count <= 260) {
|
|
159
|
+
score += 0.03;
|
|
160
|
+
}
|
|
161
|
+
if (plan.requested_version) {
|
|
162
|
+
if (versionTag === plan.requested_version) {
|
|
163
|
+
score += 0.12;
|
|
164
|
+
reasons.push(`matches requested version v${versionTag}`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
else if (!versionTag) {
|
|
168
|
+
score += 0.08;
|
|
169
|
+
reasons.push("canonical unversioned page");
|
|
170
|
+
}
|
|
171
|
+
else {
|
|
172
|
+
score += Math.min(parseInt(versionTag, 10), 20) / 300;
|
|
173
|
+
}
|
|
174
|
+
const uniqueReasons = Array.from(new Set(reasons)).slice(0, 4);
|
|
175
|
+
return {
|
|
176
|
+
...candidate,
|
|
177
|
+
path_type: pathType,
|
|
178
|
+
version_tag: versionTag,
|
|
179
|
+
rerank_score: Number(score.toFixed(6)),
|
|
180
|
+
reasons: uniqueReasons,
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
collapseDuplicates(plan, candidates) {
|
|
184
|
+
const bestByPage = new Map();
|
|
185
|
+
for (const candidate of candidates) {
|
|
186
|
+
const existing = bestByPage.get(candidate.page_url);
|
|
187
|
+
if (!existing || candidate.rerank_score > existing.rerank_score) {
|
|
188
|
+
bestByPage.set(candidate.page_url, candidate);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
const bestByCanonicalPage = new Map();
|
|
192
|
+
const pageResults = Array.from(bestByPage.values()).sort((left, right) => right.rerank_score - left.rerank_score);
|
|
193
|
+
for (const candidate of pageResults) {
|
|
194
|
+
const canonicalKey = this.canonicalPageKey(candidate);
|
|
195
|
+
const existing = bestByCanonicalPage.get(canonicalKey);
|
|
196
|
+
if (!existing ||
|
|
197
|
+
this.preferenceScore(plan, candidate) >
|
|
198
|
+
this.preferenceScore(plan, existing)) {
|
|
199
|
+
bestByCanonicalPage.set(canonicalKey, candidate);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return Array.from(bestByCanonicalPage.values()).sort((left, right) => {
|
|
203
|
+
if (right.rerank_score !== left.rerank_score) {
|
|
204
|
+
return right.rerank_score - left.rerank_score;
|
|
205
|
+
}
|
|
206
|
+
return left.lexical_score - right.lexical_score;
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
preferenceScore(plan, result) {
|
|
210
|
+
let score = result.rerank_score;
|
|
211
|
+
if (plan.requested_version) {
|
|
212
|
+
if (result.version_tag === plan.requested_version) {
|
|
213
|
+
score += 0.2;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
else if (!result.version_tag) {
|
|
217
|
+
score += 0.12;
|
|
218
|
+
}
|
|
219
|
+
else {
|
|
220
|
+
score += parseInt(result.version_tag, 10) / 500;
|
|
221
|
+
}
|
|
222
|
+
if (plan.intent === "overview" || plan.intent === "getting_started") {
|
|
223
|
+
if (result.path_type === "getting_started" ||
|
|
224
|
+
result.path_type === "overview") {
|
|
225
|
+
score += 0.02;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
return score;
|
|
229
|
+
}
|
|
230
|
+
canonicalPageKey(result) {
|
|
231
|
+
const canonicalPath = result.page_path
|
|
232
|
+
.toLowerCase()
|
|
233
|
+
.replace(/\/v\d+(?=\/|$)/g, "")
|
|
234
|
+
.replace(/\/+/g, "/")
|
|
235
|
+
.replace(/\/$/, "") || "/";
|
|
236
|
+
const canonicalTitle = normalizeSearchText(result.page_title)
|
|
237
|
+
.replace(/\bv\d+\b/g, "")
|
|
238
|
+
.trim();
|
|
239
|
+
return `${result.library_name}:${canonicalPath}:${canonicalTitle}`;
|
|
240
|
+
}
|
|
241
|
+
inferPathType(pagePath, pageTitle) {
|
|
242
|
+
const value = `${pagePath} ${pageTitle}`.toLowerCase();
|
|
243
|
+
if (/getting-started|quickstart|install|installation|setup/.test(value)) {
|
|
244
|
+
return "getting_started";
|
|
245
|
+
}
|
|
246
|
+
if (/overview|introduction|what is|basics/.test(value)) {
|
|
247
|
+
return "overview";
|
|
248
|
+
}
|
|
249
|
+
if (/\/api|\bapi\b|reference|\/apis\//.test(value)) {
|
|
250
|
+
return "api";
|
|
251
|
+
}
|
|
252
|
+
if (/troubleshoot|troubleshooting|errors?|faq|debug/.test(value)) {
|
|
253
|
+
return "troubleshooting";
|
|
254
|
+
}
|
|
255
|
+
return "guide";
|
|
256
|
+
}
|
|
257
|
+
pathTypeScore(intent, pathType) {
|
|
258
|
+
switch (intent) {
|
|
259
|
+
case "overview":
|
|
260
|
+
if (pathType === "overview")
|
|
261
|
+
return 0.16;
|
|
262
|
+
if (pathType === "getting_started")
|
|
263
|
+
return 0.1;
|
|
264
|
+
if (pathType === "guide")
|
|
265
|
+
return 0.06;
|
|
266
|
+
return 0;
|
|
267
|
+
case "getting_started":
|
|
268
|
+
if (pathType === "getting_started")
|
|
269
|
+
return 0.18;
|
|
270
|
+
if (pathType === "guide")
|
|
271
|
+
return 0.08;
|
|
272
|
+
if (pathType === "overview")
|
|
273
|
+
return 0.06;
|
|
274
|
+
return 0;
|
|
275
|
+
case "api_lookup":
|
|
276
|
+
if (pathType === "api")
|
|
277
|
+
return 0.18;
|
|
278
|
+
if (pathType === "guide")
|
|
279
|
+
return 0.04;
|
|
280
|
+
return 0;
|
|
281
|
+
case "troubleshooting":
|
|
282
|
+
if (pathType === "troubleshooting")
|
|
283
|
+
return 0.16;
|
|
284
|
+
if (pathType === "guide")
|
|
285
|
+
return 0.07;
|
|
286
|
+
return 0;
|
|
287
|
+
default:
|
|
288
|
+
if (pathType === "getting_started")
|
|
289
|
+
return 0.08;
|
|
290
|
+
if (pathType === "overview")
|
|
291
|
+
return 0.07;
|
|
292
|
+
if (pathType === "api")
|
|
293
|
+
return 0.05;
|
|
294
|
+
if (pathType === "guide")
|
|
295
|
+
return 0.03;
|
|
296
|
+
return 0;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
keywordOverlap(keywords, haystack) {
|
|
300
|
+
if (keywords.length === 0 || haystack.length === 0) {
|
|
301
|
+
return 0;
|
|
302
|
+
}
|
|
303
|
+
const matches = keywords.filter((keyword) => haystack.includes(keyword)).length;
|
|
304
|
+
return matches / keywords.length;
|
|
305
|
+
}
|
|
306
|
+
hasPhraseMatch(plan, title, heading, content) {
|
|
307
|
+
if (plan.normalized_query.includes(" ") &&
|
|
308
|
+
(title.includes(plan.normalized_query) ||
|
|
309
|
+
heading.includes(plan.normalized_query))) {
|
|
310
|
+
return true;
|
|
311
|
+
}
|
|
312
|
+
return plan.phrases.some((phrase) => title.includes(phrase) ||
|
|
313
|
+
heading.includes(phrase) ||
|
|
314
|
+
content.includes(phrase));
|
|
315
|
+
}
|
|
316
|
+
primaryTitle(title) {
|
|
317
|
+
return title.split(/\||—|-/)[0]?.trim() ?? title;
|
|
318
|
+
}
|
|
319
|
+
extractVersionTag(path) {
|
|
320
|
+
const match = path.toLowerCase().match(/\/v(\d+)(?=\/|$)/);
|
|
321
|
+
return match?.[1] ?? null;
|
|
48
322
|
}
|
|
49
323
|
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
// src/tools/search-docs.ts — Primary search tool (80% of usage)
|
|
2
2
|
import * as v from 'valibot';
|
|
3
3
|
import { tool } from 'tmcp/utils';
|
|
4
|
+
import { formatSearchResults } from '../search/format-results.js';
|
|
4
5
|
export function createSearchDocsTool(searchEngine) {
|
|
5
6
|
return {
|
|
6
7
|
definition: {
|
|
@@ -20,16 +21,7 @@ export function createSearchDocsTool(searchEngine) {
|
|
|
20
21
|
if (results.length === 0) {
|
|
21
22
|
return tool.text(`No results found for "${query}".`);
|
|
22
23
|
}
|
|
23
|
-
|
|
24
|
-
.map((r, i) => {
|
|
25
|
-
let block = `### ${i + 1}. ${r.page_title} — ${r.library_display_name}\n`;
|
|
26
|
-
block += `**Source:** ${r.page_url}\n`;
|
|
27
|
-
block += `**Section:** ${r.heading_context}\n\n`;
|
|
28
|
-
block += r.content;
|
|
29
|
-
return block;
|
|
30
|
-
})
|
|
31
|
-
.join('\n\n---\n\n');
|
|
32
|
-
return tool.text(`## Results for "${query}"\n\n${formatted}`);
|
|
24
|
+
return tool.text(formatSearchResults(query, results));
|
|
33
25
|
},
|
|
34
26
|
};
|
|
35
27
|
}
|
package/dist/types.d.ts
CHANGED
package/dist/version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "0.1.
|
|
1
|
+
export declare const VERSION = "0.1.21";
|
package/dist/version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// This file is automatically updated by the version sync script.
|
|
2
|
-
export const VERSION = '0.1.
|
|
2
|
+
export const VERSION = '0.1.21';
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "docshark",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.21",
|
|
4
4
|
"description": "🦈 Documentation MCP Server — scrape, index, and search any doc website",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -56,7 +56,7 @@
|
|
|
56
56
|
"dependencies": {
|
|
57
57
|
"@mozilla/readability": "^0.6.0",
|
|
58
58
|
"@tmcp/adapter-valibot": "^0.1.5",
|
|
59
|
-
"@tmcp/transport-http": "^0.8.
|
|
59
|
+
"@tmcp/transport-http": "^0.8.5",
|
|
60
60
|
"@tmcp/transport-sse": "^0.5.3",
|
|
61
61
|
"@tmcp/transport-stdio": "^0.4.1",
|
|
62
62
|
"cac": "^7.0.0",
|
|
@@ -66,7 +66,7 @@
|
|
|
66
66
|
"puppeteer-core": "^24.37.5",
|
|
67
67
|
"robots-parser": "^3.0.1",
|
|
68
68
|
"srvx": "^0.11.8",
|
|
69
|
-
"tmcp": "^1.19.
|
|
69
|
+
"tmcp": "^1.19.3",
|
|
70
70
|
"turndown": "^7.2.2",
|
|
71
71
|
"turndown-plugin-gfm": "^1.0.2",
|
|
72
72
|
"valibot": "^1.2.0"
|