docshark 0.1.20 → 0.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -5,6 +5,7 @@ import { startHttpServer } from "./http.js";
5
5
  import { StdioTransport } from "@tmcp/transport-stdio";
6
6
  import { server, db, searchEngine, libraryService } from "./server.js";
7
7
  import { maybeNotifyAboutUpdate, runUpdateCommand } from "./cli-update.js";
8
+ import { formatSearchResults } from "./search/format-results.js";
8
9
  import { VERSION } from "./version.js";
9
10
  const useColor = process.stdout.isTTY;
10
11
  const color = {
@@ -160,12 +161,7 @@ cli
160
161
  console.log(`\nNo results found for "${query}".\n`);
161
162
  return;
162
163
  }
163
- for (const r of results) {
164
- console.log(`\n--- ${r.page_title} (${r.library_display_name}) ---`);
165
- console.log(`Section: ${r.heading_context}`);
166
- console.log(r.content.slice(0, 300));
167
- console.log(`Source: ${r.page_url}\n`);
168
- }
164
+ console.log(`\n${formatSearchResults(query, results)}\n`);
169
165
  });
170
166
  cli
171
167
  .command("list", "List indexed libraries")
@@ -9,6 +9,7 @@ export declare class JobManager {
9
9
  /** Start a crawl job for a library */
10
10
  startCrawl(libraryId: string, opts?: {
11
11
  incremental?: boolean;
12
+ sessionId?: string;
12
13
  }): CrawlJob;
13
14
  /** Get status of a specific job */
14
15
  getJob(jobId: string): CrawlJob | undefined;
@@ -12,7 +12,7 @@ export class JobManager {
12
12
  /** Start a crawl job for a library */
13
13
  startCrawl(libraryId, opts) {
14
14
  const jobId = nanoid();
15
- const job = this.db.createJob({ id: jobId, libraryId });
15
+ const job = this.db.createJob({ id: jobId, libraryId, sessionId: opts?.sessionId });
16
16
  // Run crawl async (non-blocking)
17
17
  const worker = new CrawlWorker(this.db, this.eventBus);
18
18
  this.activeJobs.set(jobId, worker);
@@ -2,7 +2,7 @@
2
2
  export class RateLimiter {
3
3
  delayMs;
4
4
  lastRequest = 0;
5
- constructor(delayMs = 500) {
5
+ constructor(delayMs = 200) {
6
6
  this.delayMs = delayMs;
7
7
  }
8
8
  async wait() {
@@ -0,0 +1,2 @@
1
+ import type { SearchResult } from './types.js';
2
+ export declare function formatSearchResults(query: string, results: SearchResult[]): string;
@@ -0,0 +1,23 @@
1
+ import { sanitizeDocContent } from './sanitize.js';
2
+ function formatReasons(reasons) {
3
+ if (reasons.length === 0) {
4
+ return '';
5
+ }
6
+ return `**Why this ranked highly:** ${reasons.join(', ')}\n\n`;
7
+ }
8
+ export function formatSearchResults(query, results) {
9
+ const formatted = results
10
+ .map((result, index) => {
11
+ let block = `### ${index + 1}. ${result.page_title} — ${result.library_display_name}\n`;
12
+ block += `**Source:** ${result.page_url}\n`;
13
+ if (result.heading_context.trim().length > 0) {
14
+ block += `**Section:** ${result.heading_context}\n`;
15
+ }
16
+ // Sanitize content to prevent prompt injection
17
+ const sanitizedContent = sanitizeDocContent(result.content);
18
+ block += `${formatReasons(result.reasons)}${sanitizedContent}`;
19
+ return block;
20
+ })
21
+ .join('\n\n---\n\n');
22
+ return `## Results for "${query}"\n\n${formatted}`;
23
+ }
@@ -0,0 +1,7 @@
1
+ import type { SearchPlan } from "./types.js";
2
+ export declare function normalizeSearchText(value: string): string;
3
+ export declare class QueryPlanner {
4
+ build(query: string, library?: string): SearchPlan;
5
+ private detectIntent;
6
+ private extractVersion;
7
+ }
@@ -0,0 +1,88 @@
1
+ const STOP_WORDS = new Set([
2
+ "a",
3
+ "an",
4
+ "and",
5
+ "are",
6
+ "at",
7
+ "do",
8
+ "for",
9
+ "how",
10
+ "i",
11
+ "in",
12
+ "is",
13
+ "of",
14
+ "on",
15
+ "or",
16
+ "the",
17
+ "to",
18
+ "what",
19
+ "with",
20
+ ]);
21
+ const PHRASE_HINTS = [
22
+ "getting started",
23
+ "quickstart",
24
+ "overview",
25
+ "reference",
26
+ "api",
27
+ "troubleshooting",
28
+ ];
29
+ export function normalizeSearchText(value) {
30
+ return value
31
+ .toLowerCase()
32
+ .replace(/[^a-z0-9@/._\-\s]+/g, " ")
33
+ .replace(/\s+/g, " ")
34
+ .trim();
35
+ }
36
+ function sanitizeToken(value) {
37
+ return value.replace(/^[^a-z0-9@/._-]+|[^a-z0-9@/._-]+$/gi, "").toLowerCase();
38
+ }
39
+ export class QueryPlanner {
40
+ build(query, library) {
41
+ const normalizedQuery = normalizeSearchText(query);
42
+ const rawTokens = normalizedQuery
43
+ .split(/\s+/)
44
+ .map((token) => sanitizeToken(token))
45
+ .filter(Boolean);
46
+ const filteredKeywords = Array.from(new Set(rawTokens.filter((token) => token.length > 1 && !STOP_WORDS.has(token))));
47
+ return {
48
+ original_query: query,
49
+ normalized_query: normalizedQuery,
50
+ intent: this.detectIntent(normalizedQuery),
51
+ keywords: filteredKeywords.length > 0
52
+ ? filteredKeywords
53
+ : Array.from(new Set(rawTokens)),
54
+ phrases: PHRASE_HINTS.filter((phrase) => normalizedQuery.includes(phrase)),
55
+ requested_library: library,
56
+ requested_version: this.extractVersion(normalizedQuery),
57
+ };
58
+ }
59
+ detectIntent(query) {
60
+ if (query.includes("getting started") ||
61
+ query.includes("quickstart") ||
62
+ query.startsWith("install ") ||
63
+ query.startsWith("setup ")) {
64
+ return "getting_started";
65
+ }
66
+ if (query.includes("overview") ||
67
+ query.startsWith("what is ") ||
68
+ query.startsWith("about ")) {
69
+ return "overview";
70
+ }
71
+ if (/[a-z]+\.[a-z]+/.test(query) ||
72
+ /[A-Z][a-zA-Z]+\(/.test(query) ||
73
+ query.includes(" api") ||
74
+ query.endsWith(" api") ||
75
+ query.includes("reference") ||
76
+ query.includes("@")) {
77
+ return "api_lookup";
78
+ }
79
+ if (/error|fail|issue|problem|broken|debug|fix|troubleshoot/.test(query)) {
80
+ return "troubleshooting";
81
+ }
82
+ return "general";
83
+ }
84
+ extractVersion(query) {
85
+ const match = query.match(/\bv(?:ersion)?\s*(\d+)\b/);
86
+ return match?.[1];
87
+ }
88
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Output sanitization to prevent prompt injection attacks
3
+ * Removes suspicious patterns that could escape agent context
4
+ */
5
+ export declare function sanitizeOutput(text: string): string;
6
+ /**
7
+ * Sanitize a single chunk of documentation content
8
+ * Removes malicious content while preserving code blocks and formatting
9
+ */
10
+ export declare function sanitizeDocContent(content: string): string;
@@ -0,0 +1,40 @@
1
+ /**
2
+ * Output sanitization to prevent prompt injection attacks
3
+ * Removes suspicious patterns that could escape agent context
4
+ */
5
+ export function sanitizeOutput(text) {
6
+ return text
7
+ // Remove template directives
8
+ .replace(/\{[#%].*?[#%]\}/g, '')
9
+ // Remove system prompt markers
10
+ .replace(/\[SYSTEM[\]:].*?\[\/SYSTEM\]/gi, '')
11
+ .replace(/\[ADMIN[\]:].*?\[\/ADMIN\]/gi, '')
12
+ // Remove potential prompt injection patterns
13
+ .replace(/ignore\s+above.*?instructions/gi, '')
14
+ .replace(/forget\s+previous.*?context/gi, '')
15
+ .trim();
16
+ }
17
+ /**
18
+ * Sanitize a single chunk of documentation content
19
+ * Removes malicious content while preserving code blocks and formatting
20
+ */
21
+ export function sanitizeDocContent(content) {
22
+ // First sanitize for injection patterns
23
+ let sanitized = sanitizeOutput(content);
24
+ // Escape potential dangerous markdown constructs
25
+ // but preserve code blocks (between triple backticks)
26
+ const codeBlockPattern = /```[\s\S]*?```/g;
27
+ const codeBlocks = sanitized.match(codeBlockPattern) || [];
28
+ // Temporarily replace code blocks
29
+ let temp = sanitized;
30
+ codeBlocks.forEach((block, i) => {
31
+ temp = temp.replace(block, `__CODE_BLOCK_${i}__`);
32
+ });
33
+ // Sanitize outside code blocks
34
+ temp = sanitizeOutput(temp);
35
+ // Restore code blocks
36
+ codeBlocks.forEach((block, i) => {
37
+ temp = temp.replace(`__CODE_BLOCK_${i}__`, block);
38
+ });
39
+ return temp;
40
+ }
@@ -0,0 +1,33 @@
1
+ export type SearchIntent = "general" | "overview" | "getting_started" | "api_lookup" | "troubleshooting";
2
+ export interface SearchOptions {
3
+ library?: string;
4
+ limit?: number;
5
+ }
6
+ export interface SearchPlan {
7
+ original_query: string;
8
+ normalized_query: string;
9
+ intent: SearchIntent;
10
+ keywords: string[];
11
+ phrases: string[];
12
+ requested_version?: string;
13
+ requested_library?: string;
14
+ }
15
+ export interface SearchCandidate {
16
+ content: string;
17
+ heading_context: string;
18
+ page_url: string;
19
+ page_path: string;
20
+ page_title: string;
21
+ library_name: string;
22
+ library_display_name: string;
23
+ lexical_score: number;
24
+ has_code_block: boolean;
25
+ token_count: number;
26
+ chunk_index: number;
27
+ }
28
+ export interface SearchResult extends SearchCandidate {
29
+ rerank_score: number;
30
+ reasons: string[];
31
+ path_type: string;
32
+ version_tag: string | null;
33
+ }
@@ -0,0 +1 @@
1
+ export {};
package/dist/server.js CHANGED
@@ -5,6 +5,7 @@ import * as v from "valibot";
5
5
  import { tool } from "tmcp/utils";
6
6
  import { Database } from "./storage/db.js";
7
7
  import { SearchEngine } from "./storage/search.js";
8
+ import { formatSearchResults } from "./search/format-results.js";
8
9
  import { LibraryService } from "./services/library.js";
9
10
  import { JobManager } from "./jobs/manager.js";
10
11
  import { VERSION } from "./version.js";
@@ -32,29 +33,27 @@ export const server = new McpServer({
32
33
  // ──────────────────────────────────────
33
34
  server.tool({
34
35
  name: "search_docs",
35
- description: "Search through indexed documentation libraries for relevant information. " +
36
- "Returns ranked documentation sections with code examples and source URLs. " +
37
- "Use this when you need to find information about a library, framework, API, " +
38
- "or any technical concept.",
36
+ description: "Search indexed docs by keyword or library. Returns ranked sections with URLs.",
37
+ annotations: {
38
+ readOnlyHint: true,
39
+ idempotentHint: true,
40
+ },
39
41
  schema: v.object({
40
42
  query: v.pipe(v.string(), v.description("Search query. Use natural language.")),
41
43
  library: v.optional(v.pipe(v.string(), v.description("Filter to a specific library."))),
42
44
  limit: v.optional(v.pipe(v.number(), v.integer(), v.minValue(1), v.maxValue(20)), 5),
43
45
  }),
44
46
  }, async ({ query, library, limit }) => {
45
- const results = searchEngine.search(query, { library, limit });
46
- if (results.length === 0)
47
- return tool.text(`No results found for "${query}".`);
48
- const formatted = results
49
- .map((r, i) => {
50
- let block = `### ${i + 1}. ${r.page_title} — ${r.library_display_name}\n`;
51
- block += `**Source:** ${r.page_url}\n`;
52
- block += `**Section:** ${r.heading_context}\n\n`;
53
- block += r.content;
54
- return block;
55
- })
56
- .join("\n\n---\n\n");
57
- return tool.text(`## Results for "${query}"\n\n${formatted}`);
47
+ try {
48
+ const results = searchEngine.search(query, { library, limit });
49
+ if (results.length === 0)
50
+ return tool.text(`No results found for "${query}".`);
51
+ return tool.text(formatSearchResults(query, results));
52
+ }
53
+ catch (err) {
54
+ const message = err instanceof Error ? err.message : "Search failed";
55
+ return tool.text(`❌ Error: ${message}`);
56
+ }
58
57
  });
59
58
  function requireValue(value, message) {
60
59
  if (value === undefined || value === null || value === "") {
@@ -89,52 +88,84 @@ function formatLibraryInfo(libraryId) {
89
88
  return output;
90
89
  }
91
90
  // ──────────────────────────────────────
92
- // Tool 2: list_libraries — Discovery tool
91
+ // Tool 2: list_libraries — Discovery tool with pagination
93
92
  // ──────────────────────────────────────
94
93
  server.tool({
95
94
  name: "list_libraries",
96
- description: "List all documentation libraries currently indexed and available for searching. " +
97
- "Use this to discover what docs are available before running search_docs.",
95
+ description: "List indexed documentation libraries. Paginated results.",
96
+ annotations: {
97
+ readOnlyHint: true,
98
+ idempotentHint: true,
99
+ },
98
100
  schema: v.object({
99
101
  status: v.optional(v.pipe(v.picklist(["indexed", "crawling", "error", "all"]), v.description('Filter by status. Default: "all".')), "all"),
102
+ page: v.optional(v.pipe(v.number(), v.integer(), v.minValue(1)), 1),
103
+ limit: v.optional(v.pipe(v.number(), v.integer(), v.minValue(1), v.maxValue(50)), 20),
100
104
  }),
101
- }, async ({ status }) => {
102
- const libraries = db.listLibraries(status);
103
- if (libraries.length === 0) {
104
- return tool.text("No libraries indexed yet. Use manage_library with action=add to add a documentation website.");
105
+ }, async ({ status, page = 1, limit = 20 }) => {
106
+ try {
107
+ const libraries = db.listLibraries(status);
108
+ if (libraries.length === 0) {
109
+ return tool.text("No libraries indexed yet. Use manage_library with action=add to add a documentation website.");
110
+ }
111
+ // Paginate results
112
+ const start = (page - 1) * limit;
113
+ const end = start + limit;
114
+ const paginated = libraries.slice(start, end);
115
+ const hasMore = end < libraries.length;
116
+ // Minified response (no pretty-printing)
117
+ let output = `## Libraries (${start + 1}-${Math.min(end, libraries.length)} of ${libraries.length})\n\n`;
118
+ output += "| Library | URL | Pages | Chunks | Status |\n";
119
+ output += "| ------- | --- | ----- | ------ | ------ |\n";
120
+ for (const lib of paginated) {
121
+ output += `|${lib.name}|${lib.url}|${lib.page_count}|${lib.chunk_count}|${lib.status}|\n`;
122
+ }
123
+ if (hasMore) {
124
+ output += `\n**More available.** Use page=${page + 1} to fetch next page.`;
125
+ }
126
+ return tool.text(output);
105
127
  }
106
- let output = `## Indexed Libraries (${libraries.length} total)\n\n`;
107
- output += "| Library | URL | Pages | Chunks | Status |\n";
108
- output += "| ------- | --- | ----- | ------ | ------ |\n";
109
- for (const lib of libraries) {
110
- output += `| ${lib.name} | ${lib.url} | ${lib.page_count} | ${lib.chunk_count} | ${lib.status} |\n`;
128
+ catch (err) {
129
+ const message = err instanceof Error ? err.message : "Failed to list libraries";
130
+ return tool.text(`❌ Error: ${message}`);
111
131
  }
112
- return tool.text(output);
113
132
  });
114
133
  // ──────────────────────────────────────
115
134
  // Tool 3: get_doc_page — Full page read
116
135
  // ──────────────────────────────────────
117
136
  server.tool({
118
137
  name: "get_doc_page",
119
- description: "Retrieve the complete content of a specific documentation page as markdown. " +
120
- "Use when search results reference a page and you need full context.",
138
+ description: "Retrieve complete documentation page as markdown.",
139
+ annotations: {
140
+ readOnlyHint: true,
141
+ idempotentHint: true,
142
+ },
121
143
  schema: v.object({
122
144
  url: v.optional(v.pipe(v.string(), v.description("The full URL of the documentation page."))),
123
145
  library: v.optional(v.pipe(v.string(), v.description("Library name to search within."))),
124
146
  path: v.optional(v.pipe(v.string(), v.description("Relative path within the library."))),
125
147
  }),
126
148
  }, async ({ url, library, path }) => {
127
- const page = db.getPage({ url, library, path });
128
- if (!page)
129
- return tool.text("Page not found. Use search_docs to find the correct page.");
130
- return tool.text(`# ${page.title}\n**Source:** ${page.url}\n\n${page.content_markdown}`);
149
+ try {
150
+ const page = db.getPage({ url, library, path });
151
+ if (!page)
152
+ return tool.text("Page not found. Use search_docs to find the correct page.");
153
+ return tool.text(`# ${page.title}\n**Source:** ${page.url}\n\n${page.content_markdown}`);
154
+ }
155
+ catch (err) {
156
+ const message = err instanceof Error ? err.message : "Failed to fetch page";
157
+ return tool.text(`❌ Error: ${message}`);
158
+ }
131
159
  });
132
160
  // ──────────────────────────────────────
133
161
  // Tool 4: manage_library — Create, rename, refresh, remove, inspect
134
162
  // ──────────────────────────────────────
135
163
  server.tool({
136
164
  name: "manage_library",
137
- description: "Manage a documentation library lifecycle. Use action=add to crawl a new source, action=rename to change the library name, action=refresh to re-crawl, action=remove to delete it, or action=info to inspect its pages and stats.",
165
+ description: "Manage library lifecycle: add/rename/refresh/remove/info. Destructive actions require confirmation.",
166
+ annotations: {
167
+ destructiveHint: true,
168
+ },
138
169
  schema: v.object({
139
170
  action: v.pipe(v.picklist(["add", "rename", "refresh", "remove", "info"]), v.description("The management action to perform.")),
140
171
  url: v.optional(v.pipe(v.string(), v.url(), v.description("Base URL of the documentation website."))),
@@ -170,7 +201,7 @@ server.tool({
170
201
  const libraryName = requireValue(input.library, "library is required for action=refresh.");
171
202
  const lib = db.getLibraryByName(libraryName);
172
203
  if (!lib)
173
- return tool.text(`Library "${libraryName}" not found. Use list_libraries to see available.`);
204
+ return tool.text(`❌ Library "${libraryName}" not found. Use list_libraries to see available.`);
174
205
  const job = jobManager.startCrawl(lib.id, { incremental: true });
175
206
  return tool.text(`🔄 Refresh started for "${lib.display_name}".\nJob ${job.id}: checking for updated pages...`);
176
207
  }
@@ -178,7 +209,7 @@ server.tool({
178
209
  const libraryName = requireValue(input.library, "library is required for action=remove.");
179
210
  const lib = db.getLibraryByName(libraryName);
180
211
  if (!lib)
181
- return tool.text(`Library "${libraryName}" not found.`);
212
+ return tool.text(`❌ Library "${libraryName}" not found.`);
182
213
  db.removeLibrary(lib.id);
183
214
  return tool.text(`🗑️ Library "${lib.display_name}" removed.\nDeleted ${lib.page_count} pages and ${lib.chunk_count} chunks.`);
184
215
  }
@@ -186,14 +217,14 @@ server.tool({
186
217
  const libraryName = requireValue(input.library, "library is required for action=info.");
187
218
  const lib = db.getLibraryByName(libraryName);
188
219
  if (!lib)
189
- return tool.text(`Library "${libraryName}" not found. Use list_libraries to see available libraries.`);
220
+ return tool.text(`❌ Library "${libraryName}" not found. Use list_libraries to see available libraries.`);
190
221
  return tool.text(formatLibraryInfo(lib.id));
191
222
  }
192
223
  }
193
224
  }
194
225
  catch (err) {
195
226
  const message = err instanceof Error ? err.message : "Unknown error";
196
- return tool.text(`❌ Failed: ${message}`);
227
+ return tool.text(`❌ Error: ${message}`);
197
228
  }
198
- return tool.text(`❌ Failed: Unsupported action.`);
229
+ return tool.text(`❌ Error: Unsupported action.`);
199
230
  });
@@ -51,6 +51,7 @@ export declare class Database {
51
51
  createJob(job: {
52
52
  id: string;
53
53
  libraryId: string;
54
+ sessionId?: string;
54
55
  }): CrawlJob;
55
56
  getJob(id: string): CrawlJob | undefined;
56
57
  updateJob(id: string, updates: Partial<Pick<CrawlJob, "status" | "pages_discovered" | "pages_crawled" | "pages_failed" | "chunks_created" | "error_message" | "started_at" | "completed_at">>): void;
@@ -92,6 +92,7 @@ export class Database {
92
92
  CREATE TABLE IF NOT EXISTS crawl_jobs (
93
93
  id TEXT PRIMARY KEY,
94
94
  library_id TEXT NOT NULL REFERENCES libraries(id) ON DELETE CASCADE,
95
+ session_id TEXT,
95
96
  status TEXT NOT NULL DEFAULT 'queued',
96
97
  pages_discovered INTEGER NOT NULL DEFAULT 0,
97
98
  pages_crawled INTEGER NOT NULL DEFAULT 0,
@@ -213,8 +214,8 @@ export class Database {
213
214
  // ──────────────────────────────────────
214
215
  createJob(job) {
215
216
  this.db
216
- .prepare("INSERT INTO crawl_jobs (id, library_id) VALUES (?, ?)")
217
- .run(job.id, job.libraryId);
217
+ .prepare("INSERT INTO crawl_jobs (id, library_id, session_id) VALUES (?, ?, ?)")
218
+ .run(job.id, job.libraryId, job.sessionId ?? null);
218
219
  return this.db
219
220
  .prepare("SELECT * FROM crawl_jobs WHERE id = ?")
220
221
  .get(job.id);
@@ -1,21 +1,23 @@
1
- import type { Database } from './db.js';
2
- export interface SearchResult {
3
- content: string;
4
- heading_context: string;
5
- page_url: string;
6
- page_title: string;
7
- library_name: string;
8
- library_display_name: string;
9
- relevance_score: number;
10
- has_code_block: boolean;
11
- token_count: number;
12
- }
1
+ import type { Database } from "./db.js";
2
+ import type { SearchOptions, SearchResult } from "../search/types.js";
3
+ export type { SearchOptions, SearchResult } from "../search/types.js";
13
4
  export declare class SearchEngine {
14
5
  private db;
6
+ private planner;
15
7
  constructor(db: Database);
16
- search(query: string, opts?: {
17
- library?: string;
18
- limit?: number;
19
- }): SearchResult[];
20
- private sanitizeQuery;
8
+ search(query: string, opts?: SearchOptions): SearchResult[];
9
+ private fetchCandidates;
10
+ private buildFtsQuery;
11
+ private quoteTerm;
12
+ private rerank;
13
+ private scoreCandidate;
14
+ private collapseDuplicates;
15
+ private preferenceScore;
16
+ private canonicalPageKey;
17
+ private inferPathType;
18
+ private pathTypeScore;
19
+ private keywordOverlap;
20
+ private hasPhraseMatch;
21
+ private primaryTitle;
22
+ private extractVersionTag;
21
23
  }
@@ -1,49 +1,323 @@
1
+ import { QueryPlanner, normalizeSearchText } from "../search/query-planner.js";
1
2
  export class SearchEngine {
2
3
  db;
4
+ planner = new QueryPlanner();
3
5
  constructor(db) {
4
6
  this.db = db;
5
7
  }
6
8
  search(query, opts = {}) {
7
9
  const limit = opts.limit ?? 5;
8
- const ftsQuery = this.sanitizeQuery(query);
10
+ const plan = this.planner.build(query, opts.library);
11
+ const ftsQuery = this.buildFtsQuery(plan);
9
12
  if (!ftsQuery)
10
13
  return [];
11
14
  try {
12
- const stmt = this.db.raw().prepare(`
13
- SELECT
14
- c.content,
15
- c.heading_context,
16
- c.has_code_block,
17
- c.token_count,
18
- p.url AS page_url,
19
- p.title AS page_title,
20
- l.name AS library_name,
21
- l.display_name AS library_display_name,
22
- bm25(chunks_fts, 1.0, 0.5) AS relevance_score
23
- FROM chunks_fts
24
- JOIN chunks c ON chunks_fts.rowid = c.rowid
25
- JOIN pages p ON c.page_id = p.id
26
- JOIN libraries l ON c.library_id = l.id
27
- WHERE chunks_fts MATCH ?
28
- AND (? IS NULL OR l.name = ?)
29
- ORDER BY relevance_score
30
- LIMIT ?
31
- `);
32
- return stmt.all(ftsQuery, opts.library ?? null, opts.library ?? null, limit);
15
+ const candidates = this.fetchCandidates(ftsQuery, opts.library, limit);
16
+ if (candidates.length === 0) {
17
+ return [];
18
+ }
19
+ const reranked = this.rerank(plan, candidates);
20
+ return this.collapseDuplicates(plan, reranked).slice(0, limit);
33
21
  }
34
22
  catch (err) {
35
- // FTS5 query might fail with bad syntax — return empty
36
23
  console.warn(`[DocShark] Search failed:`, err.message);
37
24
  return [];
38
25
  }
39
26
  }
40
- sanitizeQuery(query) {
41
- // Remove FTS5 special operators for safety, wrap terms in quotes
42
- return query
43
- .replace(/['"]/g, '')
44
- .split(/\s+/)
45
- .filter(Boolean)
46
- .map((term) => `"${term}"`)
47
- .join(' OR ');
27
+ fetchCandidates(ftsQuery, library, limit) {
28
+ const candidateLimit = Math.min(Math.max(limit * 12, 25), 80);
29
+ const stmt = this.db.raw().prepare(`
30
+ SELECT
31
+ c.content,
32
+ COALESCE(c.heading_context, '') AS heading_context,
33
+ c.has_code_block,
34
+ COALESCE(c.token_count, 0) AS token_count,
35
+ c.chunk_index,
36
+ p.url AS page_url,
37
+ p.path AS page_path,
38
+ COALESCE(p.title, 'Untitled') AS page_title,
39
+ l.name AS library_name,
40
+ l.display_name AS library_display_name,
41
+ bm25(chunks_fts, 1.0, 0.7) AS lexical_score
42
+ FROM chunks_fts
43
+ JOIN chunks c ON chunks_fts.rowid = c.rowid
44
+ JOIN pages p ON c.page_id = p.id
45
+ JOIN libraries l ON c.library_id = l.id
46
+ WHERE chunks_fts MATCH ?
47
+ AND (? IS NULL OR l.name = ?)
48
+ ORDER BY lexical_score
49
+ LIMIT ?
50
+ `);
51
+ const rows = stmt.all(ftsQuery, library ?? null, library ?? null, candidateLimit);
52
+ return rows.map((row) => ({
53
+ ...row,
54
+ has_code_block: row.has_code_block === 1,
55
+ }));
56
+ }
57
+ buildFtsQuery(plan) {
58
+ const clauses = new Set();
59
+ const exactQuery = this.quoteTerm(plan.normalized_query);
60
+ if (plan.normalized_query.length > 0) {
61
+ clauses.add(exactQuery);
62
+ }
63
+ for (const phrase of plan.phrases) {
64
+ clauses.add(this.quoteTerm(phrase));
65
+ }
66
+ for (const keyword of plan.keywords) {
67
+ clauses.add(this.quoteTerm(keyword));
68
+ }
69
+ if (plan.keywords.length > 1 && plan.keywords.length <= 6) {
70
+ clauses.add(`(${plan.keywords.map((keyword) => this.quoteTerm(keyword)).join(" AND ")})`);
71
+ }
72
+ return Array.from(clauses).join(" OR ");
73
+ }
74
+ quoteTerm(value) {
75
+ return `"${value.replace(/["']/g, "").trim()}"`;
76
+ }
77
+ rerank(plan, candidates) {
78
+ const total = Math.max(candidates.length, 1);
79
+ return candidates
80
+ .map((candidate, index) => this.scoreCandidate(plan, candidate, index, total))
81
+ .sort((left, right) => {
82
+ if (right.rerank_score !== left.rerank_score) {
83
+ return right.rerank_score - left.rerank_score;
84
+ }
85
+ return left.lexical_score - right.lexical_score;
86
+ });
87
+ }
88
+ scoreCandidate(plan, candidate, index, total) {
89
+ const title = normalizeSearchText(candidate.page_title);
90
+ const primaryTitle = normalizeSearchText(this.primaryTitle(candidate.page_title));
91
+ const heading = normalizeSearchText(candidate.heading_context);
92
+ const path = normalizeSearchText(candidate.page_path);
93
+ const libraryText = normalizeSearchText(`${candidate.library_name} ${candidate.library_display_name}`);
94
+ const contentPreview = normalizeSearchText(candidate.content.slice(0, 800));
95
+ const pathType = this.inferPathType(candidate.page_path, candidate.page_title);
96
+ const versionTag = this.extractVersionTag(candidate.page_path);
97
+ const reasons = [];
98
+ let score = 0;
99
+ const lexicalRankScore = 0.35 * (1 - index / total);
100
+ score += lexicalRankScore;
101
+ const titleExact = primaryTitle.includes(plan.normalized_query) &&
102
+ plan.normalized_query.length > 0;
103
+ if (titleExact) {
104
+ score += 0.22;
105
+ reasons.push("exact title match");
106
+ }
107
+ const titleOverlap = this.keywordOverlap(plan.keywords, primaryTitle || title);
108
+ if (titleOverlap > 0) {
109
+ score += 0.14 * titleOverlap;
110
+ if (titleOverlap === 1 && !titleExact) {
111
+ reasons.push("all query keywords appear in title");
112
+ }
113
+ }
114
+ const headingOverlap = this.keywordOverlap(plan.keywords, heading);
115
+ if (headingOverlap > 0) {
116
+ score += 0.1 * headingOverlap;
117
+ if (headingOverlap >= 0.6) {
118
+ reasons.push("heading context aligns with the query");
119
+ }
120
+ }
121
+ const pathOverlap = this.keywordOverlap(plan.keywords, path);
122
+ if (pathOverlap > 0) {
123
+ score += 0.08 * pathOverlap;
124
+ }
125
+ const libraryOverlap = this.keywordOverlap(plan.keywords, libraryText);
126
+ if (libraryOverlap > 0) {
127
+ score += 0.08 * libraryOverlap;
128
+ if (libraryOverlap === 1) {
129
+ reasons.push("library name aligns with the query");
130
+ }
131
+ }
132
+ else if (plan.keywords.length === 1) {
133
+ score -= 0.03;
134
+ }
135
+ const phraseMatch = this.hasPhraseMatch(plan, title, heading, contentPreview);
136
+ if (phraseMatch) {
137
+ score += 0.08;
138
+ reasons.push("exact phrase match");
139
+ }
140
+ const pathPrior = this.pathTypeScore(plan.intent, pathType);
141
+ if (pathPrior > 0) {
142
+ score += pathPrior;
143
+ if (pathPrior >= 0.1) {
144
+ reasons.push(`matched ${pathType.replace(/_/g, "-")} page type`);
145
+ }
146
+ }
147
+ if (candidate.has_code_block) {
148
+ const codeSignal = plan.intent === "api_lookup" || plan.intent === "troubleshooting"
149
+ ? 0.07
150
+ : plan.intent === "getting_started"
151
+ ? 0.03
152
+ : 0.015;
153
+ score += codeSignal;
154
+ if (codeSignal >= 0.03) {
155
+ reasons.push("includes code sample");
156
+ }
157
+ }
158
+ if (candidate.token_count >= 60 && candidate.token_count <= 260) {
159
+ score += 0.03;
160
+ }
161
+ if (plan.requested_version) {
162
+ if (versionTag === plan.requested_version) {
163
+ score += 0.12;
164
+ reasons.push(`matches requested version v${versionTag}`);
165
+ }
166
+ }
167
+ else if (!versionTag) {
168
+ score += 0.08;
169
+ reasons.push("canonical unversioned page");
170
+ }
171
+ else {
172
+ score += Math.min(parseInt(versionTag, 10), 20) / 300;
173
+ }
174
+ const uniqueReasons = Array.from(new Set(reasons)).slice(0, 4);
175
+ return {
176
+ ...candidate,
177
+ path_type: pathType,
178
+ version_tag: versionTag,
179
+ rerank_score: Number(score.toFixed(6)),
180
+ reasons: uniqueReasons,
181
+ };
182
+ }
183
+ collapseDuplicates(plan, candidates) {
184
+ const bestByPage = new Map();
185
+ for (const candidate of candidates) {
186
+ const existing = bestByPage.get(candidate.page_url);
187
+ if (!existing || candidate.rerank_score > existing.rerank_score) {
188
+ bestByPage.set(candidate.page_url, candidate);
189
+ }
190
+ }
191
+ const bestByCanonicalPage = new Map();
192
+ const pageResults = Array.from(bestByPage.values()).sort((left, right) => right.rerank_score - left.rerank_score);
193
+ for (const candidate of pageResults) {
194
+ const canonicalKey = this.canonicalPageKey(candidate);
195
+ const existing = bestByCanonicalPage.get(canonicalKey);
196
+ if (!existing ||
197
+ this.preferenceScore(plan, candidate) >
198
+ this.preferenceScore(plan, existing)) {
199
+ bestByCanonicalPage.set(canonicalKey, candidate);
200
+ }
201
+ }
202
+ return Array.from(bestByCanonicalPage.values()).sort((left, right) => {
203
+ if (right.rerank_score !== left.rerank_score) {
204
+ return right.rerank_score - left.rerank_score;
205
+ }
206
+ return left.lexical_score - right.lexical_score;
207
+ });
208
+ }
209
+ preferenceScore(plan, result) {
210
+ let score = result.rerank_score;
211
+ if (plan.requested_version) {
212
+ if (result.version_tag === plan.requested_version) {
213
+ score += 0.2;
214
+ }
215
+ }
216
+ else if (!result.version_tag) {
217
+ score += 0.12;
218
+ }
219
+ else {
220
+ score += parseInt(result.version_tag, 10) / 500;
221
+ }
222
+ if (plan.intent === "overview" || plan.intent === "getting_started") {
223
+ if (result.path_type === "getting_started" ||
224
+ result.path_type === "overview") {
225
+ score += 0.02;
226
+ }
227
+ }
228
+ return score;
229
+ }
230
+ canonicalPageKey(result) {
231
+ const canonicalPath = result.page_path
232
+ .toLowerCase()
233
+ .replace(/\/v\d+(?=\/|$)/g, "")
234
+ .replace(/\/+/g, "/")
235
+ .replace(/\/$/, "") || "/";
236
+ const canonicalTitle = normalizeSearchText(result.page_title)
237
+ .replace(/\bv\d+\b/g, "")
238
+ .trim();
239
+ return `${result.library_name}:${canonicalPath}:${canonicalTitle}`;
240
+ }
241
+ inferPathType(pagePath, pageTitle) {
242
+ const value = `${pagePath} ${pageTitle}`.toLowerCase();
243
+ if (/getting-started|quickstart|install|installation|setup/.test(value)) {
244
+ return "getting_started";
245
+ }
246
+ if (/overview|introduction|what is|basics/.test(value)) {
247
+ return "overview";
248
+ }
249
+ if (/\/api|\bapi\b|reference|\/apis\//.test(value)) {
250
+ return "api";
251
+ }
252
+ if (/troubleshoot|troubleshooting|errors?|faq|debug/.test(value)) {
253
+ return "troubleshooting";
254
+ }
255
+ return "guide";
256
+ }
257
+ pathTypeScore(intent, pathType) {
258
+ switch (intent) {
259
+ case "overview":
260
+ if (pathType === "overview")
261
+ return 0.16;
262
+ if (pathType === "getting_started")
263
+ return 0.1;
264
+ if (pathType === "guide")
265
+ return 0.06;
266
+ return 0;
267
+ case "getting_started":
268
+ if (pathType === "getting_started")
269
+ return 0.18;
270
+ if (pathType === "guide")
271
+ return 0.08;
272
+ if (pathType === "overview")
273
+ return 0.06;
274
+ return 0;
275
+ case "api_lookup":
276
+ if (pathType === "api")
277
+ return 0.18;
278
+ if (pathType === "guide")
279
+ return 0.04;
280
+ return 0;
281
+ case "troubleshooting":
282
+ if (pathType === "troubleshooting")
283
+ return 0.16;
284
+ if (pathType === "guide")
285
+ return 0.07;
286
+ return 0;
287
+ default:
288
+ if (pathType === "getting_started")
289
+ return 0.08;
290
+ if (pathType === "overview")
291
+ return 0.07;
292
+ if (pathType === "api")
293
+ return 0.05;
294
+ if (pathType === "guide")
295
+ return 0.03;
296
+ return 0;
297
+ }
298
+ }
299
+ keywordOverlap(keywords, haystack) {
300
+ if (keywords.length === 0 || haystack.length === 0) {
301
+ return 0;
302
+ }
303
+ const matches = keywords.filter((keyword) => haystack.includes(keyword)).length;
304
+ return matches / keywords.length;
305
+ }
306
+ hasPhraseMatch(plan, title, heading, content) {
307
+ if (plan.normalized_query.includes(" ") &&
308
+ (title.includes(plan.normalized_query) ||
309
+ heading.includes(plan.normalized_query))) {
310
+ return true;
311
+ }
312
+ return plan.phrases.some((phrase) => title.includes(phrase) ||
313
+ heading.includes(phrase) ||
314
+ content.includes(phrase));
315
+ }
316
+ primaryTitle(title) {
317
+ return title.split(/\||—|-/)[0]?.trim() ?? title;
318
+ }
319
+ extractVersionTag(path) {
320
+ const match = path.toLowerCase().match(/\/v(\d+)(?=\/|$)/);
321
+ return match?.[1] ?? null;
48
322
  }
49
323
  }
@@ -1,6 +1,7 @@
1
1
  // src/tools/search-docs.ts — Primary search tool (80% of usage)
2
2
  import * as v from 'valibot';
3
3
  import { tool } from 'tmcp/utils';
4
+ import { formatSearchResults } from '../search/format-results.js';
4
5
  export function createSearchDocsTool(searchEngine) {
5
6
  return {
6
7
  definition: {
@@ -20,16 +21,7 @@ export function createSearchDocsTool(searchEngine) {
20
21
  if (results.length === 0) {
21
22
  return tool.text(`No results found for "${query}".`);
22
23
  }
23
- const formatted = results
24
- .map((r, i) => {
25
- let block = `### ${i + 1}. ${r.page_title} — ${r.library_display_name}\n`;
26
- block += `**Source:** ${r.page_url}\n`;
27
- block += `**Section:** ${r.heading_context}\n\n`;
28
- block += r.content;
29
- return block;
30
- })
31
- .join('\n\n---\n\n');
32
- return tool.text(`## Results for "${query}"\n\n${formatted}`);
24
+ return tool.text(formatSearchResults(query, results));
33
25
  },
34
26
  };
35
27
  }
package/dist/types.d.ts CHANGED
@@ -42,6 +42,7 @@ export interface ChunkRecord {
42
42
  export interface CrawlJob {
43
43
  id: string;
44
44
  library_id: string;
45
+ session_id?: string;
45
46
  status: 'queued' | 'running' | 'completed' | 'failed' | 'cancelled';
46
47
  pages_discovered: number;
47
48
  pages_crawled: number;
package/dist/version.d.ts CHANGED
@@ -1 +1 @@
1
- export declare const VERSION = "0.1.20";
1
+ export declare const VERSION = "0.1.21";
package/dist/version.js CHANGED
@@ -1,2 +1,2 @@
1
1
  // This file is automatically updated by the version sync script.
2
- export const VERSION = '0.1.20';
2
+ export const VERSION = '0.1.21';
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "docshark",
3
- "version": "0.1.20",
3
+ "version": "0.1.21",
4
4
  "description": "🦈 Documentation MCP Server — scrape, index, and search any doc website",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -56,7 +56,7 @@
56
56
  "dependencies": {
57
57
  "@mozilla/readability": "^0.6.0",
58
58
  "@tmcp/adapter-valibot": "^0.1.5",
59
- "@tmcp/transport-http": "^0.8.4",
59
+ "@tmcp/transport-http": "^0.8.5",
60
60
  "@tmcp/transport-sse": "^0.5.3",
61
61
  "@tmcp/transport-stdio": "^0.4.1",
62
62
  "cac": "^7.0.0",
@@ -66,7 +66,7 @@
66
66
  "puppeteer-core": "^24.37.5",
67
67
  "robots-parser": "^3.0.1",
68
68
  "srvx": "^0.11.8",
69
- "tmcp": "^1.19.2",
69
+ "tmcp": "^1.19.3",
70
70
  "turndown": "^7.2.2",
71
71
  "turndown-plugin-gfm": "^1.0.2",
72
72
  "valibot": "^1.2.0"