docshark 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ import { extractAndConvert } from '../processor/extractor.js';
2
+ const USER_AGENT = 'DocShark/1.0';
3
+ const MIN_CONTENT_LENGTH = 500;
4
+ const MAX_RETRIES = 3;
5
+ /**
6
+ * Fetch a page and return its HTML.
7
+ * Supports auto-detection of JS-rendered sites (falls back to puppeteer-core if installed).
8
+ */
9
+ export async function fetchPage(url, renderer = 'auto') {
10
+ // Force puppeteer if configured
11
+ if (renderer === 'puppeteer') {
12
+ return fetchWithPuppeteer(url);
13
+ }
14
+ // Tier 1: Standard fetch
15
+ const result = await fetchWithRetry(url);
16
+ if (renderer === 'fetch') {
17
+ return result;
18
+ }
19
+ // Auto mode: check if content is too short (possibly JS-rendered)
20
+ const { markdown } = extractAndConvert(result.html, url);
21
+ if (markdown.length >= MIN_CONTENT_LENGTH) {
22
+ return result;
23
+ }
24
+ // Tier 2: Content too short + has <script> tags → likely JS-rendered
25
+ const looksJsRendered = result.html.includes('<script') && markdown.length < MIN_CONTENT_LENGTH;
26
+ if (looksJsRendered) {
27
+ console.warn(`[DocShark] ${url} appears JS-rendered (${markdown.length} chars). Trying puppeteer...`);
28
+ if (await canUsePuppeteer()) {
29
+ return fetchWithPuppeteer(url);
30
+ }
31
+ console.warn(`[DocShark] puppeteer-core not installed. Run: bun add puppeteer-core\n` +
32
+ `Or set renderer: "fetch" in the library config to suppress this warning.`);
33
+ }
34
+ return result;
35
+ }
36
+ /** Fetch with exponential backoff retry */
37
+ async function fetchWithRetry(url, retries = MAX_RETRIES) {
38
+ for (let attempt = 1; attempt <= retries; attempt++) {
39
+ try {
40
+ const response = await fetch(url, {
41
+ headers: {
42
+ 'User-Agent': USER_AGENT,
43
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44
+ },
45
+ signal: AbortSignal.timeout(30_000),
46
+ redirect: 'follow',
47
+ });
48
+ const html = await response.text();
49
+ return {
50
+ html,
51
+ renderer: 'fetch',
52
+ status: response.status,
53
+ etag: response.headers.get('etag'),
54
+ lastModified: response.headers.get('last-modified'),
55
+ };
56
+ }
57
+ catch (err) {
58
+ if (attempt === retries)
59
+ throw err;
60
+ // Exponential backoff: 1s, 2s, 4s
61
+ await new Promise((r) => setTimeout(r, 1000 * Math.pow(2, attempt - 1)));
62
+ }
63
+ }
64
+ throw new Error(`Failed to fetch ${url} after ${retries} attempts`);
65
+ }
66
+ /** Check if puppeteer-core is available (dynamic import) */
67
+ async function canUsePuppeteer() {
68
+ try {
69
+ // @ts-ignore — puppeteer-core is an optional dependency
70
+ await import(/* webpackIgnore: true */ 'puppeteer-core');
71
+ return true;
72
+ }
73
+ catch {
74
+ return false;
75
+ }
76
+ }
77
+ /** Fetch with puppeteer-core using system Chrome */
78
+ async function fetchWithPuppeteer(url) {
79
+ // @ts-ignore — puppeteer-core is an optional dependency
80
+ const puppeteer = await import('puppeteer-core');
81
+ const { existsSync } = await import('fs');
82
+ const executablePath = findChrome(existsSync);
83
+ if (!executablePath) {
84
+ throw new Error('Chrome not found. Set CHROME_PATH env var or install Chrome.\n' +
85
+ 'Alternatively: npx puppeteer browsers install chrome');
86
+ }
87
+ const browser = await puppeteer.default.launch({
88
+ headless: true,
89
+ executablePath,
90
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
91
+ });
92
+ const page = await browser.newPage();
93
+ try {
94
+ await page.setRequestInterception(true);
95
+ page.on('request', (req) => {
96
+ const type = req.resourceType();
97
+ if (['image', 'stylesheet', 'font', 'media'].includes(type)) {
98
+ req.abort();
99
+ }
100
+ else {
101
+ req.continue();
102
+ }
103
+ });
104
+ await page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 });
105
+ const html = await page.content();
106
+ return { html, renderer: 'puppeteer', status: 200 };
107
+ }
108
+ finally {
109
+ await page.close();
110
+ await browser.close();
111
+ }
112
+ }
113
+ function findChrome(existsSync) {
114
+ const candidates = [
115
+ process.env.CHROME_PATH,
116
+ process.env.PUPPETEER_EXECUTABLE_PATH,
117
+ '/usr/bin/google-chrome',
118
+ '/usr/bin/google-chrome-stable',
119
+ '/usr/bin/chromium-browser',
120
+ '/usr/bin/chromium',
121
+ '/snap/bin/chromium',
122
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
123
+ ];
124
+ for (const path of candidates) {
125
+ if (path && existsSync(path))
126
+ return path;
127
+ }
128
+ return undefined;
129
+ }
@@ -0,0 +1,18 @@
1
+ // src/scraper/rate-limiter.ts — Configurable rate limiter for polite crawling
2
+ export class RateLimiter {
3
+ delayMs;
4
+ lastRequest = 0;
5
+ constructor(delayMs = 500) {
6
+ this.delayMs = delayMs;
7
+ }
8
+ async wait() {
9
+ const elapsed = Date.now() - this.lastRequest;
10
+ if (elapsed < this.delayMs) {
11
+ await new Promise((resolve) => setTimeout(resolve, this.delayMs - elapsed));
12
+ }
13
+ this.lastRequest = Date.now();
14
+ }
15
+ setDelay(ms) {
16
+ this.delayMs = ms;
17
+ }
18
+ }
@@ -0,0 +1,26 @@
1
+ // src/scraper/robots.ts — robots.txt parser
2
+ import robotsParser from 'robots-parser';
3
+ const USER_AGENT = 'DocShark/1.0';
4
+ /** Fetch and parse robots.txt for a given base URL */
5
+ export async function getRobotsParser(baseUrl) {
6
+ const robotsUrl = new URL('/robots.txt', baseUrl).href;
7
+ try {
8
+ const response = await fetch(robotsUrl, {
9
+ headers: { 'User-Agent': USER_AGENT },
10
+ signal: AbortSignal.timeout(10_000),
11
+ });
12
+ if (!response.ok)
13
+ return null;
14
+ const body = await response.text();
15
+ return robotsParser(robotsUrl, body);
16
+ }
17
+ catch {
18
+ return null;
19
+ }
20
+ }
21
+ /** Check if a URL is allowed by robots.txt */
22
+ export function isAllowed(robots, url) {
23
+ if (!robots)
24
+ return true;
25
+ return robots.isAllowed(url, USER_AGENT) !== false;
26
+ }
package/dist/server.js ADDED
@@ -0,0 +1,154 @@
1
+ // src/server.ts — TMCP McpServer setup + tool registration
2
+ import { McpServer } from 'tmcp';
3
+ import { ValibotJsonSchemaAdapter } from '@tmcp/adapter-valibot';
4
+ import * as v from 'valibot';
5
+ import { tool } from 'tmcp/utils';
6
+ import { Database } from './storage/db.js';
7
+ import { SearchEngine } from './storage/search.js';
8
+ import { LibraryService } from './services/library.js';
9
+ import { JobManager } from './jobs/manager.js';
10
+ import { VERSION } from './version.js';
11
+ import { EventBus } from './jobs/events.js';
12
+ // Initialize core services
13
+ export const db = new Database();
14
+ export const eventBus = new EventBus();
15
+ export const searchEngine = new SearchEngine(db);
16
+ export const jobManager = new JobManager(db, eventBus);
17
+ export const libraryService = new LibraryService(db, jobManager);
18
+ // Create TMCP server
19
+ export const server = new McpServer({
20
+ name: 'docshark',
21
+ version: VERSION,
22
+ description: '🦈 Documentation MCP Server — scrape, index, and search any doc website',
23
+ }, {
24
+ adapter: new ValibotJsonSchemaAdapter(),
25
+ capabilities: {
26
+ tools: { listChanged: true },
27
+ resources: {},
28
+ },
29
+ });
30
+ // ──────────────────────────────────────
31
+ // Tool 1: search_docs — Primary search tool
32
+ // ──────────────────────────────────────
33
+ server.tool({
34
+ name: 'search_docs',
35
+ description: 'Search through indexed documentation libraries for relevant information. ' +
36
+ 'Returns ranked documentation sections with code examples and source URLs. ' +
37
+ 'Use this when you need to find information about a library, framework, API, ' +
38
+ 'or any technical concept.',
39
+ schema: v.object({
40
+ query: v.pipe(v.string(), v.description('Search query. Use natural language.')),
41
+ library: v.optional(v.pipe(v.string(), v.description('Filter to a specific library.'))),
42
+ limit: v.optional(v.pipe(v.number(), v.integer(), v.minValue(1), v.maxValue(20)), 5),
43
+ }),
44
+ }, async ({ query, library, limit }) => {
45
+ const results = searchEngine.search(query, { library, limit });
46
+ if (results.length === 0)
47
+ return tool.text(`No results found for "${query}".`);
48
+ const formatted = results
49
+ .map((r, i) => {
50
+ let block = `### ${i + 1}. ${r.page_title} — ${r.library_display_name}\n`;
51
+ block += `**Source:** ${r.page_url}\n`;
52
+ block += `**Section:** ${r.heading_context}\n\n`;
53
+ block += r.content;
54
+ return block;
55
+ })
56
+ .join('\n\n---\n\n');
57
+ return tool.text(`## Results for "${query}"\n\n${formatted}`);
58
+ });
59
+ // ──────────────────────────────────────
60
+ // Tool 2: list_libraries — Discovery tool
61
+ // ──────────────────────────────────────
62
+ server.tool({
63
+ name: 'list_libraries',
64
+ description: 'List all documentation libraries currently indexed and available for searching. ' +
65
+ 'Use this to discover what docs are available before running search_docs.',
66
+ schema: v.object({
67
+ status: v.optional(v.pipe(v.picklist(['indexed', 'crawling', 'error', 'all']), v.description('Filter by status. Default: "all".')), 'all'),
68
+ }),
69
+ }, async ({ status }) => {
70
+ const libraries = db.listLibraries(status);
71
+ if (libraries.length === 0) {
72
+ return tool.text('No libraries indexed yet. Use add_library to add a documentation website.');
73
+ }
74
+ let output = `## Indexed Libraries (${libraries.length} total)\n\n`;
75
+ output += '| Library | URL | Pages | Chunks | Status |\n';
76
+ output += '| ------- | --- | ----- | ------ | ------ |\n';
77
+ for (const lib of libraries) {
78
+ output += `| ${lib.name} | ${lib.url} | ${lib.page_count} | ${lib.chunk_count} | ${lib.status} |\n`;
79
+ }
80
+ return tool.text(output);
81
+ });
82
+ // ──────────────────────────────────────
83
+ // Tool 3: get_doc_page — Full page read
84
+ // ──────────────────────────────────────
85
+ server.tool({
86
+ name: 'get_doc_page',
87
+ description: 'Retrieve the complete content of a specific documentation page as markdown. ' +
88
+ 'Use when search results reference a page and you need full context.',
89
+ schema: v.object({
90
+ url: v.optional(v.pipe(v.string(), v.description('The full URL of the documentation page.'))),
91
+ library: v.optional(v.pipe(v.string(), v.description('Library name to search within.'))),
92
+ path: v.optional(v.pipe(v.string(), v.description('Relative path within the library.'))),
93
+ }),
94
+ }, async ({ url, library, path }) => {
95
+ const page = db.getPage({ url, library, path });
96
+ if (!page)
97
+ return tool.text('Page not found. Use search_docs to find the correct page.');
98
+ return tool.text(`# ${page.title}\n**Source:** ${page.url}\n\n${page.content_markdown}`);
99
+ });
100
+ // ──────────────────────────────────────
101
+ // Tool 4: add_library — Add new doc source
102
+ // ──────────────────────────────────────
103
+ server.tool({
104
+ name: 'add_library',
105
+ description: 'Add a new documentation library to be crawled and indexed. ' +
106
+ 'Provide the URL and an optional name. Crawl runs in the background.',
107
+ schema: v.object({
108
+ url: v.pipe(v.string(), v.url(), v.description('Base URL of the documentation website.')),
109
+ name: v.optional(v.pipe(v.string(), v.description('Short identifier (auto-generated if omitted).'))),
110
+ version: v.optional(v.pipe(v.string(), v.description('Version string.'))),
111
+ max_depth: v.optional(v.pipe(v.number(), v.integer(), v.minValue(1), v.maxValue(10)), 3),
112
+ }),
113
+ }, async ({ url, name, version, max_depth }) => {
114
+ try {
115
+ const library = await libraryService.add({ url, name, version, maxDepth: max_depth });
116
+ return tool.text(`✅ Library "${library.display_name}" added.\n` +
117
+ `Crawl job ${library.jobId} started. Use list_libraries to check progress.`);
118
+ }
119
+ catch (err) {
120
+ return tool.text(`❌ Failed: ${err.message}`);
121
+ }
122
+ });
123
+ // ──────────────────────────────────────
124
+ // Tool 5: refresh_library — Re-crawl
125
+ // ──────────────────────────────────────
126
+ server.tool({
127
+ name: 'refresh_library',
128
+ description: 'Re-crawl and re-index an existing documentation library to get the latest content.',
129
+ schema: v.object({
130
+ library: v.pipe(v.string(), v.description('The library name to refresh.')),
131
+ }),
132
+ }, async ({ library }) => {
133
+ const lib = db.getLibraryByName(library);
134
+ if (!lib)
135
+ return tool.text(`Library "${library}" not found. Use list_libraries to see available.`);
136
+ const job = jobManager.startCrawl(lib.id, { incremental: true });
137
+ return tool.text(`🔄 Refresh started for "${lib.display_name}".\nJob ${job.id}: checking for updated pages...`);
138
+ });
139
+ // ──────────────────────────────────────
140
+ // Tool 6: remove_library — Delete
141
+ // ──────────────────────────────────────
142
+ server.tool({
143
+ name: 'remove_library',
144
+ description: 'Remove a documentation library and all its indexed content permanently.',
145
+ schema: v.object({
146
+ library: v.pipe(v.string(), v.description('The library name to remove.')),
147
+ }),
148
+ }, async ({ library }) => {
149
+ const lib = db.getLibraryByName(library);
150
+ if (!lib)
151
+ return tool.text(`Library "${library}" not found.`);
152
+ db.removeLibrary(lib.id);
153
+ return tool.text(`🗑️ Library "${lib.display_name}" removed.\nDeleted ${lib.page_count} pages and ${lib.chunk_count} chunks.`);
154
+ });
@@ -0,0 +1,66 @@
1
+ // src/services/library.ts — Library management service
2
+ import { nanoid } from 'nanoid';
3
+ export class LibraryService {
4
+ db;
5
+ jobManager;
6
+ constructor(db, jobManager) {
7
+ this.db = db;
8
+ this.jobManager = jobManager;
9
+ }
10
+ /** Add a new documentation library and start crawling */
11
+ async add(opts) {
12
+ const url = normalizeUrl(opts.url);
13
+ const name = opts.name || generateName(url);
14
+ const displayName = generateDisplayName(name);
15
+ // Check if already exists
16
+ const existing = this.db.getLibraryByName(name);
17
+ if (existing) {
18
+ throw new Error(`Library "${name}" already exists. Use refresh_library to re-crawl.`);
19
+ }
20
+ const id = nanoid();
21
+ const crawlConfig = {
22
+ maxDepth: opts.maxDepth ?? 3,
23
+ renderer: 'auto',
24
+ };
25
+ this.db.addLibrary({
26
+ id,
27
+ name,
28
+ displayName,
29
+ url,
30
+ version: opts.version,
31
+ crawlConfig,
32
+ });
33
+ // Start crawl job
34
+ const job = this.jobManager.startCrawl(id);
35
+ const library = this.db.getLibraryById(id);
36
+ return { ...library, jobId: job.id };
37
+ }
38
+ }
39
+ /** Normalize URL: ensure trailing slash for base docs */
40
+ function normalizeUrl(url) {
41
+ const parsed = new URL(url);
42
+ // Remove trailing hash and query for base URL
43
+ parsed.hash = '';
44
+ return parsed.href;
45
+ }
46
+ /** Generate a slug name from URL */
47
+ function generateName(url) {
48
+ const parsed = new URL(url);
49
+ const host = parsed.hostname.replace(/^www\./, '');
50
+ const path = parsed.pathname.replace(/\/$/, '').replace(/^\//, '');
51
+ if (path) {
52
+ // e.g. svelte.dev/docs → "svelte-docs"
53
+ const hostPart = host.split('.')[0];
54
+ const pathPart = path.split('/').slice(0, 2).join('-');
55
+ return `${hostPart}-${pathPart}`.toLowerCase().replace(/[^a-z0-9-]/g, '-');
56
+ }
57
+ // Just the hostname
58
+ return host.replace(/\./g, '-').toLowerCase();
59
+ }
60
+ /** Generate a display name from the slug */
61
+ function generateDisplayName(name) {
62
+ return name
63
+ .split('-')
64
+ .map((word) => word.charAt(0).toUpperCase() + word.slice(1))
65
+ .join(' ');
66
+ }
@@ -0,0 +1,228 @@
1
+ // src/storage/db.ts — SQLite + FTS5 storage layer (bun:sqlite)
2
+ import { Database as BunDatabase } from 'bun:sqlite';
3
+ import { resolve } from 'path';
4
+ import { mkdirSync } from 'fs';
5
+ import { homedir } from 'os';
6
+ export class Database {
7
+ db;
8
+ init() {
9
+ const dir = process.env.DOCSHARK_DATA_DIR || resolve(homedir(), '.docshark');
10
+ mkdirSync(dir, { recursive: true });
11
+ this.db = new BunDatabase(resolve(dir, 'docshark.db'));
12
+ this.db.run('PRAGMA journal_mode = WAL');
13
+ this.db.run('PRAGMA foreign_keys = ON');
14
+ this.migrate();
15
+ }
16
+ /** Expose raw DB for search engine direct queries */
17
+ raw() {
18
+ return this.db;
19
+ }
20
+ migrate() {
21
+ this.db.run(`
22
+ CREATE TABLE IF NOT EXISTS libraries (
23
+ id TEXT PRIMARY KEY,
24
+ name TEXT NOT NULL UNIQUE,
25
+ display_name TEXT NOT NULL,
26
+ url TEXT NOT NULL,
27
+ version TEXT,
28
+ description TEXT,
29
+ status TEXT NOT NULL DEFAULT 'pending',
30
+ page_count INTEGER NOT NULL DEFAULT 0,
31
+ chunk_count INTEGER NOT NULL DEFAULT 0,
32
+ crawl_config TEXT,
33
+ last_crawled_at TEXT,
34
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
35
+ updated_at TEXT NOT NULL DEFAULT (datetime('now'))
36
+ )
37
+ `);
38
+ this.db.run(`
39
+ CREATE TABLE IF NOT EXISTS pages (
40
+ id TEXT PRIMARY KEY,
41
+ library_id TEXT NOT NULL REFERENCES libraries(id) ON DELETE CASCADE,
42
+ url TEXT NOT NULL,
43
+ path TEXT NOT NULL,
44
+ title TEXT,
45
+ content_markdown TEXT,
46
+ content_hash TEXT,
47
+ headings TEXT,
48
+ http_status INTEGER,
49
+ last_modified TEXT,
50
+ etag TEXT,
51
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
52
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
53
+ UNIQUE(library_id, url)
54
+ )
55
+ `);
56
+ this.db.run(`
57
+ CREATE TABLE IF NOT EXISTS chunks (
58
+ id TEXT PRIMARY KEY,
59
+ page_id TEXT NOT NULL REFERENCES pages(id) ON DELETE CASCADE,
60
+ library_id TEXT NOT NULL REFERENCES libraries(id) ON DELETE CASCADE,
61
+ content TEXT NOT NULL,
62
+ heading_context TEXT,
63
+ chunk_index INTEGER NOT NULL,
64
+ token_count INTEGER,
65
+ has_code_block INTEGER NOT NULL DEFAULT 0,
66
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
67
+ )
68
+ `);
69
+ this.db.run(`
70
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
71
+ content,
72
+ heading_context,
73
+ content=chunks,
74
+ content_rowid=rowid,
75
+ tokenize='porter unicode61 remove_diacritics 2'
76
+ )
77
+ `);
78
+ // FTS5 sync triggers
79
+ this.db.run(`
80
+ CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
81
+ INSERT INTO chunks_fts(rowid, content, heading_context)
82
+ VALUES (NEW.rowid, NEW.content, NEW.heading_context);
83
+ END
84
+ `);
85
+ this.db.run(`
86
+ CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
87
+ INSERT INTO chunks_fts(chunks_fts, rowid, content, heading_context)
88
+ VALUES ('delete', OLD.rowid, OLD.content, OLD.heading_context);
89
+ END
90
+ `);
91
+ this.db.run(`
92
+ CREATE TABLE IF NOT EXISTS crawl_jobs (
93
+ id TEXT PRIMARY KEY,
94
+ library_id TEXT NOT NULL REFERENCES libraries(id) ON DELETE CASCADE,
95
+ status TEXT NOT NULL DEFAULT 'queued',
96
+ pages_discovered INTEGER NOT NULL DEFAULT 0,
97
+ pages_crawled INTEGER NOT NULL DEFAULT 0,
98
+ pages_failed INTEGER NOT NULL DEFAULT 0,
99
+ chunks_created INTEGER NOT NULL DEFAULT 0,
100
+ error_message TEXT,
101
+ started_at TEXT,
102
+ completed_at TEXT,
103
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
104
+ )
105
+ `);
106
+ }
107
+ // ──────────────────────────────────────
108
+ // Library CRUD
109
+ // ──────────────────────────────────────
110
+ addLibrary(lib) {
111
+ return this.db
112
+ .prepare(`INSERT INTO libraries (id, name, display_name, url, version, crawl_config)
113
+ VALUES (?, ?, ?, ?, ?, ?)`)
114
+ .run(lib.id, lib.name, lib.displayName, lib.url, lib.version ?? null, lib.crawlConfig ? JSON.stringify(lib.crawlConfig) : null);
115
+ }
116
+ listLibraries(status) {
117
+ if (status && status !== 'all') {
118
+ return this.db.prepare('SELECT * FROM libraries WHERE status = ?').all(status);
119
+ }
120
+ return this.db.prepare('SELECT * FROM libraries ORDER BY name').all();
121
+ }
122
+ getLibraryByName(name) {
123
+ return this.db.prepare('SELECT * FROM libraries WHERE name = ?').get(name);
124
+ }
125
+ getLibraryById(id) {
126
+ return this.db.prepare('SELECT * FROM libraries WHERE id = ?').get(id);
127
+ }
128
+ removeLibrary(id) {
129
+ return this.db.prepare('DELETE FROM libraries WHERE id = ?').run(id);
130
+ }
131
+ updateLibraryStatus(id, status) {
132
+ return this.db
133
+ .prepare('UPDATE libraries SET status = ?, updated_at = datetime("now") WHERE id = ?')
134
+ .run(status, id);
135
+ }
136
+ updateLibraryStats(id, pageCount, chunkCount) {
137
+ return this.db
138
+ .prepare(`UPDATE libraries
139
+ SET page_count = ?, chunk_count = ?, last_crawled_at = datetime('now'), updated_at = datetime('now')
140
+ WHERE id = ?`)
141
+ .run(pageCount, chunkCount, id);
142
+ }
143
+ // ──────────────────────────────────────
144
+ // Page CRUD
145
+ // ──────────────────────────────────────
146
+ upsertPage(page) {
147
+ this.db
148
+ .prepare(`INSERT INTO pages (id, library_id, url, path, title, content_markdown, content_hash, headings)
149
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
150
+ ON CONFLICT(library_id, url) DO UPDATE SET
151
+ title = excluded.title,
152
+ content_markdown = excluded.content_markdown,
153
+ content_hash = excluded.content_hash,
154
+ headings = excluded.headings,
155
+ updated_at = datetime('now')`)
156
+ .run(page.id, page.libraryId, page.url, page.path, page.title, page.contentMarkdown, page.contentHash, JSON.stringify(page.headings));
157
+ const row = this.db.prepare('SELECT id FROM pages WHERE library_id = ? AND url = ?').get(page.libraryId, page.url);
158
+ return row.id;
159
+ }
160
+ getPage(opts) {
161
+ if (opts.url) {
162
+ return this.db.prepare('SELECT * FROM pages WHERE url = ?').get(opts.url);
163
+ }
164
+ if (opts.library && opts.path) {
165
+ return this.db
166
+ .prepare(`SELECT p.* FROM pages p
167
+ JOIN libraries l ON p.library_id = l.id
168
+ WHERE l.name = ? AND p.path = ?`)
169
+ .get(opts.library, opts.path);
170
+ }
171
+ return undefined;
172
+ }
173
+ getPagesByLibrary(libraryId) {
174
+ return this.db
175
+ .prepare('SELECT * FROM pages WHERE library_id = ? ORDER BY path')
176
+ .all(libraryId);
177
+ }
178
+ // ──────────────────────────────────────
179
+ // Chunk CRUD
180
+ // ──────────────────────────────────────
181
+ insertChunks(chunks) {
182
+ const insert = this.db.prepare(`INSERT INTO chunks (id, page_id, library_id, content, heading_context, chunk_index, token_count, has_code_block)
183
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
184
+ const tx = this.db.transaction(() => {
185
+ for (const c of chunks) {
186
+ insert.run(c.id, c.pageId, c.libraryId, c.content, c.headingContext, c.chunkIndex, c.tokenCount, c.hasCodeBlock ? 1 : 0);
187
+ }
188
+ });
189
+ tx();
190
+ }
191
+ deleteChunksByPage(pageId) {
192
+ this.db.prepare('DELETE FROM chunks WHERE page_id = ?').run(pageId);
193
+ }
194
+ // ──────────────────────────────────────
195
+ // Crawl Jobs
196
+ // ──────────────────────────────────────
197
+ createJob(job) {
198
+ this.db
199
+ .prepare('INSERT INTO crawl_jobs (id, library_id) VALUES (?, ?)')
200
+ .run(job.id, job.libraryId);
201
+ return this.db.prepare('SELECT * FROM crawl_jobs WHERE id = ?').get(job.id);
202
+ }
203
+ getJob(id) {
204
+ return this.db.prepare('SELECT * FROM crawl_jobs WHERE id = ?').get(id);
205
+ }
206
+ updateJob(id, updates) {
207
+ const sets = [];
208
+ const values = [];
209
+ for (const [key, value] of Object.entries(updates)) {
210
+ sets.push(`${key} = ?`);
211
+ values.push(value);
212
+ }
213
+ if (sets.length === 0)
214
+ return;
215
+ values.push(id);
216
+ this.db.prepare(`UPDATE crawl_jobs SET ${sets.join(', ')} WHERE id = ?`).run(...values);
217
+ }
218
+ listJobs(libraryId) {
219
+ if (libraryId) {
220
+ return this.db
221
+ .prepare('SELECT * FROM crawl_jobs WHERE library_id = ? ORDER BY created_at DESC')
222
+ .all(libraryId);
223
+ }
224
+ return this.db
225
+ .prepare('SELECT * FROM crawl_jobs ORDER BY created_at DESC')
226
+ .all();
227
+ }
228
+ }
@@ -0,0 +1,49 @@
1
+ export class SearchEngine {
2
+ db;
3
+ constructor(db) {
4
+ this.db = db;
5
+ }
6
+ search(query, opts = {}) {
7
+ const limit = opts.limit ?? 5;
8
+ const ftsQuery = this.sanitizeQuery(query);
9
+ if (!ftsQuery)
10
+ return [];
11
+ try {
12
+ const stmt = this.db.raw().prepare(`
13
+ SELECT
14
+ c.content,
15
+ c.heading_context,
16
+ c.has_code_block,
17
+ c.token_count,
18
+ p.url AS page_url,
19
+ p.title AS page_title,
20
+ l.name AS library_name,
21
+ l.display_name AS library_display_name,
22
+ bm25(chunks_fts, 1.0, 0.5) AS relevance_score
23
+ FROM chunks_fts
24
+ JOIN chunks c ON chunks_fts.rowid = c.rowid
25
+ JOIN pages p ON c.page_id = p.id
26
+ JOIN libraries l ON c.library_id = l.id
27
+ WHERE chunks_fts MATCH ?
28
+ AND (? IS NULL OR l.name = ?)
29
+ ORDER BY relevance_score
30
+ LIMIT ?
31
+ `);
32
+ return stmt.all(ftsQuery, opts.library ?? null, opts.library ?? null, limit);
33
+ }
34
+ catch (err) {
35
+ // FTS5 query might fail with bad syntax — return empty
36
+ console.warn(`[DocShark] Search failed:`, err.message);
37
+ return [];
38
+ }
39
+ }
40
+ sanitizeQuery(query) {
41
+ // Remove FTS5 special operators for safety, wrap terms in quotes
42
+ return query
43
+ .replace(/['"]/g, '')
44
+ .split(/\s+/)
45
+ .filter(Boolean)
46
+ .map((term) => `"${term}"`)
47
+ .join(' OR ');
48
+ }
49
+ }