inkdex 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/.github/workflows/ci.yml +73 -0
- package/.github/workflows/release.yml +65 -0
- package/AGENTS.md +32 -0
- package/LICENSE +190 -0
- package/README.md +40 -0
- package/biome.json +43 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +38 -0
- package/dist/embedder/embedder.d.ts +9 -0
- package/dist/embedder/embedder.js +39 -0
- package/dist/ingest/chunker.d.ts +7 -0
- package/dist/ingest/chunker.js +114 -0
- package/dist/ingest/index-docs.d.ts +2 -0
- package/dist/ingest/index-docs.js +78 -0
- package/dist/logger.d.ts +6 -0
- package/dist/logger.js +28 -0
- package/dist/search/search.d.ts +7 -0
- package/dist/search/search.js +70 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.js +66 -0
- package/dist/store/db.d.ts +13 -0
- package/dist/store/db.js +149 -0
- package/dist/types.d.ts +14 -0
- package/dist/types.js +1 -0
- package/dist/version.d.ts +1 -0
- package/dist/version.js +13 -0
- package/inkdex-0.0.1.tgz +0 -0
- package/package.json +46 -0
- package/release.sh +33 -0
- package/src/cli.ts +45 -0
- package/src/embedder/embedder.ts +52 -0
- package/src/ingest/chunker.ts +158 -0
- package/src/ingest/index-docs.ts +120 -0
- package/src/logger.ts +39 -0
- package/src/search/search.ts +93 -0
- package/src/server.ts +96 -0
- package/src/store/db.ts +217 -0
- package/src/types.ts +16 -0
- package/src/version.ts +16 -0
- package/test/fixtures/docs/api.md +26 -0
- package/test/fixtures/docs/getting-started.md +13 -0
- package/test/helpers/index.ts +14 -0
- package/test/integration/embedder.test.ts +52 -0
- package/test/integration/server.test.ts +125 -0
- package/test/unit/chunker.test.ts +193 -0
- package/test/unit/db.test.ts +190 -0
- package/test/unit/index-docs.test.ts +120 -0
- package/test/unit/logger.test.ts +11 -0
- package/test/unit/search.test.ts +93 -0
- package/test/unit/version.test.ts +16 -0
- package/test-docs/api-reference.md +76 -0
- package/test-docs/deployment.md +55 -0
- package/test-docs/getting-started.md +52 -0
- package/tsconfig.json +18 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { glob, readFile } from "node:fs/promises";
|
|
3
|
+
import { relative } from "node:path";
|
|
4
|
+
import { logger } from "../logger.js";
|
|
5
|
+
import { getAllDocumentHashes, insertChunk, removeDocument, runInTransaction, setDocumentHash, } from "../store/db.js";
|
|
6
|
+
import { chunkMarkdown } from "./chunker.js";
|
|
7
|
+
const MAX_CHUNK_FILL = 0.8;
|
|
8
|
+
async function findMarkdownFiles(docsPath) {
|
|
9
|
+
const files = [];
|
|
10
|
+
for await (const entry of glob("**/*.md", { cwd: docsPath })) {
|
|
11
|
+
files.push(`${docsPath}/${entry}`);
|
|
12
|
+
}
|
|
13
|
+
return files.sort();
|
|
14
|
+
}
|
|
15
|
+
function hashContent(content) {
|
|
16
|
+
return createHash("sha256").update(content).digest("hex");
|
|
17
|
+
}
|
|
18
|
+
export async function indexDocs(embedder, docsPath) {
|
|
19
|
+
const files = await findMarkdownFiles(docsPath);
|
|
20
|
+
if (files.length === 0) {
|
|
21
|
+
logger.warn({ path: docsPath }, "No markdown files found");
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
const fileContents = new Map();
|
|
25
|
+
for (const file of files) {
|
|
26
|
+
const key = relative(docsPath, file);
|
|
27
|
+
const content = await readFile(file, "utf-8");
|
|
28
|
+
fileContents.set(key, content);
|
|
29
|
+
}
|
|
30
|
+
const storedHashes = getAllDocumentHashes();
|
|
31
|
+
const changedKeys = [];
|
|
32
|
+
for (const [key, content] of fileContents) {
|
|
33
|
+
if (storedHashes[key] !== hashContent(content)) {
|
|
34
|
+
changedKeys.push(key);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
const removedKeys = [];
|
|
38
|
+
for (const key of Object.keys(storedHashes)) {
|
|
39
|
+
if (!fileContents.has(key)) {
|
|
40
|
+
removedKeys.push(key);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
if (changedKeys.length === 0 && removedKeys.length === 0) {
|
|
44
|
+
logger.info({ files: files.length }, "Index up to date");
|
|
45
|
+
return;
|
|
46
|
+
}
|
|
47
|
+
const start = performance.now();
|
|
48
|
+
logger.info({ changed: changedKeys.length, removed: removedKeys.length }, "Indexing changed files");
|
|
49
|
+
if (removedKeys.length > 0) {
|
|
50
|
+
runInTransaction(() => {
|
|
51
|
+
for (const key of removedKeys) {
|
|
52
|
+
removeDocument(key);
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
const chunkOptions = {
|
|
57
|
+
maxTokens: Math.floor(embedder.maxTokens * MAX_CHUNK_FILL),
|
|
58
|
+
countTokens: (text) => embedder.tokenize(text).length,
|
|
59
|
+
};
|
|
60
|
+
let totalChunks = 0;
|
|
61
|
+
for (const key of changedKeys) {
|
|
62
|
+
const content = fileContents.get(key);
|
|
63
|
+
const chunks = chunkMarkdown(content, key, chunkOptions);
|
|
64
|
+
logger.debug({ path: key, chunks: chunks.length }, "Embedding chunks");
|
|
65
|
+
const embeddings = await embedder.embedBatch(chunks.map((c) => c.text));
|
|
66
|
+
runInTransaction(() => {
|
|
67
|
+
removeDocument(key);
|
|
68
|
+
setDocumentHash(key, hashContent(content));
|
|
69
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
70
|
+
const chunk = chunks[i];
|
|
71
|
+
insertChunk(chunk.path, chunk.fileHeading, chunk.heading, chunk.text, chunk.metadata, embeddings[i]);
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
totalChunks += chunks.length;
|
|
75
|
+
}
|
|
76
|
+
const duration = ((performance.now() - start) / 1000).toFixed(1);
|
|
77
|
+
logger.info({ duration: `${duration}s`, chunks: totalChunks }, "Indexing complete");
|
|
78
|
+
}
|
package/dist/logger.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export declare const logger: {
|
|
2
|
+
debug: (msgOrObj: string | Record<string, unknown>, msg?: string) => void;
|
|
3
|
+
info: (msgOrObj: string | Record<string, unknown>, msg?: string) => void;
|
|
4
|
+
warn: (msgOrObj: string | Record<string, unknown>, msg?: string) => void;
|
|
5
|
+
error: (msgOrObj: string | Record<string, unknown>, msg?: string) => void;
|
|
6
|
+
};
|
package/dist/logger.js
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
const level = process.env.LOG_LEVEL ?? "info";
|
|
2
|
+
const levels = {
|
|
3
|
+
debug: 0,
|
|
4
|
+
info: 1,
|
|
5
|
+
warn: 2,
|
|
6
|
+
error: 3,
|
|
7
|
+
};
|
|
8
|
+
const threshold = levels[level] ?? 1;
|
|
9
|
+
// All levels go to stderr to keep stdout free for the MCP stdio transport
|
|
10
|
+
function log(lvl, msg) {
|
|
11
|
+
if ((levels[lvl] ?? 0) >= threshold) {
|
|
12
|
+
console.error(`[${lvl.toUpperCase()}] ${msg}`);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
export const logger = {
|
|
16
|
+
debug: (msgOrObj, msg) => log("debug", formatMsg(msgOrObj, msg)),
|
|
17
|
+
info: (msgOrObj, msg) => log("info", formatMsg(msgOrObj, msg)),
|
|
18
|
+
warn: (msgOrObj, msg) => log("warn", formatMsg(msgOrObj, msg)),
|
|
19
|
+
error: (msgOrObj, msg) => log("error", formatMsg(msgOrObj, msg)),
|
|
20
|
+
};
|
|
21
|
+
function formatMsg(msgOrObj, msg) {
|
|
22
|
+
if (typeof msgOrObj === "string")
|
|
23
|
+
return msgOrObj;
|
|
24
|
+
const data = Object.entries(msgOrObj)
|
|
25
|
+
.map(([k, v]) => `${k}=${v}`)
|
|
26
|
+
.join(" ");
|
|
27
|
+
return msg ? `${msg} ${data}` : data;
|
|
28
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { Embedder } from "../embedder/embedder.js";
|
|
2
|
+
import type { ChunkRow, SearchResult } from "../types.js";
|
|
3
|
+
/** @package */
|
|
4
|
+
export declare function cosineSimilarity(a: number[], b: number[]): number;
|
|
5
|
+
/** @package */
|
|
6
|
+
export declare function rankChunksHybrid(chunks: ChunkRow[], queryEmbedding: number[], ftsRankedIds: number[], limit: number): SearchResult[];
|
|
7
|
+
export declare function search(embedder: Embedder, query: string, limit: number): Promise<SearchResult[]>;
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import { getAllChunks, searchFts } from "../store/db.js";
|
|
2
|
+
/** @package */
|
|
3
|
+
export function cosineSimilarity(a, b) {
|
|
4
|
+
let dot = 0;
|
|
5
|
+
let normA = 0;
|
|
6
|
+
let normB = 0;
|
|
7
|
+
for (let i = 0; i < a.length; i++) {
|
|
8
|
+
dot += a[i] * b[i];
|
|
9
|
+
normA += a[i] * a[i];
|
|
10
|
+
normB += b[i] * b[i];
|
|
11
|
+
}
|
|
12
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
13
|
+
}
|
|
14
|
+
const RRF_K = 60;
|
|
15
|
+
/** @package */
|
|
16
|
+
export function rankChunksHybrid(chunks, queryEmbedding, ftsRankedIds, limit) {
|
|
17
|
+
const vectorRanked = chunks
|
|
18
|
+
.map((chunk) => ({
|
|
19
|
+
chunk,
|
|
20
|
+
similarity: cosineSimilarity(queryEmbedding, chunk.embedding),
|
|
21
|
+
}))
|
|
22
|
+
.sort((a, b) => b.similarity - a.similarity);
|
|
23
|
+
const vectorRankMap = new Map();
|
|
24
|
+
for (let i = 0; i < vectorRanked.length; i++) {
|
|
25
|
+
vectorRankMap.set(vectorRanked[i].chunk.id, i + 1);
|
|
26
|
+
}
|
|
27
|
+
const bm25RankMap = new Map();
|
|
28
|
+
for (let i = 0; i < ftsRankedIds.length; i++) {
|
|
29
|
+
bm25RankMap.set(ftsRankedIds[i], i + 1);
|
|
30
|
+
}
|
|
31
|
+
const chunkById = new Map();
|
|
32
|
+
for (const chunk of chunks) {
|
|
33
|
+
chunkById.set(chunk.id, chunk);
|
|
34
|
+
}
|
|
35
|
+
const allIds = new Set([
|
|
36
|
+
...vectorRankMap.keys(),
|
|
37
|
+
...bm25RankMap.keys(),
|
|
38
|
+
]);
|
|
39
|
+
const scored = [];
|
|
40
|
+
for (const id of allIds) {
|
|
41
|
+
const chunk = chunkById.get(id);
|
|
42
|
+
if (!chunk)
|
|
43
|
+
continue;
|
|
44
|
+
const vectorRank = vectorRankMap.get(id);
|
|
45
|
+
const bm25Rank = bm25RankMap.get(id);
|
|
46
|
+
let score = 0;
|
|
47
|
+
if (vectorRank !== undefined)
|
|
48
|
+
score += 1 / (RRF_K + vectorRank);
|
|
49
|
+
if (bm25Rank !== undefined)
|
|
50
|
+
score += 1 / (RRF_K + bm25Rank);
|
|
51
|
+
scored.push({ chunk, score });
|
|
52
|
+
}
|
|
53
|
+
return scored
|
|
54
|
+
.sort((a, b) => b.score - a.score)
|
|
55
|
+
.slice(0, limit)
|
|
56
|
+
.map(({ chunk, score }) => ({
|
|
57
|
+
path: chunk.path,
|
|
58
|
+
fileHeading: chunk.fileHeading,
|
|
59
|
+
heading: chunk.heading,
|
|
60
|
+
text: chunk.text,
|
|
61
|
+
metadata: chunk.metadata,
|
|
62
|
+
score,
|
|
63
|
+
}));
|
|
64
|
+
}
|
|
65
|
+
export async function search(embedder, query, limit) {
|
|
66
|
+
const queryEmbedding = await embedder.embed(query);
|
|
67
|
+
const chunks = getAllChunks();
|
|
68
|
+
const ftsRankedIds = searchFts(query, chunks.length);
|
|
69
|
+
return rankChunksHybrid(chunks, queryEmbedding, ftsRankedIds, limit);
|
|
70
|
+
}
|
package/dist/server.d.ts
ADDED
package/dist/server.js
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
2
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
3
|
+
import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
|
|
4
|
+
import { logger } from "./logger.js";
|
|
5
|
+
import { search } from "./search/search.js";
|
|
6
|
+
import { getChunkCount } from "./store/db.js";
|
|
7
|
+
import { getVersion } from "./version.js";
|
|
8
|
+
async function createServer(embedder) {
|
|
9
|
+
const server = new Server({
|
|
10
|
+
name: "inkdex",
|
|
11
|
+
version: getVersion(),
|
|
12
|
+
}, {
|
|
13
|
+
capabilities: {
|
|
14
|
+
tools: {},
|
|
15
|
+
},
|
|
16
|
+
});
|
|
17
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
18
|
+
return {
|
|
19
|
+
tools: [
|
|
20
|
+
{
|
|
21
|
+
name: "search_docs",
|
|
22
|
+
description: "Search markdown documentation for relevant information. Returns chunks of content that match the query semantically.",
|
|
23
|
+
inputSchema: {
|
|
24
|
+
type: "object",
|
|
25
|
+
properties: {
|
|
26
|
+
query: {
|
|
27
|
+
type: "string",
|
|
28
|
+
description: "Search query - natural language question or keywords",
|
|
29
|
+
},
|
|
30
|
+
limit: {
|
|
31
|
+
type: "number",
|
|
32
|
+
description: "Maximum number of results to return (1-20)",
|
|
33
|
+
default: 5,
|
|
34
|
+
minimum: 1,
|
|
35
|
+
maximum: 20,
|
|
36
|
+
},
|
|
37
|
+
},
|
|
38
|
+
required: ["query"],
|
|
39
|
+
},
|
|
40
|
+
},
|
|
41
|
+
],
|
|
42
|
+
};
|
|
43
|
+
});
|
|
44
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
45
|
+
if (request.params.name !== "search_docs") {
|
|
46
|
+
throw new Error(`Unknown tool: ${request.params.name}`);
|
|
47
|
+
}
|
|
48
|
+
const query = String(request.params.arguments?.query || "");
|
|
49
|
+
const limit = Math.min(Math.max(Number(request.params.arguments?.limit) || 5, 1), 20);
|
|
50
|
+
logger.debug({ query, limit }, "Searching docs");
|
|
51
|
+
const results = await search(embedder, query, limit);
|
|
52
|
+
const text = results
|
|
53
|
+
.map((r) => `## ${r.fileHeading} > ${r.heading}\n_Source: ${r.path} (score: ${r.score.toFixed(3)})_\n\n${r.text}`)
|
|
54
|
+
.join("\n\n---\n\n");
|
|
55
|
+
return {
|
|
56
|
+
content: [{ type: "text", text: text || "No results found." }],
|
|
57
|
+
};
|
|
58
|
+
});
|
|
59
|
+
return server;
|
|
60
|
+
}
|
|
61
|
+
export async function startServer(embedder) {
|
|
62
|
+
const server = await createServer(embedder);
|
|
63
|
+
const transport = new StdioServerTransport();
|
|
64
|
+
await server.connect(transport);
|
|
65
|
+
logger.info({ version: getVersion(), chunks: getChunkCount() }, "Server started");
|
|
66
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { ChunkRow } from "../types.js";
|
|
2
|
+
/** @package */
|
|
3
|
+
export declare function dbPath(docsPath: string): string;
|
|
4
|
+
export declare function openDb(docsPath: string): void;
|
|
5
|
+
export declare function closeDb(): void;
|
|
6
|
+
export declare function getAllDocumentHashes(): Record<string, string>;
|
|
7
|
+
export declare function setDocumentHash(path: string, hash: string): void;
|
|
8
|
+
export declare function removeDocument(path: string): void;
|
|
9
|
+
export declare function insertChunk(documentPath: string, fileHeading: string, heading: string, text: string, metadata: Record<string, unknown>, embedding: number[]): void;
|
|
10
|
+
export declare function getAllChunks(): ChunkRow[];
|
|
11
|
+
export declare function getChunkCount(): number;
|
|
12
|
+
export declare function searchFts(query: string, limit: number): number[];
|
|
13
|
+
export declare function runInTransaction(fn: () => void): void;
|
package/dist/store/db.js
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { mkdirSync } from "node:fs";
|
|
3
|
+
import { homedir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
import { DatabaseSync } from "node:sqlite";
|
|
6
|
+
const STORE_DIR = join(homedir(), ".inkdex");
|
|
7
|
+
/** @package */
|
|
8
|
+
export function dbPath(docsPath) {
|
|
9
|
+
const hash = createHash("sha256").update(docsPath).digest("hex").slice(0, 12);
|
|
10
|
+
return join(STORE_DIR, `${hash}.db`);
|
|
11
|
+
}
|
|
12
|
+
const SCHEMA_VERSION = 2;
|
|
13
|
+
const CHUNK_COLUMNS = "id, document_path, file_heading, heading, text, metadata, embedding";
|
|
14
|
+
let db;
|
|
15
|
+
let stmts;
|
|
16
|
+
function createSchema() {
|
|
17
|
+
db.exec(`
|
|
18
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
19
|
+
path TEXT PRIMARY KEY,
|
|
20
|
+
hash TEXT NOT NULL
|
|
21
|
+
);
|
|
22
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
23
|
+
id INTEGER PRIMARY KEY,
|
|
24
|
+
document_path TEXT NOT NULL REFERENCES documents(path),
|
|
25
|
+
file_heading TEXT NOT NULL,
|
|
26
|
+
heading TEXT NOT NULL,
|
|
27
|
+
text TEXT NOT NULL,
|
|
28
|
+
metadata TEXT NOT NULL,
|
|
29
|
+
embedding BLOB NOT NULL
|
|
30
|
+
);
|
|
31
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_document ON chunks(document_path);
|
|
32
|
+
|
|
33
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts
|
|
34
|
+
USING fts5(text, content=chunks, content_rowid=id);
|
|
35
|
+
|
|
36
|
+
CREATE TRIGGER IF NOT EXISTS chunks_fts_insert AFTER INSERT ON chunks BEGIN
|
|
37
|
+
INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.text);
|
|
38
|
+
END;
|
|
39
|
+
|
|
40
|
+
CREATE TRIGGER IF NOT EXISTS chunks_fts_delete AFTER DELETE ON chunks BEGIN
|
|
41
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, text)
|
|
42
|
+
VALUES('delete', old.id, old.text);
|
|
43
|
+
END;
|
|
44
|
+
`);
|
|
45
|
+
db.exec(`PRAGMA user_version = ${SCHEMA_VERSION}`);
|
|
46
|
+
}
|
|
47
|
+
function prepareStatements() {
|
|
48
|
+
stmts = {
|
|
49
|
+
getAllDocs: db.prepare("SELECT path, hash FROM documents"),
|
|
50
|
+
upsertDoc: db.prepare("INSERT OR REPLACE INTO documents (path, hash) VALUES (?, ?)"),
|
|
51
|
+
deleteChunksByDoc: db.prepare("DELETE FROM chunks WHERE document_path = ?"),
|
|
52
|
+
deleteDoc: db.prepare("DELETE FROM documents WHERE path = ?"),
|
|
53
|
+
insertChunk: db.prepare("INSERT INTO chunks (document_path, file_heading, heading, text, metadata, embedding) VALUES (?, ?, ?, ?, ?, ?)"),
|
|
54
|
+
getAllChunks: db.prepare(`SELECT ${CHUNK_COLUMNS} FROM chunks`),
|
|
55
|
+
countChunks: db.prepare("SELECT COUNT(*) as count FROM chunks"),
|
|
56
|
+
searchFts: db.prepare("SELECT rowid FROM chunks_fts WHERE chunks_fts MATCH ? ORDER BY bm25(chunks_fts) LIMIT ?"),
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
export function openDb(docsPath) {
|
|
60
|
+
mkdirSync(STORE_DIR, { recursive: true });
|
|
61
|
+
db = new DatabaseSync(dbPath(docsPath));
|
|
62
|
+
db.exec("PRAGMA journal_mode = WAL");
|
|
63
|
+
db.exec("PRAGMA foreign_keys = ON");
|
|
64
|
+
const { user_version } = db.prepare("PRAGMA user_version").get();
|
|
65
|
+
if (user_version !== SCHEMA_VERSION) {
|
|
66
|
+
db.exec("DROP TABLE IF EXISTS chunks");
|
|
67
|
+
db.exec("DROP TABLE IF EXISTS documents");
|
|
68
|
+
createSchema();
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
createSchema();
|
|
72
|
+
}
|
|
73
|
+
prepareStatements();
|
|
74
|
+
}
|
|
75
|
+
export function closeDb() {
|
|
76
|
+
db?.close();
|
|
77
|
+
}
|
|
78
|
+
export function getAllDocumentHashes() {
|
|
79
|
+
const rows = stmts.getAllDocs.all();
|
|
80
|
+
const result = {};
|
|
81
|
+
for (const row of rows) {
|
|
82
|
+
result[row.path] = row.hash;
|
|
83
|
+
}
|
|
84
|
+
return result;
|
|
85
|
+
}
|
|
86
|
+
export function setDocumentHash(path, hash) {
|
|
87
|
+
stmts.upsertDoc.run(path, hash);
|
|
88
|
+
}
|
|
89
|
+
export function removeDocument(path) {
|
|
90
|
+
stmts.deleteChunksByDoc.run(path);
|
|
91
|
+
stmts.deleteDoc.run(path);
|
|
92
|
+
}
|
|
93
|
+
function embeddingToBlob(embedding) {
|
|
94
|
+
return Buffer.from(new Float32Array(embedding).buffer);
|
|
95
|
+
}
|
|
96
|
+
function blobToEmbedding(blob) {
|
|
97
|
+
return Array.from(new Float32Array(new Uint8Array(blob).buffer));
|
|
98
|
+
}
|
|
99
|
+
function toChunkRow(row) {
|
|
100
|
+
return {
|
|
101
|
+
id: row.id,
|
|
102
|
+
path: row.document_path,
|
|
103
|
+
fileHeading: row.file_heading,
|
|
104
|
+
heading: row.heading,
|
|
105
|
+
text: row.text,
|
|
106
|
+
metadata: JSON.parse(row.metadata),
|
|
107
|
+
embedding: blobToEmbedding(row.embedding),
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
export function insertChunk(documentPath, fileHeading, heading, text, metadata, embedding) {
|
|
111
|
+
stmts.insertChunk.run(documentPath, fileHeading, heading, text, JSON.stringify(metadata), embeddingToBlob(embedding));
|
|
112
|
+
}
|
|
113
|
+
export function getAllChunks() {
|
|
114
|
+
const rows = stmts.getAllChunks.all();
|
|
115
|
+
return rows.map(toChunkRow);
|
|
116
|
+
}
|
|
117
|
+
export function getChunkCount() {
|
|
118
|
+
const row = stmts.countChunks.get();
|
|
119
|
+
return row.count;
|
|
120
|
+
}
|
|
121
|
+
export function searchFts(query, limit) {
|
|
122
|
+
// Sanitize: split into words, quote each to escape FTS5 operators
|
|
123
|
+
const terms = query
|
|
124
|
+
.split(/\s+/)
|
|
125
|
+
.filter((t) => t.length > 0)
|
|
126
|
+
.map((t) => `"${t.replace(/"/g, '""')}"`)
|
|
127
|
+
.join(" ");
|
|
128
|
+
if (terms.length === 0)
|
|
129
|
+
return [];
|
|
130
|
+
try {
|
|
131
|
+
const rows = stmts.searchFts.all(terms, limit);
|
|
132
|
+
return rows.map((r) => r.rowid);
|
|
133
|
+
}
|
|
134
|
+
catch {
|
|
135
|
+
// FTS5 MATCH can fail on edge-case inputs; fall back to empty
|
|
136
|
+
return [];
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
export function runInTransaction(fn) {
|
|
140
|
+
db.exec("BEGIN");
|
|
141
|
+
try {
|
|
142
|
+
fn();
|
|
143
|
+
db.exec("COMMIT");
|
|
144
|
+
}
|
|
145
|
+
catch (error) {
|
|
146
|
+
db.exec("ROLLBACK");
|
|
147
|
+
throw error;
|
|
148
|
+
}
|
|
149
|
+
}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export interface BaseChunk {
|
|
2
|
+
path: string;
|
|
3
|
+
fileHeading: string;
|
|
4
|
+
heading: string;
|
|
5
|
+
text: string;
|
|
6
|
+
metadata: Record<string, unknown>;
|
|
7
|
+
}
|
|
8
|
+
export interface ChunkRow extends BaseChunk {
|
|
9
|
+
id: number;
|
|
10
|
+
embedding: number[];
|
|
11
|
+
}
|
|
12
|
+
export interface SearchResult extends Omit<ChunkRow, "id" | "embedding"> {
|
|
13
|
+
score: number;
|
|
14
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function getVersion(): string;
|
package/dist/version.js
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
import { dirname, join } from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
5
|
+
export function getVersion() {
|
|
6
|
+
try {
|
|
7
|
+
const packageJson = JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf-8"));
|
|
8
|
+
return packageJson.version || "unknown";
|
|
9
|
+
}
|
|
10
|
+
catch {
|
|
11
|
+
return "unknown";
|
|
12
|
+
}
|
|
13
|
+
}
|
package/inkdex-0.0.1.tgz
ADDED
|
Binary file
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "inkdex",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "MCP server that makes your markdown docs searchable",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/cli.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"inkdex": "dist/cli.js"
|
|
9
|
+
},
|
|
10
|
+
"engines": {
|
|
11
|
+
"node": ">=22.5.0"
|
|
12
|
+
},
|
|
13
|
+
"scripts": {
|
|
14
|
+
"build": "tsc",
|
|
15
|
+
"dev": "tsx src/cli.ts",
|
|
16
|
+
"start": "node dist/cli.js",
|
|
17
|
+
"test": "npm run test:unit && npm run test:integration",
|
|
18
|
+
"test:unit": "tsx --test test/unit/**/*.test.ts",
|
|
19
|
+
"test:integration": "tsx --test --test-concurrency=1 test/integration/**/*.test.ts",
|
|
20
|
+
"check": "biome check .",
|
|
21
|
+
"format": "biome format --write ."
|
|
22
|
+
},
|
|
23
|
+
"keywords": [
|
|
24
|
+
"mcp",
|
|
25
|
+
"mcp-server",
|
|
26
|
+
"model-context-protocol",
|
|
27
|
+
"documentation",
|
|
28
|
+
"semantic-search",
|
|
29
|
+
"embeddings",
|
|
30
|
+
"markdown",
|
|
31
|
+
"rag"
|
|
32
|
+
],
|
|
33
|
+
"author": "Anton Lundén",
|
|
34
|
+
"license": "Apache-2.0",
|
|
35
|
+
"dependencies": {
|
|
36
|
+
"@huggingface/transformers": "^3.8.1",
|
|
37
|
+
"@modelcontextprotocol/sdk": "^1.25.3",
|
|
38
|
+
"gray-matter": "^4.0.3"
|
|
39
|
+
},
|
|
40
|
+
"devDependencies": {
|
|
41
|
+
"@biomejs/biome": "^2.3.14",
|
|
42
|
+
"@types/node": "^25.1.0",
|
|
43
|
+
"tsx": "^4.21.0",
|
|
44
|
+
"typescript": "^5.9.3"
|
|
45
|
+
}
|
|
46
|
+
}
|
package/release.sh
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
if [[ "${1:-}" =~ ^(-h|--help)$ ]] || [[ -z "${1:-}" ]]; then
|
|
5
|
+
echo "Usage: ./release.sh <version>"
|
|
6
|
+
echo " version: X.Y.Z (e.g., 0.1.0)"
|
|
7
|
+
exit 0
|
|
8
|
+
fi
|
|
9
|
+
|
|
10
|
+
VERSION="$1"
|
|
11
|
+
|
|
12
|
+
if ! [[ "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
|
13
|
+
echo "Error: Version must be in format X.Y.Z (e.g., 0.1.0)"
|
|
14
|
+
exit 1
|
|
15
|
+
fi
|
|
16
|
+
|
|
17
|
+
echo "Releasing v$VERSION"
|
|
18
|
+
|
|
19
|
+
npm version "$VERSION" --no-git-tag-version
|
|
20
|
+
npm install
|
|
21
|
+
|
|
22
|
+
npm audit --audit-level=critical
|
|
23
|
+
npm run check
|
|
24
|
+
npm run test:unit
|
|
25
|
+
npm run test:integration
|
|
26
|
+
|
|
27
|
+
npm run build
|
|
28
|
+
|
|
29
|
+
git add package.json package-lock.json
|
|
30
|
+
git commit -m "v$VERSION"
|
|
31
|
+
git tag "v$VERSION"
|
|
32
|
+
|
|
33
|
+
echo "Done. Push with: git push origin main v$VERSION"
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { stat } from "node:fs/promises";
|
|
4
|
+
import { resolve } from "node:path";
|
|
5
|
+
import { Embedder } from "./embedder/embedder.js";
|
|
6
|
+
import { indexDocs } from "./ingest/index-docs.js";
|
|
7
|
+
import { logger } from "./logger.js";
|
|
8
|
+
import { startServer } from "./server.js";
|
|
9
|
+
import { closeDb, openDb } from "./store/db.js";
|
|
10
|
+
|
|
11
|
+
process.on("uncaughtException", (error) => {
|
|
12
|
+
logger.error({ error }, "Uncaught exception");
|
|
13
|
+
process.exit(1);
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
process.on("unhandledRejection", (reason) => {
|
|
17
|
+
logger.error({ reason }, "Unhandled rejection");
|
|
18
|
+
process.exit(1);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
async function main(): Promise<void> {
|
|
22
|
+
const docsPath = process.env.DOCS_PATH;
|
|
23
|
+
if (!docsPath) {
|
|
24
|
+
logger.error("DOCS_PATH environment variable is required");
|
|
25
|
+
process.exit(1);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const resolved = resolve(docsPath);
|
|
29
|
+
const info = await stat(resolved).catch(() => null);
|
|
30
|
+
if (!info?.isDirectory()) {
|
|
31
|
+
logger.error({ path: resolved }, "DOCS_PATH is not a directory");
|
|
32
|
+
process.exit(1);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const embedder = await Embedder.load();
|
|
36
|
+
openDb(resolved);
|
|
37
|
+
await indexDocs(embedder, resolved);
|
|
38
|
+
await startServer(embedder);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
main().catch((error) => {
|
|
42
|
+
closeDb();
|
|
43
|
+
logger.error({ error }, "Failed to start server");
|
|
44
|
+
process.exit(1);
|
|
45
|
+
});
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import type { FeatureExtractionPipeline } from "@huggingface/transformers";
|
|
2
|
+
import { pipeline } from "@huggingface/transformers";
|
|
3
|
+
|
|
4
|
+
const MODEL = "Xenova/all-MiniLM-L6-v2";
|
|
5
|
+
const BATCH_SIZE = 32;
|
|
6
|
+
|
|
7
|
+
export class Embedder {
|
|
8
|
+
readonly maxTokens: number;
|
|
9
|
+
private readonly pipeline: FeatureExtractionPipeline;
|
|
10
|
+
|
|
11
|
+
private constructor(pipe: FeatureExtractionPipeline) {
|
|
12
|
+
this.pipeline = pipe;
|
|
13
|
+
this.maxTokens = (pipe.tokenizer.model_max_length as number) ?? 256;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
static async load(): Promise<Embedder> {
|
|
17
|
+
const pipe = await pipeline<"feature-extraction">(
|
|
18
|
+
"feature-extraction",
|
|
19
|
+
MODEL,
|
|
20
|
+
);
|
|
21
|
+
return new Embedder(pipe);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
tokenize(text: string): number[] {
|
|
25
|
+
return this.pipeline.tokenizer.encode(text);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
async embed(text: string): Promise<number[]> {
|
|
29
|
+
const result = await this.pipeline(text, {
|
|
30
|
+
pooling: "mean",
|
|
31
|
+
normalize: true,
|
|
32
|
+
});
|
|
33
|
+
return (result.tolist() as number[][])[0];
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async embedBatch(texts: string[]): Promise<number[][]> {
|
|
37
|
+
if (texts.length === 0) return [];
|
|
38
|
+
|
|
39
|
+
const results: number[][] = [];
|
|
40
|
+
|
|
41
|
+
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
42
|
+
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
43
|
+
const result = await this.pipeline(batch, {
|
|
44
|
+
pooling: "mean",
|
|
45
|
+
normalize: true,
|
|
46
|
+
});
|
|
47
|
+
results.push(...(result.tolist() as number[][]));
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return results;
|
|
51
|
+
}
|
|
52
|
+
}
|