inkdex 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.claude/settings.local.json +15 -0
  2. package/.github/workflows/ci.yml +73 -0
  3. package/.github/workflows/release.yml +65 -0
  4. package/AGENTS.md +32 -0
  5. package/LICENSE +190 -0
  6. package/README.md +40 -0
  7. package/biome.json +43 -0
  8. package/dist/cli.d.ts +2 -0
  9. package/dist/cli.js +38 -0
  10. package/dist/embedder/embedder.d.ts +9 -0
  11. package/dist/embedder/embedder.js +39 -0
  12. package/dist/ingest/chunker.d.ts +7 -0
  13. package/dist/ingest/chunker.js +114 -0
  14. package/dist/ingest/index-docs.d.ts +2 -0
  15. package/dist/ingest/index-docs.js +78 -0
  16. package/dist/logger.d.ts +6 -0
  17. package/dist/logger.js +28 -0
  18. package/dist/search/search.d.ts +7 -0
  19. package/dist/search/search.js +70 -0
  20. package/dist/server.d.ts +2 -0
  21. package/dist/server.js +66 -0
  22. package/dist/store/db.d.ts +13 -0
  23. package/dist/store/db.js +149 -0
  24. package/dist/types.d.ts +14 -0
  25. package/dist/types.js +1 -0
  26. package/dist/version.d.ts +1 -0
  27. package/dist/version.js +13 -0
  28. package/inkdex-0.0.1.tgz +0 -0
  29. package/package.json +46 -0
  30. package/release.sh +33 -0
  31. package/src/cli.ts +45 -0
  32. package/src/embedder/embedder.ts +52 -0
  33. package/src/ingest/chunker.ts +158 -0
  34. package/src/ingest/index-docs.ts +120 -0
  35. package/src/logger.ts +39 -0
  36. package/src/search/search.ts +93 -0
  37. package/src/server.ts +96 -0
  38. package/src/store/db.ts +217 -0
  39. package/src/types.ts +16 -0
  40. package/src/version.ts +16 -0
  41. package/test/fixtures/docs/api.md +26 -0
  42. package/test/fixtures/docs/getting-started.md +13 -0
  43. package/test/helpers/index.ts +14 -0
  44. package/test/integration/embedder.test.ts +52 -0
  45. package/test/integration/server.test.ts +125 -0
  46. package/test/unit/chunker.test.ts +193 -0
  47. package/test/unit/db.test.ts +190 -0
  48. package/test/unit/index-docs.test.ts +120 -0
  49. package/test/unit/logger.test.ts +11 -0
  50. package/test/unit/search.test.ts +93 -0
  51. package/test/unit/version.test.ts +16 -0
  52. package/test-docs/api-reference.md +76 -0
  53. package/test-docs/deployment.md +55 -0
  54. package/test-docs/getting-started.md +52 -0
  55. package/tsconfig.json +18 -0
@@ -0,0 +1,158 @@
1
+ import { basename } from "node:path";
2
+ import matter from "gray-matter";
3
+ import type { BaseChunk } from "../types.js";
4
+
5
+ const OVERLAP_RATIO = 0.1;
6
+ const SUB_SEPARATORS = [/^### /m, /\n\n/, /\. /];
7
+
8
+ export interface ChunkOptions {
9
+ readonly maxTokens: number;
10
+ readonly countTokens: (text: string) => number;
11
+ }
12
+
13
+ function extractH1(body: string): string | null {
14
+ const match = body.match(/^# (.+)$/m);
15
+ return match ? match[1].trim() : null;
16
+ }
17
+
18
+ function clean(text: string): string {
19
+ return text
20
+ .replace(/<!--.*?-->/gs, "")
21
+ .replace(/\n{3,}/g, "\n\n")
22
+ .trim();
23
+ }
24
+
25
+ function splitWithOverlap(
26
+ text: string,
27
+ separators: RegExp[],
28
+ maxTokens: number,
29
+ overlap: number,
30
+ countTokens: (text: string) => number,
31
+ ): string[] {
32
+ if (countTokens(text) <= maxTokens) return [text];
33
+
34
+ const separator = separators[0];
35
+ const remaining = separators.slice(1);
36
+
37
+ const parts = text.split(separator).filter((p) => p.trim());
38
+ if (parts.length <= 1) {
39
+ // Separator didn't help — try the next one
40
+ if (remaining.length > 0) {
41
+ return splitWithOverlap(text, remaining, maxTokens, overlap, countTokens);
42
+ }
43
+ // Last resort: hard split
44
+ return hardSplit(text, maxTokens, overlap, countTokens);
45
+ }
46
+
47
+ const chunks: string[] = [];
48
+ let current = "";
49
+
50
+ for (const part of parts) {
51
+ const combined = current ? `${current}\n\n${part}` : part;
52
+ if (current && countTokens(combined) > maxTokens) {
53
+ chunks.push(current.trim());
54
+ // Start next chunk with overlap from the end of the previous
55
+ const overlapText = current.slice(-overlap);
56
+ current = overlapText + part;
57
+ } else {
58
+ current = combined;
59
+ }
60
+ }
61
+ if (current.trim()) chunks.push(current.trim());
62
+
63
+ // Recursively split any chunks that are still too large
64
+ return chunks.flatMap((chunk) => {
65
+ if (countTokens(chunk) <= maxTokens) return [chunk];
66
+ if (remaining.length > 0) {
67
+ return splitWithOverlap(
68
+ chunk,
69
+ remaining,
70
+ maxTokens,
71
+ overlap,
72
+ countTokens,
73
+ );
74
+ }
75
+ return hardSplit(chunk, maxTokens, overlap, countTokens);
76
+ });
77
+ }
78
+
79
+ function hardSplit(
80
+ text: string,
81
+ maxTokens: number,
82
+ overlap: number,
83
+ countTokens: (text: string) => number,
84
+ ): string[] {
85
+ const chunks: string[] = [];
86
+ const words = text.split(/\s+/);
87
+ let current = "";
88
+
89
+ for (const word of words) {
90
+ const next = current ? `${current} ${word}` : word;
91
+ if (countTokens(next) > maxTokens && current) {
92
+ chunks.push(current.trim());
93
+ // Keep overlap from end of current chunk
94
+ const overlapText = current.slice(-overlap);
95
+ current = overlapText + word;
96
+ } else {
97
+ current = next;
98
+ }
99
+ }
100
+ if (current.trim()) chunks.push(current.trim());
101
+
102
+ return chunks;
103
+ }
104
+
105
+ /** @package */
106
+ export function chunkMarkdown(
107
+ content: string,
108
+ path: string,
109
+ options: ChunkOptions,
110
+ ): BaseChunk[] {
111
+ const { maxTokens, countTokens } = options;
112
+ const overlap = Math.floor(maxTokens * OVERLAP_RATIO);
113
+ const { data: metadata, content: body } = matter(content);
114
+ const fileHeading = extractH1(body) || basename(path, ".md");
115
+ const sections = body.split(/^## /m);
116
+ const chunks: BaseChunk[] = [];
117
+
118
+ for (let i = 0; i < sections.length; i++) {
119
+ const section = sections[i];
120
+ if (!section.trim()) continue;
121
+
122
+ let heading: string;
123
+ let text: string;
124
+
125
+ if (i === 0) {
126
+ // Content before the first ## — strip the H1 line and use fileHeading
127
+ heading = fileHeading;
128
+ const withoutH1 = section.replace(/^# .+$/m, "");
129
+ text = clean(withoutH1);
130
+ } else {
131
+ const [headingLine, ...rest] = section.split("\n");
132
+ heading = headingLine.trim();
133
+ text = clean(rest.join("\n"));
134
+ }
135
+
136
+ if (!text) continue;
137
+
138
+ const subChunks = splitWithOverlap(
139
+ text,
140
+ SUB_SEPARATORS,
141
+ maxTokens,
142
+ overlap,
143
+ countTokens,
144
+ );
145
+
146
+ for (const sub of subChunks) {
147
+ chunks.push({
148
+ path,
149
+ fileHeading,
150
+ heading,
151
+ text: sub,
152
+ metadata,
153
+ });
154
+ }
155
+ }
156
+
157
+ return chunks;
158
+ }
@@ -0,0 +1,120 @@
1
+ import { createHash } from "node:crypto";
2
+ import { glob, readFile } from "node:fs/promises";
3
+ import { relative } from "node:path";
4
+ import type { Embedder } from "../embedder/embedder.js";
5
+ import { logger } from "../logger.js";
6
+ import {
7
+ getAllDocumentHashes,
8
+ insertChunk,
9
+ removeDocument,
10
+ runInTransaction,
11
+ setDocumentHash,
12
+ } from "../store/db.js";
13
+ import { chunkMarkdown } from "./chunker.js";
14
+
15
+ const MAX_CHUNK_FILL = 0.8;
16
+
17
+ async function findMarkdownFiles(docsPath: string): Promise<string[]> {
18
+ const files: string[] = [];
19
+ for await (const entry of glob("**/*.md", { cwd: docsPath })) {
20
+ files.push(`${docsPath}/${entry}`);
21
+ }
22
+ return files.sort();
23
+ }
24
+
25
+ function hashContent(content: string): string {
26
+ return createHash("sha256").update(content).digest("hex");
27
+ }
28
+
29
+ export async function indexDocs(
30
+ embedder: Embedder,
31
+ docsPath: string,
32
+ ): Promise<void> {
33
+ const files = await findMarkdownFiles(docsPath);
34
+
35
+ if (files.length === 0) {
36
+ logger.warn({ path: docsPath }, "No markdown files found");
37
+ return;
38
+ }
39
+
40
+ const fileContents = new Map<string, string>();
41
+ for (const file of files) {
42
+ const key = relative(docsPath, file);
43
+ const content = await readFile(file, "utf-8");
44
+ fileContents.set(key, content);
45
+ }
46
+
47
+ const storedHashes = getAllDocumentHashes();
48
+
49
+ const changedKeys: string[] = [];
50
+ for (const [key, content] of fileContents) {
51
+ if (storedHashes[key] !== hashContent(content)) {
52
+ changedKeys.push(key);
53
+ }
54
+ }
55
+
56
+ const removedKeys: string[] = [];
57
+ for (const key of Object.keys(storedHashes)) {
58
+ if (!fileContents.has(key)) {
59
+ removedKeys.push(key);
60
+ }
61
+ }
62
+
63
+ if (changedKeys.length === 0 && removedKeys.length === 0) {
64
+ logger.info({ files: files.length }, "Index up to date");
65
+ return;
66
+ }
67
+
68
+ const start = performance.now();
69
+
70
+ logger.info(
71
+ { changed: changedKeys.length, removed: removedKeys.length },
72
+ "Indexing changed files",
73
+ );
74
+
75
+ if (removedKeys.length > 0) {
76
+ runInTransaction(() => {
77
+ for (const key of removedKeys) {
78
+ removeDocument(key);
79
+ }
80
+ });
81
+ }
82
+
83
+ const chunkOptions = {
84
+ maxTokens: Math.floor(embedder.maxTokens * MAX_CHUNK_FILL),
85
+ countTokens: (text: string) => embedder.tokenize(text).length,
86
+ };
87
+
88
+ let totalChunks = 0;
89
+ for (const key of changedKeys) {
90
+ const content = fileContents.get(key) as string;
91
+ const chunks = chunkMarkdown(content, key, chunkOptions);
92
+
93
+ logger.debug({ path: key, chunks: chunks.length }, "Embedding chunks");
94
+ const embeddings = await embedder.embedBatch(chunks.map((c) => c.text));
95
+
96
+ runInTransaction(() => {
97
+ removeDocument(key);
98
+ setDocumentHash(key, hashContent(content));
99
+ for (let i = 0; i < chunks.length; i++) {
100
+ const chunk = chunks[i];
101
+ insertChunk(
102
+ chunk.path,
103
+ chunk.fileHeading,
104
+ chunk.heading,
105
+ chunk.text,
106
+ chunk.metadata,
107
+ embeddings[i],
108
+ );
109
+ }
110
+ });
111
+
112
+ totalChunks += chunks.length;
113
+ }
114
+
115
+ const duration = ((performance.now() - start) / 1000).toFixed(1);
116
+ logger.info(
117
+ { duration: `${duration}s`, chunks: totalChunks },
118
+ "Indexing complete",
119
+ );
120
+ }
package/src/logger.ts ADDED
@@ -0,0 +1,39 @@
1
+ const level = process.env.LOG_LEVEL ?? "info";
2
+
3
+ const levels: Record<string, number> = {
4
+ debug: 0,
5
+ info: 1,
6
+ warn: 2,
7
+ error: 3,
8
+ };
9
+
10
+ const threshold = levels[level] ?? 1;
11
+
12
+ // All levels go to stderr to keep stdout free for the MCP stdio transport
13
+ function log(lvl: string, msg: string): void {
14
+ if ((levels[lvl] ?? 0) >= threshold) {
15
+ console.error(`[${lvl.toUpperCase()}] ${msg}`);
16
+ }
17
+ }
18
+
19
+ export const logger = {
20
+ debug: (msgOrObj: string | Record<string, unknown>, msg?: string) =>
21
+ log("debug", formatMsg(msgOrObj, msg)),
22
+ info: (msgOrObj: string | Record<string, unknown>, msg?: string) =>
23
+ log("info", formatMsg(msgOrObj, msg)),
24
+ warn: (msgOrObj: string | Record<string, unknown>, msg?: string) =>
25
+ log("warn", formatMsg(msgOrObj, msg)),
26
+ error: (msgOrObj: string | Record<string, unknown>, msg?: string) =>
27
+ log("error", formatMsg(msgOrObj, msg)),
28
+ };
29
+
30
+ function formatMsg(
31
+ msgOrObj: string | Record<string, unknown>,
32
+ msg?: string,
33
+ ): string {
34
+ if (typeof msgOrObj === "string") return msgOrObj;
35
+ const data = Object.entries(msgOrObj)
36
+ .map(([k, v]) => `${k}=${v}`)
37
+ .join(" ");
38
+ return msg ? `${msg} ${data}` : data;
39
+ }
@@ -0,0 +1,93 @@
1
+ import type { Embedder } from "../embedder/embedder.js";
2
+ import { getAllChunks, searchFts } from "../store/db.js";
3
+ import type { ChunkRow, SearchResult } from "../types.js";
4
+
5
+ /** @package */
6
+ export function cosineSimilarity(a: number[], b: number[]): number {
7
+ let dot = 0;
8
+ let normA = 0;
9
+ let normB = 0;
10
+
11
+ for (let i = 0; i < a.length; i++) {
12
+ dot += a[i] * b[i];
13
+ normA += a[i] * a[i];
14
+ normB += b[i] * b[i];
15
+ }
16
+
17
+ return dot / (Math.sqrt(normA) * Math.sqrt(normB));
18
+ }
19
+
20
+ const RRF_K = 60;
21
+
22
+ /** @package */
23
+ export function rankChunksHybrid(
24
+ chunks: ChunkRow[],
25
+ queryEmbedding: number[],
26
+ ftsRankedIds: number[],
27
+ limit: number,
28
+ ): SearchResult[] {
29
+ const vectorRanked = chunks
30
+ .map((chunk) => ({
31
+ chunk,
32
+ similarity: cosineSimilarity(queryEmbedding, chunk.embedding),
33
+ }))
34
+ .sort((a, b) => b.similarity - a.similarity);
35
+
36
+ const vectorRankMap = new Map<number, number>();
37
+ for (let i = 0; i < vectorRanked.length; i++) {
38
+ vectorRankMap.set(vectorRanked[i].chunk.id, i + 1);
39
+ }
40
+
41
+ const bm25RankMap = new Map<number, number>();
42
+ for (let i = 0; i < ftsRankedIds.length; i++) {
43
+ bm25RankMap.set(ftsRankedIds[i], i + 1);
44
+ }
45
+
46
+ const chunkById = new Map<number, ChunkRow>();
47
+ for (const chunk of chunks) {
48
+ chunkById.set(chunk.id, chunk);
49
+ }
50
+
51
+ const allIds = new Set<number>([
52
+ ...vectorRankMap.keys(),
53
+ ...bm25RankMap.keys(),
54
+ ]);
55
+
56
+ const scored: { chunk: ChunkRow; score: number }[] = [];
57
+ for (const id of allIds) {
58
+ const chunk = chunkById.get(id);
59
+ if (!chunk) continue;
60
+
61
+ const vectorRank = vectorRankMap.get(id);
62
+ const bm25Rank = bm25RankMap.get(id);
63
+
64
+ let score = 0;
65
+ if (vectorRank !== undefined) score += 1 / (RRF_K + vectorRank);
66
+ if (bm25Rank !== undefined) score += 1 / (RRF_K + bm25Rank);
67
+
68
+ scored.push({ chunk, score });
69
+ }
70
+
71
+ return scored
72
+ .sort((a, b) => b.score - a.score)
73
+ .slice(0, limit)
74
+ .map(({ chunk, score }) => ({
75
+ path: chunk.path,
76
+ fileHeading: chunk.fileHeading,
77
+ heading: chunk.heading,
78
+ text: chunk.text,
79
+ metadata: chunk.metadata,
80
+ score,
81
+ }));
82
+ }
83
+
84
+ export async function search(
85
+ embedder: Embedder,
86
+ query: string,
87
+ limit: number,
88
+ ): Promise<SearchResult[]> {
89
+ const queryEmbedding = await embedder.embed(query);
90
+ const chunks = getAllChunks();
91
+ const ftsRankedIds = searchFts(query, chunks.length);
92
+ return rankChunksHybrid(chunks, queryEmbedding, ftsRankedIds, limit);
93
+ }
package/src/server.ts ADDED
@@ -0,0 +1,96 @@
1
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
2
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
3
+ import {
4
+ CallToolRequestSchema,
5
+ ListToolsRequestSchema,
6
+ } from "@modelcontextprotocol/sdk/types.js";
7
+ import type { Embedder } from "./embedder/embedder.js";
8
+ import { logger } from "./logger.js";
9
+ import { search } from "./search/search.js";
10
+ import { getChunkCount } from "./store/db.js";
11
+ import { getVersion } from "./version.js";
12
+
13
+ async function createServer(embedder: Embedder): Promise<Server> {
14
+ const server = new Server(
15
+ {
16
+ name: "inkdex",
17
+ version: getVersion(),
18
+ },
19
+ {
20
+ capabilities: {
21
+ tools: {},
22
+ },
23
+ },
24
+ );
25
+
26
+ server.setRequestHandler(ListToolsRequestSchema, async () => {
27
+ return {
28
+ tools: [
29
+ {
30
+ name: "search_docs",
31
+ description:
32
+ "Search markdown documentation for relevant information. Returns chunks of content that match the query semantically.",
33
+ inputSchema: {
34
+ type: "object" as const,
35
+ properties: {
36
+ query: {
37
+ type: "string",
38
+ description:
39
+ "Search query - natural language question or keywords",
40
+ },
41
+ limit: {
42
+ type: "number",
43
+ description: "Maximum number of results to return (1-20)",
44
+ default: 5,
45
+ minimum: 1,
46
+ maximum: 20,
47
+ },
48
+ },
49
+ required: ["query"],
50
+ },
51
+ },
52
+ ],
53
+ };
54
+ });
55
+
56
+ server.setRequestHandler(CallToolRequestSchema, async (request) => {
57
+ if (request.params.name !== "search_docs") {
58
+ throw new Error(`Unknown tool: ${request.params.name}`);
59
+ }
60
+
61
+ const query = String(request.params.arguments?.query || "");
62
+ const limit = Math.min(
63
+ Math.max(Number(request.params.arguments?.limit) || 5, 1),
64
+ 20,
65
+ );
66
+
67
+ logger.debug({ query, limit }, "Searching docs");
68
+
69
+ const results = await search(embedder, query, limit);
70
+
71
+ const text = results
72
+ .map(
73
+ (r) =>
74
+ `## ${r.fileHeading} > ${r.heading}\n_Source: ${r.path} (score: ${r.score.toFixed(3)})_\n\n${r.text}`,
75
+ )
76
+ .join("\n\n---\n\n");
77
+
78
+ return {
79
+ content: [{ type: "text", text: text || "No results found." }],
80
+ };
81
+ });
82
+
83
+ return server;
84
+ }
85
+
86
+ export async function startServer(embedder: Embedder): Promise<void> {
87
+ const server = await createServer(embedder);
88
+
89
+ const transport = new StdioServerTransport();
90
+ await server.connect(transport);
91
+
92
+ logger.info(
93
+ { version: getVersion(), chunks: getChunkCount() },
94
+ "Server started",
95
+ );
96
+ }