npm - inkdex - Versions diffs - 0.0.1 - Mend

inkdex 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

package/.claude/settings.local.json +15 -0
package/.github/workflows/ci.yml +73 -0
package/.github/workflows/release.yml +65 -0
package/AGENTS.md +32 -0
package/LICENSE +190 -0
package/README.md +40 -0
package/biome.json +43 -0
package/dist/cli.d.ts +2 -0
package/dist/cli.js +38 -0
package/dist/embedder/embedder.d.ts +9 -0
package/dist/embedder/embedder.js +39 -0
package/dist/ingest/chunker.d.ts +7 -0
package/dist/ingest/chunker.js +114 -0
package/dist/ingest/index-docs.d.ts +2 -0
package/dist/ingest/index-docs.js +78 -0
package/dist/logger.d.ts +6 -0
package/dist/logger.js +28 -0
package/dist/search/search.d.ts +7 -0
package/dist/search/search.js +70 -0
package/dist/server.d.ts +2 -0
package/dist/server.js +66 -0
package/dist/store/db.d.ts +13 -0
package/dist/store/db.js +149 -0
package/dist/types.d.ts +14 -0
package/dist/types.js +1 -0
package/dist/version.d.ts +1 -0
package/dist/version.js +13 -0
package/inkdex-0.0.1.tgz +0 -0
package/package.json +46 -0
package/release.sh +33 -0
package/src/cli.ts +45 -0
package/src/embedder/embedder.ts +52 -0
package/src/ingest/chunker.ts +158 -0
package/src/ingest/index-docs.ts +120 -0
package/src/logger.ts +39 -0
package/src/search/search.ts +93 -0
package/src/server.ts +96 -0
package/src/store/db.ts +217 -0
package/src/types.ts +16 -0
package/src/version.ts +16 -0
package/test/fixtures/docs/api.md +26 -0
package/test/fixtures/docs/getting-started.md +13 -0
package/test/helpers/index.ts +14 -0
package/test/integration/embedder.test.ts +52 -0
package/test/integration/server.test.ts +125 -0
package/test/unit/chunker.test.ts +193 -0
package/test/unit/db.test.ts +190 -0
package/test/unit/index-docs.test.ts +120 -0
package/test/unit/logger.test.ts +11 -0
package/test/unit/search.test.ts +93 -0
package/test/unit/version.test.ts +16 -0
package/test-docs/api-reference.md +76 -0
package/test-docs/deployment.md +55 -0
package/test-docs/getting-started.md +52 -0
package/tsconfig.json +18 -0

package/src/ingest/chunker.ts ADDED Viewed

@@ -0,0 +1,158 @@
+import { basename } from "node:path";
+import matter from "gray-matter";
+import type { BaseChunk } from "../types.js";
+const OVERLAP_RATIO = 0.1;
+const SUB_SEPARATORS = [/^### /m, /\n\n/, /\. /];
+export interface ChunkOptions {
+  readonly maxTokens: number;
+  readonly countTokens: (text: string) => number;
+}
+function extractH1(body: string): string | null {
+  const match = body.match(/^# (.+)$/m);
+  return match ? match[1].trim() : null;
+}
+function clean(text: string): string {
+  return text
+    .replace(/<!--.*?-->/gs, "")
+    .replace(/\n{3,}/g, "\n\n")
+    .trim();
+}
+function splitWithOverlap(
+  text: string,
+  separators: RegExp[],
+  maxTokens: number,
+  overlap: number,
+  countTokens: (text: string) => number,
+): string[] {
+  if (countTokens(text) <= maxTokens) return [text];
+  const separator = separators[0];
+  const remaining = separators.slice(1);
+  const parts = text.split(separator).filter((p) => p.trim());
+  if (parts.length <= 1) {
+    // Separator didn't help — try the next one
+    if (remaining.length > 0) {
+      return splitWithOverlap(text, remaining, maxTokens, overlap, countTokens);
+    }
+    // Last resort: hard split
+    return hardSplit(text, maxTokens, overlap, countTokens);
+  }
+  const chunks: string[] = [];
+  let current = "";
+  for (const part of parts) {
+    const combined = current ? `${current}\n\n${part}` : part;
+    if (current && countTokens(combined) > maxTokens) {
+      chunks.push(current.trim());
+      // Start next chunk with overlap from the end of the previous
+      const overlapText = current.slice(-overlap);
+      current = overlapText + part;
+    } else {
+      current = combined;
+    }
+  }
+  if (current.trim()) chunks.push(current.trim());
+  // Recursively split any chunks that are still too large
+  return chunks.flatMap((chunk) => {
+    if (countTokens(chunk) <= maxTokens) return [chunk];
+    if (remaining.length > 0) {
+      return splitWithOverlap(
+        chunk,
+        remaining,
+        maxTokens,
+        overlap,
+        countTokens,
+      );
+    }
+    return hardSplit(chunk, maxTokens, overlap, countTokens);
+  });
+}
+function hardSplit(
+  text: string,
+  maxTokens: number,
+  overlap: number,
+  countTokens: (text: string) => number,
+): string[] {
+  const chunks: string[] = [];
+  const words = text.split(/\s+/);
+  let current = "";
+  for (const word of words) {
+    const next = current ? `${current} ${word}` : word;
+    if (countTokens(next) > maxTokens && current) {
+      chunks.push(current.trim());
+      // Keep overlap from end of current chunk
+      const overlapText = current.slice(-overlap);
+      current = overlapText + word;
+    } else {
+      current = next;
+    }
+  }
+  if (current.trim()) chunks.push(current.trim());
+  return chunks;
+}
+/** @package */
+export function chunkMarkdown(
+  content: string,
+  path: string,
+  options: ChunkOptions,
+): BaseChunk[] {
+  const { maxTokens, countTokens } = options;
+  const overlap = Math.floor(maxTokens * OVERLAP_RATIO);
+  const { data: metadata, content: body } = matter(content);
+  const fileHeading = extractH1(body) || basename(path, ".md");
+  const sections = body.split(/^## /m);
+  const chunks: BaseChunk[] = [];
+  for (let i = 0; i < sections.length; i++) {
+    const section = sections[i];
+    if (!section.trim()) continue;
+    let heading: string;
+    let text: string;
+    if (i === 0) {
+      // Content before the first ## — strip the H1 line and use fileHeading
+      heading = fileHeading;
+      const withoutH1 = section.replace(/^# .+$/m, "");
+      text = clean(withoutH1);
+    } else {
+      const [headingLine, ...rest] = section.split("\n");
+      heading = headingLine.trim();
+      text = clean(rest.join("\n"));
+    }
+    if (!text) continue;
+    const subChunks = splitWithOverlap(
+      text,
+      SUB_SEPARATORS,
+      maxTokens,
+      overlap,
+      countTokens,
+    );
+    for (const sub of subChunks) {
+      chunks.push({
+        path,
+        fileHeading,
+        heading,
+        text: sub,
+        metadata,
+      });
+    }
+  }
+  return chunks;
+}

package/src/ingest/index-docs.ts ADDED Viewed

@@ -0,0 +1,120 @@
+import { createHash } from "node:crypto";
+import { glob, readFile } from "node:fs/promises";
+import { relative } from "node:path";
+import type { Embedder } from "../embedder/embedder.js";
+import { logger } from "../logger.js";
+import {
+  getAllDocumentHashes,
+  insertChunk,
+  removeDocument,
+  runInTransaction,
+  setDocumentHash,
+} from "../store/db.js";
+import { chunkMarkdown } from "./chunker.js";
+const MAX_CHUNK_FILL = 0.8;
+async function findMarkdownFiles(docsPath: string): Promise<string[]> {
+  const files: string[] = [];
+  for await (const entry of glob("**/*.md", { cwd: docsPath })) {
+    files.push(`${docsPath}/${entry}`);
+  }
+  return files.sort();
+}
+function hashContent(content: string): string {
+  return createHash("sha256").update(content).digest("hex");
+}
+export async function indexDocs(
+  embedder: Embedder,
+  docsPath: string,
+): Promise<void> {
+  const files = await findMarkdownFiles(docsPath);
+  if (files.length === 0) {
+    logger.warn({ path: docsPath }, "No markdown files found");
+    return;
+  }
+  const fileContents = new Map<string, string>();
+  for (const file of files) {
+    const key = relative(docsPath, file);
+    const content = await readFile(file, "utf-8");
+    fileContents.set(key, content);
+  }
+  const storedHashes = getAllDocumentHashes();
+  const changedKeys: string[] = [];
+  for (const [key, content] of fileContents) {
+    if (storedHashes[key] !== hashContent(content)) {
+      changedKeys.push(key);
+    }
+  }
+  const removedKeys: string[] = [];
+  for (const key of Object.keys(storedHashes)) {
+    if (!fileContents.has(key)) {
+      removedKeys.push(key);
+    }
+  }
+  if (changedKeys.length === 0 && removedKeys.length === 0) {
+    logger.info({ files: files.length }, "Index up to date");
+    return;
+  }
+  const start = performance.now();
+  logger.info(
+    { changed: changedKeys.length, removed: removedKeys.length },
+    "Indexing changed files",
+  );
+  if (removedKeys.length > 0) {
+    runInTransaction(() => {
+      for (const key of removedKeys) {
+        removeDocument(key);
+      }
+    });
+  }
+  const chunkOptions = {
+    maxTokens: Math.floor(embedder.maxTokens * MAX_CHUNK_FILL),
+    countTokens: (text: string) => embedder.tokenize(text).length,
+  };
+  let totalChunks = 0;
+  for (const key of changedKeys) {
+    const content = fileContents.get(key) as string;
+    const chunks = chunkMarkdown(content, key, chunkOptions);
+    logger.debug({ path: key, chunks: chunks.length }, "Embedding chunks");
+    const embeddings = await embedder.embedBatch(chunks.map((c) => c.text));
+    runInTransaction(() => {
+      removeDocument(key);
+      setDocumentHash(key, hashContent(content));
+      for (let i = 0; i < chunks.length; i++) {
+        const chunk = chunks[i];
+        insertChunk(
+          chunk.path,
+          chunk.fileHeading,
+          chunk.heading,
+          chunk.text,
+          chunk.metadata,
+          embeddings[i],
+        );
+      }
+    });
+    totalChunks += chunks.length;
+  }
+  const duration = ((performance.now() - start) / 1000).toFixed(1);
+  logger.info(
+    { duration: `${duration}s`, chunks: totalChunks },
+    "Indexing complete",
+  );
+}

package/src/logger.ts ADDED Viewed

@@ -0,0 +1,39 @@
+const level = process.env.LOG_LEVEL ?? "info";
+const levels: Record<string, number> = {
+  debug: 0,
+  info: 1,
+  warn: 2,
+  error: 3,
+};
+const threshold = levels[level] ?? 1;
+// All levels go to stderr to keep stdout free for the MCP stdio transport
+function log(lvl: string, msg: string): void {
+  if ((levels[lvl] ?? 0) >= threshold) {
+    console.error(`[${lvl.toUpperCase()}] ${msg}`);
+  }
+}
+export const logger = {
+  debug: (msgOrObj: string | Record<string, unknown>, msg?: string) =>
+    log("debug", formatMsg(msgOrObj, msg)),
+  info: (msgOrObj: string | Record<string, unknown>, msg?: string) =>
+    log("info", formatMsg(msgOrObj, msg)),
+  warn: (msgOrObj: string | Record<string, unknown>, msg?: string) =>
+    log("warn", formatMsg(msgOrObj, msg)),
+  error: (msgOrObj: string | Record<string, unknown>, msg?: string) =>
+    log("error", formatMsg(msgOrObj, msg)),
+};
+function formatMsg(
+  msgOrObj: string | Record<string, unknown>,
+  msg?: string,
+): string {
+  if (typeof msgOrObj === "string") return msgOrObj;
+  const data = Object.entries(msgOrObj)
+    .map(([k, v]) => `${k}=${v}`)
+    .join(" ");
+  return msg ? `${msg} ${data}` : data;
+}

package/src/search/search.ts ADDED Viewed

@@ -0,0 +1,93 @@
+import type { Embedder } from "../embedder/embedder.js";
+import { getAllChunks, searchFts } from "../store/db.js";
+import type { ChunkRow, SearchResult } from "../types.js";
+/** @package */
+export function cosineSimilarity(a: number[], b: number[]): number {
+  let dot = 0;
+  let normA = 0;
+  let normB = 0;
+  for (let i = 0; i < a.length; i++) {
+    dot += a[i] * b[i];
+    normA += a[i] * a[i];
+    normB += b[i] * b[i];
+  }
+  return dot / (Math.sqrt(normA) * Math.sqrt(normB));
+}
+const RRF_K = 60;
+/** @package */
+export function rankChunksHybrid(
+  chunks: ChunkRow[],
+  queryEmbedding: number[],
+  ftsRankedIds: number[],
+  limit: number,
+): SearchResult[] {
+  const vectorRanked = chunks
+    .map((chunk) => ({
+      chunk,
+      similarity: cosineSimilarity(queryEmbedding, chunk.embedding),
+    }))
+    .sort((a, b) => b.similarity - a.similarity);
+  const vectorRankMap = new Map<number, number>();
+  for (let i = 0; i < vectorRanked.length; i++) {
+    vectorRankMap.set(vectorRanked[i].chunk.id, i + 1);
+  }
+  const bm25RankMap = new Map<number, number>();
+  for (let i = 0; i < ftsRankedIds.length; i++) {
+    bm25RankMap.set(ftsRankedIds[i], i + 1);
+  }
+  const chunkById = new Map<number, ChunkRow>();
+  for (const chunk of chunks) {
+    chunkById.set(chunk.id, chunk);
+  }
+  const allIds = new Set<number>([
+    ...vectorRankMap.keys(),
+    ...bm25RankMap.keys(),
+  ]);
+  const scored: { chunk: ChunkRow; score: number }[] = [];
+  for (const id of allIds) {
+    const chunk = chunkById.get(id);
+    if (!chunk) continue;
+    const vectorRank = vectorRankMap.get(id);
+    const bm25Rank = bm25RankMap.get(id);
+    let score = 0;
+    if (vectorRank !== undefined) score += 1 / (RRF_K + vectorRank);
+    if (bm25Rank !== undefined) score += 1 / (RRF_K + bm25Rank);
+    scored.push({ chunk, score });
+  }
+  return scored
+    .sort((a, b) => b.score - a.score)
+    .slice(0, limit)
+    .map(({ chunk, score }) => ({
+      path: chunk.path,
+      fileHeading: chunk.fileHeading,
+      heading: chunk.heading,
+      text: chunk.text,
+      metadata: chunk.metadata,
+      score,
+    }));
+}
+export async function search(
+  embedder: Embedder,
+  query: string,
+  limit: number,
+): Promise<SearchResult[]> {
+  const queryEmbedding = await embedder.embed(query);
+  const chunks = getAllChunks();
+  const ftsRankedIds = searchFts(query, chunks.length);
+  return rankChunksHybrid(chunks, queryEmbedding, ftsRankedIds, limit);
+}

package/src/server.ts ADDED Viewed

@@ -0,0 +1,96 @@
+import { Server } from "@modelcontextprotocol/sdk/server/index.js";
+import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
+import {
+  CallToolRequestSchema,
+  ListToolsRequestSchema,
+} from "@modelcontextprotocol/sdk/types.js";
+import type { Embedder } from "./embedder/embedder.js";
+import { logger } from "./logger.js";
+import { search } from "./search/search.js";
+import { getChunkCount } from "./store/db.js";
+import { getVersion } from "./version.js";
+async function createServer(embedder: Embedder): Promise<Server> {
+  const server = new Server(
+    {
+      name: "inkdex",
+      version: getVersion(),
+    },
+    {
+      capabilities: {
+        tools: {},
+      },
+    },
+  );
+  server.setRequestHandler(ListToolsRequestSchema, async () => {
+    return {
+      tools: [
+        {
+          name: "search_docs",
+          description:
+            "Search markdown documentation for relevant information. Returns chunks of content that match the query semantically.",
+          inputSchema: {
+            type: "object" as const,
+            properties: {
+              query: {
+                type: "string",
+                description:
+                  "Search query - natural language question or keywords",
+              },
+              limit: {
+                type: "number",
+                description: "Maximum number of results to return (1-20)",
+                default: 5,
+                minimum: 1,
+                maximum: 20,
+              },
+            },
+            required: ["query"],
+          },
+        },
+      ],
+    };
+  });
+  server.setRequestHandler(CallToolRequestSchema, async (request) => {
+    if (request.params.name !== "search_docs") {
+      throw new Error(`Unknown tool: ${request.params.name}`);
+    }
+    const query = String(request.params.arguments?.query || "");
+    const limit = Math.min(
+      Math.max(Number(request.params.arguments?.limit) || 5, 1),
+      20,
+    );
+    logger.debug({ query, limit }, "Searching docs");
+    const results = await search(embedder, query, limit);
+    const text = results
+      .map(
+        (r) =>
+          `## ${r.fileHeading} > ${r.heading}\n_Source: ${r.path} (score: ${r.score.toFixed(3)})_\n\n${r.text}`,
+      )
+      .join("\n\n---\n\n");
+    return {
+      content: [{ type: "text", text: text || "No results found." }],
+    };
+  });
+  return server;
+}
+export async function startServer(embedder: Embedder): Promise<void> {
+  const server = await createServer(embedder);
+  const transport = new StdioServerTransport();
+  await server.connect(transport);
+  logger.info(
+    { version: getVersion(), chunks: getChunkCount() },
+    "Server started",
+  );
+}