npm - pdf-brain - Versions diffs - 1.3.0 → 2.0.0 - Mend

pdf-brain 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +22 -2
package/package.json +2 -1
package/scripts/install.sh +1 -1
package/src/agent/hints.ts +426 -3
package/src/agent/manifest.ts +24 -4
package/src/agent/protocol.ts +52 -0
package/src/chunking.ts +130 -0
package/src/cli.contract.test.ts +239 -0
package/src/cli.ts +2573 -840
package/src/index.ts +259 -6
package/src/logger.ts +53 -0
package/src/services/AutoTagger.ts +26 -38
package/src/services/ClusterSummarizer.ts +3 -3
package/src/services/Clustering.test.ts +20 -5
package/src/services/Clustering.ts +48 -11
package/src/services/Database.ts +27 -0
package/src/services/EmbeddingProvider.ts +77 -7
package/src/services/Gateway.ts +8 -7
package/src/services/LibSQLDatabase.test.ts +139 -0
package/src/services/LibSQLDatabase.ts +228 -15
package/src/services/Migration.ts +1 -1
package/src/services/Ollama.ts +22 -7
package/src/services/PDFExtractor.test.ts +40 -1
package/src/services/PDFExtractor.ts +37 -6
package/src/types.test.ts +22 -0
package/src/types.ts +82 -2
package/src/updater.ts +8 -3

package/src/services/Clustering.ts CHANGED Viewed

@@ -55,6 +55,11 @@ export interface ClusterOptions {
   k: number;
   /** Maximum iterations for k-means (default: 100) */
   maxIterations?: number;
+  /**
+   * Optional deterministic seed for centroid initialization.
+   * Useful for reproducible runs and non-flaky tests.
+   */
+  seed?: number;
 }
 /**
@@ -67,6 +72,11 @@ export interface MiniBatchClusterOptions {
   batchSize?: number;
   /** Maximum iterations (default: 100) */
   maxIterations?: number;
+  /**
+   * Optional deterministic seed for centroid initialization + batch sampling.
+   * Useful for reproducible runs and non-flaky tests.
+   */
+  seed?: number;
 }
 // ============================================================================
@@ -227,13 +237,31 @@ function softmax(distances: number[], temperature = 1.0): number[] {
 // K-Means Algorithm
 // ============================================================================
+function makeSeededRng(seed: number): () => number {
+  // Deterministic, fast PRNG (mulberry32). Useful for reproducible clustering.
+  let a = seed >>> 0;
+  return () => {
+    a = (a + 0x6d2b79f5) >>> 0;
+    let t = Math.imul(a ^ (a >>> 15), 1 | a);
+    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
+    return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
+  };
+}
+function getRng(seed?: number): () => number {
+  return typeof seed === "number" && Number.isFinite(seed)
+    ? makeSeededRng(seed)
+    : Math.random;
+}
 /**
  * K-means clustering algorithm
  */
 function kMeans(
   vectors: number[][],
   k: number,
-  maxIterations = 100
+  maxIterations = 100,
+  rng: () => number = Math.random
 ): { centroids: number[][]; assignments: number[] } {
   if (vectors.length === 0) {
     throw new Error("Cannot cluster empty vector array");
@@ -246,7 +274,7 @@ function kMeans(
   }
   // Initialize centroids with k-means++ for better convergence
-  const centroids = kMeansPlusPlusInit(vectors, k);
+  const centroids = kMeansPlusPlusInit(vectors, k, rng);
   let assignments = new Array(vectors.length).fill(0);
   for (let iter = 0; iter < maxIterations; iter++) {
@@ -283,11 +311,15 @@ function kMeans(
 /**
  * K-means++ initialization for better centroid selection
  */
-function kMeansPlusPlusInit(vectors: number[][], k: number): number[][] {
+function kMeansPlusPlusInit(
+  vectors: number[][],
+  k: number,
+  rng: () => number = Math.random
+): number[][] {
   const centroids: number[][] = [];
   // First centroid: random
-  const firstIdx = Math.floor(Math.random() * vectors.length);
+  const firstIdx = Math.floor(rng() * vectors.length);
   centroids.push([...vectors[firstIdx]]);
   // Remaining centroids: weighted by distance squared
@@ -300,7 +332,7 @@ function kMeansPlusPlusInit(vectors: number[][], k: number): number[][] {
     });
     const totalDist = distances.reduce((a, b) => a + b, 0);
-    let threshold = Math.random() * totalDist;
+    let threshold = rng() * totalDist;
     for (let j = 0; j < vectors.length; j++) {
       threshold -= distances[j];
@@ -312,7 +344,7 @@ function kMeansPlusPlusInit(vectors: number[][], k: number): number[][] {
     // Fallback if we didn't select (shouldn't happen)
     if (centroids.length === i) {
-      centroids.push([...vectors[Math.floor(Math.random() * vectors.length)]]);
+      centroids.push([...vectors[Math.floor(rng() * vectors.length)]]);
     }
   }
@@ -412,7 +444,8 @@ function miniBatchKMeans(
   vectors: number[][],
   k: number,
   batchSize = 100,
-  maxIterations = 100
+  maxIterations = 100,
+  rng: () => number = Math.random
 ): { centroids: number[][]; assignments: number[] } {
   if (vectors.length === 0) {
     throw new Error("Cannot cluster empty vector array");
@@ -428,7 +461,7 @@ function miniBatchKMeans(
   const actualBatchSize = Math.min(batchSize, n);
   // Initialize centroids with k-means++
-  const centroids = kMeansPlusPlusInit(vectors, k);
+  const centroids = kMeansPlusPlusInit(vectors, k, rng);
   // Track per-centroid sample counts for weighted updates
   const centroidCounts = new Array(k).fill(0);
@@ -443,7 +476,7 @@ function miniBatchKMeans(
     const batchSet = new Set<number>();
     while (batchIndices.length < actualBatchSize) {
-      const idx = Math.floor(Math.random() * n);
+      const idx = Math.floor(rng() * n);
       if (!batchSet.has(idx)) {
         batchSet.add(idx);
         batchIndices.push(idx);
@@ -572,10 +605,12 @@ export const ClusteringServiceImpl = {
         Effect.try({
           try: () => {
             const vectors = embeddings.map((e) => e.vector);
+            const rng = getRng(options.seed);
             const { centroids, assignments } = kMeans(
               vectors,
               options.k,
-              options.maxIterations
+              options.maxIterations,
+              rng
             );
             // Build cluster metadata
@@ -718,12 +753,14 @@ export const ClusteringServiceImpl = {
           try: () => {
             const vectors = embeddings.map((e) => e.vector);
             const { batchSize = 100, maxIterations = 100 } = options;
+            const rng = getRng(options.seed);
             const { centroids, assignments } = miniBatchKMeans(
               vectors,
               options.k,
               batchSize,
-              maxIterations
+              maxIterations,
+              rng
             );
             // Build cluster metadata

package/src/services/Database.ts CHANGED Viewed

@@ -9,6 +9,7 @@ import { Context, Effect } from "effect";
 import type {
   DatabaseError,
   Document,
+  PDFChunk,
   SearchOptions,
   SearchResult,
 } from "../types.js";
@@ -47,10 +48,31 @@ export class Database extends Context.Tag("Database")<
         content: string;
       }>
     ) => Effect.Effect<void, DatabaseError>;
+    readonly getChunk: (
+      chunkId: string
+    ) => Effect.Effect<PDFChunk | null, DatabaseError>;
+    readonly listChunksByDocument: (
+      docId: string,
+      opts?: { page?: number }
+    ) => Effect.Effect<PDFChunk[], DatabaseError>;
     readonly addEmbeddings: (
       embeddings: Array<{ chunkId: string; embedding: number[] }>
     ) => Effect.Effect<void, DatabaseError>;
+    // Atomic rebuild/replace (non-destructive): replace a document's chunks+embeddings
+    // in a single transaction so agents can safely rerun chunking algorithms.
+    readonly replaceDocument: (
+      doc: Document,
+      chunks: Array<{
+        id: string;
+        docId: string;
+        page: number;
+        chunkIndex: number;
+        content: string;
+      }>,
+      embeddings: Array<{ chunkId: string; embedding: number[] }>,
+    ) => Effect.Effect<void, DatabaseError>;
     // Search operations
     readonly vectorSearch: (
       embedding: number[],
@@ -78,6 +100,11 @@ export class Database extends Context.Tag("Database")<
       DatabaseError
     >;
+    // Cheap aggregation helpers (avoid loading full chunk content into memory)
+    readonly countChunksByDocumentIds: (
+      docIds: string[]
+    ) => Effect.Effect<Record<string, number>, DatabaseError>;
     // Maintenance
     readonly repair: () => Effect.Effect<
       {

package/src/services/EmbeddingProvider.ts CHANGED Viewed

@@ -26,6 +26,48 @@ export class EmbeddingProvider extends Context.Tag("EmbeddingProvider")<
   }
 >() {}
+/**
+ * Agent workflows tend to call `search` repeatedly with the same query within a
+ * single session (especially via MCP). Cache query embeddings in-process to
+ * avoid repeated embed calls.
+ *
+ * Notes:
+ * - This only wraps `embed()` (single text) and intentionally does NOT cache
+ *   `embedBatch()` (chunk embeddings would explode memory).
+ * - Cache is per-process (MCP session), not persisted.
+ */
+const DEFAULT_QUERY_EMBED_CACHE_SIZE = 256;
+const readQueryEmbedCacheSize = (): number => {
+  const raw = process.env.PDF_BRAIN_QUERY_EMBED_CACHE_SIZE;
+  if (raw === undefined) return DEFAULT_QUERY_EMBED_CACHE_SIZE;
+  const n = Number.parseInt(raw, 10);
+  if (!Number.isFinite(n) || n < 0) return DEFAULT_QUERY_EMBED_CACHE_SIZE;
+  return Math.floor(n);
+};
+const makeLruCache = <V>(maxSize: number) => {
+  const map = new Map<string, V>();
+  return {
+    get(key: string): V | undefined {
+      const value = map.get(key);
+      if (value === undefined) return undefined;
+      // Refresh recency.
+      map.delete(key);
+      map.set(key, value);
+      return value;
+    },
+    set(key: string, value: V): void {
+      if (maxSize <= 0) return;
+      if (map.has(key)) map.delete(key);
+      map.set(key, value);
+      if (map.size <= maxSize) return;
+      const oldest = map.keys().next().value as string | undefined;
+      if (oldest) map.delete(oldest);
+    },
+  };
+};
 // ============================================================================
 // Implementation
 // ============================================================================
@@ -38,12 +80,31 @@ export const EmbeddingProviderLive = Layer.effect(
   Effect.gen(function* () {
     const config = loadConfig();
     const provider = config.embedding.provider;
+    const model = config.embedding.model;
+    const queryCacheSize = readQueryEmbedCacheSize();
+    const queryEmbedCache = makeLruCache<number[]>(queryCacheSize);
+    const wrapQueryCache = <E>(
+      embed: (text: string) => Effect.Effect<number[], E>,
+      label: string,
+    ) => {
+      if (queryCacheSize <= 0) return embed;
+      return (text: string) =>
+        Effect.gen(function* () {
+          const key = `${label}:${model}:${text}`;
+          const cached = queryEmbedCache.get(key);
+          if (cached) return cached;
+          const embedding = yield* embed(text);
+          queryEmbedCache.set(key, embedding);
+          return embedding;
+        });
+    };
     if (provider === "gateway") {
       // Use Gateway
       const gateway = yield* Gateway;
       return {
-        embed: gateway.embed,
+        embed: wrapQueryCache(gateway.embed, "gateway"),
         embedBatch: gateway.embedBatch,
         checkHealth: gateway.checkHealth,
         provider: "gateway" as const,
@@ -52,7 +113,7 @@ export const EmbeddingProviderLive = Layer.effect(
       // Default to Ollama
       const ollama = yield* Ollama;
       return {
-        embed: ollama.embed,
+        embed: wrapQueryCache(ollama.embed, "ollama"),
         embedBatch: ollama.embedBatch,
         checkHealth: ollama.checkHealth,
         provider: "ollama" as const,
@@ -62,9 +123,18 @@ export const EmbeddingProviderLive = Layer.effect(
 );
 /**
- * Full layer with dependencies - use this in app composition
+ * Full layer with dependencies - use this in app composition.
+ * Only constructs the provider layer that's actually configured.
  */
-export const EmbeddingProviderFullLive = Layer.provide(
-  EmbeddingProviderLive,
-  Layer.merge(OllamaLive, GatewayLive),
-);
+export const EmbeddingProviderFullLive = (() => {
+  const config = loadConfig();
+  const deps =
+    config.embedding.provider === "gateway"
+      ? Layer.merge(OllamaLive, GatewayLive)
+      : Layer.merge(OllamaLive, Layer.succeed(Gateway, {
+          embed: () => Effect.fail(new GatewayError({ reason: "Gateway not configured" })),
+          embedBatch: () => Effect.fail(new GatewayError({ reason: "Gateway not configured" })),
+          checkHealth: () => Effect.fail(new GatewayError({ reason: "Gateway not configured" })),
+        }));
+  return Layer.provide(EmbeddingProviderLive, deps);
+})();

package/src/services/Gateway.ts CHANGED Viewed

@@ -13,6 +13,7 @@ import {
 } from "effect";
 import { embed, embedMany } from "ai";
 import { GatewayError, loadConfig } from "../types.js";
+import { logDebug } from "../logger.js";
 // ============================================================================
 // Service Definition
@@ -73,8 +74,8 @@ function validateEmbedding(
   // First embedding sets the expected dimension
   if (detectedEmbeddingDimension === null) {
     detectedEmbeddingDimension = embedding.length;
-    console.log(
-      `[Gateway] Detected embedding dimension: ${detectedEmbeddingDimension}`,
+    logDebug(
+      `Gateway embedding dimension detected: ${detectedEmbeddingDimension}`,
     );
   } else if (embedding.length !== detectedEmbeddingDimension) {
     // Subsequent embeddings must match
@@ -103,11 +104,11 @@ export const GatewayLive = Layer.effect(
     const config = loadConfig();
     const model = config.embedding.model; // e.g., "openai/text-embedding-3-small"
-    // Check API key at initialization time
-    const apiKey = process.env.AI_GATEWAY_API_KEY;
+    // Check API key at initialization time (config > env var)
+    const apiKey = config.gatewayApiKey;
     if (!apiKey) {
       return yield* Effect.fail(
-        new GatewayError({ reason: "AI_GATEWAY_API_KEY not set" }),
+        new GatewayError({ reason: "Gateway API key not set. Use: pdf-brain config set gateway.apiKey <key>" }),
       );
     }
@@ -149,9 +150,9 @@ export const GatewayLive = Layer.effect(
       checkHealth: () =>
         Effect.gen(function* () {
-          if (!process.env.AI_GATEWAY_API_KEY) {
+          if (!config.gatewayApiKey) {
             return yield* Effect.fail(
-              new GatewayError({ reason: "AI_GATEWAY_API_KEY not set" }),
+              new GatewayError({ reason: "Gateway API key not set. Use: pdf-brain config set gateway.apiKey <key>" }),
             );
           }
           // Do a test embedding to verify connectivity and model access

package/src/services/LibSQLDatabase.test.ts CHANGED Viewed

@@ -295,6 +295,145 @@ describe("LibSQLDatabase", () => {
       expect(stats.chunks).toBe(1);
       expect(stats.embeddings).toBe(0);
     });
+    test("countChunksByDocumentIds returns per-doc chunk counts (including 0s)", async () => {
+      const program = Effect.gen(function* () {
+        const db = yield* Database;
+        // Add documents
+        yield* db.addDocument(
+          new Document({
+            id: "doc-a",
+            title: "Doc A",
+            path: "/path/a.pdf",
+            addedAt: new Date(),
+            pageCount: 1,
+            sizeBytes: 100,
+            tags: [],
+          }),
+        );
+        yield* db.addDocument(
+          new Document({
+            id: "doc-b",
+            title: "Doc B",
+            path: "/path/b.pdf",
+            addedAt: new Date(),
+            pageCount: 1,
+            sizeBytes: 100,
+            tags: [],
+          }),
+        );
+        // Add chunks for each
+        yield* db.addChunks([
+          {
+            id: "chunk-a-1",
+            docId: "doc-a",
+            page: 1,
+            chunkIndex: 0,
+            content: "A1",
+          },
+          {
+            id: "chunk-a-2",
+            docId: "doc-a",
+            page: 1,
+            chunkIndex: 1,
+            content: "A2",
+          },
+          {
+            id: "chunk-b-1",
+            docId: "doc-b",
+            page: 1,
+            chunkIndex: 0,
+            content: "B1",
+          },
+        ]);
+        const counts = yield* db.countChunksByDocumentIds([
+          "doc-a",
+          "doc-b",
+          "doc-missing",
+        ]);
+        return counts;
+      });
+      const layer = LibSQLDatabase.make({ url: ":memory:" });
+      const counts = await Effect.runPromise(Effect.provide(program, layer));
+      expect(counts["doc-a"]).toBe(2);
+      expect(counts["doc-b"]).toBe(1);
+      expect(counts["doc-missing"]).toBe(0);
+    });
+    test("replaceDocument atomically replaces chunks+embeddings for an existing doc", async () => {
+      const url = "file::memory:?cache=shared";
+      const layer = LibSQLDatabase.make({ url });
+      const program = Effect.gen(function* () {
+        const db = yield* Database;
+        const doc = new Document({
+          id: "doc-replace",
+          title: "Replace Me",
+          path: "/path/replace.pdf",
+          addedAt: new Date("2025-01-01T00:00:00Z"),
+          pageCount: 1,
+          sizeBytes: 100,
+          tags: [],
+          metadata: {},
+        });
+        // Seed initial doc/chunks/embeddings
+        yield* db.addDocument(doc);
+        yield* db.addChunks([
+          { id: "doc-replace-0", docId: "doc-replace", page: 1, chunkIndex: 0, content: "old-0" },
+          { id: "doc-replace-1", docId: "doc-replace", page: 1, chunkIndex: 1, content: "old-1" },
+        ]);
+        const mkEmbedding = (seed: number) =>
+          Array.from({ length: 1024 }, (_, i) => seed + i * 0.00001);
+        yield* db.addEmbeddings([
+          { chunkId: "doc-replace-0", embedding: mkEmbedding(0.1) },
+          { chunkId: "doc-replace-1", embedding: mkEmbedding(0.2) },
+        ]);
+        // Now atomically replace with 3 chunks + 3 embeddings
+        const updatedDoc = new Document({
+          ...doc,
+          pageCount: 2,
+          sizeBytes: 200,
+          metadata: { chunker: { id: "test", version: 1, unit: "chars", chunkSize: 1, chunkOverlap: 0 } },
+        });
+        yield* db.replaceDocument(
+          updatedDoc,
+          [
+            { id: "doc-replace-0", docId: "doc-replace", page: 1, chunkIndex: 0, content: "new-0" },
+            { id: "doc-replace-1", docId: "doc-replace", page: 1, chunkIndex: 1, content: "new-1" },
+            { id: "doc-replace-2", docId: "doc-replace", page: 2, chunkIndex: 0, content: "new-2" },
+          ],
+          [
+            { chunkId: "doc-replace-0", embedding: mkEmbedding(1.1) },
+            { chunkId: "doc-replace-1", embedding: mkEmbedding(1.2) },
+            { chunkId: "doc-replace-2", embedding: mkEmbedding(1.3) },
+          ],
+        );
+        const chunks = yield* db.listChunksByDocument("doc-replace");
+        const stats = yield* db.getStats();
+        return { chunks, stats };
+      });
+      const result = await Effect.runPromise(Effect.provide(program, layer));
+      expect(result.stats.documents).toBe(1);
+      expect(result.stats.chunks).toBe(3);
+      expect(result.stats.embeddings).toBe(3);
+      expect(result.chunks.map((c) => c.content)).toEqual(["new-0", "new-1", "new-2"]);
+    });
   });
   describe("taxonomy schema (SKOS)", () => {