pdf-brain 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -55,6 +55,11 @@ export interface ClusterOptions {
55
55
  k: number;
56
56
  /** Maximum iterations for k-means (default: 100) */
57
57
  maxIterations?: number;
58
+ /**
59
+ * Optional deterministic seed for centroid initialization.
60
+ * Useful for reproducible runs and non-flaky tests.
61
+ */
62
+ seed?: number;
58
63
  }
59
64
 
60
65
  /**
@@ -67,6 +72,11 @@ export interface MiniBatchClusterOptions {
67
72
  batchSize?: number;
68
73
  /** Maximum iterations (default: 100) */
69
74
  maxIterations?: number;
75
+ /**
76
+ * Optional deterministic seed for centroid initialization + batch sampling.
77
+ * Useful for reproducible runs and non-flaky tests.
78
+ */
79
+ seed?: number;
70
80
  }
71
81
 
72
82
  // ============================================================================
@@ -227,13 +237,31 @@ function softmax(distances: number[], temperature = 1.0): number[] {
227
237
  // K-Means Algorithm
228
238
  // ============================================================================
229
239
 
240
+ function makeSeededRng(seed: number): () => number {
241
+ // Deterministic, fast PRNG (mulberry32). Useful for reproducible clustering.
242
+ let a = seed >>> 0;
243
+ return () => {
244
+ a = (a + 0x6d2b79f5) >>> 0;
245
+ let t = Math.imul(a ^ (a >>> 15), 1 | a);
246
+ t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
247
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
248
+ };
249
+ }
250
+
251
+ function getRng(seed?: number): () => number {
252
+ return typeof seed === "number" && Number.isFinite(seed)
253
+ ? makeSeededRng(seed)
254
+ : Math.random;
255
+ }
256
+
230
257
  /**
231
258
  * K-means clustering algorithm
232
259
  */
233
260
  function kMeans(
234
261
  vectors: number[][],
235
262
  k: number,
236
- maxIterations = 100
263
+ maxIterations = 100,
264
+ rng: () => number = Math.random
237
265
  ): { centroids: number[][]; assignments: number[] } {
238
266
  if (vectors.length === 0) {
239
267
  throw new Error("Cannot cluster empty vector array");
@@ -246,7 +274,7 @@ function kMeans(
246
274
  }
247
275
 
248
276
  // Initialize centroids with k-means++ for better convergence
249
- const centroids = kMeansPlusPlusInit(vectors, k);
277
+ const centroids = kMeansPlusPlusInit(vectors, k, rng);
250
278
  let assignments = new Array(vectors.length).fill(0);
251
279
 
252
280
  for (let iter = 0; iter < maxIterations; iter++) {
@@ -283,11 +311,15 @@ function kMeans(
283
311
  /**
284
312
  * K-means++ initialization for better centroid selection
285
313
  */
286
- function kMeansPlusPlusInit(vectors: number[][], k: number): number[][] {
314
+ function kMeansPlusPlusInit(
315
+ vectors: number[][],
316
+ k: number,
317
+ rng: () => number = Math.random
318
+ ): number[][] {
287
319
  const centroids: number[][] = [];
288
320
 
289
321
  // First centroid: random
290
- const firstIdx = Math.floor(Math.random() * vectors.length);
322
+ const firstIdx = Math.floor(rng() * vectors.length);
291
323
  centroids.push([...vectors[firstIdx]]);
292
324
 
293
325
  // Remaining centroids: weighted by distance squared
@@ -300,7 +332,7 @@ function kMeansPlusPlusInit(vectors: number[][], k: number): number[][] {
300
332
  });
301
333
 
302
334
  const totalDist = distances.reduce((a, b) => a + b, 0);
303
- let threshold = Math.random() * totalDist;
335
+ let threshold = rng() * totalDist;
304
336
 
305
337
  for (let j = 0; j < vectors.length; j++) {
306
338
  threshold -= distances[j];
@@ -312,7 +344,7 @@ function kMeansPlusPlusInit(vectors: number[][], k: number): number[][] {
312
344
 
313
345
  // Fallback if we didn't select (shouldn't happen)
314
346
  if (centroids.length === i) {
315
- centroids.push([...vectors[Math.floor(Math.random() * vectors.length)]]);
347
+ centroids.push([...vectors[Math.floor(rng() * vectors.length)]]);
316
348
  }
317
349
  }
318
350
 
@@ -412,7 +444,8 @@ function miniBatchKMeans(
412
444
  vectors: number[][],
413
445
  k: number,
414
446
  batchSize = 100,
415
- maxIterations = 100
447
+ maxIterations = 100,
448
+ rng: () => number = Math.random
416
449
  ): { centroids: number[][]; assignments: number[] } {
417
450
  if (vectors.length === 0) {
418
451
  throw new Error("Cannot cluster empty vector array");
@@ -428,7 +461,7 @@ function miniBatchKMeans(
428
461
  const actualBatchSize = Math.min(batchSize, n);
429
462
 
430
463
  // Initialize centroids with k-means++
431
- const centroids = kMeansPlusPlusInit(vectors, k);
464
+ const centroids = kMeansPlusPlusInit(vectors, k, rng);
432
465
 
433
466
  // Track per-centroid sample counts for weighted updates
434
467
  const centroidCounts = new Array(k).fill(0);
@@ -443,7 +476,7 @@ function miniBatchKMeans(
443
476
  const batchSet = new Set<number>();
444
477
 
445
478
  while (batchIndices.length < actualBatchSize) {
446
- const idx = Math.floor(Math.random() * n);
479
+ const idx = Math.floor(rng() * n);
447
480
  if (!batchSet.has(idx)) {
448
481
  batchSet.add(idx);
449
482
  batchIndices.push(idx);
@@ -572,10 +605,12 @@ export const ClusteringServiceImpl = {
572
605
  Effect.try({
573
606
  try: () => {
574
607
  const vectors = embeddings.map((e) => e.vector);
608
+ const rng = getRng(options.seed);
575
609
  const { centroids, assignments } = kMeans(
576
610
  vectors,
577
611
  options.k,
578
- options.maxIterations
612
+ options.maxIterations,
613
+ rng
579
614
  );
580
615
 
581
616
  // Build cluster metadata
@@ -718,12 +753,14 @@ export const ClusteringServiceImpl = {
718
753
  try: () => {
719
754
  const vectors = embeddings.map((e) => e.vector);
720
755
  const { batchSize = 100, maxIterations = 100 } = options;
756
+ const rng = getRng(options.seed);
721
757
 
722
758
  const { centroids, assignments } = miniBatchKMeans(
723
759
  vectors,
724
760
  options.k,
725
761
  batchSize,
726
- maxIterations
762
+ maxIterations,
763
+ rng
727
764
  );
728
765
 
729
766
  // Build cluster metadata
@@ -9,6 +9,7 @@ import { Context, Effect } from "effect";
9
9
  import type {
10
10
  DatabaseError,
11
11
  Document,
12
+ PDFChunk,
12
13
  SearchOptions,
13
14
  SearchResult,
14
15
  } from "../types.js";
@@ -47,10 +48,31 @@ export class Database extends Context.Tag("Database")<
47
48
  content: string;
48
49
  }>
49
50
  ) => Effect.Effect<void, DatabaseError>;
51
+ readonly getChunk: (
52
+ chunkId: string
53
+ ) => Effect.Effect<PDFChunk | null, DatabaseError>;
54
+ readonly listChunksByDocument: (
55
+ docId: string,
56
+ opts?: { page?: number }
57
+ ) => Effect.Effect<PDFChunk[], DatabaseError>;
50
58
  readonly addEmbeddings: (
51
59
  embeddings: Array<{ chunkId: string; embedding: number[] }>
52
60
  ) => Effect.Effect<void, DatabaseError>;
53
61
 
62
+ // Atomic rebuild/replace (non-destructive): replace a document's chunks+embeddings
63
+ // in a single transaction so agents can safely rerun chunking algorithms.
64
+ readonly replaceDocument: (
65
+ doc: Document,
66
+ chunks: Array<{
67
+ id: string;
68
+ docId: string;
69
+ page: number;
70
+ chunkIndex: number;
71
+ content: string;
72
+ }>,
73
+ embeddings: Array<{ chunkId: string; embedding: number[] }>,
74
+ ) => Effect.Effect<void, DatabaseError>;
75
+
54
76
  // Search operations
55
77
  readonly vectorSearch: (
56
78
  embedding: number[],
@@ -78,6 +100,11 @@ export class Database extends Context.Tag("Database")<
78
100
  DatabaseError
79
101
  >;
80
102
 
103
+ // Cheap aggregation helpers (avoid loading full chunk content into memory)
104
+ readonly countChunksByDocumentIds: (
105
+ docIds: string[]
106
+ ) => Effect.Effect<Record<string, number>, DatabaseError>;
107
+
81
108
  // Maintenance
82
109
  readonly repair: () => Effect.Effect<
83
110
  {
@@ -26,6 +26,48 @@ export class EmbeddingProvider extends Context.Tag("EmbeddingProvider")<
26
26
  }
27
27
  >() {}
28
28
 
29
+ /**
30
+ * Agent workflows tend to call `search` repeatedly with the same query within a
31
+ * single session (especially via MCP). Cache query embeddings in-process to
32
+ * avoid repeated embed calls.
33
+ *
34
+ * Notes:
35
+ * - This only wraps `embed()` (single text) and intentionally does NOT cache
36
+ * `embedBatch()` (chunk embeddings would explode memory).
37
+ * - Cache is per-process (MCP session), not persisted.
38
+ */
39
+ const DEFAULT_QUERY_EMBED_CACHE_SIZE = 256;
40
+
41
+ const readQueryEmbedCacheSize = (): number => {
42
+ const raw = process.env.PDF_BRAIN_QUERY_EMBED_CACHE_SIZE;
43
+ if (raw === undefined) return DEFAULT_QUERY_EMBED_CACHE_SIZE;
44
+ const n = Number.parseInt(raw, 10);
45
+ if (!Number.isFinite(n) || n < 0) return DEFAULT_QUERY_EMBED_CACHE_SIZE;
46
+ return Math.floor(n);
47
+ };
48
+
49
+ const makeLruCache = <V>(maxSize: number) => {
50
+ const map = new Map<string, V>();
51
+ return {
52
+ get(key: string): V | undefined {
53
+ const value = map.get(key);
54
+ if (value === undefined) return undefined;
55
+ // Refresh recency.
56
+ map.delete(key);
57
+ map.set(key, value);
58
+ return value;
59
+ },
60
+ set(key: string, value: V): void {
61
+ if (maxSize <= 0) return;
62
+ if (map.has(key)) map.delete(key);
63
+ map.set(key, value);
64
+ if (map.size <= maxSize) return;
65
+ const oldest = map.keys().next().value as string | undefined;
66
+ if (oldest) map.delete(oldest);
67
+ },
68
+ };
69
+ };
70
+
29
71
  // ============================================================================
30
72
  // Implementation
31
73
  // ============================================================================
@@ -38,12 +80,31 @@ export const EmbeddingProviderLive = Layer.effect(
38
80
  Effect.gen(function* () {
39
81
  const config = loadConfig();
40
82
  const provider = config.embedding.provider;
83
+ const model = config.embedding.model;
84
+ const queryCacheSize = readQueryEmbedCacheSize();
85
+ const queryEmbedCache = makeLruCache<number[]>(queryCacheSize);
86
+
87
+ const wrapQueryCache = <E>(
88
+ embed: (text: string) => Effect.Effect<number[], E>,
89
+ label: string,
90
+ ) => {
91
+ if (queryCacheSize <= 0) return embed;
92
+ return (text: string) =>
93
+ Effect.gen(function* () {
94
+ const key = `${label}:${model}:${text}`;
95
+ const cached = queryEmbedCache.get(key);
96
+ if (cached) return cached;
97
+ const embedding = yield* embed(text);
98
+ queryEmbedCache.set(key, embedding);
99
+ return embedding;
100
+ });
101
+ };
41
102
 
42
103
  if (provider === "gateway") {
43
104
  // Use Gateway
44
105
  const gateway = yield* Gateway;
45
106
  return {
46
- embed: gateway.embed,
107
+ embed: wrapQueryCache(gateway.embed, "gateway"),
47
108
  embedBatch: gateway.embedBatch,
48
109
  checkHealth: gateway.checkHealth,
49
110
  provider: "gateway" as const,
@@ -52,7 +113,7 @@ export const EmbeddingProviderLive = Layer.effect(
52
113
  // Default to Ollama
53
114
  const ollama = yield* Ollama;
54
115
  return {
55
- embed: ollama.embed,
116
+ embed: wrapQueryCache(ollama.embed, "ollama"),
56
117
  embedBatch: ollama.embedBatch,
57
118
  checkHealth: ollama.checkHealth,
58
119
  provider: "ollama" as const,
@@ -62,9 +123,18 @@ export const EmbeddingProviderLive = Layer.effect(
62
123
  );
63
124
 
64
125
  /**
65
- * Full layer with dependencies - use this in app composition
126
+ * Full layer with dependencies - use this in app composition.
127
+ * Only constructs the provider layer that's actually configured.
66
128
  */
67
- export const EmbeddingProviderFullLive = Layer.provide(
68
- EmbeddingProviderLive,
69
- Layer.merge(OllamaLive, GatewayLive),
70
- );
129
+ export const EmbeddingProviderFullLive = (() => {
130
+ const config = loadConfig();
131
+ const deps =
132
+ config.embedding.provider === "gateway"
133
+ ? Layer.merge(OllamaLive, GatewayLive)
134
+ : Layer.merge(OllamaLive, Layer.succeed(Gateway, {
135
+ embed: () => Effect.fail(new GatewayError({ reason: "Gateway not configured" })),
136
+ embedBatch: () => Effect.fail(new GatewayError({ reason: "Gateway not configured" })),
137
+ checkHealth: () => Effect.fail(new GatewayError({ reason: "Gateway not configured" })),
138
+ }));
139
+ return Layer.provide(EmbeddingProviderLive, deps);
140
+ })();
@@ -13,6 +13,7 @@ import {
13
13
  } from "effect";
14
14
  import { embed, embedMany } from "ai";
15
15
  import { GatewayError, loadConfig } from "../types.js";
16
+ import { logDebug } from "../logger.js";
16
17
 
17
18
  // ============================================================================
18
19
  // Service Definition
@@ -73,8 +74,8 @@ function validateEmbedding(
73
74
  // First embedding sets the expected dimension
74
75
  if (detectedEmbeddingDimension === null) {
75
76
  detectedEmbeddingDimension = embedding.length;
76
- console.log(
77
- `[Gateway] Detected embedding dimension: ${detectedEmbeddingDimension}`,
77
+ logDebug(
78
+ `Gateway embedding dimension detected: ${detectedEmbeddingDimension}`,
78
79
  );
79
80
  } else if (embedding.length !== detectedEmbeddingDimension) {
80
81
  // Subsequent embeddings must match
@@ -103,11 +104,11 @@ export const GatewayLive = Layer.effect(
103
104
  const config = loadConfig();
104
105
  const model = config.embedding.model; // e.g., "openai/text-embedding-3-small"
105
106
 
106
- // Check API key at initialization time
107
- const apiKey = process.env.AI_GATEWAY_API_KEY;
107
+ // Check API key at initialization time (config > env var)
108
+ const apiKey = config.gatewayApiKey;
108
109
  if (!apiKey) {
109
110
  return yield* Effect.fail(
110
- new GatewayError({ reason: "AI_GATEWAY_API_KEY not set" }),
111
+ new GatewayError({ reason: "Gateway API key not set. Use: pdf-brain config set gateway.apiKey <key>" }),
111
112
  );
112
113
  }
113
114
 
@@ -149,9 +150,9 @@ export const GatewayLive = Layer.effect(
149
150
 
150
151
  checkHealth: () =>
151
152
  Effect.gen(function* () {
152
- if (!process.env.AI_GATEWAY_API_KEY) {
153
+ if (!config.gatewayApiKey) {
153
154
  return yield* Effect.fail(
154
- new GatewayError({ reason: "AI_GATEWAY_API_KEY not set" }),
155
+ new GatewayError({ reason: "Gateway API key not set. Use: pdf-brain config set gateway.apiKey <key>" }),
155
156
  );
156
157
  }
157
158
  // Do a test embedding to verify connectivity and model access
@@ -295,6 +295,145 @@ describe("LibSQLDatabase", () => {
295
295
  expect(stats.chunks).toBe(1);
296
296
  expect(stats.embeddings).toBe(0);
297
297
  });
298
+
299
+ test("countChunksByDocumentIds returns per-doc chunk counts (including 0s)", async () => {
300
+ const program = Effect.gen(function* () {
301
+ const db = yield* Database;
302
+
303
+ // Add documents
304
+ yield* db.addDocument(
305
+ new Document({
306
+ id: "doc-a",
307
+ title: "Doc A",
308
+ path: "/path/a.pdf",
309
+ addedAt: new Date(),
310
+ pageCount: 1,
311
+ sizeBytes: 100,
312
+ tags: [],
313
+ }),
314
+ );
315
+ yield* db.addDocument(
316
+ new Document({
317
+ id: "doc-b",
318
+ title: "Doc B",
319
+ path: "/path/b.pdf",
320
+ addedAt: new Date(),
321
+ pageCount: 1,
322
+ sizeBytes: 100,
323
+ tags: [],
324
+ }),
325
+ );
326
+
327
+ // Add chunks for each
328
+ yield* db.addChunks([
329
+ {
330
+ id: "chunk-a-1",
331
+ docId: "doc-a",
332
+ page: 1,
333
+ chunkIndex: 0,
334
+ content: "A1",
335
+ },
336
+ {
337
+ id: "chunk-a-2",
338
+ docId: "doc-a",
339
+ page: 1,
340
+ chunkIndex: 1,
341
+ content: "A2",
342
+ },
343
+ {
344
+ id: "chunk-b-1",
345
+ docId: "doc-b",
346
+ page: 1,
347
+ chunkIndex: 0,
348
+ content: "B1",
349
+ },
350
+ ]);
351
+
352
+ const counts = yield* db.countChunksByDocumentIds([
353
+ "doc-a",
354
+ "doc-b",
355
+ "doc-missing",
356
+ ]);
357
+ return counts;
358
+ });
359
+
360
+ const layer = LibSQLDatabase.make({ url: ":memory:" });
361
+ const counts = await Effect.runPromise(Effect.provide(program, layer));
362
+
363
+ expect(counts["doc-a"]).toBe(2);
364
+ expect(counts["doc-b"]).toBe(1);
365
+ expect(counts["doc-missing"]).toBe(0);
366
+ });
367
+
368
+ test("replaceDocument atomically replaces chunks+embeddings for an existing doc", async () => {
369
+ const url = "file::memory:?cache=shared";
370
+ const layer = LibSQLDatabase.make({ url });
371
+
372
+ const program = Effect.gen(function* () {
373
+ const db = yield* Database;
374
+
375
+ const doc = new Document({
376
+ id: "doc-replace",
377
+ title: "Replace Me",
378
+ path: "/path/replace.pdf",
379
+ addedAt: new Date("2025-01-01T00:00:00Z"),
380
+ pageCount: 1,
381
+ sizeBytes: 100,
382
+ tags: [],
383
+ metadata: {},
384
+ });
385
+
386
+ // Seed initial doc/chunks/embeddings
387
+ yield* db.addDocument(doc);
388
+ yield* db.addChunks([
389
+ { id: "doc-replace-0", docId: "doc-replace", page: 1, chunkIndex: 0, content: "old-0" },
390
+ { id: "doc-replace-1", docId: "doc-replace", page: 1, chunkIndex: 1, content: "old-1" },
391
+ ]);
392
+
393
+ const mkEmbedding = (seed: number) =>
394
+ Array.from({ length: 1024 }, (_, i) => seed + i * 0.00001);
395
+
396
+ yield* db.addEmbeddings([
397
+ { chunkId: "doc-replace-0", embedding: mkEmbedding(0.1) },
398
+ { chunkId: "doc-replace-1", embedding: mkEmbedding(0.2) },
399
+ ]);
400
+
401
+ // Now atomically replace with 3 chunks + 3 embeddings
402
+ const updatedDoc = new Document({
403
+ ...doc,
404
+ pageCount: 2,
405
+ sizeBytes: 200,
406
+ metadata: { chunker: { id: "test", version: 1, unit: "chars", chunkSize: 1, chunkOverlap: 0 } },
407
+ });
408
+
409
+ yield* db.replaceDocument(
410
+ updatedDoc,
411
+ [
412
+ { id: "doc-replace-0", docId: "doc-replace", page: 1, chunkIndex: 0, content: "new-0" },
413
+ { id: "doc-replace-1", docId: "doc-replace", page: 1, chunkIndex: 1, content: "new-1" },
414
+ { id: "doc-replace-2", docId: "doc-replace", page: 2, chunkIndex: 0, content: "new-2" },
415
+ ],
416
+ [
417
+ { chunkId: "doc-replace-0", embedding: mkEmbedding(1.1) },
418
+ { chunkId: "doc-replace-1", embedding: mkEmbedding(1.2) },
419
+ { chunkId: "doc-replace-2", embedding: mkEmbedding(1.3) },
420
+ ],
421
+ );
422
+
423
+ const chunks = yield* db.listChunksByDocument("doc-replace");
424
+ const stats = yield* db.getStats();
425
+
426
+ return { chunks, stats };
427
+ });
428
+
429
+ const result = await Effect.runPromise(Effect.provide(program, layer));
430
+
431
+ expect(result.stats.documents).toBe(1);
432
+ expect(result.stats.chunks).toBe(3);
433
+ expect(result.stats.embeddings).toBe(3);
434
+
435
+ expect(result.chunks.map((c) => c.content)).toEqual(["new-0", "new-1", "new-2"]);
436
+ });
298
437
  });
299
438
 
300
439
  describe("taxonomy schema (SKOS)", () => {