pdf-brain 1.3.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -2
- package/package.json +2 -1
- package/scripts/install.sh +1 -1
- package/src/agent/hints.ts +426 -3
- package/src/agent/manifest.ts +24 -4
- package/src/agent/protocol.ts +52 -0
- package/src/chunking.ts +130 -0
- package/src/cli.contract.test.ts +239 -0
- package/src/cli.ts +2573 -840
- package/src/index.ts +259 -6
- package/src/logger.ts +53 -0
- package/src/services/AutoTagger.ts +26 -38
- package/src/services/ClusterSummarizer.ts +3 -3
- package/src/services/Clustering.test.ts +20 -5
- package/src/services/Clustering.ts +48 -11
- package/src/services/Database.ts +27 -0
- package/src/services/EmbeddingProvider.ts +77 -7
- package/src/services/Gateway.ts +8 -7
- package/src/services/LibSQLDatabase.test.ts +139 -0
- package/src/services/LibSQLDatabase.ts +228 -15
- package/src/services/Migration.ts +1 -1
- package/src/services/Ollama.ts +22 -7
- package/src/services/PDFExtractor.test.ts +40 -1
- package/src/services/PDFExtractor.ts +37 -6
- package/src/types.test.ts +22 -0
- package/src/types.ts +82 -2
- package/src/updater.ts +8 -3
|
@@ -55,6 +55,11 @@ export interface ClusterOptions {
|
|
|
55
55
|
k: number;
|
|
56
56
|
/** Maximum iterations for k-means (default: 100) */
|
|
57
57
|
maxIterations?: number;
|
|
58
|
+
/**
|
|
59
|
+
* Optional deterministic seed for centroid initialization.
|
|
60
|
+
* Useful for reproducible runs and non-flaky tests.
|
|
61
|
+
*/
|
|
62
|
+
seed?: number;
|
|
58
63
|
}
|
|
59
64
|
|
|
60
65
|
/**
|
|
@@ -67,6 +72,11 @@ export interface MiniBatchClusterOptions {
|
|
|
67
72
|
batchSize?: number;
|
|
68
73
|
/** Maximum iterations (default: 100) */
|
|
69
74
|
maxIterations?: number;
|
|
75
|
+
/**
|
|
76
|
+
* Optional deterministic seed for centroid initialization + batch sampling.
|
|
77
|
+
* Useful for reproducible runs and non-flaky tests.
|
|
78
|
+
*/
|
|
79
|
+
seed?: number;
|
|
70
80
|
}
|
|
71
81
|
|
|
72
82
|
// ============================================================================
|
|
@@ -227,13 +237,31 @@ function softmax(distances: number[], temperature = 1.0): number[] {
|
|
|
227
237
|
// K-Means Algorithm
|
|
228
238
|
// ============================================================================
|
|
229
239
|
|
|
240
|
+
function makeSeededRng(seed: number): () => number {
|
|
241
|
+
// Deterministic, fast PRNG (mulberry32). Useful for reproducible clustering.
|
|
242
|
+
let a = seed >>> 0;
|
|
243
|
+
return () => {
|
|
244
|
+
a = (a + 0x6d2b79f5) >>> 0;
|
|
245
|
+
let t = Math.imul(a ^ (a >>> 15), 1 | a);
|
|
246
|
+
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
|
|
247
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
function getRng(seed?: number): () => number {
|
|
252
|
+
return typeof seed === "number" && Number.isFinite(seed)
|
|
253
|
+
? makeSeededRng(seed)
|
|
254
|
+
: Math.random;
|
|
255
|
+
}
|
|
256
|
+
|
|
230
257
|
/**
|
|
231
258
|
* K-means clustering algorithm
|
|
232
259
|
*/
|
|
233
260
|
function kMeans(
|
|
234
261
|
vectors: number[][],
|
|
235
262
|
k: number,
|
|
236
|
-
maxIterations = 100
|
|
263
|
+
maxIterations = 100,
|
|
264
|
+
rng: () => number = Math.random
|
|
237
265
|
): { centroids: number[][]; assignments: number[] } {
|
|
238
266
|
if (vectors.length === 0) {
|
|
239
267
|
throw new Error("Cannot cluster empty vector array");
|
|
@@ -246,7 +274,7 @@ function kMeans(
|
|
|
246
274
|
}
|
|
247
275
|
|
|
248
276
|
// Initialize centroids with k-means++ for better convergence
|
|
249
|
-
const centroids = kMeansPlusPlusInit(vectors, k);
|
|
277
|
+
const centroids = kMeansPlusPlusInit(vectors, k, rng);
|
|
250
278
|
let assignments = new Array(vectors.length).fill(0);
|
|
251
279
|
|
|
252
280
|
for (let iter = 0; iter < maxIterations; iter++) {
|
|
@@ -283,11 +311,15 @@ function kMeans(
|
|
|
283
311
|
/**
|
|
284
312
|
* K-means++ initialization for better centroid selection
|
|
285
313
|
*/
|
|
286
|
-
function kMeansPlusPlusInit(
|
|
314
|
+
function kMeansPlusPlusInit(
|
|
315
|
+
vectors: number[][],
|
|
316
|
+
k: number,
|
|
317
|
+
rng: () => number = Math.random
|
|
318
|
+
): number[][] {
|
|
287
319
|
const centroids: number[][] = [];
|
|
288
320
|
|
|
289
321
|
// First centroid: random
|
|
290
|
-
const firstIdx = Math.floor(
|
|
322
|
+
const firstIdx = Math.floor(rng() * vectors.length);
|
|
291
323
|
centroids.push([...vectors[firstIdx]]);
|
|
292
324
|
|
|
293
325
|
// Remaining centroids: weighted by distance squared
|
|
@@ -300,7 +332,7 @@ function kMeansPlusPlusInit(vectors: number[][], k: number): number[][] {
|
|
|
300
332
|
});
|
|
301
333
|
|
|
302
334
|
const totalDist = distances.reduce((a, b) => a + b, 0);
|
|
303
|
-
let threshold =
|
|
335
|
+
let threshold = rng() * totalDist;
|
|
304
336
|
|
|
305
337
|
for (let j = 0; j < vectors.length; j++) {
|
|
306
338
|
threshold -= distances[j];
|
|
@@ -312,7 +344,7 @@ function kMeansPlusPlusInit(vectors: number[][], k: number): number[][] {
|
|
|
312
344
|
|
|
313
345
|
// Fallback if we didn't select (shouldn't happen)
|
|
314
346
|
if (centroids.length === i) {
|
|
315
|
-
centroids.push([...vectors[Math.floor(
|
|
347
|
+
centroids.push([...vectors[Math.floor(rng() * vectors.length)]]);
|
|
316
348
|
}
|
|
317
349
|
}
|
|
318
350
|
|
|
@@ -412,7 +444,8 @@ function miniBatchKMeans(
|
|
|
412
444
|
vectors: number[][],
|
|
413
445
|
k: number,
|
|
414
446
|
batchSize = 100,
|
|
415
|
-
maxIterations = 100
|
|
447
|
+
maxIterations = 100,
|
|
448
|
+
rng: () => number = Math.random
|
|
416
449
|
): { centroids: number[][]; assignments: number[] } {
|
|
417
450
|
if (vectors.length === 0) {
|
|
418
451
|
throw new Error("Cannot cluster empty vector array");
|
|
@@ -428,7 +461,7 @@ function miniBatchKMeans(
|
|
|
428
461
|
const actualBatchSize = Math.min(batchSize, n);
|
|
429
462
|
|
|
430
463
|
// Initialize centroids with k-means++
|
|
431
|
-
const centroids = kMeansPlusPlusInit(vectors, k);
|
|
464
|
+
const centroids = kMeansPlusPlusInit(vectors, k, rng);
|
|
432
465
|
|
|
433
466
|
// Track per-centroid sample counts for weighted updates
|
|
434
467
|
const centroidCounts = new Array(k).fill(0);
|
|
@@ -443,7 +476,7 @@ function miniBatchKMeans(
|
|
|
443
476
|
const batchSet = new Set<number>();
|
|
444
477
|
|
|
445
478
|
while (batchIndices.length < actualBatchSize) {
|
|
446
|
-
const idx = Math.floor(
|
|
479
|
+
const idx = Math.floor(rng() * n);
|
|
447
480
|
if (!batchSet.has(idx)) {
|
|
448
481
|
batchSet.add(idx);
|
|
449
482
|
batchIndices.push(idx);
|
|
@@ -572,10 +605,12 @@ export const ClusteringServiceImpl = {
|
|
|
572
605
|
Effect.try({
|
|
573
606
|
try: () => {
|
|
574
607
|
const vectors = embeddings.map((e) => e.vector);
|
|
608
|
+
const rng = getRng(options.seed);
|
|
575
609
|
const { centroids, assignments } = kMeans(
|
|
576
610
|
vectors,
|
|
577
611
|
options.k,
|
|
578
|
-
options.maxIterations
|
|
612
|
+
options.maxIterations,
|
|
613
|
+
rng
|
|
579
614
|
);
|
|
580
615
|
|
|
581
616
|
// Build cluster metadata
|
|
@@ -718,12 +753,14 @@ export const ClusteringServiceImpl = {
|
|
|
718
753
|
try: () => {
|
|
719
754
|
const vectors = embeddings.map((e) => e.vector);
|
|
720
755
|
const { batchSize = 100, maxIterations = 100 } = options;
|
|
756
|
+
const rng = getRng(options.seed);
|
|
721
757
|
|
|
722
758
|
const { centroids, assignments } = miniBatchKMeans(
|
|
723
759
|
vectors,
|
|
724
760
|
options.k,
|
|
725
761
|
batchSize,
|
|
726
|
-
maxIterations
|
|
762
|
+
maxIterations,
|
|
763
|
+
rng
|
|
727
764
|
);
|
|
728
765
|
|
|
729
766
|
// Build cluster metadata
|
package/src/services/Database.ts
CHANGED
|
@@ -9,6 +9,7 @@ import { Context, Effect } from "effect";
|
|
|
9
9
|
import type {
|
|
10
10
|
DatabaseError,
|
|
11
11
|
Document,
|
|
12
|
+
PDFChunk,
|
|
12
13
|
SearchOptions,
|
|
13
14
|
SearchResult,
|
|
14
15
|
} from "../types.js";
|
|
@@ -47,10 +48,31 @@ export class Database extends Context.Tag("Database")<
|
|
|
47
48
|
content: string;
|
|
48
49
|
}>
|
|
49
50
|
) => Effect.Effect<void, DatabaseError>;
|
|
51
|
+
readonly getChunk: (
|
|
52
|
+
chunkId: string
|
|
53
|
+
) => Effect.Effect<PDFChunk | null, DatabaseError>;
|
|
54
|
+
readonly listChunksByDocument: (
|
|
55
|
+
docId: string,
|
|
56
|
+
opts?: { page?: number }
|
|
57
|
+
) => Effect.Effect<PDFChunk[], DatabaseError>;
|
|
50
58
|
readonly addEmbeddings: (
|
|
51
59
|
embeddings: Array<{ chunkId: string; embedding: number[] }>
|
|
52
60
|
) => Effect.Effect<void, DatabaseError>;
|
|
53
61
|
|
|
62
|
+
// Atomic rebuild/replace (non-destructive): replace a document's chunks+embeddings
|
|
63
|
+
// in a single transaction so agents can safely rerun chunking algorithms.
|
|
64
|
+
readonly replaceDocument: (
|
|
65
|
+
doc: Document,
|
|
66
|
+
chunks: Array<{
|
|
67
|
+
id: string;
|
|
68
|
+
docId: string;
|
|
69
|
+
page: number;
|
|
70
|
+
chunkIndex: number;
|
|
71
|
+
content: string;
|
|
72
|
+
}>,
|
|
73
|
+
embeddings: Array<{ chunkId: string; embedding: number[] }>,
|
|
74
|
+
) => Effect.Effect<void, DatabaseError>;
|
|
75
|
+
|
|
54
76
|
// Search operations
|
|
55
77
|
readonly vectorSearch: (
|
|
56
78
|
embedding: number[],
|
|
@@ -78,6 +100,11 @@ export class Database extends Context.Tag("Database")<
|
|
|
78
100
|
DatabaseError
|
|
79
101
|
>;
|
|
80
102
|
|
|
103
|
+
// Cheap aggregation helpers (avoid loading full chunk content into memory)
|
|
104
|
+
readonly countChunksByDocumentIds: (
|
|
105
|
+
docIds: string[]
|
|
106
|
+
) => Effect.Effect<Record<string, number>, DatabaseError>;
|
|
107
|
+
|
|
81
108
|
// Maintenance
|
|
82
109
|
readonly repair: () => Effect.Effect<
|
|
83
110
|
{
|
|
@@ -26,6 +26,48 @@ export class EmbeddingProvider extends Context.Tag("EmbeddingProvider")<
|
|
|
26
26
|
}
|
|
27
27
|
>() {}
|
|
28
28
|
|
|
29
|
+
/**
|
|
30
|
+
* Agent workflows tend to call `search` repeatedly with the same query within a
|
|
31
|
+
* single session (especially via MCP). Cache query embeddings in-process to
|
|
32
|
+
* avoid repeated embed calls.
|
|
33
|
+
*
|
|
34
|
+
* Notes:
|
|
35
|
+
* - This only wraps `embed()` (single text) and intentionally does NOT cache
|
|
36
|
+
* `embedBatch()` (chunk embeddings would explode memory).
|
|
37
|
+
* - Cache is per-process (MCP session), not persisted.
|
|
38
|
+
*/
|
|
39
|
+
const DEFAULT_QUERY_EMBED_CACHE_SIZE = 256;
|
|
40
|
+
|
|
41
|
+
const readQueryEmbedCacheSize = (): number => {
|
|
42
|
+
const raw = process.env.PDF_BRAIN_QUERY_EMBED_CACHE_SIZE;
|
|
43
|
+
if (raw === undefined) return DEFAULT_QUERY_EMBED_CACHE_SIZE;
|
|
44
|
+
const n = Number.parseInt(raw, 10);
|
|
45
|
+
if (!Number.isFinite(n) || n < 0) return DEFAULT_QUERY_EMBED_CACHE_SIZE;
|
|
46
|
+
return Math.floor(n);
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
const makeLruCache = <V>(maxSize: number) => {
|
|
50
|
+
const map = new Map<string, V>();
|
|
51
|
+
return {
|
|
52
|
+
get(key: string): V | undefined {
|
|
53
|
+
const value = map.get(key);
|
|
54
|
+
if (value === undefined) return undefined;
|
|
55
|
+
// Refresh recency.
|
|
56
|
+
map.delete(key);
|
|
57
|
+
map.set(key, value);
|
|
58
|
+
return value;
|
|
59
|
+
},
|
|
60
|
+
set(key: string, value: V): void {
|
|
61
|
+
if (maxSize <= 0) return;
|
|
62
|
+
if (map.has(key)) map.delete(key);
|
|
63
|
+
map.set(key, value);
|
|
64
|
+
if (map.size <= maxSize) return;
|
|
65
|
+
const oldest = map.keys().next().value as string | undefined;
|
|
66
|
+
if (oldest) map.delete(oldest);
|
|
67
|
+
},
|
|
68
|
+
};
|
|
69
|
+
};
|
|
70
|
+
|
|
29
71
|
// ============================================================================
|
|
30
72
|
// Implementation
|
|
31
73
|
// ============================================================================
|
|
@@ -38,12 +80,31 @@ export const EmbeddingProviderLive = Layer.effect(
|
|
|
38
80
|
Effect.gen(function* () {
|
|
39
81
|
const config = loadConfig();
|
|
40
82
|
const provider = config.embedding.provider;
|
|
83
|
+
const model = config.embedding.model;
|
|
84
|
+
const queryCacheSize = readQueryEmbedCacheSize();
|
|
85
|
+
const queryEmbedCache = makeLruCache<number[]>(queryCacheSize);
|
|
86
|
+
|
|
87
|
+
const wrapQueryCache = <E>(
|
|
88
|
+
embed: (text: string) => Effect.Effect<number[], E>,
|
|
89
|
+
label: string,
|
|
90
|
+
) => {
|
|
91
|
+
if (queryCacheSize <= 0) return embed;
|
|
92
|
+
return (text: string) =>
|
|
93
|
+
Effect.gen(function* () {
|
|
94
|
+
const key = `${label}:${model}:${text}`;
|
|
95
|
+
const cached = queryEmbedCache.get(key);
|
|
96
|
+
if (cached) return cached;
|
|
97
|
+
const embedding = yield* embed(text);
|
|
98
|
+
queryEmbedCache.set(key, embedding);
|
|
99
|
+
return embedding;
|
|
100
|
+
});
|
|
101
|
+
};
|
|
41
102
|
|
|
42
103
|
if (provider === "gateway") {
|
|
43
104
|
// Use Gateway
|
|
44
105
|
const gateway = yield* Gateway;
|
|
45
106
|
return {
|
|
46
|
-
embed: gateway.embed,
|
|
107
|
+
embed: wrapQueryCache(gateway.embed, "gateway"),
|
|
47
108
|
embedBatch: gateway.embedBatch,
|
|
48
109
|
checkHealth: gateway.checkHealth,
|
|
49
110
|
provider: "gateway" as const,
|
|
@@ -52,7 +113,7 @@ export const EmbeddingProviderLive = Layer.effect(
|
|
|
52
113
|
// Default to Ollama
|
|
53
114
|
const ollama = yield* Ollama;
|
|
54
115
|
return {
|
|
55
|
-
embed: ollama.embed,
|
|
116
|
+
embed: wrapQueryCache(ollama.embed, "ollama"),
|
|
56
117
|
embedBatch: ollama.embedBatch,
|
|
57
118
|
checkHealth: ollama.checkHealth,
|
|
58
119
|
provider: "ollama" as const,
|
|
@@ -62,9 +123,18 @@ export const EmbeddingProviderLive = Layer.effect(
|
|
|
62
123
|
);
|
|
63
124
|
|
|
64
125
|
/**
|
|
65
|
-
* Full layer with dependencies - use this in app composition
|
|
126
|
+
* Full layer with dependencies - use this in app composition.
|
|
127
|
+
* Only constructs the provider layer that's actually configured.
|
|
66
128
|
*/
|
|
67
|
-
export const EmbeddingProviderFullLive =
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
129
|
+
export const EmbeddingProviderFullLive = (() => {
|
|
130
|
+
const config = loadConfig();
|
|
131
|
+
const deps =
|
|
132
|
+
config.embedding.provider === "gateway"
|
|
133
|
+
? Layer.merge(OllamaLive, GatewayLive)
|
|
134
|
+
: Layer.merge(OllamaLive, Layer.succeed(Gateway, {
|
|
135
|
+
embed: () => Effect.fail(new GatewayError({ reason: "Gateway not configured" })),
|
|
136
|
+
embedBatch: () => Effect.fail(new GatewayError({ reason: "Gateway not configured" })),
|
|
137
|
+
checkHealth: () => Effect.fail(new GatewayError({ reason: "Gateway not configured" })),
|
|
138
|
+
}));
|
|
139
|
+
return Layer.provide(EmbeddingProviderLive, deps);
|
|
140
|
+
})();
|
package/src/services/Gateway.ts
CHANGED
|
@@ -13,6 +13,7 @@ import {
|
|
|
13
13
|
} from "effect";
|
|
14
14
|
import { embed, embedMany } from "ai";
|
|
15
15
|
import { GatewayError, loadConfig } from "../types.js";
|
|
16
|
+
import { logDebug } from "../logger.js";
|
|
16
17
|
|
|
17
18
|
// ============================================================================
|
|
18
19
|
// Service Definition
|
|
@@ -73,8 +74,8 @@ function validateEmbedding(
|
|
|
73
74
|
// First embedding sets the expected dimension
|
|
74
75
|
if (detectedEmbeddingDimension === null) {
|
|
75
76
|
detectedEmbeddingDimension = embedding.length;
|
|
76
|
-
|
|
77
|
-
`
|
|
77
|
+
logDebug(
|
|
78
|
+
`Gateway embedding dimension detected: ${detectedEmbeddingDimension}`,
|
|
78
79
|
);
|
|
79
80
|
} else if (embedding.length !== detectedEmbeddingDimension) {
|
|
80
81
|
// Subsequent embeddings must match
|
|
@@ -103,11 +104,11 @@ export const GatewayLive = Layer.effect(
|
|
|
103
104
|
const config = loadConfig();
|
|
104
105
|
const model = config.embedding.model; // e.g., "openai/text-embedding-3-small"
|
|
105
106
|
|
|
106
|
-
// Check API key at initialization time
|
|
107
|
-
const apiKey =
|
|
107
|
+
// Check API key at initialization time (config > env var)
|
|
108
|
+
const apiKey = config.gatewayApiKey;
|
|
108
109
|
if (!apiKey) {
|
|
109
110
|
return yield* Effect.fail(
|
|
110
|
-
new GatewayError({ reason: "
|
|
111
|
+
new GatewayError({ reason: "Gateway API key not set. Use: pdf-brain config set gateway.apiKey <key>" }),
|
|
111
112
|
);
|
|
112
113
|
}
|
|
113
114
|
|
|
@@ -149,9 +150,9 @@ export const GatewayLive = Layer.effect(
|
|
|
149
150
|
|
|
150
151
|
checkHealth: () =>
|
|
151
152
|
Effect.gen(function* () {
|
|
152
|
-
if (!
|
|
153
|
+
if (!config.gatewayApiKey) {
|
|
153
154
|
return yield* Effect.fail(
|
|
154
|
-
new GatewayError({ reason: "
|
|
155
|
+
new GatewayError({ reason: "Gateway API key not set. Use: pdf-brain config set gateway.apiKey <key>" }),
|
|
155
156
|
);
|
|
156
157
|
}
|
|
157
158
|
// Do a test embedding to verify connectivity and model access
|
|
@@ -295,6 +295,145 @@ describe("LibSQLDatabase", () => {
|
|
|
295
295
|
expect(stats.chunks).toBe(1);
|
|
296
296
|
expect(stats.embeddings).toBe(0);
|
|
297
297
|
});
|
|
298
|
+
|
|
299
|
+
test("countChunksByDocumentIds returns per-doc chunk counts (including 0s)", async () => {
|
|
300
|
+
const program = Effect.gen(function* () {
|
|
301
|
+
const db = yield* Database;
|
|
302
|
+
|
|
303
|
+
// Add documents
|
|
304
|
+
yield* db.addDocument(
|
|
305
|
+
new Document({
|
|
306
|
+
id: "doc-a",
|
|
307
|
+
title: "Doc A",
|
|
308
|
+
path: "/path/a.pdf",
|
|
309
|
+
addedAt: new Date(),
|
|
310
|
+
pageCount: 1,
|
|
311
|
+
sizeBytes: 100,
|
|
312
|
+
tags: [],
|
|
313
|
+
}),
|
|
314
|
+
);
|
|
315
|
+
yield* db.addDocument(
|
|
316
|
+
new Document({
|
|
317
|
+
id: "doc-b",
|
|
318
|
+
title: "Doc B",
|
|
319
|
+
path: "/path/b.pdf",
|
|
320
|
+
addedAt: new Date(),
|
|
321
|
+
pageCount: 1,
|
|
322
|
+
sizeBytes: 100,
|
|
323
|
+
tags: [],
|
|
324
|
+
}),
|
|
325
|
+
);
|
|
326
|
+
|
|
327
|
+
// Add chunks for each
|
|
328
|
+
yield* db.addChunks([
|
|
329
|
+
{
|
|
330
|
+
id: "chunk-a-1",
|
|
331
|
+
docId: "doc-a",
|
|
332
|
+
page: 1,
|
|
333
|
+
chunkIndex: 0,
|
|
334
|
+
content: "A1",
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
id: "chunk-a-2",
|
|
338
|
+
docId: "doc-a",
|
|
339
|
+
page: 1,
|
|
340
|
+
chunkIndex: 1,
|
|
341
|
+
content: "A2",
|
|
342
|
+
},
|
|
343
|
+
{
|
|
344
|
+
id: "chunk-b-1",
|
|
345
|
+
docId: "doc-b",
|
|
346
|
+
page: 1,
|
|
347
|
+
chunkIndex: 0,
|
|
348
|
+
content: "B1",
|
|
349
|
+
},
|
|
350
|
+
]);
|
|
351
|
+
|
|
352
|
+
const counts = yield* db.countChunksByDocumentIds([
|
|
353
|
+
"doc-a",
|
|
354
|
+
"doc-b",
|
|
355
|
+
"doc-missing",
|
|
356
|
+
]);
|
|
357
|
+
return counts;
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
const layer = LibSQLDatabase.make({ url: ":memory:" });
|
|
361
|
+
const counts = await Effect.runPromise(Effect.provide(program, layer));
|
|
362
|
+
|
|
363
|
+
expect(counts["doc-a"]).toBe(2);
|
|
364
|
+
expect(counts["doc-b"]).toBe(1);
|
|
365
|
+
expect(counts["doc-missing"]).toBe(0);
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
test("replaceDocument atomically replaces chunks+embeddings for an existing doc", async () => {
|
|
369
|
+
const url = "file::memory:?cache=shared";
|
|
370
|
+
const layer = LibSQLDatabase.make({ url });
|
|
371
|
+
|
|
372
|
+
const program = Effect.gen(function* () {
|
|
373
|
+
const db = yield* Database;
|
|
374
|
+
|
|
375
|
+
const doc = new Document({
|
|
376
|
+
id: "doc-replace",
|
|
377
|
+
title: "Replace Me",
|
|
378
|
+
path: "/path/replace.pdf",
|
|
379
|
+
addedAt: new Date("2025-01-01T00:00:00Z"),
|
|
380
|
+
pageCount: 1,
|
|
381
|
+
sizeBytes: 100,
|
|
382
|
+
tags: [],
|
|
383
|
+
metadata: {},
|
|
384
|
+
});
|
|
385
|
+
|
|
386
|
+
// Seed initial doc/chunks/embeddings
|
|
387
|
+
yield* db.addDocument(doc);
|
|
388
|
+
yield* db.addChunks([
|
|
389
|
+
{ id: "doc-replace-0", docId: "doc-replace", page: 1, chunkIndex: 0, content: "old-0" },
|
|
390
|
+
{ id: "doc-replace-1", docId: "doc-replace", page: 1, chunkIndex: 1, content: "old-1" },
|
|
391
|
+
]);
|
|
392
|
+
|
|
393
|
+
const mkEmbedding = (seed: number) =>
|
|
394
|
+
Array.from({ length: 1024 }, (_, i) => seed + i * 0.00001);
|
|
395
|
+
|
|
396
|
+
yield* db.addEmbeddings([
|
|
397
|
+
{ chunkId: "doc-replace-0", embedding: mkEmbedding(0.1) },
|
|
398
|
+
{ chunkId: "doc-replace-1", embedding: mkEmbedding(0.2) },
|
|
399
|
+
]);
|
|
400
|
+
|
|
401
|
+
// Now atomically replace with 3 chunks + 3 embeddings
|
|
402
|
+
const updatedDoc = new Document({
|
|
403
|
+
...doc,
|
|
404
|
+
pageCount: 2,
|
|
405
|
+
sizeBytes: 200,
|
|
406
|
+
metadata: { chunker: { id: "test", version: 1, unit: "chars", chunkSize: 1, chunkOverlap: 0 } },
|
|
407
|
+
});
|
|
408
|
+
|
|
409
|
+
yield* db.replaceDocument(
|
|
410
|
+
updatedDoc,
|
|
411
|
+
[
|
|
412
|
+
{ id: "doc-replace-0", docId: "doc-replace", page: 1, chunkIndex: 0, content: "new-0" },
|
|
413
|
+
{ id: "doc-replace-1", docId: "doc-replace", page: 1, chunkIndex: 1, content: "new-1" },
|
|
414
|
+
{ id: "doc-replace-2", docId: "doc-replace", page: 2, chunkIndex: 0, content: "new-2" },
|
|
415
|
+
],
|
|
416
|
+
[
|
|
417
|
+
{ chunkId: "doc-replace-0", embedding: mkEmbedding(1.1) },
|
|
418
|
+
{ chunkId: "doc-replace-1", embedding: mkEmbedding(1.2) },
|
|
419
|
+
{ chunkId: "doc-replace-2", embedding: mkEmbedding(1.3) },
|
|
420
|
+
],
|
|
421
|
+
);
|
|
422
|
+
|
|
423
|
+
const chunks = yield* db.listChunksByDocument("doc-replace");
|
|
424
|
+
const stats = yield* db.getStats();
|
|
425
|
+
|
|
426
|
+
return { chunks, stats };
|
|
427
|
+
});
|
|
428
|
+
|
|
429
|
+
const result = await Effect.runPromise(Effect.provide(program, layer));
|
|
430
|
+
|
|
431
|
+
expect(result.stats.documents).toBe(1);
|
|
432
|
+
expect(result.stats.chunks).toBe(3);
|
|
433
|
+
expect(result.stats.embeddings).toBe(3);
|
|
434
|
+
|
|
435
|
+
expect(result.chunks.map((c) => c.content)).toEqual(["new-0", "new-1", "new-2"]);
|
|
436
|
+
});
|
|
298
437
|
});
|
|
299
438
|
|
|
300
439
|
describe("taxonomy schema (SKOS)", () => {
|