@et0and/ovid 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,11 +1,8 @@
1
1
  {
2
2
  "name": "@et0and/ovid",
3
- "version": "0.0.3",
3
+ "version": "0.0.5",
4
4
  "description": "Browse a repository's files by semantic meaning",
5
5
  "type": "module",
6
- "bin": {
7
- "ovid": "./bin/semantic-navigator.js"
8
- },
9
6
  "files": [
10
7
  "bin",
11
8
  "src",
package/src/cache.ts ADDED
@@ -0,0 +1,119 @@
1
+ /**
2
+ * Two-layer result cache for semantic-navigator.
3
+ *
4
+ * Layer 1 — Embedding cache (~/.cache/semantic-navigator/embeddings.json)
5
+ * Content-addressed: maps sha256(text)[0:16] → number[]
6
+ * Per-entry granularity: only re-embed chunks whose text changed.
7
+ *
8
+ * Layer 2 — Tree cache (~/.cache/semantic-navigator/trees/<fingerprint>.json)
9
+ * Keyed by sha256(model + sorted(path:contentHash pairs)).
10
+ * A single changed file invalidates the whole tree, forcing a fresh
11
+ * cluster + label run, but embeddings are still reused from layer 1.
12
+ */
13
+
14
+ import { createHash } from "node:crypto"
15
+ import { mkdirSync, existsSync, readFileSync, writeFileSync } from "node:fs"
16
+ import { join } from "node:path"
17
+ import { homedir } from "node:os"
18
+ import type { Tree } from "./tree.ts"
19
+
20
+ // ---------------------------------------------------------------------------
21
+ // Paths
22
+ // ---------------------------------------------------------------------------
23
+
24
+ const CACHE_DIR = join(homedir(), ".cache", "semantic-navigator")
25
+ const EMBED_CACHE_PATH = join(CACHE_DIR, "embeddings.json")
26
+ const TREES_DIR = join(CACHE_DIR, "trees")
27
+
28
+ function ensureDirs(): void {
29
+ mkdirSync(CACHE_DIR, { recursive: true })
30
+ mkdirSync(TREES_DIR, { recursive: true })
31
+ }
32
+
33
+ // ---------------------------------------------------------------------------
34
+ // Helpers
35
+ // ---------------------------------------------------------------------------
36
+
37
+ /** Short hex digest: sha256(text).slice(0, 16) */
38
+ export function textHash(text: string): string {
39
+ return createHash("sha256").update(text).digest("hex").slice(0, 16)
40
+ }
41
+
42
+ /**
43
+ * Fingerprint for the tree cache: sha256(model + sorted path:contentHash pairs).
44
+ * `fileHashes` is a map from relative path to sha256(file content).
45
+ */
46
+ export function treeFingerprint(model: string, fileHashes: Map<string, string>): string {
47
+ const entries = Array.from(fileHashes.entries())
48
+ .map(([p, h]) => `${p}:${h}`)
49
+ .sort()
50
+ .join("\n")
51
+ return createHash("sha256").update(model + "\n" + entries).digest("hex")
52
+ }
53
+
54
+ // ---------------------------------------------------------------------------
55
+ // Layer 1: Embedding cache
56
+ // ---------------------------------------------------------------------------
57
+
58
+ type EmbedCacheMap = Record<string, number[]>
59
+
60
+ let _embedCache: EmbedCacheMap | null = null
61
+
62
+ function loadEmbedCache(): EmbedCacheMap {
63
+ if (_embedCache !== null) return _embedCache
64
+ if (existsSync(EMBED_CACHE_PATH)) {
65
+ try {
66
+ _embedCache = JSON.parse(readFileSync(EMBED_CACHE_PATH, "utf-8")) as EmbedCacheMap
67
+ } catch {
68
+ _embedCache = {}
69
+ }
70
+ } else {
71
+ _embedCache = {}
72
+ }
73
+ return _embedCache
74
+ }
75
+
76
+ /** Look up a cached embedding by text content. Returns null on miss. */
77
+ export function getCachedEmbedding(text: string): Float32Array | null {
78
+ const cache = loadEmbedCache()
79
+ const key = textHash(text)
80
+ const vec = cache[key]
81
+ if (vec === undefined) return null
82
+ return Float32Array.from(vec)
83
+ }
84
+
85
+ /** Store an embedding in the in-memory cache (call flushEmbedCache to persist). */
86
+ export function setCachedEmbedding(text: string, embedding: Float32Array): void {
87
+ const cache = loadEmbedCache()
88
+ const key = textHash(text)
89
+ cache[key] = Array.from(embedding)
90
+ }
91
+
92
+ /** Persist the in-memory embedding cache to disk. */
93
+ export function flushEmbedCache(): void {
94
+ if (_embedCache === null) return
95
+ ensureDirs()
96
+ writeFileSync(EMBED_CACHE_PATH, JSON.stringify(_embedCache), "utf-8")
97
+ }
98
+
99
+ // ---------------------------------------------------------------------------
100
+ // Layer 2: Tree cache
101
+ // ---------------------------------------------------------------------------
102
+
103
+ /** Look up a cached Tree by fingerprint. Returns null on miss. */
104
+ export function getCachedTree(fingerprint: string): Tree | null {
105
+ const treePath = join(TREES_DIR, `${fingerprint}.json`)
106
+ if (!existsSync(treePath)) return null
107
+ try {
108
+ return JSON.parse(readFileSync(treePath, "utf-8")) as Tree
109
+ } catch {
110
+ return null
111
+ }
112
+ }
113
+
114
+ /** Persist a Tree to the tree cache. */
115
+ export function setCachedTree(fingerprint: string, tree: Tree): void {
116
+ ensureDirs()
117
+ const treePath = join(TREES_DIR, `${fingerprint}.json`)
118
+ writeFileSync(treePath, JSON.stringify(tree), "utf-8")
119
+ }
package/src/cluster.ts CHANGED
@@ -95,19 +95,24 @@ function connectedComponents(indices: Int32Array[], N: number): number {
95
95
  }
96
96
 
97
97
  /**
98
- * Build the (dense) normalised Laplacian from the affinity matrix (stored as
99
- * a list of sparse {row,col,val} triples) and return it as a dense matrix
100
- * plus the degree diagonal `dd`.
98
+ * Build the degree diagonal `dd` from sparse affinity triplets, then return
99
+ * a sparse matvec closure for the *negated* normalised Laplacian (-L_norm).
100
+ *
101
+ * L_norm = I - D^{-1/2} A D^{-1/2}
102
+ * -L_norm = D^{-1/2} A D^{-1/2} - I
103
+ *
104
+ * The matvec avoids allocating an N×N dense matrix — at N=2000, k_sparse≈7,
105
+ * this reduces the cost per multiply from O(N²)=4M to O(N·k)=14k ops.
101
106
  */
102
- function buildNormalisedLaplacian(
107
+ function buildNormLaplacianSparseMatvec(
103
108
  sparseAffinity: Array<{ i: number; j: number; v: number }>,
104
109
  N: number
105
- ): { L: Matrix; dd: Float64Array } {
110
+ ): { matvec: (v: Float64Array) => Float64Array; dd: Float64Array } {
106
111
  // Accumulate row sums (degree) for normalisation
107
112
  const degree = new Float64Array(N)
108
113
  for (const { i, j, v } of sparseAffinity) {
109
- degree[i] = (degree[i] ?? 0) + v
110
- if (i !== j) degree[j] = (degree[j] ?? 0) + v
114
+ degree[i]! += v
115
+ if (i !== j) degree[j]! += v
111
116
  }
112
117
 
113
118
  const dd = new Float64Array(N)
@@ -115,31 +120,29 @@ function buildNormalisedLaplacian(
115
120
  dd[i] = degree[i]! > 1e-12 ? 1 / Math.sqrt(degree[i]!) : 0
116
121
  }
117
122
 
118
- // L_norm = I - D^{-1/2} A D^{-1/2}
119
- // We start from identity
120
- const L: Matrix = Array.from({ length: N }, (_, i) => {
121
- const row = new Float64Array(N)
122
- row[i] = 1
123
- return row
124
- })
125
-
126
- // Subtract normalised affinity contributions
127
- for (const { i, j, v } of sparseAffinity) {
128
- const w = v * dd[i]! * dd[j]!
129
- const rowI = L[i]!
130
- rowI[j] = (rowI[j] ?? 0) - w
131
- if (i !== j) {
132
- const rowJ = L[j]!
133
- rowJ[i] = (rowJ[i] ?? 0) - w
123
+ // Pre-compute normalised weights once so the closure stays cheap.
124
+ // w_ij = v * dd[i] * dd[j] (the off-diagonal contribution to A_norm)
125
+ const normAffinity = sparseAffinity.map(({ i, j, v }) => ({
126
+ i,
127
+ j,
128
+ w: v * dd[i]! * dd[j]!,
129
+ }))
130
+
131
+ // Matvec for -L_norm = A_norm - I
132
+ // result[i] = -v[i] + sum_j w_ij * v[j] (using symmetry)
133
+ const matvecFn = (vec: Float64Array): Float64Array => {
134
+ const out = new Float64Array(N)
135
+ // Start from -I · vec
136
+ for (let i = 0; i < N; i++) out[i] = -vec[i]!
137
+ // Add symmetric A_norm contributions
138
+ for (const { i, j, w } of normAffinity) {
139
+ out[i]! += w * vec[j]!
140
+ if (i !== j) out[j]! += w * vec[i]!
134
141
  }
142
+ return out
135
143
  }
136
144
 
137
- // Clamp diagonal to 1 (matches scipy behaviour after set_diag)
138
- for (let i = 0; i < N; i++) {
139
- L[i]![i] = 1
140
- }
141
-
142
- return { L, dd }
145
+ return { matvec: matvecFn, dd }
143
146
  }
144
147
 
145
148
  /**
@@ -171,35 +174,24 @@ function normaliseVec(v: Float64Array): number {
171
174
  return n
172
175
  }
173
176
 
174
- /** Multiply matrix M by vector v */
175
- function matvec(M: Matrix, v: Float64Array): Float64Array<ArrayBuffer> {
176
- const N = M.length
177
- const out = new Float64Array(N) as Float64Array<ArrayBuffer>
178
- for (let i = 0; i < N; i++) {
179
- out[i] = dot(M[i]!, v)
180
- }
181
- return out
182
- }
183
-
184
177
  /**
185
178
  * Randomised power-iteration with deflation to extract the `k` eigenpairs
186
- * corresponding to the *smallest* eigenvalues of a symmetric matrix M.
179
+ * corresponding to the *smallest* eigenvalues of a symmetric matrix.
187
180
  *
188
- * M is the **negated** Laplacian (M = -L), so its *largest* eigenvalues
189
- * correspond to L's smallest matching the Python code which does `laplacian *= -1`.
190
- *
191
- * We use shifted inverse iteration: to find small eigenvalues of L we find
192
- * large eigenvalues of (-L + shift*I) where shift ≈ 1 (the diagonal was set
193
- * to 1 above). We iterate on M = -L and take the top-k eigenvectors, then
194
- * negate the eigenvalues back.
181
+ * Instead of a dense matrix, accepts a sparse `matvecFn` closure so that the
182
+ * per-iteration cost is O(N·k_sparse) rather than O(N²). The closure should
183
+ * implement multiplication by the *negated* Laplacian (-L_norm), whose top
184
+ * eigenvalues correspond to L_norm's bottom ones (matching the Python code
185
+ * which does `laplacian *= -1`).
195
186
  */
196
187
  function topKEigenpairs(
197
- negL: Matrix,
188
+ matvecFn: (v: Float64Array) => Float64Array,
189
+ n: number,
198
190
  k: number,
199
191
  maxIter = 300,
200
192
  tol = 1e-6
201
193
  ): { values: Float64Array<ArrayBuffer>; vectors: Float64Array<ArrayBuffer>[] } {
202
- const N = negL.length
194
+ const N = n
203
195
  const rng = seededRng(42)
204
196
 
205
197
  const vectors: Float64Array<ArrayBuffer>[] = []
@@ -207,7 +199,7 @@ function topKEigenpairs(
207
199
 
208
200
  for (let idx = 0; idx < k; idx++) {
209
201
  // Random start
210
- let v = Float64Array.from({ length: N }, () => rng() * 2 - 1)
202
+ let v = Float64Array.from({ length: N }, () => rng() * 2 - 1) as Float64Array<ArrayBuffer>
211
203
  normaliseVec(v)
212
204
 
213
205
  // Deflate against already-found vectors
@@ -216,7 +208,7 @@ function topKEigenpairs(
216
208
 
217
209
  let lambda = 0
218
210
  for (let iter = 0; iter < maxIter; iter++) {
219
- const Mv = matvec(negL, v)
211
+ const Mv = matvecFn(v) as Float64Array<ArrayBuffer>
220
212
 
221
213
  // Deflate
222
214
  for (const u of vectors) subtractProjection(Mv, u)
@@ -353,135 +345,231 @@ function kmeans(
353
345
  return labels
354
346
  }
355
347
 
356
- /**
357
- * Recursively split a Cluster into sub-clusters using spectral clustering.
358
- * Returns [input] when the cluster is small enough to be a leaf.
359
- */
360
- export function splitCluster(input: Cluster): Cluster[] {
361
- const N = input.entries.length
362
-
363
- if (N <= MAX_LEAVES) return [input]
348
+ const MINI_BATCH_THRESHOLD = 512
349
+ const MINI_BATCH_SIZE = 128
350
+ const MINI_BATCH_ITERS = 120
351
+ const KMEANS_MAX_ITER = 60
352
+ const KMEANS_RETRIES = 2
353
+ const MINI_BATCH_RETRIES = 2
364
354
 
365
- const normalized = normaliseRows(matFromEmbeds(input.entries))
355
+ interface ClusterState {
356
+ entries: EmbedEntry[]
357
+ points: Float64Array[]
358
+ }
366
359
 
367
- // --- Adaptive k-NN: find smallest k that gives 1 connected component ---
368
- const candidateKs: number[] = []
369
- for (let n = 0; ; n++) {
370
- const k = Math.round(Math.exp(n))
371
- if (k >= N) break
372
- candidateKs.push(k)
360
+ function countLabels(labels: Int32Array, k: number): Int32Array {
361
+ const counts = new Int32Array(k)
362
+ for (let i = 0; i < labels.length; i++) {
363
+ const label = labels[i]
364
+ if (label !== undefined) counts[label] = (counts[label] ?? 0) + 1
373
365
  }
374
- candidateKs.push(Math.floor(N / 2))
375
-
376
- let chosenK = candidateKs[candidateKs.length - 1]!
377
- let chosenKnnResult: { distances: Float64Array[]; indices: Int32Array[] } | null = null
378
-
379
- for (const k of candidateKs) {
380
- const knnResult = knn(normalized, k)
381
- const nComponents = connectedComponents(knnResult.indices, N)
382
- if (nComponents === 1) {
383
- chosenK = k
384
- chosenKnnResult = knnResult
385
- break
366
+ return counts
367
+ }
368
+
369
+ function nearestCentroid(point: Float64Array, centroids: Float64Array[]): number {
370
+ let best = 0
371
+ let bestDist = Infinity
372
+ for (let c = 0; c < centroids.length; c++) {
373
+ const d = distSq(point, centroids[c]!)
374
+ if (d < bestDist) {
375
+ bestDist = d
376
+ best = c
386
377
  }
387
378
  }
379
+ return best
380
+ }
388
381
 
389
- if (chosenKnnResult === null) {
390
- // Fallback: compute for the last candidate (floor(N/2))
391
- chosenKnnResult = knn(normalized, chosenK)
382
+ function assignLabels(points: Float64Array[], centroids: Float64Array[]): Int32Array {
383
+ const labels = new Int32Array(points.length)
384
+ for (let i = 0; i < points.length; i++) {
385
+ labels[i] = nearestCentroid(points[i]!, centroids)
392
386
  }
387
+ return labels
388
+ }
393
389
 
394
- const { distances, indices } = chosenKnnResult
390
+ function initRandomCentroids(
391
+ points: Float64Array[],
392
+ k: number,
393
+ rng: () => number
394
+ ): Float64Array[] {
395
+ const N = points.length
396
+ const centroids: Float64Array[] = []
397
+ const used = new Set<number>()
398
+ for (let c = 0; c < k; c++) {
399
+ let idx = Math.floor(rng() * N)
400
+ for (let attempts = 0; attempts < 4 && used.has(idx); attempts++) {
401
+ idx = Math.floor(rng() * N)
402
+ }
403
+ used.add(idx)
404
+ centroids.push(Float64Array.from(points[idx]!))
405
+ }
406
+ return centroids
407
+ }
408
+
409
+ function miniBatchKmeans(
410
+ points: Float64Array[],
411
+ k: number,
412
+ rng: () => number,
413
+ opts: { batchSize: number; maxIter: number }
414
+ ): Int32Array {
415
+ const N = points.length
416
+ if (N === 0) return new Int32Array()
417
+
418
+ const dim = points[0]!.length
419
+ const centroids = initRandomCentroids(points, k, rng)
420
+ const counts = new Int32Array(k)
421
+ const batchSize = Math.min(opts.batchSize, N)
422
+
423
+ for (let iter = 0; iter < opts.maxIter; iter++) {
424
+ for (let b = 0; b < batchSize; b++) {
425
+ const idx = Math.floor(rng() * N)
426
+ const point = points[idx]!
427
+ const c = nearestCentroid(point, centroids)
428
+ counts[c] = (counts[c] ?? 0) + 1
429
+ const centroid = centroids[c]!
430
+ const eta = 1 / (counts[c] ?? 1)
431
+ for (let d = 0; d < dim; d++) {
432
+ centroid[d]! = centroid[d]! + eta * (point[d]! - centroid[d]!)
433
+ }
434
+ }
435
+ }
395
436
 
396
- // --- Build affinity matrix (sparse triplets) ---
397
- // σ[i] = distance to Kth nearest neighbour
398
- const sigmas = distances.map((d) => d[d.length - 1]!)
437
+ return assignLabels(points, centroids)
438
+ }
399
439
 
400
- const sparseAffinity: Array<{ i: number; j: number; v: number }> = []
440
+ function splitByProjection(points: Float64Array[], rng: () => number): Int32Array {
441
+ const N = points.length
442
+ const labels = new Int32Array(N)
443
+ if (N <= 1) return labels
401
444
 
445
+ const dim = points[0]!.length
446
+ const a = Math.floor(rng() * N)
447
+ let b = Math.floor(rng() * N)
448
+ if (b === a) b = (a + 1) % N
449
+
450
+ const pa = points[a]!
451
+ const pb = points[b]!
452
+ const dir = new Float64Array(dim)
453
+ for (let d = 0; d < dim; d++) dir[d]! = pa[d]! - pb[d]!
454
+
455
+ let min = Infinity
456
+ let max = -Infinity
457
+ const proj = new Float64Array(N)
402
458
  for (let i = 0; i < N; i++) {
403
- for (let n = 0; n < chosenK; n++) {
404
- const j = indices[i]![n]!
405
- const d = distances[i]![n]!
406
- const sigma_i = sigmas[i]!
407
- const sigma_j = sigmas[j]!
408
- const denom = Math.max(sigma_i * sigma_j, 1e-12)
409
- const v = Math.exp(-(d * d) / denom)
410
- sparseAffinity.push({ i, j, v })
411
- }
459
+ const p = points[i]!
460
+ let dot = 0
461
+ for (let d = 0; d < dim; d++) dot += p[d]! * dir[d]!
462
+ proj[i] = dot
463
+ if (dot < min) min = dot
464
+ if (dot > max) max = dot
412
465
  }
413
466
 
414
- // --- Normalised Laplacian ---
415
- const { L, dd } = buildNormalisedLaplacian(sparseAffinity, N)
416
-
417
- // Negate L (as Python does `laplacian *= -1`) so power iteration finds
418
- // eigenvectors of -L, whose top eigenvalues correspond to L's bottom ones.
419
- const negL: Matrix = L.map((row) => {
420
- const r = Float64Array.from(row)
421
- for (let i = 0; i < r.length; i++) r[i]! *= -1
422
- return r
423
- })
424
-
425
- const k = Math.min(MAX_CLUSTERS + 1, N)
426
- const { values: rawValues, vectors } = topKEigenpairs(negL, k)
427
-
428
- // Eigenvalues were of -L; flip sign back to get L eigenvalues
429
- const eigenvalues = Float64Array.from(rawValues, (v) => -v)
430
-
431
- // Sort by eigenvalue ascending (smallest first), skip index 0
432
- const sortedIdx = Array.from({ length: k }, (_, i) => i).sort(
433
- (a, b) => eigenvalues[a]! - eigenvalues[b]!
434
- )
435
-
436
- const sortedEigenvalues = Float64Array.from(sortedIdx, (i) => eigenvalues[i]!)
437
- const sortedVectors = sortedIdx.map((i) => vectors[i]!)
438
-
439
- deterministicSignFlip(sortedVectors)
440
-
441
- // --- Eigengap heuristic (skip λ₀ ≈ 0) ---
442
- // n_clusters = argmax(diff(eigenvalues[1:])) + 2
443
- let maxGap = -Infinity
444
- let nClusters = 2
445
- for (let i = 1; i < sortedEigenvalues.length - 1; i++) {
446
- const gap = sortedEigenvalues[i + 1]! - sortedEigenvalues[i]!
447
- if (gap > maxGap) {
448
- maxGap = gap
449
- nClusters = i + 1 // 1-indexed + 1 for the off-by-one vs Python
467
+ const threshold = (min + max) / 2
468
+ for (let i = 0; i < N; i++) labels[i] = proj[i]! <= threshold ? 0 : 1
469
+
470
+ const counts = countLabels(labels, 2)
471
+ if ((counts[0] ?? 0) === 0 || (counts[1] ?? 0) === 0) {
472
+ const mid = Math.floor(N / 2)
473
+ for (let i = 0; i < N; i++) labels[i] = i < mid ? 0 : 1
474
+ }
475
+
476
+ return labels
477
+ }
478
+
479
+ function chooseBisectLabels(points: Float64Array[], rng: () => number): Int32Array {
480
+ const N = points.length
481
+ if (N <= 1) return new Int32Array(N)
482
+
483
+ const useMiniBatch = N >= MINI_BATCH_THRESHOLD
484
+ const retries = useMiniBatch ? MINI_BATCH_RETRIES : KMEANS_RETRIES
485
+
486
+ for (let attempt = 0; attempt <= retries; attempt++) {
487
+ const seed = Math.floor(rng() * 1_000_000_000)
488
+ const labels = useMiniBatch
489
+ ? miniBatchKmeans(points, 2, seededRng(seed), {
490
+ batchSize: MINI_BATCH_SIZE,
491
+ maxIter: MINI_BATCH_ITERS,
492
+ })
493
+ : kmeans(points, 2, KMEANS_MAX_ITER, seed)
494
+
495
+ const counts = countLabels(labels, 2)
496
+ const left = counts[0] ?? 0
497
+ const right = counts[1] ?? 0
498
+ if (left > 0 && right > 0) return labels
499
+ }
500
+
501
+ return splitByProjection(points, rng)
502
+ }
503
+
504
+ function bisectCluster(
505
+ cluster: ClusterState,
506
+ rng: () => number
507
+ ): { left: ClusterState; right: ClusterState } {
508
+ const { entries, points } = cluster
509
+ const N = entries.length
510
+ if (N <= 1) {
511
+ return {
512
+ left: { entries, points },
513
+ right: { entries: [], points: [] },
450
514
  }
451
515
  }
452
- nClusters = Math.max(2, Math.min(nClusters, MAX_CLUSTERS))
453
-
454
- // --- Spectral embeddings: use eigenvectors 1..nClusters (skip 0) ---
455
- // Build [N × nClusters] matrix, normalise each row
456
- const spectralPoints: Float64Array[] = Array.from({ length: N }, () =>
457
- new Float64Array(nClusters)
458
- )
459
- for (let c = 0; c < nClusters; c++) {
460
- const vec = sortedVectors[c + 1] // skip smallest (index 0)
461
- if (vec === undefined) break
462
- for (let i = 0; i < N; i++) {
463
- // Divide by dd[i] (matches Python `wide_spectral_embeddings = eigenvectors.T / dd`)
464
- spectralPoints[i]![c] = (vec[i]! / dd[i]!)
516
+
517
+ let labels = chooseBisectLabels(points, rng)
518
+ let leftEntries: EmbedEntry[] = []
519
+ let rightEntries: EmbedEntry[] = []
520
+ let leftPoints: Float64Array[] = []
521
+ let rightPoints: Float64Array[] = []
522
+
523
+ for (let i = 0; i < N; i++) {
524
+ if (labels[i] === 0) {
525
+ leftEntries.push(entries[i]!)
526
+ leftPoints.push(points[i]!)
527
+ } else {
528
+ rightEntries.push(entries[i]!)
529
+ rightPoints.push(points[i]!)
465
530
  }
466
531
  }
467
- // L2-normalise each row
468
- for (const row of spectralPoints) {
469
- let norm = 0
470
- for (const v of row) norm += v * v
471
- norm = Math.sqrt(norm)
472
- if (norm > 1e-12) for (let d = 0; d < row.length; d++) row[d]! /= norm
532
+
533
+ if (leftEntries.length === 0 || rightEntries.length === 0) {
534
+ const mid = Math.floor(N / 2)
535
+ leftEntries = entries.slice(0, mid)
536
+ rightEntries = entries.slice(mid)
537
+ leftPoints = points.slice(0, mid)
538
+ rightPoints = points.slice(mid)
473
539
  }
474
540
 
475
- // --- K-means ---
476
- const labels = kmeans(spectralPoints, nClusters)
541
+ return {
542
+ left: { entries: leftEntries, points: leftPoints },
543
+ right: { entries: rightEntries, points: rightPoints },
544
+ }
545
+ }
477
546
 
478
- // Group entries by cluster label, preserving order
479
- const groups = new Map<number, EmbedEntry[]>()
480
- for (let i = 0; i < N; i++) {
481
- const label = labels[i]!
482
- if (!groups.has(label)) groups.set(label, [])
483
- groups.get(label)!.push(input.entries[i]!)
547
+ /**
548
+ * Recursively split a Cluster into sub-clusters using bisecting k-means.
549
+ * Returns [input] when the cluster is small enough to be a leaf.
550
+ */
551
+ export function splitCluster(input: Cluster): Cluster[] {
552
+ const N = input.entries.length
553
+
554
+ if (N <= MAX_LEAVES) return [input]
555
+
556
+ const normalized = normaliseRows(matFromEmbeds(input.entries))
557
+ const rng = seededRng(42)
558
+
559
+ const work: ClusterState[] = [{ entries: input.entries, points: normalized }]
560
+ const leaves: Cluster[] = []
561
+
562
+ while (work.length > 0) {
563
+ const cluster = work.pop()!
564
+ if (cluster.entries.length <= MAX_LEAVES) {
565
+ leaves.push({ entries: cluster.entries })
566
+ continue
567
+ }
568
+
569
+ const { left, right } = bisectCluster(cluster, rng)
570
+ if (right.entries.length > 0) work.push(right)
571
+ if (left.entries.length > 0) work.push(left)
484
572
  }
485
573
 
486
- return Array.from(groups.values()).map((entries) => ({ entries }))
574
+ return leaves
487
575
  }
package/src/embed.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  import { pipeline, type FeatureExtractionPipeline } from "@huggingface/transformers"
2
2
  import type { Chunk } from "./tokenize.ts"
3
+ import { getCachedEmbedding, setCachedEmbedding, flushEmbedCache } from "./cache.ts"
3
4
 
4
5
  export const DEFAULT_EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2"
5
6
 
@@ -19,6 +20,8 @@ export interface EmbedOptions {
19
20
  model: string
20
21
  batchSize: number
21
22
  concurrency: number
23
+ /** When true, skip reading from cache (but still write to it). */
24
+ noCache?: boolean
22
25
  }
23
26
 
24
27
  let _pipe: FeatureExtractionPipeline | null = null
@@ -81,6 +84,7 @@ async function embedBatch(
81
84
 
82
85
  /**
83
86
  * Embed all chunks using the local model, with batching + concurrency limits.
87
+ * Chunks whose text is already in the embedding cache are skipped.
84
88
  * Calls `onProgress(done, total)` after each batch completes.
85
89
  */
86
90
  export async function embedChunks(
@@ -90,44 +94,60 @@ export async function embedChunks(
90
94
  ): Promise<EmbedEntry[]> {
91
95
  if (chunks.length === 0) return []
92
96
 
93
- const pipe = await getEmbedPipeline(opts.model)
94
-
95
- const batches: Chunk[][] = []
96
- for (let i = 0; i < chunks.length; i += opts.batchSize) {
97
- batches.push(chunks.slice(i, i + opts.batchSize))
97
+ // --- Cache pass: resolve hits immediately, collect misses for the model ---
98
+ const entries: EmbedEntry[] = new Array(chunks.length)
99
+ const missIndices: number[] = []
100
+
101
+ if (!opts.noCache) {
102
+ for (let i = 0; i < chunks.length; i++) {
103
+ const cached = getCachedEmbedding(chunks[i]!.text)
104
+ if (cached !== null) {
105
+ entries[i] = { path: chunks[i]!.path, text: chunks[i]!.text, embedding: cached }
106
+ } else {
107
+ missIndices.push(i)
108
+ }
109
+ }
110
+ } else {
111
+ for (let i = 0; i < chunks.length; i++) missIndices.push(i)
98
112
  }
99
113
 
100
- const entries: EmbedEntry[] = new Array(chunks.length)
101
- let chunkIndex = 0
102
- let done = 0
114
+ let done = chunks.length - missIndices.length
115
+ onProgress?.(done, chunks.length)
103
116
 
104
- for (let i = 0; i < batches.length; i += opts.concurrency) {
105
- const concurrentBatches = batches.slice(i, i + opts.concurrency)
106
- const startIndex = chunkIndex
117
+ if (missIndices.length > 0) {
118
+ const pipe = await getEmbedPipeline(opts.model)
119
+ const missChunks = missIndices.map((i) => chunks[i]!)
107
120
 
108
- const batchResults = await Promise.all(
109
- concurrentBatches.map((batch) =>
110
- embedBatch(pipe, batch.map((c) => c.text))
121
+ const batches: number[][] = []
122
+ for (let i = 0; i < missChunks.length; i += opts.batchSize) {
123
+ batches.push(missIndices.slice(i, i + opts.batchSize))
124
+ }
125
+
126
+ for (let i = 0; i < batches.length; i += opts.concurrency) {
127
+ const concurrentBatches = batches.slice(i, i + opts.concurrency)
128
+
129
+ const batchResults = await Promise.all(
130
+ concurrentBatches.map((idxBatch) =>
131
+ embedBatch(pipe, idxBatch.map((ci) => chunks[ci]!.text))
132
+ )
111
133
  )
112
- )
113
-
114
- let offset = startIndex
115
- for (let b = 0; b < concurrentBatches.length; b++) {
116
- const batch = concurrentBatches[b]!
117
- const embeddings = batchResults[b]!
118
- for (let j = 0; j < batch.length; j++) {
119
- const chunk = batch[j]!
120
- entries[offset] = {
121
- path: chunk.path,
122
- text: chunk.text,
123
- embedding: embeddings[j]!,
134
+
135
+ for (let b = 0; b < concurrentBatches.length; b++) {
136
+ const idxBatch = concurrentBatches[b]!
137
+ const embeddings = batchResults[b]!
138
+ for (let j = 0; j < idxBatch.length; j++) {
139
+ const ci = idxBatch[j]!
140
+ const chunk = chunks[ci]!
141
+ const embedding = embeddings[j]!
142
+ entries[ci] = { path: chunk.path, text: chunk.text, embedding }
143
+ setCachedEmbedding(chunk.text, embedding)
124
144
  }
125
- offset++
145
+ done += idxBatch.length
146
+ onProgress?.(done, chunks.length)
126
147
  }
127
- chunkIndex += batch.length
128
- done += batch.length
129
- onProgress?.(done, chunks.length)
130
148
  }
149
+
150
+ flushEmbedCache()
131
151
  }
132
152
 
133
153
  return entries
package/src/main.ts CHANGED
@@ -15,6 +15,7 @@
15
15
 
16
16
  import { Command } from "commander"
17
17
  import path from "node:path"
18
+ import { createHash } from "node:crypto"
18
19
 
19
20
  import { discoverFiles, DEFAULT_EXCLUDES, type FsOptions } from "./fs.ts"
20
21
  import { chunkFile } from "./tokenize.ts"
@@ -24,6 +25,7 @@ import { buildTree } from "./tree.ts"
24
25
  import { clearAuthCache, getCopilotToken } from "./auth.ts"
25
26
  import { SemanticNavigatorUI, type ProgressState } from "./ui.ts"
26
27
  import type { CopilotConfig } from "./labels.ts"
28
+ import { treeFingerprint, getCachedTree, setCachedTree } from "./cache.ts"
27
29
 
28
30
  // ---------------------------------------------------------------------------
29
31
  // CLI definition
@@ -40,6 +42,7 @@ const program = new Command()
40
42
  .option("--read-concurrency <n>", "Concurrent file reads", (v) => parseInt(v, 10), 64)
41
43
  .option("--embed-batch-size <n>", "Chunks per embedding batch", (v) => parseInt(v, 10), 32)
42
44
  .option("--embed-concurrency <n>", "Concurrent embedding batches", (v) => parseInt(v, 10), 2)
45
+ .option("--no-cache", "Skip reading from cache; force re-embed and re-label")
43
46
  .option("--logout", "Clear cached GitHub / Copilot credentials and exit")
44
47
  .helpOption("-h, --help", "Show help")
45
48
 
@@ -60,6 +63,7 @@ async function main(): Promise<void> {
60
63
  readConcurrency: number
61
64
  embedBatchSize: number
62
65
  embedConcurrency: number
66
+ cache: boolean
63
67
  logout: boolean | undefined
64
68
  }>()
65
69
 
@@ -167,6 +171,7 @@ async function main(): Promise<void> {
167
171
  model: DEFAULT_EMBEDDING_MODEL,
168
172
  batchSize: opts.embedBatchSize,
169
173
  concurrency: opts.embedConcurrency,
174
+ noCache: !opts.cache,
170
175
  }
171
176
 
172
177
  let embedEntriesRaw: EmbedEntry[] | undefined
@@ -184,7 +189,32 @@ async function main(): Promise<void> {
184
189
  }
185
190
 
186
191
  // ---------------------------------------------------------------------------
187
- // Step 5: Spectral clustering (CPU-bound, synchronous)
192
+ // Step 5: Compute tree fingerprint and check tree cache
193
+ // ---------------------------------------------------------------------------
194
+
195
+ // Build a map of relativePath → sha256(content) for all discovered files.
196
+ const fileHashes = new Map<string, string>()
197
+ for (const f of resolvedFiles) {
198
+ fileHashes.set(
199
+ f.relativePath,
200
+ createHash("sha256").update(f.content).digest("hex").slice(0, 16)
201
+ )
202
+ }
203
+
204
+ const fingerprint = treeFingerprint(opts.completionModel, fileHashes)
205
+ const noCache = !opts.cache
206
+
207
+ if (!noCache) {
208
+ const cached = getCachedTree(fingerprint)
209
+ if (cached !== null) {
210
+ ui.setTree(cached)
211
+ // The UI event loop keeps the process alive until the user presses q/Esc.
212
+ return
213
+ }
214
+ }
215
+
216
+ // ---------------------------------------------------------------------------
217
+ // Step 6: Spectral clustering (CPU-bound, synchronous)
188
218
  // ---------------------------------------------------------------------------
189
219
 
190
220
  ui.updateProgress({
@@ -201,7 +231,7 @@ async function main(): Promise<void> {
201
231
  await Bun.sleep(0)
202
232
 
203
233
  // ---------------------------------------------------------------------------
204
- // Step 6: Build labelled tree
234
+ // Step 7: Build labelled tree
205
235
  // ---------------------------------------------------------------------------
206
236
 
207
237
  ui.updateProgress({
@@ -223,8 +253,11 @@ async function main(): Promise<void> {
223
253
  }
224
254
  const tree = treeRaw!
225
255
 
256
+ // Persist to tree cache for future runs
257
+ setCachedTree(fingerprint, tree)
258
+
226
259
  // ---------------------------------------------------------------------------
227
- // Step 7: Hand the tree to the UI
260
+ // Step 8: Hand the tree to the UI
228
261
  // ---------------------------------------------------------------------------
229
262
 
230
263
  ui.setTree(tree)