npm - @et0and/ovid - Versions diffs - 0.0.3 → 0.0.5 - Mend

@et0and/ovid 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json CHANGED Viewed

@@ -1,11 +1,8 @@
 {
   "name": "@et0and/ovid",
-  "version": "0.0.3",
+  "version": "0.0.5",
   "description": "Browse a repository's files by semantic meaning",
   "type": "module",
-  "bin": {
-    "ovid": "./bin/semantic-navigator.js"
-  },
   "files": [
     "bin",
     "src",

package/src/cache.ts ADDED Viewed

@@ -0,0 +1,119 @@
+/**
+ * Two-layer result cache for semantic-navigator.
+ *
+ * Layer 1 — Embedding cache (~/.cache/semantic-navigator/embeddings.json)
+ *   Content-addressed: maps sha256(text)[0:16] → number[]
+ *   Per-entry granularity: only re-embed chunks whose text changed.
+ *
+ * Layer 2 — Tree cache (~/.cache/semantic-navigator/trees/<fingerprint>.json)
+ *   Keyed by sha256(model + sorted(path:contentHash pairs)).
+ *   A single changed file invalidates the whole tree, forcing a fresh
+ *   cluster + label run, but embeddings are still reused from layer 1.
+ */
+import { createHash } from "node:crypto"
+import { mkdirSync, existsSync, readFileSync, writeFileSync } from "node:fs"
+import { join } from "node:path"
+import { homedir } from "node:os"
+import type { Tree } from "./tree.ts"
+// ---------------------------------------------------------------------------
+// Paths
+// ---------------------------------------------------------------------------
+const CACHE_DIR = join(homedir(), ".cache", "semantic-navigator")
+const EMBED_CACHE_PATH = join(CACHE_DIR, "embeddings.json")
+const TREES_DIR = join(CACHE_DIR, "trees")
+function ensureDirs(): void {
+  mkdirSync(CACHE_DIR, { recursive: true })
+  mkdirSync(TREES_DIR, { recursive: true })
+}
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+/** Short hex digest: sha256(text).slice(0, 16) */
+export function textHash(text: string): string {
+  return createHash("sha256").update(text).digest("hex").slice(0, 16)
+}
+/**
+ * Fingerprint for the tree cache: sha256(model + sorted path:contentHash pairs).
+ * `fileHashes` is a map from relative path to sha256(file content).
+ */
+export function treeFingerprint(model: string, fileHashes: Map<string, string>): string {
+  const entries = Array.from(fileHashes.entries())
+    .map(([p, h]) => `${p}:${h}`)
+    .sort()
+    .join("\n")
+  return createHash("sha256").update(model + "\n" + entries).digest("hex")
+}
+// ---------------------------------------------------------------------------
+// Layer 1: Embedding cache
+// ---------------------------------------------------------------------------
+type EmbedCacheMap = Record<string, number[]>
+let _embedCache: EmbedCacheMap | null = null
+function loadEmbedCache(): EmbedCacheMap {
+  if (_embedCache !== null) return _embedCache
+  if (existsSync(EMBED_CACHE_PATH)) {
+    try {
+      _embedCache = JSON.parse(readFileSync(EMBED_CACHE_PATH, "utf-8")) as EmbedCacheMap
+    } catch {
+      _embedCache = {}
+    }
+  } else {
+    _embedCache = {}
+  }
+  return _embedCache
+}
+/** Look up a cached embedding by text content. Returns null on miss. */
+export function getCachedEmbedding(text: string): Float32Array | null {
+  const cache = loadEmbedCache()
+  const key = textHash(text)
+  const vec = cache[key]
+  if (vec === undefined) return null
+  return Float32Array.from(vec)
+}
+/** Store an embedding in the in-memory cache (call flushEmbedCache to persist). */
+export function setCachedEmbedding(text: string, embedding: Float32Array): void {
+  const cache = loadEmbedCache()
+  const key = textHash(text)
+  cache[key] = Array.from(embedding)
+}
+/** Persist the in-memory embedding cache to disk. */
+export function flushEmbedCache(): void {
+  if (_embedCache === null) return
+  ensureDirs()
+  writeFileSync(EMBED_CACHE_PATH, JSON.stringify(_embedCache), "utf-8")
+}
+// ---------------------------------------------------------------------------
+// Layer 2: Tree cache
+// ---------------------------------------------------------------------------
+/** Look up a cached Tree by fingerprint. Returns null on miss. */
+export function getCachedTree(fingerprint: string): Tree | null {
+  const treePath = join(TREES_DIR, `${fingerprint}.json`)
+  if (!existsSync(treePath)) return null
+  try {
+    return JSON.parse(readFileSync(treePath, "utf-8")) as Tree
+  } catch {
+    return null
+  }
+}
+/** Persist a Tree to the tree cache. */
+export function setCachedTree(fingerprint: string, tree: Tree): void {
+  ensureDirs()
+  const treePath = join(TREES_DIR, `${fingerprint}.json`)
+  writeFileSync(treePath, JSON.stringify(tree), "utf-8")
+}

package/src/cluster.ts CHANGED Viewed

@@ -95,19 +95,24 @@ function connectedComponents(indices: Int32Array[], N: number): number {
 }
 /**
- * Build the (dense) normalised Laplacian from the affinity matrix (stored as
- * a list of sparse {row,col,val} triples) and return it as a dense matrix
- * plus the degree diagonal `dd`.
+ * Build the degree diagonal `dd` from sparse affinity triplets, then return
+ * a sparse matvec closure for the *negated* normalised Laplacian (-L_norm).
+ *
+ * L_norm = I - D^{-1/2} A D^{-1/2}
+ * -L_norm = D^{-1/2} A D^{-1/2} - I
+ *
+ * The matvec avoids allocating an N×N dense matrix — at N=2000, k_sparse≈7,
+ * this reduces the cost per multiply from O(N²)=4M to O(N·k)=14k ops.
  */
-function buildNormalisedLaplacian(
+function buildNormLaplacianSparseMatvec(
   sparseAffinity: Array<{ i: number; j: number; v: number }>,
   N: number
-): { L: Matrix; dd: Float64Array } {
+): { matvec: (v: Float64Array) => Float64Array; dd: Float64Array } {
   // Accumulate row sums (degree) for normalisation
   const degree = new Float64Array(N)
   for (const { i, j, v } of sparseAffinity) {
-    degree[i] = (degree[i] ?? 0) + v
-    if (i !== j) degree[j] = (degree[j] ?? 0) + v
+    degree[i]! += v
+    if (i !== j) degree[j]! += v
   }
   const dd = new Float64Array(N)
@@ -115,31 +120,29 @@ function buildNormalisedLaplacian(
     dd[i] = degree[i]! > 1e-12 ? 1 / Math.sqrt(degree[i]!) : 0
   }
-  // L_norm = I - D^{-1/2} A D^{-1/2}
-  // We start from identity
-  const L: Matrix = Array.from({ length: N }, (_, i) => {
-    const row = new Float64Array(N)
-    row[i] = 1
-    return row
-  })
-  // Subtract normalised affinity contributions
-  for (const { i, j, v } of sparseAffinity) {
-    const w = v * dd[i]! * dd[j]!
-    const rowI = L[i]!
-    rowI[j] = (rowI[j] ?? 0) - w
-    if (i !== j) {
-      const rowJ = L[j]!
-      rowJ[i] = (rowJ[i] ?? 0) - w
+  // Pre-compute normalised weights once so the closure stays cheap.
+  // w_ij = v * dd[i] * dd[j]  (the off-diagonal contribution to A_norm)
+  const normAffinity = sparseAffinity.map(({ i, j, v }) => ({
+    i,
+    j,
+    w: v * dd[i]! * dd[j]!,
+  }))
+  // Matvec for -L_norm = A_norm - I
+  // result[i] = -v[i] + sum_j w_ij * v[j]   (using symmetry)
+  const matvecFn = (vec: Float64Array): Float64Array => {
+    const out = new Float64Array(N)
+    // Start from -I · vec
+    for (let i = 0; i < N; i++) out[i] = -vec[i]!
+    // Add symmetric A_norm contributions
+    for (const { i, j, w } of normAffinity) {
+      out[i]! += w * vec[j]!
+      if (i !== j) out[j]! += w * vec[i]!
     }
+    return out
   }
-  // Clamp diagonal to 1 (matches scipy behaviour after set_diag)
-  for (let i = 0; i < N; i++) {
-    L[i]![i] = 1
-  }
-  return { L, dd }
+  return { matvec: matvecFn, dd }
 }
 /**
@@ -171,35 +174,24 @@ function normaliseVec(v: Float64Array): number {
   return n
 }
-/** Multiply matrix M by vector v */
-function matvec(M: Matrix, v: Float64Array): Float64Array<ArrayBuffer> {
-  const N = M.length
-  const out = new Float64Array(N) as Float64Array<ArrayBuffer>
-  for (let i = 0; i < N; i++) {
-    out[i] = dot(M[i]!, v)
-  }
-  return out
-}
 /**
  * Randomised power-iteration with deflation to extract the `k` eigenpairs
- * corresponding to the *smallest* eigenvalues of a symmetric matrix M.
+ * corresponding to the *smallest* eigenvalues of a symmetric matrix.
  *
- * M is the **negated** Laplacian (M = -L), so its *largest* eigenvalues
- * correspond to L's smallest — matching the Python code which does `laplacian *= -1`.
- *
- * We use shifted inverse iteration: to find small eigenvalues of L we find
- * large eigenvalues of (-L + shift*I) where shift ≈ 1 (the diagonal was set
- * to 1 above).  We iterate on M = -L and take the top-k eigenvectors, then
- * negate the eigenvalues back.
+ * Instead of a dense matrix, accepts a sparse `matvecFn` closure so that the
+ * per-iteration cost is O(N·k_sparse) rather than O(N²).  The closure should
+ * implement multiplication by the *negated* Laplacian (-L_norm), whose top
+ * eigenvalues correspond to L_norm's bottom ones (matching the Python code
+ * which does `laplacian *= -1`).
  */
 function topKEigenpairs(
-  negL: Matrix,
+  matvecFn: (v: Float64Array) => Float64Array,
+  n: number,
   k: number,
   maxIter = 300,
   tol = 1e-6
 ): { values: Float64Array<ArrayBuffer>; vectors: Float64Array<ArrayBuffer>[] } {
-  const N = negL.length
+  const N = n
   const rng = seededRng(42)
   const vectors: Float64Array<ArrayBuffer>[] = []
@@ -207,7 +199,7 @@ function topKEigenpairs(
   for (let idx = 0; idx < k; idx++) {
     // Random start
-    let v = Float64Array.from({ length: N }, () => rng() * 2 - 1)
+    let v = Float64Array.from({ length: N }, () => rng() * 2 - 1) as Float64Array<ArrayBuffer>
     normaliseVec(v)
     // Deflate against already-found vectors
@@ -216,7 +208,7 @@ function topKEigenpairs(
     let lambda = 0
     for (let iter = 0; iter < maxIter; iter++) {
-      const Mv = matvec(negL, v)
+      const Mv = matvecFn(v) as Float64Array<ArrayBuffer>
       // Deflate
       for (const u of vectors) subtractProjection(Mv, u)
@@ -353,135 +345,231 @@ function kmeans(
   return labels
 }
-/**
- * Recursively split a Cluster into sub-clusters using spectral clustering.
- * Returns [input] when the cluster is small enough to be a leaf.
- */
-export function splitCluster(input: Cluster): Cluster[] {
-  const N = input.entries.length
-  if (N <= MAX_LEAVES) return [input]
+const MINI_BATCH_THRESHOLD = 512
+const MINI_BATCH_SIZE = 128
+const MINI_BATCH_ITERS = 120
+const KMEANS_MAX_ITER = 60
+const KMEANS_RETRIES = 2
+const MINI_BATCH_RETRIES = 2
-  const normalized = normaliseRows(matFromEmbeds(input.entries))
+interface ClusterState {
+  entries: EmbedEntry[]
+  points: Float64Array[]
+}
-  // --- Adaptive k-NN: find smallest k that gives 1 connected component ---
-  const candidateKs: number[] = []
-  for (let n = 0; ; n++) {
-    const k = Math.round(Math.exp(n))
-    if (k >= N) break
-    candidateKs.push(k)
+function countLabels(labels: Int32Array, k: number): Int32Array {
+  const counts = new Int32Array(k)
+  for (let i = 0; i < labels.length; i++) {
+    const label = labels[i]
+    if (label !== undefined) counts[label] = (counts[label] ?? 0) + 1
   }
-  candidateKs.push(Math.floor(N / 2))
-  let chosenK = candidateKs[candidateKs.length - 1]!
-  let chosenKnnResult: { distances: Float64Array[]; indices: Int32Array[] } | null = null
-  for (const k of candidateKs) {
-    const knnResult = knn(normalized, k)
-    const nComponents = connectedComponents(knnResult.indices, N)
-    if (nComponents === 1) {
-      chosenK = k
-      chosenKnnResult = knnResult
-      break
+  return counts
+}
+function nearestCentroid(point: Float64Array, centroids: Float64Array[]): number {
+  let best = 0
+  let bestDist = Infinity
+  for (let c = 0; c < centroids.length; c++) {
+    const d = distSq(point, centroids[c]!)
+    if (d < bestDist) {
+      bestDist = d
+      best = c
     }
   }
+  return best
+}
-  if (chosenKnnResult === null) {
-    // Fallback: compute for the last candidate (floor(N/2))
-    chosenKnnResult = knn(normalized, chosenK)
+function assignLabels(points: Float64Array[], centroids: Float64Array[]): Int32Array {
+  const labels = new Int32Array(points.length)
+  for (let i = 0; i < points.length; i++) {
+    labels[i] = nearestCentroid(points[i]!, centroids)
   }
+  return labels
+}
-  const { distances, indices } = chosenKnnResult
+function initRandomCentroids(
+  points: Float64Array[],
+  k: number,
+  rng: () => number
+): Float64Array[] {
+  const N = points.length
+  const centroids: Float64Array[] = []
+  const used = new Set<number>()
+  for (let c = 0; c < k; c++) {
+    let idx = Math.floor(rng() * N)
+    for (let attempts = 0; attempts < 4 && used.has(idx); attempts++) {
+      idx = Math.floor(rng() * N)
+    }
+    used.add(idx)
+    centroids.push(Float64Array.from(points[idx]!))
+  }
+  return centroids
+}
+function miniBatchKmeans(
+  points: Float64Array[],
+  k: number,
+  rng: () => number,
+  opts: { batchSize: number; maxIter: number }
+): Int32Array {
+  const N = points.length
+  if (N === 0) return new Int32Array()
+  const dim = points[0]!.length
+  const centroids = initRandomCentroids(points, k, rng)
+  const counts = new Int32Array(k)
+  const batchSize = Math.min(opts.batchSize, N)
+  for (let iter = 0; iter < opts.maxIter; iter++) {
+    for (let b = 0; b < batchSize; b++) {
+      const idx = Math.floor(rng() * N)
+      const point = points[idx]!
+      const c = nearestCentroid(point, centroids)
+      counts[c] = (counts[c] ?? 0) + 1
+      const centroid = centroids[c]!
+      const eta = 1 / (counts[c] ?? 1)
+      for (let d = 0; d < dim; d++) {
+        centroid[d]! = centroid[d]! + eta * (point[d]! - centroid[d]!)
+      }
+    }
+  }
-  // --- Build affinity matrix (sparse triplets) ---
-  // σ[i] = distance to Kth nearest neighbour
-  const sigmas = distances.map((d) => d[d.length - 1]!)
+  return assignLabels(points, centroids)
+}
-  const sparseAffinity: Array<{ i: number; j: number; v: number }> = []
+function splitByProjection(points: Float64Array[], rng: () => number): Int32Array {
+  const N = points.length
+  const labels = new Int32Array(N)
+  if (N <= 1) return labels
+  const dim = points[0]!.length
+  const a = Math.floor(rng() * N)
+  let b = Math.floor(rng() * N)
+  if (b === a) b = (a + 1) % N
+  const pa = points[a]!
+  const pb = points[b]!
+  const dir = new Float64Array(dim)
+  for (let d = 0; d < dim; d++) dir[d]! = pa[d]! - pb[d]!
+  let min = Infinity
+  let max = -Infinity
+  const proj = new Float64Array(N)
   for (let i = 0; i < N; i++) {
-    for (let n = 0; n < chosenK; n++) {
-      const j = indices[i]![n]!
-      const d = distances[i]![n]!
-      const sigma_i = sigmas[i]!
-      const sigma_j = sigmas[j]!
-      const denom = Math.max(sigma_i * sigma_j, 1e-12)
-      const v = Math.exp(-(d * d) / denom)
-      sparseAffinity.push({ i, j, v })
-    }
+    const p = points[i]!
+    let dot = 0
+    for (let d = 0; d < dim; d++) dot += p[d]! * dir[d]!
+    proj[i] = dot
+    if (dot < min) min = dot
+    if (dot > max) max = dot
   }
-  // --- Normalised Laplacian ---
-  const { L, dd } = buildNormalisedLaplacian(sparseAffinity, N)
-  // Negate L (as Python does `laplacian *= -1`) so power iteration finds
-  // eigenvectors of -L, whose top eigenvalues correspond to L's bottom ones.
-  const negL: Matrix = L.map((row) => {
-    const r = Float64Array.from(row)
-    for (let i = 0; i < r.length; i++) r[i]! *= -1
-    return r
-  })
-  const k = Math.min(MAX_CLUSTERS + 1, N)
-  const { values: rawValues, vectors } = topKEigenpairs(negL, k)
-  // Eigenvalues were of -L; flip sign back to get L eigenvalues
-  const eigenvalues = Float64Array.from(rawValues, (v) => -v)
-  // Sort by eigenvalue ascending (smallest first), skip index 0
-  const sortedIdx = Array.from({ length: k }, (_, i) => i).sort(
-    (a, b) => eigenvalues[a]! - eigenvalues[b]!
-  )
-  const sortedEigenvalues = Float64Array.from(sortedIdx, (i) => eigenvalues[i]!)
-  const sortedVectors = sortedIdx.map((i) => vectors[i]!)
-  deterministicSignFlip(sortedVectors)
-  // --- Eigengap heuristic (skip λ₀ ≈ 0) ---
-  // n_clusters = argmax(diff(eigenvalues[1:])) + 2
-  let maxGap = -Infinity
-  let nClusters = 2
-  for (let i = 1; i < sortedEigenvalues.length - 1; i++) {
-    const gap = sortedEigenvalues[i + 1]! - sortedEigenvalues[i]!
-    if (gap > maxGap) {
-      maxGap = gap
-      nClusters = i + 1 // 1-indexed + 1 for the off-by-one vs Python
+  const threshold = (min + max) / 2
+  for (let i = 0; i < N; i++) labels[i] = proj[i]! <= threshold ? 0 : 1
+  const counts = countLabels(labels, 2)
+  if ((counts[0] ?? 0) === 0 || (counts[1] ?? 0) === 0) {
+    const mid = Math.floor(N / 2)
+    for (let i = 0; i < N; i++) labels[i] = i < mid ? 0 : 1
+  }
+  return labels
+}
+function chooseBisectLabels(points: Float64Array[], rng: () => number): Int32Array {
+  const N = points.length
+  if (N <= 1) return new Int32Array(N)
+  const useMiniBatch = N >= MINI_BATCH_THRESHOLD
+  const retries = useMiniBatch ? MINI_BATCH_RETRIES : KMEANS_RETRIES
+  for (let attempt = 0; attempt <= retries; attempt++) {
+    const seed = Math.floor(rng() * 1_000_000_000)
+    const labels = useMiniBatch
+      ? miniBatchKmeans(points, 2, seededRng(seed), {
+          batchSize: MINI_BATCH_SIZE,
+          maxIter: MINI_BATCH_ITERS,
+        })
+      : kmeans(points, 2, KMEANS_MAX_ITER, seed)
+    const counts = countLabels(labels, 2)
+    const left = counts[0] ?? 0
+    const right = counts[1] ?? 0
+    if (left > 0 && right > 0) return labels
+  }
+  return splitByProjection(points, rng)
+}
+function bisectCluster(
+  cluster: ClusterState,
+  rng: () => number
+): { left: ClusterState; right: ClusterState } {
+  const { entries, points } = cluster
+  const N = entries.length
+  if (N <= 1) {
+    return {
+      left: { entries, points },
+      right: { entries: [], points: [] },
     }
   }
-  nClusters = Math.max(2, Math.min(nClusters, MAX_CLUSTERS))
-  // --- Spectral embeddings: use eigenvectors 1..nClusters (skip 0) ---
-  // Build [N × nClusters] matrix, normalise each row
-  const spectralPoints: Float64Array[] = Array.from({ length: N }, () =>
-    new Float64Array(nClusters)
-  )
-  for (let c = 0; c < nClusters; c++) {
-    const vec = sortedVectors[c + 1] // skip smallest (index 0)
-    if (vec === undefined) break
-    for (let i = 0; i < N; i++) {
-      // Divide by dd[i] (matches Python `wide_spectral_embeddings = eigenvectors.T / dd`)
-      spectralPoints[i]![c] = (vec[i]! / dd[i]!)
+  let labels = chooseBisectLabels(points, rng)
+  let leftEntries: EmbedEntry[] = []
+  let rightEntries: EmbedEntry[] = []
+  let leftPoints: Float64Array[] = []
+  let rightPoints: Float64Array[] = []
+  for (let i = 0; i < N; i++) {
+    if (labels[i] === 0) {
+      leftEntries.push(entries[i]!)
+      leftPoints.push(points[i]!)
+    } else {
+      rightEntries.push(entries[i]!)
+      rightPoints.push(points[i]!)
     }
   }
-  // L2-normalise each row
-  for (const row of spectralPoints) {
-    let norm = 0
-    for (const v of row) norm += v * v
-    norm = Math.sqrt(norm)
-    if (norm > 1e-12) for (let d = 0; d < row.length; d++) row[d]! /= norm
+  if (leftEntries.length === 0 || rightEntries.length === 0) {
+    const mid = Math.floor(N / 2)
+    leftEntries = entries.slice(0, mid)
+    rightEntries = entries.slice(mid)
+    leftPoints = points.slice(0, mid)
+    rightPoints = points.slice(mid)
   }
-  // --- K-means ---
-  const labels = kmeans(spectralPoints, nClusters)
+  return {
+    left: { entries: leftEntries, points: leftPoints },
+    right: { entries: rightEntries, points: rightPoints },
+  }
+}
-  // Group entries by cluster label, preserving order
-  const groups = new Map<number, EmbedEntry[]>()
-  for (let i = 0; i < N; i++) {
-    const label = labels[i]!
-    if (!groups.has(label)) groups.set(label, [])
-    groups.get(label)!.push(input.entries[i]!)
+/**
+ * Recursively split a Cluster into sub-clusters using bisecting k-means.
+ * Returns [input] when the cluster is small enough to be a leaf.
+ */
+export function splitCluster(input: Cluster): Cluster[] {
+  const N = input.entries.length
+  if (N <= MAX_LEAVES) return [input]
+  const normalized = normaliseRows(matFromEmbeds(input.entries))
+  const rng = seededRng(42)
+  const work: ClusterState[] = [{ entries: input.entries, points: normalized }]
+  const leaves: Cluster[] = []
+  while (work.length > 0) {
+    const cluster = work.pop()!
+    if (cluster.entries.length <= MAX_LEAVES) {
+      leaves.push({ entries: cluster.entries })
+      continue
+    }
+    const { left, right } = bisectCluster(cluster, rng)
+    if (right.entries.length > 0) work.push(right)
+    if (left.entries.length > 0) work.push(left)
   }
-  return Array.from(groups.values()).map((entries) => ({ entries }))
+  return leaves
 }

package/src/embed.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { pipeline, type FeatureExtractionPipeline } from "@huggingface/transformers"
 import type { Chunk } from "./tokenize.ts"
+import { getCachedEmbedding, setCachedEmbedding, flushEmbedCache } from "./cache.ts"
 export const DEFAULT_EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2"
@@ -19,6 +20,8 @@ export interface EmbedOptions {
   model: string
   batchSize: number
   concurrency: number
+  /** When true, skip reading from cache (but still write to it). */
+  noCache?: boolean
 }
 let _pipe: FeatureExtractionPipeline | null = null
@@ -81,6 +84,7 @@ async function embedBatch(
 /**
  * Embed all chunks using the local model, with batching + concurrency limits.
+ * Chunks whose text is already in the embedding cache are skipped.
  * Calls `onProgress(done, total)` after each batch completes.
  */
 export async function embedChunks(
@@ -90,44 +94,60 @@ export async function embedChunks(
 ): Promise<EmbedEntry[]> {
   if (chunks.length === 0) return []
-  const pipe = await getEmbedPipeline(opts.model)
-  const batches: Chunk[][] = []
-  for (let i = 0; i < chunks.length; i += opts.batchSize) {
-    batches.push(chunks.slice(i, i + opts.batchSize))
+  // --- Cache pass: resolve hits immediately, collect misses for the model ---
+  const entries: EmbedEntry[] = new Array(chunks.length)
+  const missIndices: number[] = []
+  if (!opts.noCache) {
+    for (let i = 0; i < chunks.length; i++) {
+      const cached = getCachedEmbedding(chunks[i]!.text)
+      if (cached !== null) {
+        entries[i] = { path: chunks[i]!.path, text: chunks[i]!.text, embedding: cached }
+      } else {
+        missIndices.push(i)
+      }
+    }
+  } else {
+    for (let i = 0; i < chunks.length; i++) missIndices.push(i)
   }
-  const entries: EmbedEntry[] = new Array(chunks.length)
-  let chunkIndex = 0
-  let done = 0
+  let done = chunks.length - missIndices.length
+  onProgress?.(done, chunks.length)
-  for (let i = 0; i < batches.length; i += opts.concurrency) {
-    const concurrentBatches = batches.slice(i, i + opts.concurrency)
-    const startIndex = chunkIndex
+  if (missIndices.length > 0) {
+    const pipe = await getEmbedPipeline(opts.model)
+    const missChunks = missIndices.map((i) => chunks[i]!)
-    const batchResults = await Promise.all(
-      concurrentBatches.map((batch) =>
-        embedBatch(pipe, batch.map((c) => c.text))
+    const batches: number[][] = []
+    for (let i = 0; i < missChunks.length; i += opts.batchSize) {
+      batches.push(missIndices.slice(i, i + opts.batchSize))
+    }
+    for (let i = 0; i < batches.length; i += opts.concurrency) {
+      const concurrentBatches = batches.slice(i, i + opts.concurrency)
+      const batchResults = await Promise.all(
+        concurrentBatches.map((idxBatch) =>
+          embedBatch(pipe, idxBatch.map((ci) => chunks[ci]!.text))
+        )
       )
-    )
-    let offset = startIndex
-    for (let b = 0; b < concurrentBatches.length; b++) {
-      const batch = concurrentBatches[b]!
-      const embeddings = batchResults[b]!
-      for (let j = 0; j < batch.length; j++) {
-        const chunk = batch[j]!
-        entries[offset] = {
-          path: chunk.path,
-          text: chunk.text,
-          embedding: embeddings[j]!,
+      for (let b = 0; b < concurrentBatches.length; b++) {
+        const idxBatch = concurrentBatches[b]!
+        const embeddings = batchResults[b]!
+        for (let j = 0; j < idxBatch.length; j++) {
+          const ci = idxBatch[j]!
+          const chunk = chunks[ci]!
+          const embedding = embeddings[j]!
+          entries[ci] = { path: chunk.path, text: chunk.text, embedding }
+          setCachedEmbedding(chunk.text, embedding)
         }
-        offset++
+        done += idxBatch.length
+        onProgress?.(done, chunks.length)
       }
-      chunkIndex += batch.length
-      done += batch.length
-      onProgress?.(done, chunks.length)
     }
+    flushEmbedCache()
   }
   return entries

package/src/main.ts CHANGED Viewed

@@ -15,6 +15,7 @@
 import { Command } from "commander"
 import path from "node:path"
+import { createHash } from "node:crypto"
 import { discoverFiles, DEFAULT_EXCLUDES, type FsOptions } from "./fs.ts"
 import { chunkFile } from "./tokenize.ts"
@@ -24,6 +25,7 @@ import { buildTree } from "./tree.ts"
 import { clearAuthCache, getCopilotToken } from "./auth.ts"
 import { SemanticNavigatorUI, type ProgressState } from "./ui.ts"
 import type { CopilotConfig } from "./labels.ts"
+import { treeFingerprint, getCachedTree, setCachedTree } from "./cache.ts"
 // ---------------------------------------------------------------------------
 // CLI definition
@@ -40,6 +42,7 @@ const program = new Command()
   .option("--read-concurrency <n>", "Concurrent file reads", (v) => parseInt(v, 10), 64)
   .option("--embed-batch-size <n>", "Chunks per embedding batch", (v) => parseInt(v, 10), 32)
   .option("--embed-concurrency <n>", "Concurrent embedding batches", (v) => parseInt(v, 10), 2)
+  .option("--no-cache", "Skip reading from cache; force re-embed and re-label")
   .option("--logout", "Clear cached GitHub / Copilot credentials and exit")
   .helpOption("-h, --help", "Show help")
@@ -60,6 +63,7 @@ async function main(): Promise<void> {
     readConcurrency: number
     embedBatchSize: number
     embedConcurrency: number
+    cache: boolean
     logout: boolean | undefined
   }>()
@@ -167,6 +171,7 @@ async function main(): Promise<void> {
     model: DEFAULT_EMBEDDING_MODEL,
     batchSize: opts.embedBatchSize,
     concurrency: opts.embedConcurrency,
+    noCache: !opts.cache,
   }
   let embedEntriesRaw: EmbedEntry[] | undefined
@@ -184,7 +189,32 @@ async function main(): Promise<void> {
   }
   // ---------------------------------------------------------------------------
-  // Step 5: Spectral clustering (CPU-bound, synchronous)
+  // Step 5: Compute tree fingerprint and check tree cache
+  // ---------------------------------------------------------------------------
+  // Build a map of relativePath → sha256(content) for all discovered files.
+  const fileHashes = new Map<string, string>()
+  for (const f of resolvedFiles) {
+    fileHashes.set(
+      f.relativePath,
+      createHash("sha256").update(f.content).digest("hex").slice(0, 16)
+    )
+  }
+  const fingerprint = treeFingerprint(opts.completionModel, fileHashes)
+  const noCache = !opts.cache
+  if (!noCache) {
+    const cached = getCachedTree(fingerprint)
+    if (cached !== null) {
+      ui.setTree(cached)
+      // The UI event loop keeps the process alive until the user presses q/Esc.
+      return
+    }
+  }
+  // ---------------------------------------------------------------------------
+  // Step 6: Spectral clustering (CPU-bound, synchronous)
   // ---------------------------------------------------------------------------
   ui.updateProgress({
@@ -201,7 +231,7 @@ async function main(): Promise<void> {
   await Bun.sleep(0)
   // ---------------------------------------------------------------------------
-  // Step 6: Build labelled tree
+  // Step 7: Build labelled tree
   // ---------------------------------------------------------------------------
   ui.updateProgress({
@@ -223,8 +253,11 @@ async function main(): Promise<void> {
   }
   const tree = treeRaw!
+  // Persist to tree cache for future runs
+  setCachedTree(fingerprint, tree)
   // ---------------------------------------------------------------------------
-  // Step 7: Hand the tree to the UI
+  // Step 8: Hand the tree to the UI
   // ---------------------------------------------------------------------------
   ui.setTree(tree)