npm - @et0and/ovid - Versions diffs - 0.0.3 → 0.0.4 - Mend

@et0and/ovid 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json CHANGED Viewed

@@ -1,11 +1,8 @@
 {
   "name": "@et0and/ovid",
-  "version": "0.0.3",
+  "version": "0.0.4",
   "description": "Browse a repository's files by semantic meaning",
   "type": "module",
-  "bin": {
-    "ovid": "./bin/semantic-navigator.js"
-  },
   "files": [
     "bin",
     "src",

package/src/cache.ts ADDED Viewed

@@ -0,0 +1,119 @@
+/**
+ * Two-layer result cache for semantic-navigator.
+ *
+ * Layer 1 — Embedding cache (~/.cache/semantic-navigator/embeddings.json)
+ *   Content-addressed: maps sha256(text)[0:16] → number[]
+ *   Per-entry granularity: only re-embed chunks whose text changed.
+ *
+ * Layer 2 — Tree cache (~/.cache/semantic-navigator/trees/<fingerprint>.json)
+ *   Keyed by sha256(model + sorted(path:contentHash pairs)).
+ *   A single changed file invalidates the whole tree, forcing a fresh
+ *   cluster + label run, but embeddings are still reused from layer 1.
+ */
+import { createHash } from "node:crypto"
+import { mkdirSync, existsSync, readFileSync, writeFileSync } from "node:fs"
+import { join } from "node:path"
+import { homedir } from "node:os"
+import type { Tree } from "./tree.ts"
+// ---------------------------------------------------------------------------
+// Paths
+// ---------------------------------------------------------------------------
+const CACHE_DIR = join(homedir(), ".cache", "semantic-navigator")
+const EMBED_CACHE_PATH = join(CACHE_DIR, "embeddings.json")
+const TREES_DIR = join(CACHE_DIR, "trees")
+function ensureDirs(): void {
+  mkdirSync(CACHE_DIR, { recursive: true })
+  mkdirSync(TREES_DIR, { recursive: true })
+}
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+/** Short hex digest: sha256(text).slice(0, 16) */
+export function textHash(text: string): string {
+  return createHash("sha256").update(text).digest("hex").slice(0, 16)
+}
+/**
+ * Fingerprint for the tree cache: sha256(model + sorted path:contentHash pairs).
+ * `fileHashes` is a map from relative path to sha256(file content).
+ */
+export function treeFingerprint(model: string, fileHashes: Map<string, string>): string {
+  const entries = Array.from(fileHashes.entries())
+    .map(([p, h]) => `${p}:${h}`)
+    .sort()
+    .join("\n")
+  return createHash("sha256").update(model + "\n" + entries).digest("hex")
+}
+// ---------------------------------------------------------------------------
+// Layer 1: Embedding cache
+// ---------------------------------------------------------------------------
+type EmbedCacheMap = Record<string, number[]>
+let _embedCache: EmbedCacheMap | null = null
+function loadEmbedCache(): EmbedCacheMap {
+  if (_embedCache !== null) return _embedCache
+  if (existsSync(EMBED_CACHE_PATH)) {
+    try {
+      _embedCache = JSON.parse(readFileSync(EMBED_CACHE_PATH, "utf-8")) as EmbedCacheMap
+    } catch {
+      _embedCache = {}
+    }
+  } else {
+    _embedCache = {}
+  }
+  return _embedCache
+}
+/** Look up a cached embedding by text content. Returns null on miss. */
+export function getCachedEmbedding(text: string): Float32Array | null {
+  const cache = loadEmbedCache()
+  const key = textHash(text)
+  const vec = cache[key]
+  if (vec === undefined) return null
+  return Float32Array.from(vec)
+}
+/** Store an embedding in the in-memory cache (call flushEmbedCache to persist). */
+export function setCachedEmbedding(text: string, embedding: Float32Array): void {
+  const cache = loadEmbedCache()
+  const key = textHash(text)
+  cache[key] = Array.from(embedding)
+}
+/** Persist the in-memory embedding cache to disk. */
+export function flushEmbedCache(): void {
+  if (_embedCache === null) return
+  ensureDirs()
+  writeFileSync(EMBED_CACHE_PATH, JSON.stringify(_embedCache), "utf-8")
+}
+// ---------------------------------------------------------------------------
+// Layer 2: Tree cache
+// ---------------------------------------------------------------------------
+/** Look up a cached Tree by fingerprint. Returns null on miss. */
+export function getCachedTree(fingerprint: string): Tree | null {
+  const treePath = join(TREES_DIR, `${fingerprint}.json`)
+  if (!existsSync(treePath)) return null
+  try {
+    return JSON.parse(readFileSync(treePath, "utf-8")) as Tree
+  } catch {
+    return null
+  }
+}
+/** Persist a Tree to the tree cache. */
+export function setCachedTree(fingerprint: string, tree: Tree): void {
+  ensureDirs()
+  const treePath = join(TREES_DIR, `${fingerprint}.json`)
+  writeFileSync(treePath, JSON.stringify(tree), "utf-8")
+}

package/src/cluster.ts CHANGED Viewed

@@ -95,19 +95,24 @@ function connectedComponents(indices: Int32Array[], N: number): number {
 }
 /**
- * Build the (dense) normalised Laplacian from the affinity matrix (stored as
- * a list of sparse {row,col,val} triples) and return it as a dense matrix
- * plus the degree diagonal `dd`.
+ * Build the degree diagonal `dd` from sparse affinity triplets, then return
+ * a sparse matvec closure for the *negated* normalised Laplacian (-L_norm).
+ *
+ * L_norm = I - D^{-1/2} A D^{-1/2}
+ * -L_norm = D^{-1/2} A D^{-1/2} - I
+ *
+ * The matvec avoids allocating an N×N dense matrix — at N=2000, k_sparse≈7,
+ * this reduces the cost per multiply from O(N²)=4M to O(N·k)=14k ops.
  */
-function buildNormalisedLaplacian(
+function buildNormLaplacianSparseMatvec(
   sparseAffinity: Array<{ i: number; j: number; v: number }>,
   N: number
-): { L: Matrix; dd: Float64Array } {
+): { matvec: (v: Float64Array) => Float64Array; dd: Float64Array } {
   // Accumulate row sums (degree) for normalisation
   const degree = new Float64Array(N)
   for (const { i, j, v } of sparseAffinity) {
-    degree[i] = (degree[i] ?? 0) + v
-    if (i !== j) degree[j] = (degree[j] ?? 0) + v
+    degree[i]! += v
+    if (i !== j) degree[j]! += v
   }
   const dd = new Float64Array(N)
@@ -115,31 +120,29 @@ function buildNormalisedLaplacian(
     dd[i] = degree[i]! > 1e-12 ? 1 / Math.sqrt(degree[i]!) : 0
   }
-  // L_norm = I - D^{-1/2} A D^{-1/2}
-  // We start from identity
-  const L: Matrix = Array.from({ length: N }, (_, i) => {
-    const row = new Float64Array(N)
-    row[i] = 1
-    return row
-  })
-  // Subtract normalised affinity contributions
-  for (const { i, j, v } of sparseAffinity) {
-    const w = v * dd[i]! * dd[j]!
-    const rowI = L[i]!
-    rowI[j] = (rowI[j] ?? 0) - w
-    if (i !== j) {
-      const rowJ = L[j]!
-      rowJ[i] = (rowJ[i] ?? 0) - w
+  // Pre-compute normalised weights once so the closure stays cheap.
+  // w_ij = v * dd[i] * dd[j]  (the off-diagonal contribution to A_norm)
+  const normAffinity = sparseAffinity.map(({ i, j, v }) => ({
+    i,
+    j,
+    w: v * dd[i]! * dd[j]!,
+  }))
+  // Matvec for -L_norm = A_norm - I
+  // result[i] = -v[i] + sum_j w_ij * v[j]   (using symmetry)
+  const matvecFn = (vec: Float64Array): Float64Array => {
+    const out = new Float64Array(N)
+    // Start from -I · vec
+    for (let i = 0; i < N; i++) out[i] = -vec[i]!
+    // Add symmetric A_norm contributions
+    for (const { i, j, w } of normAffinity) {
+      out[i]! += w * vec[j]!
+      if (i !== j) out[j]! += w * vec[i]!
     }
+    return out
   }
-  // Clamp diagonal to 1 (matches scipy behaviour after set_diag)
-  for (let i = 0; i < N; i++) {
-    L[i]![i] = 1
-  }
-  return { L, dd }
+  return { matvec: matvecFn, dd }
 }
 /**
@@ -171,35 +174,24 @@ function normaliseVec(v: Float64Array): number {
   return n
 }
-/** Multiply matrix M by vector v */
-function matvec(M: Matrix, v: Float64Array): Float64Array<ArrayBuffer> {
-  const N = M.length
-  const out = new Float64Array(N) as Float64Array<ArrayBuffer>
-  for (let i = 0; i < N; i++) {
-    out[i] = dot(M[i]!, v)
-  }
-  return out
-}
 /**
  * Randomised power-iteration with deflation to extract the `k` eigenpairs
- * corresponding to the *smallest* eigenvalues of a symmetric matrix M.
- *
- * M is the **negated** Laplacian (M = -L), so its *largest* eigenvalues
- * correspond to L's smallest — matching the Python code which does `laplacian *= -1`.
+ * corresponding to the *smallest* eigenvalues of a symmetric matrix.
  *
- * We use shifted inverse iteration: to find small eigenvalues of L we find
- * large eigenvalues of (-L + shift*I) where shift ≈ 1 (the diagonal was set
- * to 1 above).  We iterate on M = -L and take the top-k eigenvectors, then
- * negate the eigenvalues back.
+ * Instead of a dense matrix, accepts a sparse `matvecFn` closure so that the
+ * per-iteration cost is O(N·k_sparse) rather than O(N²).  The closure should
+ * implement multiplication by the *negated* Laplacian (-L_norm), whose top
+ * eigenvalues correspond to L_norm's bottom ones (matching the Python code
+ * which does `laplacian *= -1`).
  */
 function topKEigenpairs(
-  negL: Matrix,
+  matvecFn: (v: Float64Array) => Float64Array,
+  n: number,
   k: number,
   maxIter = 300,
   tol = 1e-6
 ): { values: Float64Array<ArrayBuffer>; vectors: Float64Array<ArrayBuffer>[] } {
-  const N = negL.length
+  const N = n
   const rng = seededRng(42)
   const vectors: Float64Array<ArrayBuffer>[] = []
@@ -207,7 +199,7 @@ function topKEigenpairs(
   for (let idx = 0; idx < k; idx++) {
     // Random start
-    let v = Float64Array.from({ length: N }, () => rng() * 2 - 1)
+    let v = Float64Array.from({ length: N }, () => rng() * 2 - 1) as Float64Array<ArrayBuffer>
     normaliseVec(v)
     // Deflate against already-found vectors
@@ -216,7 +208,7 @@ function topKEigenpairs(
     let lambda = 0
     for (let iter = 0; iter < maxIter; iter++) {
-      const Mv = matvec(negL, v)
+      const Mv = matvecFn(v) as Float64Array<ArrayBuffer>
       // Deflate
       for (const u of vectors) subtractProjection(Mv, u)
@@ -364,6 +356,29 @@ export function splitCluster(input: Cluster): Cluster[] {
   const normalized = normaliseRows(matFromEmbeds(input.entries))
+  // --- Precompute all pairwise distances once (O(N²)) ---
+  // Each row is sorted ascending so we can slice any k cheaply.
+  const allDistances: Array<Array<[number, number]>> = Array.from({ length: N }, () => [])
+  for (let i = 0; i < N; i++) {
+    for (let j = 0; j < N; j++) {
+      if (j === i) continue
+      allDistances[i]!.push([cosDist(normalized[i]!, normalized[j]!), j])
+    }
+    allDistances[i]!.sort((a, b) => a[0] - b[0])
+  }
+  /** Slice sorted rows to get k-NN result for any k in O(N·k). */
+  function knnFromPrecomputed(k: number): { distances: Float64Array[]; indices: Int32Array[] } {
+    const distances: Float64Array[] = []
+    const indices: Int32Array[] = []
+    for (let i = 0; i < N; i++) {
+      const row = allDistances[i]!.slice(0, k)
+      distances.push(Float64Array.from(row.map((x) => x[0])))
+      indices.push(Int32Array.from(row.map((x) => x[1])))
+    }
+    return { distances, indices }
+  }
   // --- Adaptive k-NN: find smallest k that gives 1 connected component ---
   const candidateKs: number[] = []
   for (let n = 0; ; n++) {
@@ -377,7 +392,7 @@ export function splitCluster(input: Cluster): Cluster[] {
   let chosenKnnResult: { distances: Float64Array[]; indices: Int32Array[] } | null = null
   for (const k of candidateKs) {
-    const knnResult = knn(normalized, k)
+    const knnResult = knnFromPrecomputed(k)
     const nComponents = connectedComponents(knnResult.indices, N)
     if (nComponents === 1) {
       chosenK = k
@@ -387,8 +402,7 @@ export function splitCluster(input: Cluster): Cluster[] {
   }
   if (chosenKnnResult === null) {
-    // Fallback: compute for the last candidate (floor(N/2))
-    chosenKnnResult = knn(normalized, chosenK)
+    chosenKnnResult = knnFromPrecomputed(chosenK)
   }
   const { distances, indices } = chosenKnnResult
@@ -411,19 +425,12 @@ export function splitCluster(input: Cluster): Cluster[] {
     }
   }
-  // --- Normalised Laplacian ---
-  const { L, dd } = buildNormalisedLaplacian(sparseAffinity, N)
-  // Negate L (as Python does `laplacian *= -1`) so power iteration finds
-  // eigenvectors of -L, whose top eigenvalues correspond to L's bottom ones.
-  const negL: Matrix = L.map((row) => {
-    const r = Float64Array.from(row)
-    for (let i = 0; i < r.length; i++) r[i]! *= -1
-    return r
-  })
+  // --- Sparse normalised Laplacian matvec ---
+  // Avoids building an N×N dense matrix; matvec cost is O(N·k_sparse) vs O(N²).
+  const { matvec: negLMatvec, dd } = buildNormLaplacianSparseMatvec(sparseAffinity, N)
   const k = Math.min(MAX_CLUSTERS + 1, N)
-  const { values: rawValues, vectors } = topKEigenpairs(negL, k)
+  const { values: rawValues, vectors } = topKEigenpairs(negLMatvec, N, k)
   // Eigenvalues were of -L; flip sign back to get L eigenvalues
   const eigenvalues = Float64Array.from(rawValues, (v) => -v)

package/src/embed.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { pipeline, type FeatureExtractionPipeline } from "@huggingface/transformers"
 import type { Chunk } from "./tokenize.ts"
+import { getCachedEmbedding, setCachedEmbedding, flushEmbedCache } from "./cache.ts"
 export const DEFAULT_EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2"
@@ -19,6 +20,8 @@ export interface EmbedOptions {
   model: string
   batchSize: number
   concurrency: number
+  /** When true, skip reading from cache (but still write to it). */
+  noCache?: boolean
 }
 let _pipe: FeatureExtractionPipeline | null = null
@@ -81,6 +84,7 @@ async function embedBatch(
 /**
  * Embed all chunks using the local model, with batching + concurrency limits.
+ * Chunks whose text is already in the embedding cache are skipped.
  * Calls `onProgress(done, total)` after each batch completes.
  */
 export async function embedChunks(
@@ -90,44 +94,60 @@ export async function embedChunks(
 ): Promise<EmbedEntry[]> {
   if (chunks.length === 0) return []
-  const pipe = await getEmbedPipeline(opts.model)
-  const batches: Chunk[][] = []
-  for (let i = 0; i < chunks.length; i += opts.batchSize) {
-    batches.push(chunks.slice(i, i + opts.batchSize))
+  // --- Cache pass: resolve hits immediately, collect misses for the model ---
+  const entries: EmbedEntry[] = new Array(chunks.length)
+  const missIndices: number[] = []
+  if (!opts.noCache) {
+    for (let i = 0; i < chunks.length; i++) {
+      const cached = getCachedEmbedding(chunks[i]!.text)
+      if (cached !== null) {
+        entries[i] = { path: chunks[i]!.path, text: chunks[i]!.text, embedding: cached }
+      } else {
+        missIndices.push(i)
+      }
+    }
+  } else {
+    for (let i = 0; i < chunks.length; i++) missIndices.push(i)
   }
-  const entries: EmbedEntry[] = new Array(chunks.length)
-  let chunkIndex = 0
-  let done = 0
+  let done = chunks.length - missIndices.length
+  onProgress?.(done, chunks.length)
-  for (let i = 0; i < batches.length; i += opts.concurrency) {
-    const concurrentBatches = batches.slice(i, i + opts.concurrency)
-    const startIndex = chunkIndex
+  if (missIndices.length > 0) {
+    const pipe = await getEmbedPipeline(opts.model)
+    const missChunks = missIndices.map((i) => chunks[i]!)
-    const batchResults = await Promise.all(
-      concurrentBatches.map((batch) =>
-        embedBatch(pipe, batch.map((c) => c.text))
+    const batches: number[][] = []
+    for (let i = 0; i < missChunks.length; i += opts.batchSize) {
+      batches.push(missIndices.slice(i, i + opts.batchSize))
+    }
+    for (let i = 0; i < batches.length; i += opts.concurrency) {
+      const concurrentBatches = batches.slice(i, i + opts.concurrency)
+      const batchResults = await Promise.all(
+        concurrentBatches.map((idxBatch) =>
+          embedBatch(pipe, idxBatch.map((ci) => chunks[ci]!.text))
+        )
       )
-    )
-    let offset = startIndex
-    for (let b = 0; b < concurrentBatches.length; b++) {
-      const batch = concurrentBatches[b]!
-      const embeddings = batchResults[b]!
-      for (let j = 0; j < batch.length; j++) {
-        const chunk = batch[j]!
-        entries[offset] = {
-          path: chunk.path,
-          text: chunk.text,
-          embedding: embeddings[j]!,
+      for (let b = 0; b < concurrentBatches.length; b++) {
+        const idxBatch = concurrentBatches[b]!
+        const embeddings = batchResults[b]!
+        for (let j = 0; j < idxBatch.length; j++) {
+          const ci = idxBatch[j]!
+          const chunk = chunks[ci]!
+          const embedding = embeddings[j]!
+          entries[ci] = { path: chunk.path, text: chunk.text, embedding }
+          setCachedEmbedding(chunk.text, embedding)
         }
-        offset++
+        done += idxBatch.length
+        onProgress?.(done, chunks.length)
       }
-      chunkIndex += batch.length
-      done += batch.length
-      onProgress?.(done, chunks.length)
     }
+    flushEmbedCache()
   }
   return entries

package/src/main.ts CHANGED Viewed

@@ -15,6 +15,7 @@
 import { Command } from "commander"
 import path from "node:path"
+import { createHash } from "node:crypto"
 import { discoverFiles, DEFAULT_EXCLUDES, type FsOptions } from "./fs.ts"
 import { chunkFile } from "./tokenize.ts"
@@ -24,6 +25,7 @@ import { buildTree } from "./tree.ts"
 import { clearAuthCache, getCopilotToken } from "./auth.ts"
 import { SemanticNavigatorUI, type ProgressState } from "./ui.ts"
 import type { CopilotConfig } from "./labels.ts"
+import { treeFingerprint, getCachedTree, setCachedTree } from "./cache.ts"
 // ---------------------------------------------------------------------------
 // CLI definition
@@ -40,6 +42,7 @@ const program = new Command()
   .option("--read-concurrency <n>", "Concurrent file reads", (v) => parseInt(v, 10), 64)
   .option("--embed-batch-size <n>", "Chunks per embedding batch", (v) => parseInt(v, 10), 32)
   .option("--embed-concurrency <n>", "Concurrent embedding batches", (v) => parseInt(v, 10), 2)
+  .option("--no-cache", "Skip reading from cache; force re-embed and re-label")
   .option("--logout", "Clear cached GitHub / Copilot credentials and exit")
   .helpOption("-h, --help", "Show help")
@@ -60,6 +63,7 @@ async function main(): Promise<void> {
     readConcurrency: number
     embedBatchSize: number
     embedConcurrency: number
+    cache: boolean
     logout: boolean | undefined
   }>()
@@ -167,6 +171,7 @@ async function main(): Promise<void> {
     model: DEFAULT_EMBEDDING_MODEL,
     batchSize: opts.embedBatchSize,
     concurrency: opts.embedConcurrency,
+    noCache: !opts.cache,
   }
   let embedEntriesRaw: EmbedEntry[] | undefined
@@ -184,7 +189,32 @@ async function main(): Promise<void> {
   }
   // ---------------------------------------------------------------------------
-  // Step 5: Spectral clustering (CPU-bound, synchronous)
+  // Step 5: Compute tree fingerprint and check tree cache
+  // ---------------------------------------------------------------------------
+  // Build a map of relativePath → sha256(content) for all discovered files.
+  const fileHashes = new Map<string, string>()
+  for (const f of resolvedFiles) {
+    fileHashes.set(
+      f.relativePath,
+      createHash("sha256").update(f.content).digest("hex").slice(0, 16)
+    )
+  }
+  const fingerprint = treeFingerprint(opts.completionModel, fileHashes)
+  const noCache = !opts.cache
+  if (!noCache) {
+    const cached = getCachedTree(fingerprint)
+    if (cached !== null) {
+      ui.setTree(cached)
+      // The UI event loop keeps the process alive until the user presses q/Esc.
+      return
+    }
+  }
+  // ---------------------------------------------------------------------------
+  // Step 6: Spectral clustering (CPU-bound, synchronous)
   // ---------------------------------------------------------------------------
   ui.updateProgress({
@@ -201,7 +231,7 @@ async function main(): Promise<void> {
   await Bun.sleep(0)
   // ---------------------------------------------------------------------------
-  // Step 6: Build labelled tree
+  // Step 7: Build labelled tree
   // ---------------------------------------------------------------------------
   ui.updateProgress({
@@ -223,8 +253,11 @@ async function main(): Promise<void> {
   }
   const tree = treeRaw!
+  // Persist to tree cache for future runs
+  setCachedTree(fingerprint, tree)
   // ---------------------------------------------------------------------------
-  // Step 7: Hand the tree to the UI
+  // Step 8: Hand the tree to the UI
   // ---------------------------------------------------------------------------
   ui.setTree(tree)