@et0and/ovid 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +35 -1
  2. package/package.json +1 -1
  3. package/src/cluster.ts +196 -115
package/README.md CHANGED
@@ -1 +1,35 @@
1
- # ovid - a semantic project navigator (ported to Typescript and Bun from [this](https://github.com/Gabriella439/semantic-navigator))
1
+ # ovid
2
+
3
+ Semantic project navigator for local repos. Builds a labeled tree of files by meaning and renders it in a TUI.
4
+
5
+ Port of the original Python project by [Gabriella Gonzalez](https://github.com/Gabriella439/semantic-navigator)
6
+
7
+ Key differences in this port:
8
+
9
+ - TypeScript + Bun CLI, OpenTUI renderer
10
+ - Local embeddings via `@huggingface/transformers` (`Xenova/all-MiniLM-L6-v2`)
11
+ - Labels generated through GitHub Copilot device flow (no OpenAI API key)
12
+ - Two-layer cache under `~/.cache/semantic-navigator`
13
+
14
+ **What it does**
15
+
16
+ - Reads tracked files (git) or top-level files (non-git)
17
+ - Embeds file chunks locally with `Xenova/all-MiniLM-L6-v2`
18
+ - Clusters files with bisecting k-means
19
+ - Labels clusters and leaves via GitHub Copilot
20
+ - Renders an interactive tree (OpenTUI)
21
+
22
+ **How it works**
23
+
24
+ - Discover files → chunk text → embed → cluster → label → display
25
+ - Embeddings are cached in `~/.cache/semantic-navigator/embeddings.json`
26
+ - Trees are cached in `~/.cache/semantic-navigator/trees/<fingerprint>.json`
27
+
28
+ **Usage**
29
+
30
+ - `bun run src/main.ts [directory]`
31
+ - `bun run src/main.ts --help`
32
+
33
+ **Notes**
34
+
35
+ - First run prompts for GitHub device flow to access Copilot.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@et0and/ovid",
3
- "version": "0.0.4",
3
+ "version": "0.0.6",
4
4
  "description": "Browse a repository's files by semantic meaning",
5
5
  "type": "module",
6
6
  "files": [
package/src/cluster.ts CHANGED
@@ -345,150 +345,231 @@ function kmeans(
345
345
  return labels
346
346
  }
347
347
 
348
- /**
349
- * Recursively split a Cluster into sub-clusters using spectral clustering.
350
- * Returns [input] when the cluster is small enough to be a leaf.
351
- */
352
- export function splitCluster(input: Cluster): Cluster[] {
353
- const N = input.entries.length
354
-
355
- if (N <= MAX_LEAVES) return [input]
356
-
357
- const normalized = normaliseRows(matFromEmbeds(input.entries))
348
+ const MINI_BATCH_THRESHOLD = 512
349
+ const MINI_BATCH_SIZE = 128
350
+ const MINI_BATCH_ITERS = 120
351
+ const KMEANS_MAX_ITER = 60
352
+ const KMEANS_RETRIES = 2
353
+ const MINI_BATCH_RETRIES = 2
354
+
355
+ interface ClusterState {
356
+ entries: EmbedEntry[]
357
+ points: Float64Array[]
358
+ }
358
359
 
359
- // --- Precompute all pairwise distances once (O(N²)) ---
360
- // Each row is sorted ascending so we can slice any k cheaply.
361
- const allDistances: Array<Array<[number, number]>> = Array.from({ length: N }, () => [])
362
- for (let i = 0; i < N; i++) {
363
- for (let j = 0; j < N; j++) {
364
- if (j === i) continue
365
- allDistances[i]!.push([cosDist(normalized[i]!, normalized[j]!), j])
366
- }
367
- allDistances[i]!.sort((a, b) => a[0] - b[0])
360
+ function countLabels(labels: Int32Array, k: number): Int32Array {
361
+ const counts = new Int32Array(k)
362
+ for (let i = 0; i < labels.length; i++) {
363
+ const label = labels[i]
364
+ if (label !== undefined) counts[label] = (counts[label] ?? 0) + 1
368
365
  }
366
+ return counts
367
+ }
369
368
 
370
- /** Slice sorted rows to get k-NN result for any k in O(N·k). */
371
- function knnFromPrecomputed(k: number): { distances: Float64Array[]; indices: Int32Array[] } {
372
- const distances: Float64Array[] = []
373
- const indices: Int32Array[] = []
374
- for (let i = 0; i < N; i++) {
375
- const row = allDistances[i]!.slice(0, k)
376
- distances.push(Float64Array.from(row.map((x) => x[0])))
377
- indices.push(Int32Array.from(row.map((x) => x[1])))
369
+ function nearestCentroid(point: Float64Array, centroids: Float64Array[]): number {
370
+ let best = 0
371
+ let bestDist = Infinity
372
+ for (let c = 0; c < centroids.length; c++) {
373
+ const d = distSq(point, centroids[c]!)
374
+ if (d < bestDist) {
375
+ bestDist = d
376
+ best = c
378
377
  }
379
- return { distances, indices }
380
378
  }
379
+ return best
380
+ }
381
381
 
382
- // --- Adaptive k-NN: find smallest k that gives 1 connected component ---
383
- const candidateKs: number[] = []
384
- for (let n = 0; ; n++) {
385
- const k = Math.round(Math.exp(n))
386
- if (k >= N) break
387
- candidateKs.push(k)
382
+ function assignLabels(points: Float64Array[], centroids: Float64Array[]): Int32Array {
383
+ const labels = new Int32Array(points.length)
384
+ for (let i = 0; i < points.length; i++) {
385
+ labels[i] = nearestCentroid(points[i]!, centroids)
388
386
  }
389
- candidateKs.push(Math.floor(N / 2))
390
-
391
- let chosenK = candidateKs[candidateKs.length - 1]!
392
- let chosenKnnResult: { distances: Float64Array[]; indices: Int32Array[] } | null = null
393
-
394
- for (const k of candidateKs) {
395
- const knnResult = knnFromPrecomputed(k)
396
- const nComponents = connectedComponents(knnResult.indices, N)
397
- if (nComponents === 1) {
398
- chosenK = k
399
- chosenKnnResult = knnResult
400
- break
387
+ return labels
388
+ }
389
+
390
+ function initRandomCentroids(
391
+ points: Float64Array[],
392
+ k: number,
393
+ rng: () => number
394
+ ): Float64Array[] {
395
+ const N = points.length
396
+ const centroids: Float64Array[] = []
397
+ const used = new Set<number>()
398
+ for (let c = 0; c < k; c++) {
399
+ let idx = Math.floor(rng() * N)
400
+ for (let attempts = 0; attempts < 4 && used.has(idx); attempts++) {
401
+ idx = Math.floor(rng() * N)
401
402
  }
403
+ used.add(idx)
404
+ centroids.push(Float64Array.from(points[idx]!))
402
405
  }
406
+ return centroids
407
+ }
403
408
 
404
- if (chosenKnnResult === null) {
405
- chosenKnnResult = knnFromPrecomputed(chosenK)
406
- }
409
+ function miniBatchKmeans(
410
+ points: Float64Array[],
411
+ k: number,
412
+ rng: () => number,
413
+ opts: { batchSize: number; maxIter: number }
414
+ ): Int32Array {
415
+ const N = points.length
416
+ if (N === 0) return new Int32Array()
407
417
 
408
- const { distances, indices } = chosenKnnResult
418
+ const dim = points[0]!.length
419
+ const centroids = initRandomCentroids(points, k, rng)
420
+ const counts = new Int32Array(k)
421
+ const batchSize = Math.min(opts.batchSize, N)
422
+
423
+ for (let iter = 0; iter < opts.maxIter; iter++) {
424
+ for (let b = 0; b < batchSize; b++) {
425
+ const idx = Math.floor(rng() * N)
426
+ const point = points[idx]!
427
+ const c = nearestCentroid(point, centroids)
428
+ counts[c] = (counts[c] ?? 0) + 1
429
+ const centroid = centroids[c]!
430
+ const eta = 1 / (counts[c] ?? 1)
431
+ for (let d = 0; d < dim; d++) {
432
+ centroid[d]! = centroid[d]! + eta * (point[d]! - centroid[d]!)
433
+ }
434
+ }
435
+ }
409
436
 
410
- // --- Build affinity matrix (sparse triplets) ---
411
- // σ[i] = distance to Kth nearest neighbour
412
- const sigmas = distances.map((d) => d[d.length - 1]!)
437
+ return assignLabels(points, centroids)
438
+ }
413
439
 
414
- const sparseAffinity: Array<{ i: number; j: number; v: number }> = []
440
+ function splitByProjection(points: Float64Array[], rng: () => number): Int32Array {
441
+ const N = points.length
442
+ const labels = new Int32Array(N)
443
+ if (N <= 1) return labels
415
444
 
445
+ const dim = points[0]!.length
446
+ const a = Math.floor(rng() * N)
447
+ let b = Math.floor(rng() * N)
448
+ if (b === a) b = (a + 1) % N
449
+
450
+ const pa = points[a]!
451
+ const pb = points[b]!
452
+ const dir = new Float64Array(dim)
453
+ for (let d = 0; d < dim; d++) dir[d]! = pa[d]! - pb[d]!
454
+
455
+ let min = Infinity
456
+ let max = -Infinity
457
+ const proj = new Float64Array(N)
416
458
  for (let i = 0; i < N; i++) {
417
- for (let n = 0; n < chosenK; n++) {
418
- const j = indices[i]![n]!
419
- const d = distances[i]![n]!
420
- const sigma_i = sigmas[i]!
421
- const sigma_j = sigmas[j]!
422
- const denom = Math.max(sigma_i * sigma_j, 1e-12)
423
- const v = Math.exp(-(d * d) / denom)
424
- sparseAffinity.push({ i, j, v })
425
- }
459
+ const p = points[i]!
460
+ let dot = 0
461
+ for (let d = 0; d < dim; d++) dot += p[d]! * dir[d]!
462
+ proj[i] = dot
463
+ if (dot < min) min = dot
464
+ if (dot > max) max = dot
426
465
  }
427
466
 
428
- // --- Sparse normalised Laplacian matvec ---
429
- // Avoids building an N×N dense matrix; matvec cost is O(N·k_sparse) vs O(N²).
430
- const { matvec: negLMatvec, dd } = buildNormLaplacianSparseMatvec(sparseAffinity, N)
431
-
432
- const k = Math.min(MAX_CLUSTERS + 1, N)
433
- const { values: rawValues, vectors } = topKEigenpairs(negLMatvec, N, k)
467
+ const threshold = (min + max) / 2
468
+ for (let i = 0; i < N; i++) labels[i] = proj[i]! <= threshold ? 0 : 1
434
469
 
435
- // Eigenvalues were of -L; flip sign back to get L eigenvalues
436
- const eigenvalues = Float64Array.from(rawValues, (v) => -v)
470
+ const counts = countLabels(labels, 2)
471
+ if ((counts[0] ?? 0) === 0 || (counts[1] ?? 0) === 0) {
472
+ const mid = Math.floor(N / 2)
473
+ for (let i = 0; i < N; i++) labels[i] = i < mid ? 0 : 1
474
+ }
437
475
 
438
- // Sort by eigenvalue ascending (smallest first), skip index 0
439
- const sortedIdx = Array.from({ length: k }, (_, i) => i).sort(
440
- (a, b) => eigenvalues[a]! - eigenvalues[b]!
441
- )
476
+ return labels
477
+ }
442
478
 
443
- const sortedEigenvalues = Float64Array.from(sortedIdx, (i) => eigenvalues[i]!)
444
- const sortedVectors = sortedIdx.map((i) => vectors[i]!)
479
+ function chooseBisectLabels(points: Float64Array[], rng: () => number): Int32Array {
480
+ const N = points.length
481
+ if (N <= 1) return new Int32Array(N)
482
+
483
+ const useMiniBatch = N >= MINI_BATCH_THRESHOLD
484
+ const retries = useMiniBatch ? MINI_BATCH_RETRIES : KMEANS_RETRIES
485
+
486
+ for (let attempt = 0; attempt <= retries; attempt++) {
487
+ const seed = Math.floor(rng() * 1_000_000_000)
488
+ const labels = useMiniBatch
489
+ ? miniBatchKmeans(points, 2, seededRng(seed), {
490
+ batchSize: MINI_BATCH_SIZE,
491
+ maxIter: MINI_BATCH_ITERS,
492
+ })
493
+ : kmeans(points, 2, KMEANS_MAX_ITER, seed)
494
+
495
+ const counts = countLabels(labels, 2)
496
+ const left = counts[0] ?? 0
497
+ const right = counts[1] ?? 0
498
+ if (left > 0 && right > 0) return labels
499
+ }
445
500
 
446
- deterministicSignFlip(sortedVectors)
501
+ return splitByProjection(points, rng)
502
+ }
447
503
 
448
- // --- Eigengap heuristic (skip λ₀ ≈ 0) ---
449
- // n_clusters = argmax(diff(eigenvalues[1:])) + 2
450
- let maxGap = -Infinity
451
- let nClusters = 2
452
- for (let i = 1; i < sortedEigenvalues.length - 1; i++) {
453
- const gap = sortedEigenvalues[i + 1]! - sortedEigenvalues[i]!
454
- if (gap > maxGap) {
455
- maxGap = gap
456
- nClusters = i + 1 // 1-indexed + 1 for the off-by-one vs Python
504
+ function bisectCluster(
505
+ cluster: ClusterState,
506
+ rng: () => number
507
+ ): { left: ClusterState; right: ClusterState } {
508
+ const { entries, points } = cluster
509
+ const N = entries.length
510
+ if (N <= 1) {
511
+ return {
512
+ left: { entries, points },
513
+ right: { entries: [], points: [] },
457
514
  }
458
515
  }
459
- nClusters = Math.max(2, Math.min(nClusters, MAX_CLUSTERS))
460
-
461
- // --- Spectral embeddings: use eigenvectors 1..nClusters (skip 0) ---
462
- // Build [N × nClusters] matrix, normalise each row
463
- const spectralPoints: Float64Array[] = Array.from({ length: N }, () =>
464
- new Float64Array(nClusters)
465
- )
466
- for (let c = 0; c < nClusters; c++) {
467
- const vec = sortedVectors[c + 1] // skip smallest (index 0)
468
- if (vec === undefined) break
469
- for (let i = 0; i < N; i++) {
470
- // Divide by dd[i] (matches Python `wide_spectral_embeddings = eigenvectors.T / dd`)
471
- spectralPoints[i]![c] = (vec[i]! / dd[i]!)
516
+
517
+ let labels = chooseBisectLabels(points, rng)
518
+ let leftEntries: EmbedEntry[] = []
519
+ let rightEntries: EmbedEntry[] = []
520
+ let leftPoints: Float64Array[] = []
521
+ let rightPoints: Float64Array[] = []
522
+
523
+ for (let i = 0; i < N; i++) {
524
+ if (labels[i] === 0) {
525
+ leftEntries.push(entries[i]!)
526
+ leftPoints.push(points[i]!)
527
+ } else {
528
+ rightEntries.push(entries[i]!)
529
+ rightPoints.push(points[i]!)
472
530
  }
473
531
  }
474
- // L2-normalise each row
475
- for (const row of spectralPoints) {
476
- let norm = 0
477
- for (const v of row) norm += v * v
478
- norm = Math.sqrt(norm)
479
- if (norm > 1e-12) for (let d = 0; d < row.length; d++) row[d]! /= norm
532
+
533
+ if (leftEntries.length === 0 || rightEntries.length === 0) {
534
+ const mid = Math.floor(N / 2)
535
+ leftEntries = entries.slice(0, mid)
536
+ rightEntries = entries.slice(mid)
537
+ leftPoints = points.slice(0, mid)
538
+ rightPoints = points.slice(mid)
480
539
  }
481
540
 
482
- // --- K-means ---
483
- const labels = kmeans(spectralPoints, nClusters)
541
+ return {
542
+ left: { entries: leftEntries, points: leftPoints },
543
+ right: { entries: rightEntries, points: rightPoints },
544
+ }
545
+ }
484
546
 
485
- // Group entries by cluster label, preserving order
486
- const groups = new Map<number, EmbedEntry[]>()
487
- for (let i = 0; i < N; i++) {
488
- const label = labels[i]!
489
- if (!groups.has(label)) groups.set(label, [])
490
- groups.get(label)!.push(input.entries[i]!)
547
+ /**
548
+ * Recursively split a Cluster into sub-clusters using bisecting k-means.
549
+ * Returns [input] when the cluster is small enough to be a leaf.
550
+ */
551
+ export function splitCluster(input: Cluster): Cluster[] {
552
+ const N = input.entries.length
553
+
554
+ if (N <= MAX_LEAVES) return [input]
555
+
556
+ const normalized = normaliseRows(matFromEmbeds(input.entries))
557
+ const rng = seededRng(42)
558
+
559
+ const work: ClusterState[] = [{ entries: input.entries, points: normalized }]
560
+ const leaves: Cluster[] = []
561
+
562
+ while (work.length > 0) {
563
+ const cluster = work.pop()!
564
+ if (cluster.entries.length <= MAX_LEAVES) {
565
+ leaves.push({ entries: cluster.entries })
566
+ continue
567
+ }
568
+
569
+ const { left, right } = bisectCluster(cluster, rng)
570
+ if (right.entries.length > 0) work.push(right)
571
+ if (left.entries.length > 0) work.push(left)
491
572
  }
492
573
 
493
- return Array.from(groups.values()).map((entries) => ({ entries }))
574
+ return leaves
494
575
  }