@et0and/ovid 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -1
- package/package.json +1 -1
- package/src/cluster.ts +196 -115
package/README.md
CHANGED
|
@@ -1 +1,35 @@
|
|
|
1
|
-
# ovid
|
|
1
|
+
# ovid
|
|
2
|
+
|
|
3
|
+
Semantic project navigator for local repos. Builds a labeled tree of files by meaning and renders it in a TUI.
|
|
4
|
+
|
|
5
|
+
Port of the original Python project by [Gabriella Gonzalez](https://github.com/Gabriella439/semantic-navigator)
|
|
6
|
+
|
|
7
|
+
Key differences in this port:
|
|
8
|
+
|
|
9
|
+
- TypeScript + Bun CLI, OpenTUI renderer
|
|
10
|
+
- Local embeddings via `@huggingface/transformers` (`Xenova/all-MiniLM-L6-v2`)
|
|
11
|
+
- Labels generated through GitHub Copilot device flow (no OpenAI API key)
|
|
12
|
+
- Two-layer cache under `~/.cache/semantic-navigator`
|
|
13
|
+
|
|
14
|
+
**What it does**
|
|
15
|
+
|
|
16
|
+
- Reads tracked files (git) or top-level files (non-git)
|
|
17
|
+
- Embeds file chunks locally with `Xenova/all-MiniLM-L6-v2`
|
|
18
|
+
- Clusters files with bisecting k-means
|
|
19
|
+
- Labels clusters and leaves via GitHub Copilot
|
|
20
|
+
- Renders an interactive tree (OpenTUI)
|
|
21
|
+
|
|
22
|
+
**How it works**
|
|
23
|
+
|
|
24
|
+
- Discover files → chunk text → embed → cluster → label → display
|
|
25
|
+
- Embeddings are cached in `~/.cache/semantic-navigator/embeddings.json`
|
|
26
|
+
- Trees are cached in `~/.cache/semantic-navigator/trees/<fingerprint>.json`
|
|
27
|
+
|
|
28
|
+
**Usage**
|
|
29
|
+
|
|
30
|
+
- `bun run src/main.ts [directory]`
|
|
31
|
+
- `bun run src/main.ts --help`
|
|
32
|
+
|
|
33
|
+
**Notes**
|
|
34
|
+
|
|
35
|
+
- First run prompts for GitHub device flow to access Copilot.
|
package/package.json
CHANGED
package/src/cluster.ts
CHANGED
|
@@ -345,150 +345,231 @@ function kmeans(
|
|
|
345
345
|
return labels
|
|
346
346
|
}
|
|
347
347
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
348
|
+
const MINI_BATCH_THRESHOLD = 512
|
|
349
|
+
const MINI_BATCH_SIZE = 128
|
|
350
|
+
const MINI_BATCH_ITERS = 120
|
|
351
|
+
const KMEANS_MAX_ITER = 60
|
|
352
|
+
const KMEANS_RETRIES = 2
|
|
353
|
+
const MINI_BATCH_RETRIES = 2
|
|
354
|
+
|
|
355
|
+
interface ClusterState {
|
|
356
|
+
entries: EmbedEntry[]
|
|
357
|
+
points: Float64Array[]
|
|
358
|
+
}
|
|
358
359
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
if (j === i) continue
|
|
365
|
-
allDistances[i]!.push([cosDist(normalized[i]!, normalized[j]!), j])
|
|
366
|
-
}
|
|
367
|
-
allDistances[i]!.sort((a, b) => a[0] - b[0])
|
|
360
|
+
function countLabels(labels: Int32Array, k: number): Int32Array {
|
|
361
|
+
const counts = new Int32Array(k)
|
|
362
|
+
for (let i = 0; i < labels.length; i++) {
|
|
363
|
+
const label = labels[i]
|
|
364
|
+
if (label !== undefined) counts[label] = (counts[label] ?? 0) + 1
|
|
368
365
|
}
|
|
366
|
+
return counts
|
|
367
|
+
}
|
|
369
368
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
369
|
+
function nearestCentroid(point: Float64Array, centroids: Float64Array[]): number {
|
|
370
|
+
let best = 0
|
|
371
|
+
let bestDist = Infinity
|
|
372
|
+
for (let c = 0; c < centroids.length; c++) {
|
|
373
|
+
const d = distSq(point, centroids[c]!)
|
|
374
|
+
if (d < bestDist) {
|
|
375
|
+
bestDist = d
|
|
376
|
+
best = c
|
|
378
377
|
}
|
|
379
|
-
return { distances, indices }
|
|
380
378
|
}
|
|
379
|
+
return best
|
|
380
|
+
}
|
|
381
381
|
|
|
382
|
-
|
|
383
|
-
const
|
|
384
|
-
for (let
|
|
385
|
-
|
|
386
|
-
if (k >= N) break
|
|
387
|
-
candidateKs.push(k)
|
|
382
|
+
function assignLabels(points: Float64Array[], centroids: Float64Array[]): Int32Array {
|
|
383
|
+
const labels = new Int32Array(points.length)
|
|
384
|
+
for (let i = 0; i < points.length; i++) {
|
|
385
|
+
labels[i] = nearestCentroid(points[i]!, centroids)
|
|
388
386
|
}
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
387
|
+
return labels
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
function initRandomCentroids(
|
|
391
|
+
points: Float64Array[],
|
|
392
|
+
k: number,
|
|
393
|
+
rng: () => number
|
|
394
|
+
): Float64Array[] {
|
|
395
|
+
const N = points.length
|
|
396
|
+
const centroids: Float64Array[] = []
|
|
397
|
+
const used = new Set<number>()
|
|
398
|
+
for (let c = 0; c < k; c++) {
|
|
399
|
+
let idx = Math.floor(rng() * N)
|
|
400
|
+
for (let attempts = 0; attempts < 4 && used.has(idx); attempts++) {
|
|
401
|
+
idx = Math.floor(rng() * N)
|
|
401
402
|
}
|
|
403
|
+
used.add(idx)
|
|
404
|
+
centroids.push(Float64Array.from(points[idx]!))
|
|
402
405
|
}
|
|
406
|
+
return centroids
|
|
407
|
+
}
|
|
403
408
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
409
|
+
function miniBatchKmeans(
|
|
410
|
+
points: Float64Array[],
|
|
411
|
+
k: number,
|
|
412
|
+
rng: () => number,
|
|
413
|
+
opts: { batchSize: number; maxIter: number }
|
|
414
|
+
): Int32Array {
|
|
415
|
+
const N = points.length
|
|
416
|
+
if (N === 0) return new Int32Array()
|
|
407
417
|
|
|
408
|
-
const
|
|
418
|
+
const dim = points[0]!.length
|
|
419
|
+
const centroids = initRandomCentroids(points, k, rng)
|
|
420
|
+
const counts = new Int32Array(k)
|
|
421
|
+
const batchSize = Math.min(opts.batchSize, N)
|
|
422
|
+
|
|
423
|
+
for (let iter = 0; iter < opts.maxIter; iter++) {
|
|
424
|
+
for (let b = 0; b < batchSize; b++) {
|
|
425
|
+
const idx = Math.floor(rng() * N)
|
|
426
|
+
const point = points[idx]!
|
|
427
|
+
const c = nearestCentroid(point, centroids)
|
|
428
|
+
counts[c] = (counts[c] ?? 0) + 1
|
|
429
|
+
const centroid = centroids[c]!
|
|
430
|
+
const eta = 1 / (counts[c] ?? 1)
|
|
431
|
+
for (let d = 0; d < dim; d++) {
|
|
432
|
+
centroid[d]! = centroid[d]! + eta * (point[d]! - centroid[d]!)
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
409
436
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
const sigmas = distances.map((d) => d[d.length - 1]!)
|
|
437
|
+
return assignLabels(points, centroids)
|
|
438
|
+
}
|
|
413
439
|
|
|
414
|
-
|
|
440
|
+
function splitByProjection(points: Float64Array[], rng: () => number): Int32Array {
|
|
441
|
+
const N = points.length
|
|
442
|
+
const labels = new Int32Array(N)
|
|
443
|
+
if (N <= 1) return labels
|
|
415
444
|
|
|
445
|
+
const dim = points[0]!.length
|
|
446
|
+
const a = Math.floor(rng() * N)
|
|
447
|
+
let b = Math.floor(rng() * N)
|
|
448
|
+
if (b === a) b = (a + 1) % N
|
|
449
|
+
|
|
450
|
+
const pa = points[a]!
|
|
451
|
+
const pb = points[b]!
|
|
452
|
+
const dir = new Float64Array(dim)
|
|
453
|
+
for (let d = 0; d < dim; d++) dir[d]! = pa[d]! - pb[d]!
|
|
454
|
+
|
|
455
|
+
let min = Infinity
|
|
456
|
+
let max = -Infinity
|
|
457
|
+
const proj = new Float64Array(N)
|
|
416
458
|
for (let i = 0; i < N; i++) {
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
const v = Math.exp(-(d * d) / denom)
|
|
424
|
-
sparseAffinity.push({ i, j, v })
|
|
425
|
-
}
|
|
459
|
+
const p = points[i]!
|
|
460
|
+
let dot = 0
|
|
461
|
+
for (let d = 0; d < dim; d++) dot += p[d]! * dir[d]!
|
|
462
|
+
proj[i] = dot
|
|
463
|
+
if (dot < min) min = dot
|
|
464
|
+
if (dot > max) max = dot
|
|
426
465
|
}
|
|
427
466
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
const { matvec: negLMatvec, dd } = buildNormLaplacianSparseMatvec(sparseAffinity, N)
|
|
431
|
-
|
|
432
|
-
const k = Math.min(MAX_CLUSTERS + 1, N)
|
|
433
|
-
const { values: rawValues, vectors } = topKEigenpairs(negLMatvec, N, k)
|
|
467
|
+
const threshold = (min + max) / 2
|
|
468
|
+
for (let i = 0; i < N; i++) labels[i] = proj[i]! <= threshold ? 0 : 1
|
|
434
469
|
|
|
435
|
-
|
|
436
|
-
|
|
470
|
+
const counts = countLabels(labels, 2)
|
|
471
|
+
if ((counts[0] ?? 0) === 0 || (counts[1] ?? 0) === 0) {
|
|
472
|
+
const mid = Math.floor(N / 2)
|
|
473
|
+
for (let i = 0; i < N; i++) labels[i] = i < mid ? 0 : 1
|
|
474
|
+
}
|
|
437
475
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
(a, b) => eigenvalues[a]! - eigenvalues[b]!
|
|
441
|
-
)
|
|
476
|
+
return labels
|
|
477
|
+
}
|
|
442
478
|
|
|
443
|
-
|
|
444
|
-
const
|
|
479
|
+
function chooseBisectLabels(points: Float64Array[], rng: () => number): Int32Array {
|
|
480
|
+
const N = points.length
|
|
481
|
+
if (N <= 1) return new Int32Array(N)
|
|
482
|
+
|
|
483
|
+
const useMiniBatch = N >= MINI_BATCH_THRESHOLD
|
|
484
|
+
const retries = useMiniBatch ? MINI_BATCH_RETRIES : KMEANS_RETRIES
|
|
485
|
+
|
|
486
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
487
|
+
const seed = Math.floor(rng() * 1_000_000_000)
|
|
488
|
+
const labels = useMiniBatch
|
|
489
|
+
? miniBatchKmeans(points, 2, seededRng(seed), {
|
|
490
|
+
batchSize: MINI_BATCH_SIZE,
|
|
491
|
+
maxIter: MINI_BATCH_ITERS,
|
|
492
|
+
})
|
|
493
|
+
: kmeans(points, 2, KMEANS_MAX_ITER, seed)
|
|
494
|
+
|
|
495
|
+
const counts = countLabels(labels, 2)
|
|
496
|
+
const left = counts[0] ?? 0
|
|
497
|
+
const right = counts[1] ?? 0
|
|
498
|
+
if (left > 0 && right > 0) return labels
|
|
499
|
+
}
|
|
445
500
|
|
|
446
|
-
|
|
501
|
+
return splitByProjection(points, rng)
|
|
502
|
+
}
|
|
447
503
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
504
|
+
function bisectCluster(
|
|
505
|
+
cluster: ClusterState,
|
|
506
|
+
rng: () => number
|
|
507
|
+
): { left: ClusterState; right: ClusterState } {
|
|
508
|
+
const { entries, points } = cluster
|
|
509
|
+
const N = entries.length
|
|
510
|
+
if (N <= 1) {
|
|
511
|
+
return {
|
|
512
|
+
left: { entries, points },
|
|
513
|
+
right: { entries: [], points: [] },
|
|
457
514
|
}
|
|
458
515
|
}
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
for (let
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
516
|
+
|
|
517
|
+
let labels = chooseBisectLabels(points, rng)
|
|
518
|
+
let leftEntries: EmbedEntry[] = []
|
|
519
|
+
let rightEntries: EmbedEntry[] = []
|
|
520
|
+
let leftPoints: Float64Array[] = []
|
|
521
|
+
let rightPoints: Float64Array[] = []
|
|
522
|
+
|
|
523
|
+
for (let i = 0; i < N; i++) {
|
|
524
|
+
if (labels[i] === 0) {
|
|
525
|
+
leftEntries.push(entries[i]!)
|
|
526
|
+
leftPoints.push(points[i]!)
|
|
527
|
+
} else {
|
|
528
|
+
rightEntries.push(entries[i]!)
|
|
529
|
+
rightPoints.push(points[i]!)
|
|
472
530
|
}
|
|
473
531
|
}
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
532
|
+
|
|
533
|
+
if (leftEntries.length === 0 || rightEntries.length === 0) {
|
|
534
|
+
const mid = Math.floor(N / 2)
|
|
535
|
+
leftEntries = entries.slice(0, mid)
|
|
536
|
+
rightEntries = entries.slice(mid)
|
|
537
|
+
leftPoints = points.slice(0, mid)
|
|
538
|
+
rightPoints = points.slice(mid)
|
|
480
539
|
}
|
|
481
540
|
|
|
482
|
-
|
|
483
|
-
|
|
541
|
+
return {
|
|
542
|
+
left: { entries: leftEntries, points: leftPoints },
|
|
543
|
+
right: { entries: rightEntries, points: rightPoints },
|
|
544
|
+
}
|
|
545
|
+
}
|
|
484
546
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
547
|
+
/**
|
|
548
|
+
* Recursively split a Cluster into sub-clusters using bisecting k-means.
|
|
549
|
+
* Returns [input] when the cluster is small enough to be a leaf.
|
|
550
|
+
*/
|
|
551
|
+
export function splitCluster(input: Cluster): Cluster[] {
|
|
552
|
+
const N = input.entries.length
|
|
553
|
+
|
|
554
|
+
if (N <= MAX_LEAVES) return [input]
|
|
555
|
+
|
|
556
|
+
const normalized = normaliseRows(matFromEmbeds(input.entries))
|
|
557
|
+
const rng = seededRng(42)
|
|
558
|
+
|
|
559
|
+
const work: ClusterState[] = [{ entries: input.entries, points: normalized }]
|
|
560
|
+
const leaves: Cluster[] = []
|
|
561
|
+
|
|
562
|
+
while (work.length > 0) {
|
|
563
|
+
const cluster = work.pop()!
|
|
564
|
+
if (cluster.entries.length <= MAX_LEAVES) {
|
|
565
|
+
leaves.push({ entries: cluster.entries })
|
|
566
|
+
continue
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
const { left, right } = bisectCluster(cluster, rng)
|
|
570
|
+
if (right.entries.length > 0) work.push(right)
|
|
571
|
+
if (left.entries.length > 0) work.push(left)
|
|
491
572
|
}
|
|
492
573
|
|
|
493
|
-
return
|
|
574
|
+
return leaves
|
|
494
575
|
}
|