@et0and/ovid 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -4
- package/src/cache.ts +119 -0
- package/src/cluster.ts +247 -159
- package/src/embed.ts +50 -30
- package/src/main.ts +36 -3
package/package.json
CHANGED
package/src/cache.ts
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Two-layer result cache for semantic-navigator.
|
|
3
|
+
*
|
|
4
|
+
* Layer 1 — Embedding cache (~/.cache/semantic-navigator/embeddings.json)
|
|
5
|
+
* Content-addressed: maps sha256(text)[0:16] → number[]
|
|
6
|
+
* Per-entry granularity: only re-embed chunks whose text changed.
|
|
7
|
+
*
|
|
8
|
+
* Layer 2 — Tree cache (~/.cache/semantic-navigator/trees/<fingerprint>.json)
|
|
9
|
+
* Keyed by sha256(model + sorted(path:contentHash pairs)).
|
|
10
|
+
* A single changed file invalidates the whole tree, forcing a fresh
|
|
11
|
+
* cluster + label run, but embeddings are still reused from layer 1.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { createHash } from "node:crypto"
|
|
15
|
+
import { mkdirSync, existsSync, readFileSync, writeFileSync } from "node:fs"
|
|
16
|
+
import { join } from "node:path"
|
|
17
|
+
import { homedir } from "node:os"
|
|
18
|
+
import type { Tree } from "./tree.ts"
|
|
19
|
+
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Paths
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
const CACHE_DIR = join(homedir(), ".cache", "semantic-navigator")
|
|
25
|
+
const EMBED_CACHE_PATH = join(CACHE_DIR, "embeddings.json")
|
|
26
|
+
const TREES_DIR = join(CACHE_DIR, "trees")
|
|
27
|
+
|
|
28
|
+
function ensureDirs(): void {
|
|
29
|
+
mkdirSync(CACHE_DIR, { recursive: true })
|
|
30
|
+
mkdirSync(TREES_DIR, { recursive: true })
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// Helpers
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
/** Short hex digest: sha256(text).slice(0, 16) */
|
|
38
|
+
export function textHash(text: string): string {
|
|
39
|
+
return createHash("sha256").update(text).digest("hex").slice(0, 16)
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Fingerprint for the tree cache: sha256(model + sorted path:contentHash pairs).
|
|
44
|
+
* `fileHashes` is a map from relative path to sha256(file content).
|
|
45
|
+
*/
|
|
46
|
+
export function treeFingerprint(model: string, fileHashes: Map<string, string>): string {
|
|
47
|
+
const entries = Array.from(fileHashes.entries())
|
|
48
|
+
.map(([p, h]) => `${p}:${h}`)
|
|
49
|
+
.sort()
|
|
50
|
+
.join("\n")
|
|
51
|
+
return createHash("sha256").update(model + "\n" + entries).digest("hex")
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// Layer 1: Embedding cache
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
type EmbedCacheMap = Record<string, number[]>
|
|
59
|
+
|
|
60
|
+
let _embedCache: EmbedCacheMap | null = null
|
|
61
|
+
|
|
62
|
+
function loadEmbedCache(): EmbedCacheMap {
|
|
63
|
+
if (_embedCache !== null) return _embedCache
|
|
64
|
+
if (existsSync(EMBED_CACHE_PATH)) {
|
|
65
|
+
try {
|
|
66
|
+
_embedCache = JSON.parse(readFileSync(EMBED_CACHE_PATH, "utf-8")) as EmbedCacheMap
|
|
67
|
+
} catch {
|
|
68
|
+
_embedCache = {}
|
|
69
|
+
}
|
|
70
|
+
} else {
|
|
71
|
+
_embedCache = {}
|
|
72
|
+
}
|
|
73
|
+
return _embedCache
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** Look up a cached embedding by text content. Returns null on miss. */
|
|
77
|
+
export function getCachedEmbedding(text: string): Float32Array | null {
|
|
78
|
+
const cache = loadEmbedCache()
|
|
79
|
+
const key = textHash(text)
|
|
80
|
+
const vec = cache[key]
|
|
81
|
+
if (vec === undefined) return null
|
|
82
|
+
return Float32Array.from(vec)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/** Store an embedding in the in-memory cache (call flushEmbedCache to persist). */
|
|
86
|
+
export function setCachedEmbedding(text: string, embedding: Float32Array): void {
|
|
87
|
+
const cache = loadEmbedCache()
|
|
88
|
+
const key = textHash(text)
|
|
89
|
+
cache[key] = Array.from(embedding)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/** Persist the in-memory embedding cache to disk. */
|
|
93
|
+
export function flushEmbedCache(): void {
|
|
94
|
+
if (_embedCache === null) return
|
|
95
|
+
ensureDirs()
|
|
96
|
+
writeFileSync(EMBED_CACHE_PATH, JSON.stringify(_embedCache), "utf-8")
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
// Layer 2: Tree cache
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
/** Look up a cached Tree by fingerprint. Returns null on miss. */
|
|
104
|
+
export function getCachedTree(fingerprint: string): Tree | null {
|
|
105
|
+
const treePath = join(TREES_DIR, `${fingerprint}.json`)
|
|
106
|
+
if (!existsSync(treePath)) return null
|
|
107
|
+
try {
|
|
108
|
+
return JSON.parse(readFileSync(treePath, "utf-8")) as Tree
|
|
109
|
+
} catch {
|
|
110
|
+
return null
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/** Persist a Tree to the tree cache. */
|
|
115
|
+
export function setCachedTree(fingerprint: string, tree: Tree): void {
|
|
116
|
+
ensureDirs()
|
|
117
|
+
const treePath = join(TREES_DIR, `${fingerprint}.json`)
|
|
118
|
+
writeFileSync(treePath, JSON.stringify(tree), "utf-8")
|
|
119
|
+
}
|
package/src/cluster.ts
CHANGED
|
@@ -95,19 +95,24 @@ function connectedComponents(indices: Int32Array[], N: number): number {
|
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
/**
|
|
98
|
-
* Build the
|
|
99
|
-
* a
|
|
100
|
-
*
|
|
98
|
+
* Build the degree diagonal `dd` from sparse affinity triplets, then return
|
|
99
|
+
* a sparse matvec closure for the *negated* normalised Laplacian (-L_norm).
|
|
100
|
+
*
|
|
101
|
+
* L_norm = I - D^{-1/2} A D^{-1/2}
|
|
102
|
+
* -L_norm = D^{-1/2} A D^{-1/2} - I
|
|
103
|
+
*
|
|
104
|
+
* The matvec avoids allocating an N×N dense matrix — at N=2000, k_sparse≈7,
|
|
105
|
+
* this reduces the cost per multiply from O(N²)=4M to O(N·k)=14k ops.
|
|
101
106
|
*/
|
|
102
|
-
function
|
|
107
|
+
function buildNormLaplacianSparseMatvec(
|
|
103
108
|
sparseAffinity: Array<{ i: number; j: number; v: number }>,
|
|
104
109
|
N: number
|
|
105
|
-
): {
|
|
110
|
+
): { matvec: (v: Float64Array) => Float64Array; dd: Float64Array } {
|
|
106
111
|
// Accumulate row sums (degree) for normalisation
|
|
107
112
|
const degree = new Float64Array(N)
|
|
108
113
|
for (const { i, j, v } of sparseAffinity) {
|
|
109
|
-
degree[i]
|
|
110
|
-
if (i !== j) degree[j]
|
|
114
|
+
degree[i]! += v
|
|
115
|
+
if (i !== j) degree[j]! += v
|
|
111
116
|
}
|
|
112
117
|
|
|
113
118
|
const dd = new Float64Array(N)
|
|
@@ -115,31 +120,29 @@ function buildNormalisedLaplacian(
|
|
|
115
120
|
dd[i] = degree[i]! > 1e-12 ? 1 / Math.sqrt(degree[i]!) : 0
|
|
116
121
|
}
|
|
117
122
|
|
|
118
|
-
//
|
|
119
|
-
//
|
|
120
|
-
const
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
})
|
|
125
|
-
|
|
126
|
-
//
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
const
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
123
|
+
// Pre-compute normalised weights once so the closure stays cheap.
|
|
124
|
+
// w_ij = v * dd[i] * dd[j] (the off-diagonal contribution to A_norm)
|
|
125
|
+
const normAffinity = sparseAffinity.map(({ i, j, v }) => ({
|
|
126
|
+
i,
|
|
127
|
+
j,
|
|
128
|
+
w: v * dd[i]! * dd[j]!,
|
|
129
|
+
}))
|
|
130
|
+
|
|
131
|
+
// Matvec for -L_norm = A_norm - I
|
|
132
|
+
// result[i] = -v[i] + sum_j w_ij * v[j] (using symmetry)
|
|
133
|
+
const matvecFn = (vec: Float64Array): Float64Array => {
|
|
134
|
+
const out = new Float64Array(N)
|
|
135
|
+
// Start from -I · vec
|
|
136
|
+
for (let i = 0; i < N; i++) out[i] = -vec[i]!
|
|
137
|
+
// Add symmetric A_norm contributions
|
|
138
|
+
for (const { i, j, w } of normAffinity) {
|
|
139
|
+
out[i]! += w * vec[j]!
|
|
140
|
+
if (i !== j) out[j]! += w * vec[i]!
|
|
134
141
|
}
|
|
142
|
+
return out
|
|
135
143
|
}
|
|
136
144
|
|
|
137
|
-
|
|
138
|
-
for (let i = 0; i < N; i++) {
|
|
139
|
-
L[i]![i] = 1
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
return { L, dd }
|
|
145
|
+
return { matvec: matvecFn, dd }
|
|
143
146
|
}
|
|
144
147
|
|
|
145
148
|
/**
|
|
@@ -171,35 +174,24 @@ function normaliseVec(v: Float64Array): number {
|
|
|
171
174
|
return n
|
|
172
175
|
}
|
|
173
176
|
|
|
174
|
-
/** Multiply matrix M by vector v */
|
|
175
|
-
function matvec(M: Matrix, v: Float64Array): Float64Array<ArrayBuffer> {
|
|
176
|
-
const N = M.length
|
|
177
|
-
const out = new Float64Array(N) as Float64Array<ArrayBuffer>
|
|
178
|
-
for (let i = 0; i < N; i++) {
|
|
179
|
-
out[i] = dot(M[i]!, v)
|
|
180
|
-
}
|
|
181
|
-
return out
|
|
182
|
-
}
|
|
183
|
-
|
|
184
177
|
/**
|
|
185
178
|
* Randomised power-iteration with deflation to extract the `k` eigenpairs
|
|
186
|
-
* corresponding to the *smallest* eigenvalues of a symmetric matrix
|
|
179
|
+
* corresponding to the *smallest* eigenvalues of a symmetric matrix.
|
|
187
180
|
*
|
|
188
|
-
*
|
|
189
|
-
*
|
|
190
|
-
*
|
|
191
|
-
*
|
|
192
|
-
*
|
|
193
|
-
* to 1 above). We iterate on M = -L and take the top-k eigenvectors, then
|
|
194
|
-
* negate the eigenvalues back.
|
|
181
|
+
* Instead of a dense matrix, accepts a sparse `matvecFn` closure so that the
|
|
182
|
+
* per-iteration cost is O(N·k_sparse) rather than O(N²). The closure should
|
|
183
|
+
* implement multiplication by the *negated* Laplacian (-L_norm), whose top
|
|
184
|
+
* eigenvalues correspond to L_norm's bottom ones (matching the Python code
|
|
185
|
+
* which does `laplacian *= -1`).
|
|
195
186
|
*/
|
|
196
187
|
function topKEigenpairs(
|
|
197
|
-
|
|
188
|
+
matvecFn: (v: Float64Array) => Float64Array,
|
|
189
|
+
n: number,
|
|
198
190
|
k: number,
|
|
199
191
|
maxIter = 300,
|
|
200
192
|
tol = 1e-6
|
|
201
193
|
): { values: Float64Array<ArrayBuffer>; vectors: Float64Array<ArrayBuffer>[] } {
|
|
202
|
-
const N =
|
|
194
|
+
const N = n
|
|
203
195
|
const rng = seededRng(42)
|
|
204
196
|
|
|
205
197
|
const vectors: Float64Array<ArrayBuffer>[] = []
|
|
@@ -207,7 +199,7 @@ function topKEigenpairs(
|
|
|
207
199
|
|
|
208
200
|
for (let idx = 0; idx < k; idx++) {
|
|
209
201
|
// Random start
|
|
210
|
-
let v = Float64Array.from({ length: N }, () => rng() * 2 - 1)
|
|
202
|
+
let v = Float64Array.from({ length: N }, () => rng() * 2 - 1) as Float64Array<ArrayBuffer>
|
|
211
203
|
normaliseVec(v)
|
|
212
204
|
|
|
213
205
|
// Deflate against already-found vectors
|
|
@@ -216,7 +208,7 @@ function topKEigenpairs(
|
|
|
216
208
|
|
|
217
209
|
let lambda = 0
|
|
218
210
|
for (let iter = 0; iter < maxIter; iter++) {
|
|
219
|
-
const Mv =
|
|
211
|
+
const Mv = matvecFn(v) as Float64Array<ArrayBuffer>
|
|
220
212
|
|
|
221
213
|
// Deflate
|
|
222
214
|
for (const u of vectors) subtractProjection(Mv, u)
|
|
@@ -353,135 +345,231 @@ function kmeans(
|
|
|
353
345
|
return labels
|
|
354
346
|
}
|
|
355
347
|
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
if (N <= MAX_LEAVES) return [input]
|
|
348
|
+
const MINI_BATCH_THRESHOLD = 512
|
|
349
|
+
const MINI_BATCH_SIZE = 128
|
|
350
|
+
const MINI_BATCH_ITERS = 120
|
|
351
|
+
const KMEANS_MAX_ITER = 60
|
|
352
|
+
const KMEANS_RETRIES = 2
|
|
353
|
+
const MINI_BATCH_RETRIES = 2
|
|
364
354
|
|
|
365
|
-
|
|
355
|
+
interface ClusterState {
|
|
356
|
+
entries: EmbedEntry[]
|
|
357
|
+
points: Float64Array[]
|
|
358
|
+
}
|
|
366
359
|
|
|
367
|
-
|
|
368
|
-
const
|
|
369
|
-
for (let
|
|
370
|
-
const
|
|
371
|
-
if (
|
|
372
|
-
candidateKs.push(k)
|
|
360
|
+
function countLabels(labels: Int32Array, k: number): Int32Array {
|
|
361
|
+
const counts = new Int32Array(k)
|
|
362
|
+
for (let i = 0; i < labels.length; i++) {
|
|
363
|
+
const label = labels[i]
|
|
364
|
+
if (label !== undefined) counts[label] = (counts[label] ?? 0) + 1
|
|
373
365
|
}
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
const
|
|
382
|
-
if (
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
break
|
|
366
|
+
return counts
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
function nearestCentroid(point: Float64Array, centroids: Float64Array[]): number {
|
|
370
|
+
let best = 0
|
|
371
|
+
let bestDist = Infinity
|
|
372
|
+
for (let c = 0; c < centroids.length; c++) {
|
|
373
|
+
const d = distSq(point, centroids[c]!)
|
|
374
|
+
if (d < bestDist) {
|
|
375
|
+
bestDist = d
|
|
376
|
+
best = c
|
|
386
377
|
}
|
|
387
378
|
}
|
|
379
|
+
return best
|
|
380
|
+
}
|
|
388
381
|
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
382
|
+
function assignLabels(points: Float64Array[], centroids: Float64Array[]): Int32Array {
|
|
383
|
+
const labels = new Int32Array(points.length)
|
|
384
|
+
for (let i = 0; i < points.length; i++) {
|
|
385
|
+
labels[i] = nearestCentroid(points[i]!, centroids)
|
|
392
386
|
}
|
|
387
|
+
return labels
|
|
388
|
+
}
|
|
393
389
|
|
|
394
|
-
|
|
390
|
+
function initRandomCentroids(
|
|
391
|
+
points: Float64Array[],
|
|
392
|
+
k: number,
|
|
393
|
+
rng: () => number
|
|
394
|
+
): Float64Array[] {
|
|
395
|
+
const N = points.length
|
|
396
|
+
const centroids: Float64Array[] = []
|
|
397
|
+
const used = new Set<number>()
|
|
398
|
+
for (let c = 0; c < k; c++) {
|
|
399
|
+
let idx = Math.floor(rng() * N)
|
|
400
|
+
for (let attempts = 0; attempts < 4 && used.has(idx); attempts++) {
|
|
401
|
+
idx = Math.floor(rng() * N)
|
|
402
|
+
}
|
|
403
|
+
used.add(idx)
|
|
404
|
+
centroids.push(Float64Array.from(points[idx]!))
|
|
405
|
+
}
|
|
406
|
+
return centroids
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
function miniBatchKmeans(
|
|
410
|
+
points: Float64Array[],
|
|
411
|
+
k: number,
|
|
412
|
+
rng: () => number,
|
|
413
|
+
opts: { batchSize: number; maxIter: number }
|
|
414
|
+
): Int32Array {
|
|
415
|
+
const N = points.length
|
|
416
|
+
if (N === 0) return new Int32Array()
|
|
417
|
+
|
|
418
|
+
const dim = points[0]!.length
|
|
419
|
+
const centroids = initRandomCentroids(points, k, rng)
|
|
420
|
+
const counts = new Int32Array(k)
|
|
421
|
+
const batchSize = Math.min(opts.batchSize, N)
|
|
422
|
+
|
|
423
|
+
for (let iter = 0; iter < opts.maxIter; iter++) {
|
|
424
|
+
for (let b = 0; b < batchSize; b++) {
|
|
425
|
+
const idx = Math.floor(rng() * N)
|
|
426
|
+
const point = points[idx]!
|
|
427
|
+
const c = nearestCentroid(point, centroids)
|
|
428
|
+
counts[c] = (counts[c] ?? 0) + 1
|
|
429
|
+
const centroid = centroids[c]!
|
|
430
|
+
const eta = 1 / (counts[c] ?? 1)
|
|
431
|
+
for (let d = 0; d < dim; d++) {
|
|
432
|
+
centroid[d]! = centroid[d]! + eta * (point[d]! - centroid[d]!)
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
395
436
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
const sigmas = distances.map((d) => d[d.length - 1]!)
|
|
437
|
+
return assignLabels(points, centroids)
|
|
438
|
+
}
|
|
399
439
|
|
|
400
|
-
|
|
440
|
+
function splitByProjection(points: Float64Array[], rng: () => number): Int32Array {
|
|
441
|
+
const N = points.length
|
|
442
|
+
const labels = new Int32Array(N)
|
|
443
|
+
if (N <= 1) return labels
|
|
401
444
|
|
|
445
|
+
const dim = points[0]!.length
|
|
446
|
+
const a = Math.floor(rng() * N)
|
|
447
|
+
let b = Math.floor(rng() * N)
|
|
448
|
+
if (b === a) b = (a + 1) % N
|
|
449
|
+
|
|
450
|
+
const pa = points[a]!
|
|
451
|
+
const pb = points[b]!
|
|
452
|
+
const dir = new Float64Array(dim)
|
|
453
|
+
for (let d = 0; d < dim; d++) dir[d]! = pa[d]! - pb[d]!
|
|
454
|
+
|
|
455
|
+
let min = Infinity
|
|
456
|
+
let max = -Infinity
|
|
457
|
+
const proj = new Float64Array(N)
|
|
402
458
|
for (let i = 0; i < N; i++) {
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
const v = Math.exp(-(d * d) / denom)
|
|
410
|
-
sparseAffinity.push({ i, j, v })
|
|
411
|
-
}
|
|
459
|
+
const p = points[i]!
|
|
460
|
+
let dot = 0
|
|
461
|
+
for (let d = 0; d < dim; d++) dot += p[d]! * dir[d]!
|
|
462
|
+
proj[i] = dot
|
|
463
|
+
if (dot < min) min = dot
|
|
464
|
+
if (dot > max) max = dot
|
|
412
465
|
}
|
|
413
466
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
467
|
+
const threshold = (min + max) / 2
|
|
468
|
+
for (let i = 0; i < N; i++) labels[i] = proj[i]! <= threshold ? 0 : 1
|
|
469
|
+
|
|
470
|
+
const counts = countLabels(labels, 2)
|
|
471
|
+
if ((counts[0] ?? 0) === 0 || (counts[1] ?? 0) === 0) {
|
|
472
|
+
const mid = Math.floor(N / 2)
|
|
473
|
+
for (let i = 0; i < N; i++) labels[i] = i < mid ? 0 : 1
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
return labels
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
function chooseBisectLabels(points: Float64Array[], rng: () => number): Int32Array {
|
|
480
|
+
const N = points.length
|
|
481
|
+
if (N <= 1) return new Int32Array(N)
|
|
482
|
+
|
|
483
|
+
const useMiniBatch = N >= MINI_BATCH_THRESHOLD
|
|
484
|
+
const retries = useMiniBatch ? MINI_BATCH_RETRIES : KMEANS_RETRIES
|
|
485
|
+
|
|
486
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
487
|
+
const seed = Math.floor(rng() * 1_000_000_000)
|
|
488
|
+
const labels = useMiniBatch
|
|
489
|
+
? miniBatchKmeans(points, 2, seededRng(seed), {
|
|
490
|
+
batchSize: MINI_BATCH_SIZE,
|
|
491
|
+
maxIter: MINI_BATCH_ITERS,
|
|
492
|
+
})
|
|
493
|
+
: kmeans(points, 2, KMEANS_MAX_ITER, seed)
|
|
494
|
+
|
|
495
|
+
const counts = countLabels(labels, 2)
|
|
496
|
+
const left = counts[0] ?? 0
|
|
497
|
+
const right = counts[1] ?? 0
|
|
498
|
+
if (left > 0 && right > 0) return labels
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
return splitByProjection(points, rng)
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
function bisectCluster(
|
|
505
|
+
cluster: ClusterState,
|
|
506
|
+
rng: () => number
|
|
507
|
+
): { left: ClusterState; right: ClusterState } {
|
|
508
|
+
const { entries, points } = cluster
|
|
509
|
+
const N = entries.length
|
|
510
|
+
if (N <= 1) {
|
|
511
|
+
return {
|
|
512
|
+
left: { entries, points },
|
|
513
|
+
right: { entries: [], points: [] },
|
|
450
514
|
}
|
|
451
515
|
}
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
for (let
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
516
|
+
|
|
517
|
+
let labels = chooseBisectLabels(points, rng)
|
|
518
|
+
let leftEntries: EmbedEntry[] = []
|
|
519
|
+
let rightEntries: EmbedEntry[] = []
|
|
520
|
+
let leftPoints: Float64Array[] = []
|
|
521
|
+
let rightPoints: Float64Array[] = []
|
|
522
|
+
|
|
523
|
+
for (let i = 0; i < N; i++) {
|
|
524
|
+
if (labels[i] === 0) {
|
|
525
|
+
leftEntries.push(entries[i]!)
|
|
526
|
+
leftPoints.push(points[i]!)
|
|
527
|
+
} else {
|
|
528
|
+
rightEntries.push(entries[i]!)
|
|
529
|
+
rightPoints.push(points[i]!)
|
|
465
530
|
}
|
|
466
531
|
}
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
532
|
+
|
|
533
|
+
if (leftEntries.length === 0 || rightEntries.length === 0) {
|
|
534
|
+
const mid = Math.floor(N / 2)
|
|
535
|
+
leftEntries = entries.slice(0, mid)
|
|
536
|
+
rightEntries = entries.slice(mid)
|
|
537
|
+
leftPoints = points.slice(0, mid)
|
|
538
|
+
rightPoints = points.slice(mid)
|
|
473
539
|
}
|
|
474
540
|
|
|
475
|
-
|
|
476
|
-
|
|
541
|
+
return {
|
|
542
|
+
left: { entries: leftEntries, points: leftPoints },
|
|
543
|
+
right: { entries: rightEntries, points: rightPoints },
|
|
544
|
+
}
|
|
545
|
+
}
|
|
477
546
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
547
|
+
/**
|
|
548
|
+
* Recursively split a Cluster into sub-clusters using bisecting k-means.
|
|
549
|
+
* Returns [input] when the cluster is small enough to be a leaf.
|
|
550
|
+
*/
|
|
551
|
+
export function splitCluster(input: Cluster): Cluster[] {
|
|
552
|
+
const N = input.entries.length
|
|
553
|
+
|
|
554
|
+
if (N <= MAX_LEAVES) return [input]
|
|
555
|
+
|
|
556
|
+
const normalized = normaliseRows(matFromEmbeds(input.entries))
|
|
557
|
+
const rng = seededRng(42)
|
|
558
|
+
|
|
559
|
+
const work: ClusterState[] = [{ entries: input.entries, points: normalized }]
|
|
560
|
+
const leaves: Cluster[] = []
|
|
561
|
+
|
|
562
|
+
while (work.length > 0) {
|
|
563
|
+
const cluster = work.pop()!
|
|
564
|
+
if (cluster.entries.length <= MAX_LEAVES) {
|
|
565
|
+
leaves.push({ entries: cluster.entries })
|
|
566
|
+
continue
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
const { left, right } = bisectCluster(cluster, rng)
|
|
570
|
+
if (right.entries.length > 0) work.push(right)
|
|
571
|
+
if (left.entries.length > 0) work.push(left)
|
|
484
572
|
}
|
|
485
573
|
|
|
486
|
-
return
|
|
574
|
+
return leaves
|
|
487
575
|
}
|
package/src/embed.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { pipeline, type FeatureExtractionPipeline } from "@huggingface/transformers"
|
|
2
2
|
import type { Chunk } from "./tokenize.ts"
|
|
3
|
+
import { getCachedEmbedding, setCachedEmbedding, flushEmbedCache } from "./cache.ts"
|
|
3
4
|
|
|
4
5
|
export const DEFAULT_EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2"
|
|
5
6
|
|
|
@@ -19,6 +20,8 @@ export interface EmbedOptions {
|
|
|
19
20
|
model: string
|
|
20
21
|
batchSize: number
|
|
21
22
|
concurrency: number
|
|
23
|
+
/** When true, skip reading from cache (but still write to it). */
|
|
24
|
+
noCache?: boolean
|
|
22
25
|
}
|
|
23
26
|
|
|
24
27
|
let _pipe: FeatureExtractionPipeline | null = null
|
|
@@ -81,6 +84,7 @@ async function embedBatch(
|
|
|
81
84
|
|
|
82
85
|
/**
|
|
83
86
|
* Embed all chunks using the local model, with batching + concurrency limits.
|
|
87
|
+
* Chunks whose text is already in the embedding cache are skipped.
|
|
84
88
|
* Calls `onProgress(done, total)` after each batch completes.
|
|
85
89
|
*/
|
|
86
90
|
export async function embedChunks(
|
|
@@ -90,44 +94,60 @@ export async function embedChunks(
|
|
|
90
94
|
): Promise<EmbedEntry[]> {
|
|
91
95
|
if (chunks.length === 0) return []
|
|
92
96
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
const
|
|
96
|
-
|
|
97
|
-
|
|
97
|
+
// --- Cache pass: resolve hits immediately, collect misses for the model ---
|
|
98
|
+
const entries: EmbedEntry[] = new Array(chunks.length)
|
|
99
|
+
const missIndices: number[] = []
|
|
100
|
+
|
|
101
|
+
if (!opts.noCache) {
|
|
102
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
103
|
+
const cached = getCachedEmbedding(chunks[i]!.text)
|
|
104
|
+
if (cached !== null) {
|
|
105
|
+
entries[i] = { path: chunks[i]!.path, text: chunks[i]!.text, embedding: cached }
|
|
106
|
+
} else {
|
|
107
|
+
missIndices.push(i)
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
} else {
|
|
111
|
+
for (let i = 0; i < chunks.length; i++) missIndices.push(i)
|
|
98
112
|
}
|
|
99
113
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
let done = 0
|
|
114
|
+
let done = chunks.length - missIndices.length
|
|
115
|
+
onProgress?.(done, chunks.length)
|
|
103
116
|
|
|
104
|
-
|
|
105
|
-
const
|
|
106
|
-
const
|
|
117
|
+
if (missIndices.length > 0) {
|
|
118
|
+
const pipe = await getEmbedPipeline(opts.model)
|
|
119
|
+
const missChunks = missIndices.map((i) => chunks[i]!)
|
|
107
120
|
|
|
108
|
-
const
|
|
109
|
-
|
|
110
|
-
|
|
121
|
+
const batches: number[][] = []
|
|
122
|
+
for (let i = 0; i < missChunks.length; i += opts.batchSize) {
|
|
123
|
+
batches.push(missIndices.slice(i, i + opts.batchSize))
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
for (let i = 0; i < batches.length; i += opts.concurrency) {
|
|
127
|
+
const concurrentBatches = batches.slice(i, i + opts.concurrency)
|
|
128
|
+
|
|
129
|
+
const batchResults = await Promise.all(
|
|
130
|
+
concurrentBatches.map((idxBatch) =>
|
|
131
|
+
embedBatch(pipe, idxBatch.map((ci) => chunks[ci]!.text))
|
|
132
|
+
)
|
|
111
133
|
)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
text: chunk.text,
|
|
123
|
-
embedding: embeddings[j]!,
|
|
134
|
+
|
|
135
|
+
for (let b = 0; b < concurrentBatches.length; b++) {
|
|
136
|
+
const idxBatch = concurrentBatches[b]!
|
|
137
|
+
const embeddings = batchResults[b]!
|
|
138
|
+
for (let j = 0; j < idxBatch.length; j++) {
|
|
139
|
+
const ci = idxBatch[j]!
|
|
140
|
+
const chunk = chunks[ci]!
|
|
141
|
+
const embedding = embeddings[j]!
|
|
142
|
+
entries[ci] = { path: chunk.path, text: chunk.text, embedding }
|
|
143
|
+
setCachedEmbedding(chunk.text, embedding)
|
|
124
144
|
}
|
|
125
|
-
|
|
145
|
+
done += idxBatch.length
|
|
146
|
+
onProgress?.(done, chunks.length)
|
|
126
147
|
}
|
|
127
|
-
chunkIndex += batch.length
|
|
128
|
-
done += batch.length
|
|
129
|
-
onProgress?.(done, chunks.length)
|
|
130
148
|
}
|
|
149
|
+
|
|
150
|
+
flushEmbedCache()
|
|
131
151
|
}
|
|
132
152
|
|
|
133
153
|
return entries
|
package/src/main.ts
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
|
|
16
16
|
import { Command } from "commander"
|
|
17
17
|
import path from "node:path"
|
|
18
|
+
import { createHash } from "node:crypto"
|
|
18
19
|
|
|
19
20
|
import { discoverFiles, DEFAULT_EXCLUDES, type FsOptions } from "./fs.ts"
|
|
20
21
|
import { chunkFile } from "./tokenize.ts"
|
|
@@ -24,6 +25,7 @@ import { buildTree } from "./tree.ts"
|
|
|
24
25
|
import { clearAuthCache, getCopilotToken } from "./auth.ts"
|
|
25
26
|
import { SemanticNavigatorUI, type ProgressState } from "./ui.ts"
|
|
26
27
|
import type { CopilotConfig } from "./labels.ts"
|
|
28
|
+
import { treeFingerprint, getCachedTree, setCachedTree } from "./cache.ts"
|
|
27
29
|
|
|
28
30
|
// ---------------------------------------------------------------------------
|
|
29
31
|
// CLI definition
|
|
@@ -40,6 +42,7 @@ const program = new Command()
|
|
|
40
42
|
.option("--read-concurrency <n>", "Concurrent file reads", (v) => parseInt(v, 10), 64)
|
|
41
43
|
.option("--embed-batch-size <n>", "Chunks per embedding batch", (v) => parseInt(v, 10), 32)
|
|
42
44
|
.option("--embed-concurrency <n>", "Concurrent embedding batches", (v) => parseInt(v, 10), 2)
|
|
45
|
+
.option("--no-cache", "Skip reading from cache; force re-embed and re-label")
|
|
43
46
|
.option("--logout", "Clear cached GitHub / Copilot credentials and exit")
|
|
44
47
|
.helpOption("-h, --help", "Show help")
|
|
45
48
|
|
|
@@ -60,6 +63,7 @@ async function main(): Promise<void> {
|
|
|
60
63
|
readConcurrency: number
|
|
61
64
|
embedBatchSize: number
|
|
62
65
|
embedConcurrency: number
|
|
66
|
+
cache: boolean
|
|
63
67
|
logout: boolean | undefined
|
|
64
68
|
}>()
|
|
65
69
|
|
|
@@ -167,6 +171,7 @@ async function main(): Promise<void> {
|
|
|
167
171
|
model: DEFAULT_EMBEDDING_MODEL,
|
|
168
172
|
batchSize: opts.embedBatchSize,
|
|
169
173
|
concurrency: opts.embedConcurrency,
|
|
174
|
+
noCache: !opts.cache,
|
|
170
175
|
}
|
|
171
176
|
|
|
172
177
|
let embedEntriesRaw: EmbedEntry[] | undefined
|
|
@@ -184,7 +189,32 @@ async function main(): Promise<void> {
|
|
|
184
189
|
}
|
|
185
190
|
|
|
186
191
|
// ---------------------------------------------------------------------------
|
|
187
|
-
// Step 5:
|
|
192
|
+
// Step 5: Compute tree fingerprint and check tree cache
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
// Build a map of relativePath → sha256(content) for all discovered files.
|
|
196
|
+
const fileHashes = new Map<string, string>()
|
|
197
|
+
for (const f of resolvedFiles) {
|
|
198
|
+
fileHashes.set(
|
|
199
|
+
f.relativePath,
|
|
200
|
+
createHash("sha256").update(f.content).digest("hex").slice(0, 16)
|
|
201
|
+
)
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const fingerprint = treeFingerprint(opts.completionModel, fileHashes)
|
|
205
|
+
const noCache = !opts.cache
|
|
206
|
+
|
|
207
|
+
if (!noCache) {
|
|
208
|
+
const cached = getCachedTree(fingerprint)
|
|
209
|
+
if (cached !== null) {
|
|
210
|
+
ui.setTree(cached)
|
|
211
|
+
// The UI event loop keeps the process alive until the user presses q/Esc.
|
|
212
|
+
return
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// ---------------------------------------------------------------------------
|
|
217
|
+
// Step 6: Spectral clustering (CPU-bound, synchronous)
|
|
188
218
|
// ---------------------------------------------------------------------------
|
|
189
219
|
|
|
190
220
|
ui.updateProgress({
|
|
@@ -201,7 +231,7 @@ async function main(): Promise<void> {
|
|
|
201
231
|
await Bun.sleep(0)
|
|
202
232
|
|
|
203
233
|
// ---------------------------------------------------------------------------
|
|
204
|
-
// Step
|
|
234
|
+
// Step 7: Build labelled tree
|
|
205
235
|
// ---------------------------------------------------------------------------
|
|
206
236
|
|
|
207
237
|
ui.updateProgress({
|
|
@@ -223,8 +253,11 @@ async function main(): Promise<void> {
|
|
|
223
253
|
}
|
|
224
254
|
const tree = treeRaw!
|
|
225
255
|
|
|
256
|
+
// Persist to tree cache for future runs
|
|
257
|
+
setCachedTree(fingerprint, tree)
|
|
258
|
+
|
|
226
259
|
// ---------------------------------------------------------------------------
|
|
227
|
-
// Step
|
|
260
|
+
// Step 8: Hand the tree to the UI
|
|
228
261
|
// ---------------------------------------------------------------------------
|
|
229
262
|
|
|
230
263
|
ui.setTree(tree)
|