@et0and/ovid 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1 @@
1
+ # ovid - a semantic project navigator (ported to Typescript and Bun from [this](https://github.com/Gabriella439/semantic-navigator))
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bun
2
+ import "../src/main.ts"
package/package.json ADDED
@@ -0,0 +1,33 @@
1
+ {
2
+ "name": "@et0and/ovid",
3
+ "version": "0.0.2",
4
+ "description": "Browse a repository's files by semantic meaning",
5
+ "type": "module",
6
+ "bin": {
7
+ "ovid": "./bin/semantic-navigator.js"
8
+ },
9
+ "files": [
10
+ "bin",
11
+ "src",
12
+ "README.md"
13
+ ],
14
+ "scripts": {
15
+ "start": "bun run src/main.ts",
16
+ "build": "bun build src/main.ts --target=bun --outdir=dist --external=onnxruntime-node",
17
+ "check": "bunx tsc --noEmit",
18
+ "prepublishOnly": "bun run check"
19
+ },
20
+ "dependencies": {
21
+ "@huggingface/transformers": "^3.4.0",
22
+ "@octokit/auth-oauth-device": "^8.0.3",
23
+ "@opentui/core": "^0.1.80",
24
+ "commander": "^13.1.0",
25
+ "ml-kmeans": "^6.0.0",
26
+ "tiktoken": "^1.0.21",
27
+ "zod": "^3.24.2"
28
+ },
29
+ "devDependencies": {
30
+ "@types/bun": "latest",
31
+ "typescript": "^5.8.2"
32
+ }
33
+ }
package/src/auth.ts ADDED
@@ -0,0 +1,137 @@
1
+ /**
2
+ * GitHub Copilot OAuth authentication.
3
+ *
4
+ * Flow:
5
+ * 1. GitHub device flow → GitHub OAuth token (scope: read:user, copilot)
6
+ * 2. Exchange GitHub OAuth token → Copilot access token (short-lived)
7
+ * 3. Cache Copilot token with expiry under ~/.config/semantic-navigator/
8
+ *
9
+ * References:
10
+ * - https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/authorizing-oauth-apps#device-flow
11
+ * - Copilot token endpoint: https://api.github.com/copilot_internal/v2/token
12
+ */
13
+
14
+ import { createOAuthDeviceAuth } from "@octokit/auth-oauth-device"
15
+ import path from "node:path"
16
+ import fs from "node:fs"
17
+ import os from "node:os"
18
+
19
+ const GITHUB_CLIENT_ID = "Iv1.b507a08c87ecfe98"
20
+
21
+ const COPILOT_TOKEN_URL =
22
+ "https://api.github.com/copilot_internal/v2/token"
23
+
24
+ const TOKEN_CACHE_DIR = path.join(
25
+ os.homedir(),
26
+ ".config",
27
+ "semantic-navigator"
28
+ )
29
+ const TOKEN_CACHE_FILE = path.join(TOKEN_CACHE_DIR, "auth.json")
30
+
31
+ interface AuthCache {
32
+ githubToken: string
33
+ copilotToken: string
34
+ expiresAt: number
35
+ }
36
+
37
+ function readCache(): AuthCache | null {
38
+ try {
39
+ const raw = fs.readFileSync(TOKEN_CACHE_FILE, "utf-8")
40
+ return JSON.parse(raw) as AuthCache
41
+ } catch {
42
+ return null
43
+ }
44
+ }
45
+
46
+ function writeCache(cache: AuthCache): void {
47
+ fs.mkdirSync(TOKEN_CACHE_DIR, { recursive: true })
48
+ fs.writeFileSync(TOKEN_CACHE_FILE, JSON.stringify(cache, null, 2), "utf-8")
49
+ }
50
+
51
+ async function acquireGithubToken(
52
+ onVerification: (url: string, code: string) => void
53
+ ): Promise<string> {
54
+ const auth = createOAuthDeviceAuth({
55
+ clientType: "oauth-app",
56
+ clientId: GITHUB_CLIENT_ID,
57
+ scopes: ["read:user"],
58
+ onVerification: (verification) => {
59
+ onVerification(verification.verification_uri, verification.user_code)
60
+ },
61
+ })
62
+
63
+ const result = await auth({ type: "oauth" })
64
+ return result.token
65
+ }
66
+
67
+ interface CopilotTokenResponse {
68
+ token: string
69
+ expires_at: number
70
+ }
71
+
72
+ async function fetchCopilotToken(githubToken: string): Promise<{ token: string; expiresAt: number }> {
73
+ const resp = await fetch(COPILOT_TOKEN_URL, {
74
+ method: "GET",
75
+ headers: {
76
+ Authorization: `token ${githubToken}`,
77
+ "Editor-Version": "semantic-navigator/1.0.0",
78
+ "Editor-Plugin-Version": "semantic-navigator/1.0.0",
79
+ "User-Agent": "semantic-navigator",
80
+ },
81
+ })
82
+
83
+ if (!resp.ok) {
84
+ throw new Error(
85
+ `Failed to fetch Copilot token: HTTP ${resp.status} ${resp.statusText}`
86
+ )
87
+ }
88
+
89
+ const data = (await resp.json()) as CopilotTokenResponse
90
+ return { token: data.token, expiresAt: data.expires_at }
91
+ }
92
+
93
+ export async function getCopilotToken(
94
+ onVerification: (url: string, code: string) => void
95
+ ): Promise<string> {
96
+ const now = Math.floor(Date.now() / 1000)
97
+ const cache = readCache()
98
+
99
+ if (cache !== null && cache.expiresAt - now > 60) {
100
+ return cache.copilotToken
101
+ }
102
+
103
+ let githubToken = cache?.githubToken ?? null
104
+
105
+ if (githubToken === null) {
106
+ githubToken = await acquireGithubToken(onVerification)
107
+ }
108
+
109
+ let copilotResult: { token: string; expiresAt: number }
110
+ try {
111
+ copilotResult = await fetchCopilotToken(githubToken)
112
+ } catch (err) {
113
+ // GitHub token may have been revoked — start fresh
114
+ if (cache !== null) {
115
+ githubToken = await acquireGithubToken(onVerification)
116
+ copilotResult = await fetchCopilotToken(githubToken)
117
+ } else {
118
+ throw err
119
+ }
120
+ }
121
+
122
+ writeCache({
123
+ githubToken,
124
+ copilotToken: copilotResult.token,
125
+ expiresAt: copilotResult.expiresAt,
126
+ })
127
+
128
+ return copilotResult.token
129
+ }
130
+
131
+ export function clearAuthCache(): void {
132
+ try {
133
+ fs.unlinkSync(TOKEN_CACHE_FILE)
134
+ } catch {
135
+ // Already gone :)
136
+ }
137
+ }
package/src/cluster.ts ADDED
@@ -0,0 +1,487 @@
1
+ import type { EmbedEntry } from "./embed.ts"
2
+
3
+ export const MAX_CLUSTERS = 20
4
+ export const MAX_LEAVES = 20
5
+
6
+ export interface Cluster {
7
+ entries: EmbedEntry[]
8
+ }
9
+
10
+ type Matrix = Float64Array[]
11
+
12
+ function matFromEmbeds(entries: EmbedEntry[]): Matrix {
13
+ return entries.map((e) => Float64Array.from(e.embedding))
14
+ }
15
+
16
+ /** L2-normalise each row of a matrix in-place. */
17
+ function normaliseRows(m: Matrix): Matrix {
18
+ for (const row of m) {
19
+ let norm = 0
20
+ for (let i = 0; i < row.length; i++) norm += row[i]! * row[i]!
21
+ norm = Math.sqrt(norm)
22
+ if (norm > 1e-12) {
23
+ for (let i = 0; i < row.length; i++) row[i]! /= norm
24
+ }
25
+ }
26
+ return m
27
+ }
28
+
29
+ function cosDist(a: Float64Array, b: Float64Array): number {
30
+ let dot = 0
31
+ for (let i = 0; i < a.length; i++) dot += a[i]! * b[i]!
32
+ return Math.max(0, 1 - dot)
33
+ }
34
+
35
+ /**
36
+ * Brute-force k nearest neighbours (cosine distance).
37
+ * Returns { distances, indices } each of shape [N][k].
38
+ * Acceptable for N ≤ ~10k on a laptop.
39
+ */
40
+ function knn(
41
+ normalized: Matrix,
42
+ k: number
43
+ ): { distances: Float64Array[]; indices: Int32Array[] } {
44
+ const N = normalized.length
45
+ const distances: Float64Array[] = []
46
+ const indices: Int32Array[] = []
47
+
48
+ for (let i = 0; i < N; i++) {
49
+ // Compute distances to all other points
50
+ const dists: Array<[number, number]> = []
51
+ for (let j = 0; j < N; j++) {
52
+ if (j === i) continue
53
+ dists.push([cosDist(normalized[i]!, normalized[j]!), j])
54
+ }
55
+ // Partial sort: we only need the k smallest
56
+ dists.sort((a, b) => a[0] - b[0])
57
+ const kNearest = dists.slice(0, k)
58
+ distances.push(Float64Array.from(kNearest.map((x) => x[0])))
59
+ indices.push(Int32Array.from(kNearest.map((x) => x[1])))
60
+ }
61
+
62
+ return { distances, indices }
63
+ }
64
+
65
+ /**
66
+ * Count connected components in an undirected k-NN connectivity graph.
67
+ * Uses union-find.
68
+ */
69
+ function connectedComponents(indices: Int32Array[], N: number): number {
70
+ const parent = Int32Array.from({ length: N }, (_, i) => i)
71
+
72
+ function find(x: number): number {
73
+ while (parent[x] !== x) {
74
+ parent[x] = parent[parent[x]!]!
75
+ x = parent[x]!
76
+ }
77
+ return x
78
+ }
79
+
80
+ function union(a: number, b: number): void {
81
+ const ra = find(a)
82
+ const rb = find(b)
83
+ if (ra !== rb) parent[ra] = rb
84
+ }
85
+
86
+ for (let i = 0; i < N; i++) {
87
+ for (const j of indices[i]!) {
88
+ union(i, j)
89
+ }
90
+ }
91
+
92
+ const roots = new Set<number>()
93
+ for (let i = 0; i < N; i++) roots.add(find(i))
94
+ return roots.size
95
+ }
96
+
97
+ /**
98
+ * Build the (dense) normalised Laplacian from the affinity matrix (stored as
99
+ * a list of sparse {row,col,val} triples) and return it as a dense matrix
100
+ * plus the degree diagonal `dd`.
101
+ */
102
+ function buildNormalisedLaplacian(
103
+ sparseAffinity: Array<{ i: number; j: number; v: number }>,
104
+ N: number
105
+ ): { L: Matrix; dd: Float64Array } {
106
+ // Accumulate row sums (degree) for normalisation
107
+ const degree = new Float64Array(N)
108
+ for (const { i, j, v } of sparseAffinity) {
109
+ degree[i] = (degree[i] ?? 0) + v
110
+ if (i !== j) degree[j] = (degree[j] ?? 0) + v
111
+ }
112
+
113
+ const dd = new Float64Array(N)
114
+ for (let i = 0; i < N; i++) {
115
+ dd[i] = degree[i]! > 1e-12 ? 1 / Math.sqrt(degree[i]!) : 0
116
+ }
117
+
118
+ // L_norm = I - D^{-1/2} A D^{-1/2}
119
+ // We start from identity
120
+ const L: Matrix = Array.from({ length: N }, (_, i) => {
121
+ const row = new Float64Array(N)
122
+ row[i] = 1
123
+ return row
124
+ })
125
+
126
+ // Subtract normalised affinity contributions
127
+ for (const { i, j, v } of sparseAffinity) {
128
+ const w = v * dd[i]! * dd[j]!
129
+ const rowI = L[i]!
130
+ rowI[j] = (rowI[j] ?? 0) - w
131
+ if (i !== j) {
132
+ const rowJ = L[j]!
133
+ rowJ[i] = (rowJ[i] ?? 0) - w
134
+ }
135
+ }
136
+
137
+ // Clamp diagonal to 1 (matches scipy behaviour after set_diag)
138
+ for (let i = 0; i < N; i++) {
139
+ L[i]![i] = 1
140
+ }
141
+
142
+ return { L, dd }
143
+ }
144
+
145
+ /**
146
+ * Symmetric QR algorithm (Francis double-shift, implicit).
147
+ * Returns { values, vectors } where vectors is column-major (vectors[j] = j-th eigenvector).
148
+ *
149
+ * This operates on dense matrices — fine for N ≤ ~500. For larger N we use
150
+ * a power-iteration / deflation approach to extract only the smallest
151
+ * `maxK` eigenpairs.
152
+ */
153
+
154
+ /** Dot product of two arrays */
155
+ function dot(a: Float64Array, b: Float64Array): number {
156
+ let s = 0
157
+ for (let i = 0; i < a.length; i++) s += a[i]! * b[i]!
158
+ return s
159
+ }
160
+
161
+ /** Subtract projection: a -= (dot(a,b)/dot(b,b)) * b, in place */
162
+ function subtractProjection(a: Float64Array, b: Float64Array): void {
163
+ const scale = dot(a, b) / (dot(b, b) + 1e-15)
164
+ for (let i = 0; i < a.length; i++) a[i]! -= scale * b[i]!
165
+ }
166
+
167
+ /** Normalise a vector in place, return its norm */
168
+ function normaliseVec(v: Float64Array): number {
169
+ const n = Math.sqrt(dot(v, v))
170
+ if (n > 1e-12) for (let i = 0; i < v.length; i++) v[i]! /= n
171
+ return n
172
+ }
173
+
174
+ /** Multiply matrix M by vector v */
175
+ function matvec(M: Matrix, v: Float64Array): Float64Array<ArrayBuffer> {
176
+ const N = M.length
177
+ const out = new Float64Array(N) as Float64Array<ArrayBuffer>
178
+ for (let i = 0; i < N; i++) {
179
+ out[i] = dot(M[i]!, v)
180
+ }
181
+ return out
182
+ }
183
+
184
+ /**
185
+ * Randomised power-iteration with deflation to extract the `k` eigenpairs
186
+ * corresponding to the *smallest* eigenvalues of a symmetric matrix M.
187
+ *
188
+ * M is the **negated** Laplacian (M = -L), so its *largest* eigenvalues
189
+ * correspond to L's smallest — matching the Python code which does `laplacian *= -1`.
190
+ *
191
+ * We use shifted inverse iteration: to find small eigenvalues of L we find
192
+ * large eigenvalues of (-L + shift*I) where shift ≈ 1 (the diagonal was set
193
+ * to 1 above). We iterate on M = -L and take the top-k eigenvectors, then
194
+ * negate the eigenvalues back.
195
+ */
196
+ function topKEigenpairs(
197
+ negL: Matrix,
198
+ k: number,
199
+ maxIter = 300,
200
+ tol = 1e-6
201
+ ): { values: Float64Array<ArrayBuffer>; vectors: Float64Array<ArrayBuffer>[] } {
202
+ const N = negL.length
203
+ const rng = seededRng(42)
204
+
205
+ const vectors: Float64Array<ArrayBuffer>[] = []
206
+ const values = new Float64Array(k)
207
+
208
+ for (let idx = 0; idx < k; idx++) {
209
+ // Random start
210
+ let v = Float64Array.from({ length: N }, () => rng() * 2 - 1)
211
+ normaliseVec(v)
212
+
213
+ // Deflate against already-found vectors
214
+ for (const u of vectors) subtractProjection(v, u)
215
+ normaliseVec(v)
216
+
217
+ let lambda = 0
218
+ for (let iter = 0; iter < maxIter; iter++) {
219
+ const Mv = matvec(negL, v)
220
+
221
+ // Deflate
222
+ for (const u of vectors) subtractProjection(Mv, u)
223
+
224
+ const newLambda = dot(v, Mv)
225
+ const norm = normaliseVec(Mv)
226
+
227
+ if (norm < 1e-14) break
228
+
229
+ const diff = Math.abs(newLambda - lambda)
230
+ lambda = newLambda
231
+ v = Mv
232
+
233
+ if (iter > 10 && diff < tol) break
234
+ }
235
+
236
+ vectors.push(v)
237
+ values[idx] = lambda
238
+ }
239
+
240
+ return { values, vectors }
241
+ }
242
+
243
+ /** Deterministic sign flip: each eigenvector's sign is chosen so that the
244
+ * component with the largest absolute value is positive (matches sklearn). */
245
+ function deterministicSignFlip(vectors: Float64Array[]): void {
246
+ for (const v of vectors) {
247
+ let maxAbs = 0
248
+ let maxSign = 1
249
+ for (const x of v) {
250
+ if (Math.abs(x) > maxAbs) {
251
+ maxAbs = Math.abs(x)
252
+ maxSign = x >= 0 ? 1 : -1
253
+ }
254
+ }
255
+ if (maxSign < 0) {
256
+ for (let i = 0; i < v.length; i++) v[i]! *= -1
257
+ }
258
+ }
259
+ }
260
+
261
+ /** Simple seeded pseudo-random number generator (mulberry32). */
262
+ function seededRng(seed: number): () => number {
263
+ let s = seed
264
+ return () => {
265
+ s |= 0
266
+ s = (s + 0x6d2b79f5) | 0
267
+ let t = Math.imul(s ^ (s >>> 15), 1 | s)
268
+ t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
269
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296
270
+ }
271
+ }
272
+
273
+ /** Euclidean distance squared */
274
+ function distSq(a: Float64Array, b: Float64Array): number {
275
+ let s = 0
276
+ for (let i = 0; i < a.length; i++) s += (a[i]! - b[i]!) ** 2
277
+ return s
278
+ }
279
+
280
+ function kmeans(
281
+ points: Float64Array[],
282
+ k: number,
283
+ maxIter = 300,
284
+ seed = 0
285
+ ): Int32Array {
286
+ const N = points.length
287
+ if (k >= N) return Int32Array.from({ length: N }, (_, i) => i % k)
288
+
289
+ const dim = points[0]!.length
290
+ const rng = seededRng(seed)
291
+
292
+ // K-means++ initialisation
293
+ const centroids: Float64Array[] = []
294
+ centroids.push(points[Math.floor(rng() * N)]!)
295
+
296
+ for (let c = 1; c < k; c++) {
297
+ const dists = points.map((p) =>
298
+ Math.min(...centroids.map((cent) => distSq(p, cent)))
299
+ )
300
+ const total = dists.reduce((a, b) => a + b, 0)
301
+ let r = rng() * total
302
+ let chosen = 0
303
+ for (let i = 0; i < N; i++) {
304
+ r -= dists[i]!
305
+ if (r <= 0) {
306
+ chosen = i
307
+ break
308
+ }
309
+ }
310
+ centroids.push(Float64Array.from(points[chosen]!))
311
+ }
312
+
313
+ const labels = new Int32Array(N)
314
+
315
+ for (let iter = 0; iter < maxIter; iter++) {
316
+ // Assignment
317
+ let changed = false
318
+ for (let i = 0; i < N; i++) {
319
+ let bestDist = Infinity
320
+ let bestLabel = 0
321
+ for (let c = 0; c < k; c++) {
322
+ const d = distSq(points[i]!, centroids[c]!)
323
+ if (d < bestDist) {
324
+ bestDist = d
325
+ bestLabel = c
326
+ }
327
+ }
328
+ if (labels[i] !== bestLabel) {
329
+ labels[i] = bestLabel
330
+ changed = true
331
+ }
332
+ }
333
+
334
+ if (!changed) break
335
+
336
+ // Update centroids
337
+ for (let c = 0; c < k; c++) {
338
+ const newCentroid = new Float64Array(dim)
339
+ let count = 0
340
+ for (let i = 0; i < N; i++) {
341
+ if (labels[i] === c) {
342
+ for (let d = 0; d < dim; d++) newCentroid[d]! += points[i]![d]!
343
+ count++
344
+ }
345
+ }
346
+ if (count > 0) {
347
+ for (let d = 0; d < dim; d++) newCentroid[d]! /= count
348
+ centroids[c] = newCentroid
349
+ }
350
+ }
351
+ }
352
+
353
+ return labels
354
+ }
355
+
356
+ /**
357
+ * Recursively split a Cluster into sub-clusters using spectral clustering.
358
+ * Returns [input] when the cluster is small enough to be a leaf.
359
+ */
360
+ export function splitCluster(input: Cluster): Cluster[] {
361
+ const N = input.entries.length
362
+
363
+ if (N <= MAX_LEAVES) return [input]
364
+
365
+ const normalized = normaliseRows(matFromEmbeds(input.entries))
366
+
367
+ // --- Adaptive k-NN: find smallest k that gives 1 connected component ---
368
+ const candidateKs: number[] = []
369
+ for (let n = 0; ; n++) {
370
+ const k = Math.round(Math.exp(n))
371
+ if (k >= N) break
372
+ candidateKs.push(k)
373
+ }
374
+ candidateKs.push(Math.floor(N / 2))
375
+
376
+ let chosenK = candidateKs[candidateKs.length - 1]!
377
+ let chosenKnnResult: { distances: Float64Array[]; indices: Int32Array[] } | null = null
378
+
379
+ for (const k of candidateKs) {
380
+ const knnResult = knn(normalized, k)
381
+ const nComponents = connectedComponents(knnResult.indices, N)
382
+ if (nComponents === 1) {
383
+ chosenK = k
384
+ chosenKnnResult = knnResult
385
+ break
386
+ }
387
+ }
388
+
389
+ if (chosenKnnResult === null) {
390
+ // Fallback: compute for the last candidate (floor(N/2))
391
+ chosenKnnResult = knn(normalized, chosenK)
392
+ }
393
+
394
+ const { distances, indices } = chosenKnnResult
395
+
396
+ // --- Build affinity matrix (sparse triplets) ---
397
+ // σ[i] = distance to Kth nearest neighbour
398
+ const sigmas = distances.map((d) => d[d.length - 1]!)
399
+
400
+ const sparseAffinity: Array<{ i: number; j: number; v: number }> = []
401
+
402
+ for (let i = 0; i < N; i++) {
403
+ for (let n = 0; n < chosenK; n++) {
404
+ const j = indices[i]![n]!
405
+ const d = distances[i]![n]!
406
+ const sigma_i = sigmas[i]!
407
+ const sigma_j = sigmas[j]!
408
+ const denom = Math.max(sigma_i * sigma_j, 1e-12)
409
+ const v = Math.exp(-(d * d) / denom)
410
+ sparseAffinity.push({ i, j, v })
411
+ }
412
+ }
413
+
414
+ // --- Normalised Laplacian ---
415
+ const { L, dd } = buildNormalisedLaplacian(sparseAffinity, N)
416
+
417
+ // Negate L (as Python does `laplacian *= -1`) so power iteration finds
418
+ // eigenvectors of -L, whose top eigenvalues correspond to L's bottom ones.
419
+ const negL: Matrix = L.map((row) => {
420
+ const r = Float64Array.from(row)
421
+ for (let i = 0; i < r.length; i++) r[i]! *= -1
422
+ return r
423
+ })
424
+
425
+ const k = Math.min(MAX_CLUSTERS + 1, N)
426
+ const { values: rawValues, vectors } = topKEigenpairs(negL, k)
427
+
428
+ // Eigenvalues were of -L; flip sign back to get L eigenvalues
429
+ const eigenvalues = Float64Array.from(rawValues, (v) => -v)
430
+
431
+ // Sort by eigenvalue ascending (smallest first), skip index 0
432
+ const sortedIdx = Array.from({ length: k }, (_, i) => i).sort(
433
+ (a, b) => eigenvalues[a]! - eigenvalues[b]!
434
+ )
435
+
436
+ const sortedEigenvalues = Float64Array.from(sortedIdx, (i) => eigenvalues[i]!)
437
+ const sortedVectors = sortedIdx.map((i) => vectors[i]!)
438
+
439
+ deterministicSignFlip(sortedVectors)
440
+
441
+ // --- Eigengap heuristic (skip λ₀ ≈ 0) ---
442
+ // n_clusters = argmax(diff(eigenvalues[1:])) + 2
443
+ let maxGap = -Infinity
444
+ let nClusters = 2
445
+ for (let i = 1; i < sortedEigenvalues.length - 1; i++) {
446
+ const gap = sortedEigenvalues[i + 1]! - sortedEigenvalues[i]!
447
+ if (gap > maxGap) {
448
+ maxGap = gap
449
+ nClusters = i + 1 // 1-indexed + 1 for the off-by-one vs Python
450
+ }
451
+ }
452
+ nClusters = Math.max(2, Math.min(nClusters, MAX_CLUSTERS))
453
+
454
+ // --- Spectral embeddings: use eigenvectors 1..nClusters (skip 0) ---
455
+ // Build [N × nClusters] matrix, normalise each row
456
+ const spectralPoints: Float64Array[] = Array.from({ length: N }, () =>
457
+ new Float64Array(nClusters)
458
+ )
459
+ for (let c = 0; c < nClusters; c++) {
460
+ const vec = sortedVectors[c + 1] // skip smallest (index 0)
461
+ if (vec === undefined) break
462
+ for (let i = 0; i < N; i++) {
463
+ // Divide by dd[i] (matches Python `wide_spectral_embeddings = eigenvectors.T / dd`)
464
+ spectralPoints[i]![c] = (vec[i]! / dd[i]!)
465
+ }
466
+ }
467
+ // L2-normalise each row
468
+ for (const row of spectralPoints) {
469
+ let norm = 0
470
+ for (const v of row) norm += v * v
471
+ norm = Math.sqrt(norm)
472
+ if (norm > 1e-12) for (let d = 0; d < row.length; d++) row[d]! /= norm
473
+ }
474
+
475
+ // --- K-means ---
476
+ const labels = kmeans(spectralPoints, nClusters)
477
+
478
+ // Group entries by cluster label, preserving order
479
+ const groups = new Map<number, EmbedEntry[]>()
480
+ for (let i = 0; i < N; i++) {
481
+ const label = labels[i]!
482
+ if (!groups.has(label)) groups.set(label, [])
483
+ groups.get(label)!.push(input.entries[i]!)
484
+ }
485
+
486
+ return Array.from(groups.values()).map((entries) => ({ entries }))
487
+ }