@et0and/ovid 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/bin/semantic-navigator.js +2 -0
- package/package.json +33 -0
- package/src/auth.ts +137 -0
- package/src/cluster.ts +487 -0
- package/src/embed.ts +143 -0
- package/src/fs.ts +187 -0
- package/src/labels.ts +205 -0
- package/src/main.ts +239 -0
- package/src/tokenize.ts +76 -0
- package/src/tree.ts +176 -0
- package/src/ui.ts +460 -0
package/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# ovid - a semantic project navigator (ported to Typescript and Bun from [this](https://github.com/Gabriella439/semantic-navigator))
|
package/package.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@et0and/ovid",
|
|
3
|
+
"version": "0.0.2",
|
|
4
|
+
"description": "Browse a repository's files by semantic meaning",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"ovid": "./bin/semantic-navigator.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"bin",
|
|
11
|
+
"src",
|
|
12
|
+
"README.md"
|
|
13
|
+
],
|
|
14
|
+
"scripts": {
|
|
15
|
+
"start": "bun run src/main.ts",
|
|
16
|
+
"build": "bun build src/main.ts --target=bun --outdir=dist --external=onnxruntime-node",
|
|
17
|
+
"check": "bunx tsc --noEmit",
|
|
18
|
+
"prepublishOnly": "bun run check"
|
|
19
|
+
},
|
|
20
|
+
"dependencies": {
|
|
21
|
+
"@huggingface/transformers": "^3.4.0",
|
|
22
|
+
"@octokit/auth-oauth-device": "^8.0.3",
|
|
23
|
+
"@opentui/core": "^0.1.80",
|
|
24
|
+
"commander": "^13.1.0",
|
|
25
|
+
"ml-kmeans": "^6.0.0",
|
|
26
|
+
"tiktoken": "^1.0.21",
|
|
27
|
+
"zod": "^3.24.2"
|
|
28
|
+
},
|
|
29
|
+
"devDependencies": {
|
|
30
|
+
"@types/bun": "latest",
|
|
31
|
+
"typescript": "^5.8.2"
|
|
32
|
+
}
|
|
33
|
+
}
|
package/src/auth.ts
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Copilot OAuth authentication.
|
|
3
|
+
*
|
|
4
|
+
* Flow:
|
|
5
|
+
* 1. GitHub device flow → GitHub OAuth token (scope: read:user, copilot)
|
|
6
|
+
* 2. Exchange GitHub OAuth token → Copilot access token (short-lived)
|
|
7
|
+
* 3. Cache Copilot token with expiry under ~/.config/semantic-navigator/
|
|
8
|
+
*
|
|
9
|
+
* References:
|
|
10
|
+
* - https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/authorizing-oauth-apps#device-flow
|
|
11
|
+
* - Copilot token endpoint: https://api.github.com/copilot_internal/v2/token
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { createOAuthDeviceAuth } from "@octokit/auth-oauth-device"
|
|
15
|
+
import path from "node:path"
|
|
16
|
+
import fs from "node:fs"
|
|
17
|
+
import os from "node:os"
|
|
18
|
+
|
|
19
|
+
const GITHUB_CLIENT_ID = "Iv1.b507a08c87ecfe98"
|
|
20
|
+
|
|
21
|
+
const COPILOT_TOKEN_URL =
|
|
22
|
+
"https://api.github.com/copilot_internal/v2/token"
|
|
23
|
+
|
|
24
|
+
const TOKEN_CACHE_DIR = path.join(
|
|
25
|
+
os.homedir(),
|
|
26
|
+
".config",
|
|
27
|
+
"semantic-navigator"
|
|
28
|
+
)
|
|
29
|
+
const TOKEN_CACHE_FILE = path.join(TOKEN_CACHE_DIR, "auth.json")
|
|
30
|
+
|
|
31
|
+
interface AuthCache {
|
|
32
|
+
githubToken: string
|
|
33
|
+
copilotToken: string
|
|
34
|
+
expiresAt: number
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function readCache(): AuthCache | null {
|
|
38
|
+
try {
|
|
39
|
+
const raw = fs.readFileSync(TOKEN_CACHE_FILE, "utf-8")
|
|
40
|
+
return JSON.parse(raw) as AuthCache
|
|
41
|
+
} catch {
|
|
42
|
+
return null
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function writeCache(cache: AuthCache): void {
|
|
47
|
+
fs.mkdirSync(TOKEN_CACHE_DIR, { recursive: true })
|
|
48
|
+
fs.writeFileSync(TOKEN_CACHE_FILE, JSON.stringify(cache, null, 2), "utf-8")
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async function acquireGithubToken(
|
|
52
|
+
onVerification: (url: string, code: string) => void
|
|
53
|
+
): Promise<string> {
|
|
54
|
+
const auth = createOAuthDeviceAuth({
|
|
55
|
+
clientType: "oauth-app",
|
|
56
|
+
clientId: GITHUB_CLIENT_ID,
|
|
57
|
+
scopes: ["read:user"],
|
|
58
|
+
onVerification: (verification) => {
|
|
59
|
+
onVerification(verification.verification_uri, verification.user_code)
|
|
60
|
+
},
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
const result = await auth({ type: "oauth" })
|
|
64
|
+
return result.token
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
interface CopilotTokenResponse {
|
|
68
|
+
token: string
|
|
69
|
+
expires_at: number
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
async function fetchCopilotToken(githubToken: string): Promise<{ token: string; expiresAt: number }> {
|
|
73
|
+
const resp = await fetch(COPILOT_TOKEN_URL, {
|
|
74
|
+
method: "GET",
|
|
75
|
+
headers: {
|
|
76
|
+
Authorization: `token ${githubToken}`,
|
|
77
|
+
"Editor-Version": "semantic-navigator/1.0.0",
|
|
78
|
+
"Editor-Plugin-Version": "semantic-navigator/1.0.0",
|
|
79
|
+
"User-Agent": "semantic-navigator",
|
|
80
|
+
},
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
if (!resp.ok) {
|
|
84
|
+
throw new Error(
|
|
85
|
+
`Failed to fetch Copilot token: HTTP ${resp.status} ${resp.statusText}`
|
|
86
|
+
)
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const data = (await resp.json()) as CopilotTokenResponse
|
|
90
|
+
return { token: data.token, expiresAt: data.expires_at }
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export async function getCopilotToken(
|
|
94
|
+
onVerification: (url: string, code: string) => void
|
|
95
|
+
): Promise<string> {
|
|
96
|
+
const now = Math.floor(Date.now() / 1000)
|
|
97
|
+
const cache = readCache()
|
|
98
|
+
|
|
99
|
+
if (cache !== null && cache.expiresAt - now > 60) {
|
|
100
|
+
return cache.copilotToken
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
let githubToken = cache?.githubToken ?? null
|
|
104
|
+
|
|
105
|
+
if (githubToken === null) {
|
|
106
|
+
githubToken = await acquireGithubToken(onVerification)
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
let copilotResult: { token: string; expiresAt: number }
|
|
110
|
+
try {
|
|
111
|
+
copilotResult = await fetchCopilotToken(githubToken)
|
|
112
|
+
} catch (err) {
|
|
113
|
+
// GitHub token may have been revoked — start fresh
|
|
114
|
+
if (cache !== null) {
|
|
115
|
+
githubToken = await acquireGithubToken(onVerification)
|
|
116
|
+
copilotResult = await fetchCopilotToken(githubToken)
|
|
117
|
+
} else {
|
|
118
|
+
throw err
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
writeCache({
|
|
123
|
+
githubToken,
|
|
124
|
+
copilotToken: copilotResult.token,
|
|
125
|
+
expiresAt: copilotResult.expiresAt,
|
|
126
|
+
})
|
|
127
|
+
|
|
128
|
+
return copilotResult.token
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
export function clearAuthCache(): void {
|
|
132
|
+
try {
|
|
133
|
+
fs.unlinkSync(TOKEN_CACHE_FILE)
|
|
134
|
+
} catch {
|
|
135
|
+
// Already gone :)
|
|
136
|
+
}
|
|
137
|
+
}
|
package/src/cluster.ts
ADDED
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
import type { EmbedEntry } from "./embed.ts"
|
|
2
|
+
|
|
3
|
+
export const MAX_CLUSTERS = 20
|
|
4
|
+
export const MAX_LEAVES = 20
|
|
5
|
+
|
|
6
|
+
export interface Cluster {
|
|
7
|
+
entries: EmbedEntry[]
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
type Matrix = Float64Array[]
|
|
11
|
+
|
|
12
|
+
function matFromEmbeds(entries: EmbedEntry[]): Matrix {
|
|
13
|
+
return entries.map((e) => Float64Array.from(e.embedding))
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/** L2-normalise each row of a matrix in-place. */
|
|
17
|
+
function normaliseRows(m: Matrix): Matrix {
|
|
18
|
+
for (const row of m) {
|
|
19
|
+
let norm = 0
|
|
20
|
+
for (let i = 0; i < row.length; i++) norm += row[i]! * row[i]!
|
|
21
|
+
norm = Math.sqrt(norm)
|
|
22
|
+
if (norm > 1e-12) {
|
|
23
|
+
for (let i = 0; i < row.length; i++) row[i]! /= norm
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return m
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function cosDist(a: Float64Array, b: Float64Array): number {
|
|
30
|
+
let dot = 0
|
|
31
|
+
for (let i = 0; i < a.length; i++) dot += a[i]! * b[i]!
|
|
32
|
+
return Math.max(0, 1 - dot)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Brute-force k nearest neighbours (cosine distance).
|
|
37
|
+
* Returns { distances, indices } each of shape [N][k].
|
|
38
|
+
* Acceptable for N ≤ ~10k on a laptop.
|
|
39
|
+
*/
|
|
40
|
+
function knn(
|
|
41
|
+
normalized: Matrix,
|
|
42
|
+
k: number
|
|
43
|
+
): { distances: Float64Array[]; indices: Int32Array[] } {
|
|
44
|
+
const N = normalized.length
|
|
45
|
+
const distances: Float64Array[] = []
|
|
46
|
+
const indices: Int32Array[] = []
|
|
47
|
+
|
|
48
|
+
for (let i = 0; i < N; i++) {
|
|
49
|
+
// Compute distances to all other points
|
|
50
|
+
const dists: Array<[number, number]> = []
|
|
51
|
+
for (let j = 0; j < N; j++) {
|
|
52
|
+
if (j === i) continue
|
|
53
|
+
dists.push([cosDist(normalized[i]!, normalized[j]!), j])
|
|
54
|
+
}
|
|
55
|
+
// Partial sort: we only need the k smallest
|
|
56
|
+
dists.sort((a, b) => a[0] - b[0])
|
|
57
|
+
const kNearest = dists.slice(0, k)
|
|
58
|
+
distances.push(Float64Array.from(kNearest.map((x) => x[0])))
|
|
59
|
+
indices.push(Int32Array.from(kNearest.map((x) => x[1])))
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return { distances, indices }
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Count connected components in an undirected k-NN connectivity graph.
|
|
67
|
+
* Uses union-find.
|
|
68
|
+
*/
|
|
69
|
+
function connectedComponents(indices: Int32Array[], N: number): number {
|
|
70
|
+
const parent = Int32Array.from({ length: N }, (_, i) => i)
|
|
71
|
+
|
|
72
|
+
function find(x: number): number {
|
|
73
|
+
while (parent[x] !== x) {
|
|
74
|
+
parent[x] = parent[parent[x]!]!
|
|
75
|
+
x = parent[x]!
|
|
76
|
+
}
|
|
77
|
+
return x
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function union(a: number, b: number): void {
|
|
81
|
+
const ra = find(a)
|
|
82
|
+
const rb = find(b)
|
|
83
|
+
if (ra !== rb) parent[ra] = rb
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
for (let i = 0; i < N; i++) {
|
|
87
|
+
for (const j of indices[i]!) {
|
|
88
|
+
union(i, j)
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const roots = new Set<number>()
|
|
93
|
+
for (let i = 0; i < N; i++) roots.add(find(i))
|
|
94
|
+
return roots.size
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Build the (dense) normalised Laplacian from the affinity matrix (stored as
|
|
99
|
+
* a list of sparse {row,col,val} triples) and return it as a dense matrix
|
|
100
|
+
* plus the degree diagonal `dd`.
|
|
101
|
+
*/
|
|
102
|
+
function buildNormalisedLaplacian(
|
|
103
|
+
sparseAffinity: Array<{ i: number; j: number; v: number }>,
|
|
104
|
+
N: number
|
|
105
|
+
): { L: Matrix; dd: Float64Array } {
|
|
106
|
+
// Accumulate row sums (degree) for normalisation
|
|
107
|
+
const degree = new Float64Array(N)
|
|
108
|
+
for (const { i, j, v } of sparseAffinity) {
|
|
109
|
+
degree[i] = (degree[i] ?? 0) + v
|
|
110
|
+
if (i !== j) degree[j] = (degree[j] ?? 0) + v
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const dd = new Float64Array(N)
|
|
114
|
+
for (let i = 0; i < N; i++) {
|
|
115
|
+
dd[i] = degree[i]! > 1e-12 ? 1 / Math.sqrt(degree[i]!) : 0
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// L_norm = I - D^{-1/2} A D^{-1/2}
|
|
119
|
+
// We start from identity
|
|
120
|
+
const L: Matrix = Array.from({ length: N }, (_, i) => {
|
|
121
|
+
const row = new Float64Array(N)
|
|
122
|
+
row[i] = 1
|
|
123
|
+
return row
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
// Subtract normalised affinity contributions
|
|
127
|
+
for (const { i, j, v } of sparseAffinity) {
|
|
128
|
+
const w = v * dd[i]! * dd[j]!
|
|
129
|
+
const rowI = L[i]!
|
|
130
|
+
rowI[j] = (rowI[j] ?? 0) - w
|
|
131
|
+
if (i !== j) {
|
|
132
|
+
const rowJ = L[j]!
|
|
133
|
+
rowJ[i] = (rowJ[i] ?? 0) - w
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Clamp diagonal to 1 (matches scipy behaviour after set_diag)
|
|
138
|
+
for (let i = 0; i < N; i++) {
|
|
139
|
+
L[i]![i] = 1
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return { L, dd }
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Symmetric QR algorithm (Francis double-shift, implicit).
|
|
147
|
+
* Returns { values, vectors } where vectors is column-major (vectors[j] = j-th eigenvector).
|
|
148
|
+
*
|
|
149
|
+
* This operates on dense matrices — fine for N ≤ ~500. For larger N we use
|
|
150
|
+
* a power-iteration / deflation approach to extract only the smallest
|
|
151
|
+
* `maxK` eigenpairs.
|
|
152
|
+
*/
|
|
153
|
+
|
|
154
|
+
/** Dot product of two arrays */
|
|
155
|
+
function dot(a: Float64Array, b: Float64Array): number {
|
|
156
|
+
let s = 0
|
|
157
|
+
for (let i = 0; i < a.length; i++) s += a[i]! * b[i]!
|
|
158
|
+
return s
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/** Subtract projection: a -= (dot(a,b)/dot(b,b)) * b, in place */
|
|
162
|
+
function subtractProjection(a: Float64Array, b: Float64Array): void {
|
|
163
|
+
const scale = dot(a, b) / (dot(b, b) + 1e-15)
|
|
164
|
+
for (let i = 0; i < a.length; i++) a[i]! -= scale * b[i]!
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/** Normalise a vector in place, return its norm */
|
|
168
|
+
function normaliseVec(v: Float64Array): number {
|
|
169
|
+
const n = Math.sqrt(dot(v, v))
|
|
170
|
+
if (n > 1e-12) for (let i = 0; i < v.length; i++) v[i]! /= n
|
|
171
|
+
return n
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/** Multiply matrix M by vector v */
|
|
175
|
+
function matvec(M: Matrix, v: Float64Array): Float64Array<ArrayBuffer> {
|
|
176
|
+
const N = M.length
|
|
177
|
+
const out = new Float64Array(N) as Float64Array<ArrayBuffer>
|
|
178
|
+
for (let i = 0; i < N; i++) {
|
|
179
|
+
out[i] = dot(M[i]!, v)
|
|
180
|
+
}
|
|
181
|
+
return out
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Randomised power-iteration with deflation to extract the `k` eigenpairs
|
|
186
|
+
* corresponding to the *smallest* eigenvalues of a symmetric matrix M.
|
|
187
|
+
*
|
|
188
|
+
* M is the **negated** Laplacian (M = -L), so its *largest* eigenvalues
|
|
189
|
+
* correspond to L's smallest — matching the Python code which does `laplacian *= -1`.
|
|
190
|
+
*
|
|
191
|
+
* We use shifted inverse iteration: to find small eigenvalues of L we find
|
|
192
|
+
* large eigenvalues of (-L + shift*I) where shift ≈ 1 (the diagonal was set
|
|
193
|
+
* to 1 above). We iterate on M = -L and take the top-k eigenvectors, then
|
|
194
|
+
* negate the eigenvalues back.
|
|
195
|
+
*/
|
|
196
|
+
function topKEigenpairs(
|
|
197
|
+
negL: Matrix,
|
|
198
|
+
k: number,
|
|
199
|
+
maxIter = 300,
|
|
200
|
+
tol = 1e-6
|
|
201
|
+
): { values: Float64Array<ArrayBuffer>; vectors: Float64Array<ArrayBuffer>[] } {
|
|
202
|
+
const N = negL.length
|
|
203
|
+
const rng = seededRng(42)
|
|
204
|
+
|
|
205
|
+
const vectors: Float64Array<ArrayBuffer>[] = []
|
|
206
|
+
const values = new Float64Array(k)
|
|
207
|
+
|
|
208
|
+
for (let idx = 0; idx < k; idx++) {
|
|
209
|
+
// Random start
|
|
210
|
+
let v = Float64Array.from({ length: N }, () => rng() * 2 - 1)
|
|
211
|
+
normaliseVec(v)
|
|
212
|
+
|
|
213
|
+
// Deflate against already-found vectors
|
|
214
|
+
for (const u of vectors) subtractProjection(v, u)
|
|
215
|
+
normaliseVec(v)
|
|
216
|
+
|
|
217
|
+
let lambda = 0
|
|
218
|
+
for (let iter = 0; iter < maxIter; iter++) {
|
|
219
|
+
const Mv = matvec(negL, v)
|
|
220
|
+
|
|
221
|
+
// Deflate
|
|
222
|
+
for (const u of vectors) subtractProjection(Mv, u)
|
|
223
|
+
|
|
224
|
+
const newLambda = dot(v, Mv)
|
|
225
|
+
const norm = normaliseVec(Mv)
|
|
226
|
+
|
|
227
|
+
if (norm < 1e-14) break
|
|
228
|
+
|
|
229
|
+
const diff = Math.abs(newLambda - lambda)
|
|
230
|
+
lambda = newLambda
|
|
231
|
+
v = Mv
|
|
232
|
+
|
|
233
|
+
if (iter > 10 && diff < tol) break
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
vectors.push(v)
|
|
237
|
+
values[idx] = lambda
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return { values, vectors }
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/** Deterministic sign flip: each eigenvector's sign is chosen so that the
|
|
244
|
+
* component with the largest absolute value is positive (matches sklearn). */
|
|
245
|
+
function deterministicSignFlip(vectors: Float64Array[]): void {
|
|
246
|
+
for (const v of vectors) {
|
|
247
|
+
let maxAbs = 0
|
|
248
|
+
let maxSign = 1
|
|
249
|
+
for (const x of v) {
|
|
250
|
+
if (Math.abs(x) > maxAbs) {
|
|
251
|
+
maxAbs = Math.abs(x)
|
|
252
|
+
maxSign = x >= 0 ? 1 : -1
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
if (maxSign < 0) {
|
|
256
|
+
for (let i = 0; i < v.length; i++) v[i]! *= -1
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/** Simple seeded pseudo-random number generator (mulberry32). */
|
|
262
|
+
function seededRng(seed: number): () => number {
|
|
263
|
+
let s = seed
|
|
264
|
+
return () => {
|
|
265
|
+
s |= 0
|
|
266
|
+
s = (s + 0x6d2b79f5) | 0
|
|
267
|
+
let t = Math.imul(s ^ (s >>> 15), 1 | s)
|
|
268
|
+
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
|
|
269
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/** Euclidean distance squared */
|
|
274
|
+
function distSq(a: Float64Array, b: Float64Array): number {
|
|
275
|
+
let s = 0
|
|
276
|
+
for (let i = 0; i < a.length; i++) s += (a[i]! - b[i]!) ** 2
|
|
277
|
+
return s
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function kmeans(
|
|
281
|
+
points: Float64Array[],
|
|
282
|
+
k: number,
|
|
283
|
+
maxIter = 300,
|
|
284
|
+
seed = 0
|
|
285
|
+
): Int32Array {
|
|
286
|
+
const N = points.length
|
|
287
|
+
if (k >= N) return Int32Array.from({ length: N }, (_, i) => i % k)
|
|
288
|
+
|
|
289
|
+
const dim = points[0]!.length
|
|
290
|
+
const rng = seededRng(seed)
|
|
291
|
+
|
|
292
|
+
// K-means++ initialisation
|
|
293
|
+
const centroids: Float64Array[] = []
|
|
294
|
+
centroids.push(points[Math.floor(rng() * N)]!)
|
|
295
|
+
|
|
296
|
+
for (let c = 1; c < k; c++) {
|
|
297
|
+
const dists = points.map((p) =>
|
|
298
|
+
Math.min(...centroids.map((cent) => distSq(p, cent)))
|
|
299
|
+
)
|
|
300
|
+
const total = dists.reduce((a, b) => a + b, 0)
|
|
301
|
+
let r = rng() * total
|
|
302
|
+
let chosen = 0
|
|
303
|
+
for (let i = 0; i < N; i++) {
|
|
304
|
+
r -= dists[i]!
|
|
305
|
+
if (r <= 0) {
|
|
306
|
+
chosen = i
|
|
307
|
+
break
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
centroids.push(Float64Array.from(points[chosen]!))
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
const labels = new Int32Array(N)
|
|
314
|
+
|
|
315
|
+
for (let iter = 0; iter < maxIter; iter++) {
|
|
316
|
+
// Assignment
|
|
317
|
+
let changed = false
|
|
318
|
+
for (let i = 0; i < N; i++) {
|
|
319
|
+
let bestDist = Infinity
|
|
320
|
+
let bestLabel = 0
|
|
321
|
+
for (let c = 0; c < k; c++) {
|
|
322
|
+
const d = distSq(points[i]!, centroids[c]!)
|
|
323
|
+
if (d < bestDist) {
|
|
324
|
+
bestDist = d
|
|
325
|
+
bestLabel = c
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
if (labels[i] !== bestLabel) {
|
|
329
|
+
labels[i] = bestLabel
|
|
330
|
+
changed = true
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
if (!changed) break
|
|
335
|
+
|
|
336
|
+
// Update centroids
|
|
337
|
+
for (let c = 0; c < k; c++) {
|
|
338
|
+
const newCentroid = new Float64Array(dim)
|
|
339
|
+
let count = 0
|
|
340
|
+
for (let i = 0; i < N; i++) {
|
|
341
|
+
if (labels[i] === c) {
|
|
342
|
+
for (let d = 0; d < dim; d++) newCentroid[d]! += points[i]![d]!
|
|
343
|
+
count++
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
if (count > 0) {
|
|
347
|
+
for (let d = 0; d < dim; d++) newCentroid[d]! /= count
|
|
348
|
+
centroids[c] = newCentroid
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
return labels
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
/**
|
|
357
|
+
* Recursively split a Cluster into sub-clusters using spectral clustering.
|
|
358
|
+
* Returns [input] when the cluster is small enough to be a leaf.
|
|
359
|
+
*/
|
|
360
|
+
export function splitCluster(input: Cluster): Cluster[] {
|
|
361
|
+
const N = input.entries.length
|
|
362
|
+
|
|
363
|
+
if (N <= MAX_LEAVES) return [input]
|
|
364
|
+
|
|
365
|
+
const normalized = normaliseRows(matFromEmbeds(input.entries))
|
|
366
|
+
|
|
367
|
+
// --- Adaptive k-NN: find smallest k that gives 1 connected component ---
|
|
368
|
+
const candidateKs: number[] = []
|
|
369
|
+
for (let n = 0; ; n++) {
|
|
370
|
+
const k = Math.round(Math.exp(n))
|
|
371
|
+
if (k >= N) break
|
|
372
|
+
candidateKs.push(k)
|
|
373
|
+
}
|
|
374
|
+
candidateKs.push(Math.floor(N / 2))
|
|
375
|
+
|
|
376
|
+
let chosenK = candidateKs[candidateKs.length - 1]!
|
|
377
|
+
let chosenKnnResult: { distances: Float64Array[]; indices: Int32Array[] } | null = null
|
|
378
|
+
|
|
379
|
+
for (const k of candidateKs) {
|
|
380
|
+
const knnResult = knn(normalized, k)
|
|
381
|
+
const nComponents = connectedComponents(knnResult.indices, N)
|
|
382
|
+
if (nComponents === 1) {
|
|
383
|
+
chosenK = k
|
|
384
|
+
chosenKnnResult = knnResult
|
|
385
|
+
break
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
if (chosenKnnResult === null) {
|
|
390
|
+
// Fallback: compute for the last candidate (floor(N/2))
|
|
391
|
+
chosenKnnResult = knn(normalized, chosenK)
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
const { distances, indices } = chosenKnnResult
|
|
395
|
+
|
|
396
|
+
// --- Build affinity matrix (sparse triplets) ---
|
|
397
|
+
// σ[i] = distance to Kth nearest neighbour
|
|
398
|
+
const sigmas = distances.map((d) => d[d.length - 1]!)
|
|
399
|
+
|
|
400
|
+
const sparseAffinity: Array<{ i: number; j: number; v: number }> = []
|
|
401
|
+
|
|
402
|
+
for (let i = 0; i < N; i++) {
|
|
403
|
+
for (let n = 0; n < chosenK; n++) {
|
|
404
|
+
const j = indices[i]![n]!
|
|
405
|
+
const d = distances[i]![n]!
|
|
406
|
+
const sigma_i = sigmas[i]!
|
|
407
|
+
const sigma_j = sigmas[j]!
|
|
408
|
+
const denom = Math.max(sigma_i * sigma_j, 1e-12)
|
|
409
|
+
const v = Math.exp(-(d * d) / denom)
|
|
410
|
+
sparseAffinity.push({ i, j, v })
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// --- Normalised Laplacian ---
|
|
415
|
+
const { L, dd } = buildNormalisedLaplacian(sparseAffinity, N)
|
|
416
|
+
|
|
417
|
+
// Negate L (as Python does `laplacian *= -1`) so power iteration finds
|
|
418
|
+
// eigenvectors of -L, whose top eigenvalues correspond to L's bottom ones.
|
|
419
|
+
const negL: Matrix = L.map((row) => {
|
|
420
|
+
const r = Float64Array.from(row)
|
|
421
|
+
for (let i = 0; i < r.length; i++) r[i]! *= -1
|
|
422
|
+
return r
|
|
423
|
+
})
|
|
424
|
+
|
|
425
|
+
const k = Math.min(MAX_CLUSTERS + 1, N)
|
|
426
|
+
const { values: rawValues, vectors } = topKEigenpairs(negL, k)
|
|
427
|
+
|
|
428
|
+
// Eigenvalues were of -L; flip sign back to get L eigenvalues
|
|
429
|
+
const eigenvalues = Float64Array.from(rawValues, (v) => -v)
|
|
430
|
+
|
|
431
|
+
// Sort by eigenvalue ascending (smallest first), skip index 0
|
|
432
|
+
const sortedIdx = Array.from({ length: k }, (_, i) => i).sort(
|
|
433
|
+
(a, b) => eigenvalues[a]! - eigenvalues[b]!
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
const sortedEigenvalues = Float64Array.from(sortedIdx, (i) => eigenvalues[i]!)
|
|
437
|
+
const sortedVectors = sortedIdx.map((i) => vectors[i]!)
|
|
438
|
+
|
|
439
|
+
deterministicSignFlip(sortedVectors)
|
|
440
|
+
|
|
441
|
+
// --- Eigengap heuristic (skip λ₀ ≈ 0) ---
|
|
442
|
+
// n_clusters = argmax(diff(eigenvalues[1:])) + 2
|
|
443
|
+
let maxGap = -Infinity
|
|
444
|
+
let nClusters = 2
|
|
445
|
+
for (let i = 1; i < sortedEigenvalues.length - 1; i++) {
|
|
446
|
+
const gap = sortedEigenvalues[i + 1]! - sortedEigenvalues[i]!
|
|
447
|
+
if (gap > maxGap) {
|
|
448
|
+
maxGap = gap
|
|
449
|
+
nClusters = i + 1 // 1-indexed + 1 for the off-by-one vs Python
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
nClusters = Math.max(2, Math.min(nClusters, MAX_CLUSTERS))
|
|
453
|
+
|
|
454
|
+
// --- Spectral embeddings: use eigenvectors 1..nClusters (skip 0) ---
|
|
455
|
+
// Build [N × nClusters] matrix, normalise each row
|
|
456
|
+
const spectralPoints: Float64Array[] = Array.from({ length: N }, () =>
|
|
457
|
+
new Float64Array(nClusters)
|
|
458
|
+
)
|
|
459
|
+
for (let c = 0; c < nClusters; c++) {
|
|
460
|
+
const vec = sortedVectors[c + 1] // skip smallest (index 0)
|
|
461
|
+
if (vec === undefined) break
|
|
462
|
+
for (let i = 0; i < N; i++) {
|
|
463
|
+
// Divide by dd[i] (matches Python `wide_spectral_embeddings = eigenvectors.T / dd`)
|
|
464
|
+
spectralPoints[i]![c] = (vec[i]! / dd[i]!)
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
// L2-normalise each row
|
|
468
|
+
for (const row of spectralPoints) {
|
|
469
|
+
let norm = 0
|
|
470
|
+
for (const v of row) norm += v * v
|
|
471
|
+
norm = Math.sqrt(norm)
|
|
472
|
+
if (norm > 1e-12) for (let d = 0; d < row.length; d++) row[d]! /= norm
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
// --- K-means ---
|
|
476
|
+
const labels = kmeans(spectralPoints, nClusters)
|
|
477
|
+
|
|
478
|
+
// Group entries by cluster label, preserving order
|
|
479
|
+
const groups = new Map<number, EmbedEntry[]>()
|
|
480
|
+
for (let i = 0; i < N; i++) {
|
|
481
|
+
const label = labels[i]!
|
|
482
|
+
if (!groups.has(label)) groups.set(label, [])
|
|
483
|
+
groups.get(label)!.push(input.entries[i]!)
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
return Array.from(groups.values()).map((entries) => ({ entries }))
|
|
487
|
+
}
|