@et0and/ovid 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -4
- package/src/cache.ts +119 -0
- package/src/cluster.ts +72 -65
- package/src/embed.ts +50 -30
- package/src/main.ts +36 -3
package/package.json
CHANGED
package/src/cache.ts
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Two-layer result cache for semantic-navigator.
|
|
3
|
+
*
|
|
4
|
+
* Layer 1 — Embedding cache (~/.cache/semantic-navigator/embeddings.json)
|
|
5
|
+
* Content-addressed: maps sha256(text)[0:16] → number[]
|
|
6
|
+
* Per-entry granularity: only re-embed chunks whose text changed.
|
|
7
|
+
*
|
|
8
|
+
* Layer 2 — Tree cache (~/.cache/semantic-navigator/trees/<fingerprint>.json)
|
|
9
|
+
* Keyed by sha256(model + sorted(path:contentHash pairs)).
|
|
10
|
+
* A single changed file invalidates the whole tree, forcing a fresh
|
|
11
|
+
* cluster + label run, but embeddings are still reused from layer 1.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { createHash } from "node:crypto"
|
|
15
|
+
import { mkdirSync, existsSync, readFileSync, writeFileSync } from "node:fs"
|
|
16
|
+
import { join } from "node:path"
|
|
17
|
+
import { homedir } from "node:os"
|
|
18
|
+
import type { Tree } from "./tree.ts"
|
|
19
|
+
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Paths
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
const CACHE_DIR = join(homedir(), ".cache", "semantic-navigator")
|
|
25
|
+
const EMBED_CACHE_PATH = join(CACHE_DIR, "embeddings.json")
|
|
26
|
+
const TREES_DIR = join(CACHE_DIR, "trees")
|
|
27
|
+
|
|
28
|
+
function ensureDirs(): void {
|
|
29
|
+
mkdirSync(CACHE_DIR, { recursive: true })
|
|
30
|
+
mkdirSync(TREES_DIR, { recursive: true })
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// Helpers
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
/** Short hex digest: sha256(text).slice(0, 16) */
|
|
38
|
+
export function textHash(text: string): string {
|
|
39
|
+
return createHash("sha256").update(text).digest("hex").slice(0, 16)
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Fingerprint for the tree cache: sha256(model + sorted path:contentHash pairs).
|
|
44
|
+
* `fileHashes` is a map from relative path to sha256(file content).
|
|
45
|
+
*/
|
|
46
|
+
export function treeFingerprint(model: string, fileHashes: Map<string, string>): string {
|
|
47
|
+
const entries = Array.from(fileHashes.entries())
|
|
48
|
+
.map(([p, h]) => `${p}:${h}`)
|
|
49
|
+
.sort()
|
|
50
|
+
.join("\n")
|
|
51
|
+
return createHash("sha256").update(model + "\n" + entries).digest("hex")
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// Layer 1: Embedding cache
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
type EmbedCacheMap = Record<string, number[]>
|
|
59
|
+
|
|
60
|
+
let _embedCache: EmbedCacheMap | null = null
|
|
61
|
+
|
|
62
|
+
function loadEmbedCache(): EmbedCacheMap {
|
|
63
|
+
if (_embedCache !== null) return _embedCache
|
|
64
|
+
if (existsSync(EMBED_CACHE_PATH)) {
|
|
65
|
+
try {
|
|
66
|
+
_embedCache = JSON.parse(readFileSync(EMBED_CACHE_PATH, "utf-8")) as EmbedCacheMap
|
|
67
|
+
} catch {
|
|
68
|
+
_embedCache = {}
|
|
69
|
+
}
|
|
70
|
+
} else {
|
|
71
|
+
_embedCache = {}
|
|
72
|
+
}
|
|
73
|
+
return _embedCache
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** Look up a cached embedding by text content. Returns null on miss. */
|
|
77
|
+
export function getCachedEmbedding(text: string): Float32Array | null {
|
|
78
|
+
const cache = loadEmbedCache()
|
|
79
|
+
const key = textHash(text)
|
|
80
|
+
const vec = cache[key]
|
|
81
|
+
if (vec === undefined) return null
|
|
82
|
+
return Float32Array.from(vec)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/** Store an embedding in the in-memory cache (call flushEmbedCache to persist). */
|
|
86
|
+
export function setCachedEmbedding(text: string, embedding: Float32Array): void {
|
|
87
|
+
const cache = loadEmbedCache()
|
|
88
|
+
const key = textHash(text)
|
|
89
|
+
cache[key] = Array.from(embedding)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/** Persist the in-memory embedding cache to disk. */
|
|
93
|
+
export function flushEmbedCache(): void {
|
|
94
|
+
if (_embedCache === null) return
|
|
95
|
+
ensureDirs()
|
|
96
|
+
writeFileSync(EMBED_CACHE_PATH, JSON.stringify(_embedCache), "utf-8")
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
// Layer 2: Tree cache
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
/** Look up a cached Tree by fingerprint. Returns null on miss. */
|
|
104
|
+
export function getCachedTree(fingerprint: string): Tree | null {
|
|
105
|
+
const treePath = join(TREES_DIR, `${fingerprint}.json`)
|
|
106
|
+
if (!existsSync(treePath)) return null
|
|
107
|
+
try {
|
|
108
|
+
return JSON.parse(readFileSync(treePath, "utf-8")) as Tree
|
|
109
|
+
} catch {
|
|
110
|
+
return null
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/** Persist a Tree to the tree cache. */
|
|
115
|
+
export function setCachedTree(fingerprint: string, tree: Tree): void {
|
|
116
|
+
ensureDirs()
|
|
117
|
+
const treePath = join(TREES_DIR, `${fingerprint}.json`)
|
|
118
|
+
writeFileSync(treePath, JSON.stringify(tree), "utf-8")
|
|
119
|
+
}
|
package/src/cluster.ts
CHANGED
|
@@ -95,19 +95,24 @@ function connectedComponents(indices: Int32Array[], N: number): number {
|
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
/**
|
|
98
|
-
* Build the
|
|
99
|
-
* a
|
|
100
|
-
*
|
|
98
|
+
* Build the degree diagonal `dd` from sparse affinity triplets, then return
|
|
99
|
+
* a sparse matvec closure for the *negated* normalised Laplacian (-L_norm).
|
|
100
|
+
*
|
|
101
|
+
* L_norm = I - D^{-1/2} A D^{-1/2}
|
|
102
|
+
* -L_norm = D^{-1/2} A D^{-1/2} - I
|
|
103
|
+
*
|
|
104
|
+
* The matvec avoids allocating an N×N dense matrix — at N=2000, k_sparse≈7,
|
|
105
|
+
* this reduces the cost per multiply from O(N²)=4M to O(N·k)=14k ops.
|
|
101
106
|
*/
|
|
102
|
-
function
|
|
107
|
+
function buildNormLaplacianSparseMatvec(
|
|
103
108
|
sparseAffinity: Array<{ i: number; j: number; v: number }>,
|
|
104
109
|
N: number
|
|
105
|
-
): {
|
|
110
|
+
): { matvec: (v: Float64Array) => Float64Array; dd: Float64Array } {
|
|
106
111
|
// Accumulate row sums (degree) for normalisation
|
|
107
112
|
const degree = new Float64Array(N)
|
|
108
113
|
for (const { i, j, v } of sparseAffinity) {
|
|
109
|
-
degree[i]
|
|
110
|
-
if (i !== j) degree[j]
|
|
114
|
+
degree[i]! += v
|
|
115
|
+
if (i !== j) degree[j]! += v
|
|
111
116
|
}
|
|
112
117
|
|
|
113
118
|
const dd = new Float64Array(N)
|
|
@@ -115,31 +120,29 @@ function buildNormalisedLaplacian(
|
|
|
115
120
|
dd[i] = degree[i]! > 1e-12 ? 1 / Math.sqrt(degree[i]!) : 0
|
|
116
121
|
}
|
|
117
122
|
|
|
118
|
-
//
|
|
119
|
-
//
|
|
120
|
-
const
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
})
|
|
125
|
-
|
|
126
|
-
//
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
const
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
123
|
+
// Pre-compute normalised weights once so the closure stays cheap.
|
|
124
|
+
// w_ij = v * dd[i] * dd[j] (the off-diagonal contribution to A_norm)
|
|
125
|
+
const normAffinity = sparseAffinity.map(({ i, j, v }) => ({
|
|
126
|
+
i,
|
|
127
|
+
j,
|
|
128
|
+
w: v * dd[i]! * dd[j]!,
|
|
129
|
+
}))
|
|
130
|
+
|
|
131
|
+
// Matvec for -L_norm = A_norm - I
|
|
132
|
+
// result[i] = -v[i] + sum_j w_ij * v[j] (using symmetry)
|
|
133
|
+
const matvecFn = (vec: Float64Array): Float64Array => {
|
|
134
|
+
const out = new Float64Array(N)
|
|
135
|
+
// Start from -I · vec
|
|
136
|
+
for (let i = 0; i < N; i++) out[i] = -vec[i]!
|
|
137
|
+
// Add symmetric A_norm contributions
|
|
138
|
+
for (const { i, j, w } of normAffinity) {
|
|
139
|
+
out[i]! += w * vec[j]!
|
|
140
|
+
if (i !== j) out[j]! += w * vec[i]!
|
|
134
141
|
}
|
|
142
|
+
return out
|
|
135
143
|
}
|
|
136
144
|
|
|
137
|
-
|
|
138
|
-
for (let i = 0; i < N; i++) {
|
|
139
|
-
L[i]![i] = 1
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
return { L, dd }
|
|
145
|
+
return { matvec: matvecFn, dd }
|
|
143
146
|
}
|
|
144
147
|
|
|
145
148
|
/**
|
|
@@ -171,35 +174,24 @@ function normaliseVec(v: Float64Array): number {
|
|
|
171
174
|
return n
|
|
172
175
|
}
|
|
173
176
|
|
|
174
|
-
/** Multiply matrix M by vector v */
|
|
175
|
-
function matvec(M: Matrix, v: Float64Array): Float64Array<ArrayBuffer> {
|
|
176
|
-
const N = M.length
|
|
177
|
-
const out = new Float64Array(N) as Float64Array<ArrayBuffer>
|
|
178
|
-
for (let i = 0; i < N; i++) {
|
|
179
|
-
out[i] = dot(M[i]!, v)
|
|
180
|
-
}
|
|
181
|
-
return out
|
|
182
|
-
}
|
|
183
|
-
|
|
184
177
|
/**
|
|
185
178
|
* Randomised power-iteration with deflation to extract the `k` eigenpairs
|
|
186
|
-
* corresponding to the *smallest* eigenvalues of a symmetric matrix
|
|
187
|
-
*
|
|
188
|
-
* M is the **negated** Laplacian (M = -L), so its *largest* eigenvalues
|
|
189
|
-
* correspond to L's smallest — matching the Python code which does `laplacian *= -1`.
|
|
179
|
+
* corresponding to the *smallest* eigenvalues of a symmetric matrix.
|
|
190
180
|
*
|
|
191
|
-
*
|
|
192
|
-
*
|
|
193
|
-
*
|
|
194
|
-
*
|
|
181
|
+
* Instead of a dense matrix, accepts a sparse `matvecFn` closure so that the
|
|
182
|
+
* per-iteration cost is O(N·k_sparse) rather than O(N²). The closure should
|
|
183
|
+
* implement multiplication by the *negated* Laplacian (-L_norm), whose top
|
|
184
|
+
* eigenvalues correspond to L_norm's bottom ones (matching the Python code
|
|
185
|
+
* which does `laplacian *= -1`).
|
|
195
186
|
*/
|
|
196
187
|
function topKEigenpairs(
|
|
197
|
-
|
|
188
|
+
matvecFn: (v: Float64Array) => Float64Array,
|
|
189
|
+
n: number,
|
|
198
190
|
k: number,
|
|
199
191
|
maxIter = 300,
|
|
200
192
|
tol = 1e-6
|
|
201
193
|
): { values: Float64Array<ArrayBuffer>; vectors: Float64Array<ArrayBuffer>[] } {
|
|
202
|
-
const N =
|
|
194
|
+
const N = n
|
|
203
195
|
const rng = seededRng(42)
|
|
204
196
|
|
|
205
197
|
const vectors: Float64Array<ArrayBuffer>[] = []
|
|
@@ -207,7 +199,7 @@ function topKEigenpairs(
|
|
|
207
199
|
|
|
208
200
|
for (let idx = 0; idx < k; idx++) {
|
|
209
201
|
// Random start
|
|
210
|
-
let v = Float64Array.from({ length: N }, () => rng() * 2 - 1)
|
|
202
|
+
let v = Float64Array.from({ length: N }, () => rng() * 2 - 1) as Float64Array<ArrayBuffer>
|
|
211
203
|
normaliseVec(v)
|
|
212
204
|
|
|
213
205
|
// Deflate against already-found vectors
|
|
@@ -216,7 +208,7 @@ function topKEigenpairs(
|
|
|
216
208
|
|
|
217
209
|
let lambda = 0
|
|
218
210
|
for (let iter = 0; iter < maxIter; iter++) {
|
|
219
|
-
const Mv =
|
|
211
|
+
const Mv = matvecFn(v) as Float64Array<ArrayBuffer>
|
|
220
212
|
|
|
221
213
|
// Deflate
|
|
222
214
|
for (const u of vectors) subtractProjection(Mv, u)
|
|
@@ -364,6 +356,29 @@ export function splitCluster(input: Cluster): Cluster[] {
|
|
|
364
356
|
|
|
365
357
|
const normalized = normaliseRows(matFromEmbeds(input.entries))
|
|
366
358
|
|
|
359
|
+
// --- Precompute all pairwise distances once (O(N²)) ---
|
|
360
|
+
// Each row is sorted ascending so we can slice any k cheaply.
|
|
361
|
+
const allDistances: Array<Array<[number, number]>> = Array.from({ length: N }, () => [])
|
|
362
|
+
for (let i = 0; i < N; i++) {
|
|
363
|
+
for (let j = 0; j < N; j++) {
|
|
364
|
+
if (j === i) continue
|
|
365
|
+
allDistances[i]!.push([cosDist(normalized[i]!, normalized[j]!), j])
|
|
366
|
+
}
|
|
367
|
+
allDistances[i]!.sort((a, b) => a[0] - b[0])
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
/** Slice sorted rows to get k-NN result for any k in O(N·k). */
|
|
371
|
+
function knnFromPrecomputed(k: number): { distances: Float64Array[]; indices: Int32Array[] } {
|
|
372
|
+
const distances: Float64Array[] = []
|
|
373
|
+
const indices: Int32Array[] = []
|
|
374
|
+
for (let i = 0; i < N; i++) {
|
|
375
|
+
const row = allDistances[i]!.slice(0, k)
|
|
376
|
+
distances.push(Float64Array.from(row.map((x) => x[0])))
|
|
377
|
+
indices.push(Int32Array.from(row.map((x) => x[1])))
|
|
378
|
+
}
|
|
379
|
+
return { distances, indices }
|
|
380
|
+
}
|
|
381
|
+
|
|
367
382
|
// --- Adaptive k-NN: find smallest k that gives 1 connected component ---
|
|
368
383
|
const candidateKs: number[] = []
|
|
369
384
|
for (let n = 0; ; n++) {
|
|
@@ -377,7 +392,7 @@ export function splitCluster(input: Cluster): Cluster[] {
|
|
|
377
392
|
let chosenKnnResult: { distances: Float64Array[]; indices: Int32Array[] } | null = null
|
|
378
393
|
|
|
379
394
|
for (const k of candidateKs) {
|
|
380
|
-
const knnResult =
|
|
395
|
+
const knnResult = knnFromPrecomputed(k)
|
|
381
396
|
const nComponents = connectedComponents(knnResult.indices, N)
|
|
382
397
|
if (nComponents === 1) {
|
|
383
398
|
chosenK = k
|
|
@@ -387,8 +402,7 @@ export function splitCluster(input: Cluster): Cluster[] {
|
|
|
387
402
|
}
|
|
388
403
|
|
|
389
404
|
if (chosenKnnResult === null) {
|
|
390
|
-
|
|
391
|
-
chosenKnnResult = knn(normalized, chosenK)
|
|
405
|
+
chosenKnnResult = knnFromPrecomputed(chosenK)
|
|
392
406
|
}
|
|
393
407
|
|
|
394
408
|
const { distances, indices } = chosenKnnResult
|
|
@@ -411,19 +425,12 @@ export function splitCluster(input: Cluster): Cluster[] {
|
|
|
411
425
|
}
|
|
412
426
|
}
|
|
413
427
|
|
|
414
|
-
// ---
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
// Negate L (as Python does `laplacian *= -1`) so power iteration finds
|
|
418
|
-
// eigenvectors of -L, whose top eigenvalues correspond to L's bottom ones.
|
|
419
|
-
const negL: Matrix = L.map((row) => {
|
|
420
|
-
const r = Float64Array.from(row)
|
|
421
|
-
for (let i = 0; i < r.length; i++) r[i]! *= -1
|
|
422
|
-
return r
|
|
423
|
-
})
|
|
428
|
+
// --- Sparse normalised Laplacian matvec ---
|
|
429
|
+
// Avoids building an N×N dense matrix; matvec cost is O(N·k_sparse) vs O(N²).
|
|
430
|
+
const { matvec: negLMatvec, dd } = buildNormLaplacianSparseMatvec(sparseAffinity, N)
|
|
424
431
|
|
|
425
432
|
const k = Math.min(MAX_CLUSTERS + 1, N)
|
|
426
|
-
const { values: rawValues, vectors } = topKEigenpairs(
|
|
433
|
+
const { values: rawValues, vectors } = topKEigenpairs(negLMatvec, N, k)
|
|
427
434
|
|
|
428
435
|
// Eigenvalues were of -L; flip sign back to get L eigenvalues
|
|
429
436
|
const eigenvalues = Float64Array.from(rawValues, (v) => -v)
|
package/src/embed.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { pipeline, type FeatureExtractionPipeline } from "@huggingface/transformers"
|
|
2
2
|
import type { Chunk } from "./tokenize.ts"
|
|
3
|
+
import { getCachedEmbedding, setCachedEmbedding, flushEmbedCache } from "./cache.ts"
|
|
3
4
|
|
|
4
5
|
export const DEFAULT_EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2"
|
|
5
6
|
|
|
@@ -19,6 +20,8 @@ export interface EmbedOptions {
|
|
|
19
20
|
model: string
|
|
20
21
|
batchSize: number
|
|
21
22
|
concurrency: number
|
|
23
|
+
/** When true, skip reading from cache (but still write to it). */
|
|
24
|
+
noCache?: boolean
|
|
22
25
|
}
|
|
23
26
|
|
|
24
27
|
let _pipe: FeatureExtractionPipeline | null = null
|
|
@@ -81,6 +84,7 @@ async function embedBatch(
|
|
|
81
84
|
|
|
82
85
|
/**
|
|
83
86
|
* Embed all chunks using the local model, with batching + concurrency limits.
|
|
87
|
+
* Chunks whose text is already in the embedding cache are skipped.
|
|
84
88
|
* Calls `onProgress(done, total)` after each batch completes.
|
|
85
89
|
*/
|
|
86
90
|
export async function embedChunks(
|
|
@@ -90,44 +94,60 @@ export async function embedChunks(
|
|
|
90
94
|
): Promise<EmbedEntry[]> {
|
|
91
95
|
if (chunks.length === 0) return []
|
|
92
96
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
const
|
|
96
|
-
|
|
97
|
-
|
|
97
|
+
// --- Cache pass: resolve hits immediately, collect misses for the model ---
|
|
98
|
+
const entries: EmbedEntry[] = new Array(chunks.length)
|
|
99
|
+
const missIndices: number[] = []
|
|
100
|
+
|
|
101
|
+
if (!opts.noCache) {
|
|
102
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
103
|
+
const cached = getCachedEmbedding(chunks[i]!.text)
|
|
104
|
+
if (cached !== null) {
|
|
105
|
+
entries[i] = { path: chunks[i]!.path, text: chunks[i]!.text, embedding: cached }
|
|
106
|
+
} else {
|
|
107
|
+
missIndices.push(i)
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
} else {
|
|
111
|
+
for (let i = 0; i < chunks.length; i++) missIndices.push(i)
|
|
98
112
|
}
|
|
99
113
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
let done = 0
|
|
114
|
+
let done = chunks.length - missIndices.length
|
|
115
|
+
onProgress?.(done, chunks.length)
|
|
103
116
|
|
|
104
|
-
|
|
105
|
-
const
|
|
106
|
-
const
|
|
117
|
+
if (missIndices.length > 0) {
|
|
118
|
+
const pipe = await getEmbedPipeline(opts.model)
|
|
119
|
+
const missChunks = missIndices.map((i) => chunks[i]!)
|
|
107
120
|
|
|
108
|
-
const
|
|
109
|
-
|
|
110
|
-
|
|
121
|
+
const batches: number[][] = []
|
|
122
|
+
for (let i = 0; i < missChunks.length; i += opts.batchSize) {
|
|
123
|
+
batches.push(missIndices.slice(i, i + opts.batchSize))
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
for (let i = 0; i < batches.length; i += opts.concurrency) {
|
|
127
|
+
const concurrentBatches = batches.slice(i, i + opts.concurrency)
|
|
128
|
+
|
|
129
|
+
const batchResults = await Promise.all(
|
|
130
|
+
concurrentBatches.map((idxBatch) =>
|
|
131
|
+
embedBatch(pipe, idxBatch.map((ci) => chunks[ci]!.text))
|
|
132
|
+
)
|
|
111
133
|
)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
text: chunk.text,
|
|
123
|
-
embedding: embeddings[j]!,
|
|
134
|
+
|
|
135
|
+
for (let b = 0; b < concurrentBatches.length; b++) {
|
|
136
|
+
const idxBatch = concurrentBatches[b]!
|
|
137
|
+
const embeddings = batchResults[b]!
|
|
138
|
+
for (let j = 0; j < idxBatch.length; j++) {
|
|
139
|
+
const ci = idxBatch[j]!
|
|
140
|
+
const chunk = chunks[ci]!
|
|
141
|
+
const embedding = embeddings[j]!
|
|
142
|
+
entries[ci] = { path: chunk.path, text: chunk.text, embedding }
|
|
143
|
+
setCachedEmbedding(chunk.text, embedding)
|
|
124
144
|
}
|
|
125
|
-
|
|
145
|
+
done += idxBatch.length
|
|
146
|
+
onProgress?.(done, chunks.length)
|
|
126
147
|
}
|
|
127
|
-
chunkIndex += batch.length
|
|
128
|
-
done += batch.length
|
|
129
|
-
onProgress?.(done, chunks.length)
|
|
130
148
|
}
|
|
149
|
+
|
|
150
|
+
flushEmbedCache()
|
|
131
151
|
}
|
|
132
152
|
|
|
133
153
|
return entries
|
package/src/main.ts
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
|
|
16
16
|
import { Command } from "commander"
|
|
17
17
|
import path from "node:path"
|
|
18
|
+
import { createHash } from "node:crypto"
|
|
18
19
|
|
|
19
20
|
import { discoverFiles, DEFAULT_EXCLUDES, type FsOptions } from "./fs.ts"
|
|
20
21
|
import { chunkFile } from "./tokenize.ts"
|
|
@@ -24,6 +25,7 @@ import { buildTree } from "./tree.ts"
|
|
|
24
25
|
import { clearAuthCache, getCopilotToken } from "./auth.ts"
|
|
25
26
|
import { SemanticNavigatorUI, type ProgressState } from "./ui.ts"
|
|
26
27
|
import type { CopilotConfig } from "./labels.ts"
|
|
28
|
+
import { treeFingerprint, getCachedTree, setCachedTree } from "./cache.ts"
|
|
27
29
|
|
|
28
30
|
// ---------------------------------------------------------------------------
|
|
29
31
|
// CLI definition
|
|
@@ -40,6 +42,7 @@ const program = new Command()
|
|
|
40
42
|
.option("--read-concurrency <n>", "Concurrent file reads", (v) => parseInt(v, 10), 64)
|
|
41
43
|
.option("--embed-batch-size <n>", "Chunks per embedding batch", (v) => parseInt(v, 10), 32)
|
|
42
44
|
.option("--embed-concurrency <n>", "Concurrent embedding batches", (v) => parseInt(v, 10), 2)
|
|
45
|
+
.option("--no-cache", "Skip reading from cache; force re-embed and re-label")
|
|
43
46
|
.option("--logout", "Clear cached GitHub / Copilot credentials and exit")
|
|
44
47
|
.helpOption("-h, --help", "Show help")
|
|
45
48
|
|
|
@@ -60,6 +63,7 @@ async function main(): Promise<void> {
|
|
|
60
63
|
readConcurrency: number
|
|
61
64
|
embedBatchSize: number
|
|
62
65
|
embedConcurrency: number
|
|
66
|
+
cache: boolean
|
|
63
67
|
logout: boolean | undefined
|
|
64
68
|
}>()
|
|
65
69
|
|
|
@@ -167,6 +171,7 @@ async function main(): Promise<void> {
|
|
|
167
171
|
model: DEFAULT_EMBEDDING_MODEL,
|
|
168
172
|
batchSize: opts.embedBatchSize,
|
|
169
173
|
concurrency: opts.embedConcurrency,
|
|
174
|
+
noCache: !opts.cache,
|
|
170
175
|
}
|
|
171
176
|
|
|
172
177
|
let embedEntriesRaw: EmbedEntry[] | undefined
|
|
@@ -184,7 +189,32 @@ async function main(): Promise<void> {
|
|
|
184
189
|
}
|
|
185
190
|
|
|
186
191
|
// ---------------------------------------------------------------------------
|
|
187
|
-
// Step 5:
|
|
192
|
+
// Step 5: Compute tree fingerprint and check tree cache
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
// Build a map of relativePath → sha256(content) for all discovered files.
|
|
196
|
+
const fileHashes = new Map<string, string>()
|
|
197
|
+
for (const f of resolvedFiles) {
|
|
198
|
+
fileHashes.set(
|
|
199
|
+
f.relativePath,
|
|
200
|
+
createHash("sha256").update(f.content).digest("hex").slice(0, 16)
|
|
201
|
+
)
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const fingerprint = treeFingerprint(opts.completionModel, fileHashes)
|
|
205
|
+
const noCache = !opts.cache
|
|
206
|
+
|
|
207
|
+
if (!noCache) {
|
|
208
|
+
const cached = getCachedTree(fingerprint)
|
|
209
|
+
if (cached !== null) {
|
|
210
|
+
ui.setTree(cached)
|
|
211
|
+
// The UI event loop keeps the process alive until the user presses q/Esc.
|
|
212
|
+
return
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// ---------------------------------------------------------------------------
|
|
217
|
+
// Step 6: Spectral clustering (CPU-bound, synchronous)
|
|
188
218
|
// ---------------------------------------------------------------------------
|
|
189
219
|
|
|
190
220
|
ui.updateProgress({
|
|
@@ -201,7 +231,7 @@ async function main(): Promise<void> {
|
|
|
201
231
|
await Bun.sleep(0)
|
|
202
232
|
|
|
203
233
|
// ---------------------------------------------------------------------------
|
|
204
|
-
// Step
|
|
234
|
+
// Step 7: Build labelled tree
|
|
205
235
|
// ---------------------------------------------------------------------------
|
|
206
236
|
|
|
207
237
|
ui.updateProgress({
|
|
@@ -223,8 +253,11 @@ async function main(): Promise<void> {
|
|
|
223
253
|
}
|
|
224
254
|
const tree = treeRaw!
|
|
225
255
|
|
|
256
|
+
// Persist to tree cache for future runs
|
|
257
|
+
setCachedTree(fingerprint, tree)
|
|
258
|
+
|
|
226
259
|
// ---------------------------------------------------------------------------
|
|
227
|
-
// Step
|
|
260
|
+
// Step 8: Hand the tree to the UI
|
|
228
261
|
// ---------------------------------------------------------------------------
|
|
229
262
|
|
|
230
263
|
ui.setTree(tree)
|