@cruxy/cli 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -1
- package/dist/cli/commands/index.d.ts +7 -0
- package/dist/cli/commands/index.js +59 -0
- package/dist/cli/commands/skills.d.ts +8 -0
- package/dist/cli/commands/skills.js +51 -0
- package/dist/cli/program.js +4 -0
- package/dist/config/schema.d.ts +199 -0
- package/dist/config/schema.js +55 -0
- package/dist/constants.d.ts +13 -0
- package/dist/constants.js +13 -0
- package/dist/indexing/chunker.d.ts +28 -0
- package/dist/indexing/chunker.js +65 -0
- package/dist/indexing/embedder.d.ts +98 -0
- package/dist/indexing/embedder.js +140 -0
- package/dist/indexing/index.d.ts +9 -0
- package/dist/indexing/index.js +9 -0
- package/dist/indexing/indexer.d.ts +45 -0
- package/dist/indexing/indexer.js +104 -0
- package/dist/indexing/retriever.d.ts +32 -0
- package/dist/indexing/retriever.js +53 -0
- package/dist/indexing/service.d.ts +49 -0
- package/dist/indexing/service.js +132 -0
- package/dist/indexing/store.d.ts +103 -0
- package/dist/indexing/store.js +279 -0
- package/dist/indexing/types.d.ts +71 -0
- package/dist/indexing/types.js +6 -0
- package/dist/indexing/util.d.ts +34 -0
- package/dist/indexing/util.js +97 -0
- package/dist/indexing/walker.d.ts +42 -0
- package/dist/indexing/walker.js +166 -0
- package/dist/skills/index.d.ts +4 -0
- package/dist/skills/index.js +4 -0
- package/dist/skills/loader.d.ts +42 -0
- package/dist/skills/loader.js +0 -0
- package/dist/skills/parser.d.ts +29 -0
- package/dist/skills/parser.js +90 -0
- package/dist/skills/service.d.ts +41 -0
- package/dist/skills/service.js +92 -0
- package/dist/skills/types.d.ts +94 -0
- package/dist/skills/types.js +21 -0
- package/dist/tools/index.d.ts +3 -0
- package/dist/tools/index.js +3 -0
- package/dist/tools/list-skills.d.ts +9 -0
- package/dist/tools/list-skills.js +34 -0
- package/dist/tools/load-skill.d.ts +21 -0
- package/dist/tools/load-skill.js +49 -0
- package/dist/tools/registry.js +6 -0
- package/dist/tools/search-codebase.d.ts +25 -0
- package/dist/tools/search-codebase.js +70 -0
- package/package.json +6 -2
- package/skills/git-commit/SKILL.md +60 -0
- package/skills/using-skills/SKILL.md +62 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import type DatabaseNs from "better-sqlite3";
|
|
2
|
+
import type { EmbeddedChunk, ScoredRecord } from "./types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Vector storage for the codebase index. The {@link VectorStore} interface hides
|
|
5
|
+
* the backend so the brute-force cosine search here can be swapped for an ANN
|
|
6
|
+
* index later without touching the indexer, retriever, or tool. Two backends
|
|
7
|
+
* ship: {@link SqliteVectorStore} (persistent, the default) and
|
|
8
|
+
* {@link InMemoryVectorStore} (ephemeral, for tests).
|
|
9
|
+
*
|
|
10
|
+
* Vectors are stored L2-normalized, so cosine similarity reduces to a dot
|
|
11
|
+
* product (see {@link dot}).
|
|
12
|
+
*/
|
|
13
|
+
/** Embedder identity + dimensionality the index was built with. */
|
|
14
|
+
export interface VectorStoreMeta {
|
|
15
|
+
embedderId: string;
|
|
16
|
+
dim: number;
|
|
17
|
+
}
|
|
18
|
+
export interface VectorStore {
|
|
19
|
+
/** Embedder identity + dim recorded for this index, or null if never set. */
|
|
20
|
+
getMeta(): VectorStoreMeta | null;
|
|
21
|
+
/** Record the embedder identity + dim. Called when the index is (re)initialized. */
|
|
22
|
+
setMeta(meta: VectorStoreMeta): void;
|
|
23
|
+
/** Map of indexed file path → content hash, for incremental decisions. */
|
|
24
|
+
getFileHashes(): Map<string, string>;
|
|
25
|
+
/** Replace every chunk for `path` with `chunks` and record its hash. Atomic. */
|
|
26
|
+
upsertFile(path: string, hash: string, chunks: EmbeddedChunk[]): void;
|
|
27
|
+
/** Remove a file and all of its chunks. */
|
|
28
|
+
deleteByPath(path: string): void;
|
|
29
|
+
/**
|
|
30
|
+
* Brute-force cosine top-k over stored vectors, optionally restricted to paths
|
|
31
|
+
* for which `pathFilter` returns true. Results are sorted by score, descending.
|
|
32
|
+
*/
|
|
33
|
+
search(query: Float32Array, k: number, pathFilter?: (path: string) => boolean): ScoredRecord[];
|
|
34
|
+
/** Number of indexed files. */
|
|
35
|
+
countFiles(): number;
|
|
36
|
+
/** Number of stored chunks. */
|
|
37
|
+
countChunks(): number;
|
|
38
|
+
/** Release any held resources (e.g. the SQLite handle). Idempotent. */
|
|
39
|
+
close(): void;
|
|
40
|
+
}
|
|
41
|
+
/** Dot product over the overlapping prefix of two vectors. */
|
|
42
|
+
export declare function dot(a: Float32Array, b: Float32Array): number;
|
|
43
|
+
/** Cosine similarity in [-1, 1]. Returns 0 if either vector is all-zero. */
|
|
44
|
+
export declare function cosineSimilarity(a: Float32Array, b: Float32Array): number;
|
|
45
|
+
/** Serialize a vector to a little-endian Float32 BLOB (an owning copy). */
|
|
46
|
+
export declare function vectorToBlob(v: Float32Array): Buffer;
|
|
47
|
+
/** Deserialize a little-endian Float32 BLOB back into a vector. */
|
|
48
|
+
export declare function blobToVector(buf: Buffer): Float32Array;
|
|
49
|
+
/** Indices of the `k` largest scores, sorted descending. */
|
|
50
|
+
export declare function topKIndices(scores: ArrayLike<number>, k: number): number[];
|
|
51
|
+
/**
|
|
52
|
+
* A non-persistent {@link VectorStore} backed by plain JS maps. Used by the test
|
|
53
|
+
* suite and as the `auto` fallback when the SQLite native dependency is
|
|
54
|
+
* unavailable. Holds everything in memory; rebuilt on each process.
|
|
55
|
+
*/
|
|
56
|
+
export declare class InMemoryVectorStore implements VectorStore {
|
|
57
|
+
private meta;
|
|
58
|
+
private readonly fileHashes;
|
|
59
|
+
private readonly chunksByPath;
|
|
60
|
+
getMeta(): VectorStoreMeta | null;
|
|
61
|
+
setMeta(meta: VectorStoreMeta): void;
|
|
62
|
+
getFileHashes(): Map<string, string>;
|
|
63
|
+
upsertFile(path: string, hash: string, chunks: EmbeddedChunk[]): void;
|
|
64
|
+
deleteByPath(path: string): void;
|
|
65
|
+
countFiles(): number;
|
|
66
|
+
countChunks(): number;
|
|
67
|
+
search(query: Float32Array, k: number, pathFilter?: (path: string) => boolean): ScoredRecord[];
|
|
68
|
+
close(): void;
|
|
69
|
+
}
|
|
70
|
+
type DB = DatabaseNs.Database;
|
|
71
|
+
/**
|
|
72
|
+
* Persistent {@link VectorStore} backed by better-sqlite3 at `.cruxy/index.db`.
|
|
73
|
+
* Vectors are stored as Float32 BLOBs; search lazily loads them into one
|
|
74
|
+
* contiguous matrix (cached until the next write) so warm queries are a tight
|
|
75
|
+
* dot-product scan rather than per-row deserialization.
|
|
76
|
+
*
|
|
77
|
+
* Construct via {@link createSqliteVectorStore}, which loads the native module.
|
|
78
|
+
*/
|
|
79
|
+
export declare class SqliteVectorStore implements VectorStore {
|
|
80
|
+
private readonly db;
|
|
81
|
+
private cache;
|
|
82
|
+
private closed;
|
|
83
|
+
constructor(db: DB);
|
|
84
|
+
getMeta(): VectorStoreMeta | null;
|
|
85
|
+
setMeta(meta: VectorStoreMeta): void;
|
|
86
|
+
getFileHashes(): Map<string, string>;
|
|
87
|
+
upsertFile(path: string, hash: string, chunks: EmbeddedChunk[]): void;
|
|
88
|
+
deleteByPath(path: string): void;
|
|
89
|
+
countFiles(): number;
|
|
90
|
+
countChunks(): number;
|
|
91
|
+
search(query: Float32Array, k: number, pathFilter?: (path: string) => boolean): ScoredRecord[];
|
|
92
|
+
close(): void;
|
|
93
|
+
private ensureCache;
|
|
94
|
+
private getMetaValue;
|
|
95
|
+
private setMetaValue;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Open (or create) a {@link SqliteVectorStore} at `dbPath`. The native module is
|
|
99
|
+
* imported lazily here, so nothing loads it until an index is actually used.
|
|
100
|
+
* The parent directory must already exist.
|
|
101
|
+
*/
|
|
102
|
+
export declare function createSqliteVectorStore(dbPath: string): Promise<SqliteVectorStore>;
|
|
103
|
+
export {};
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
// ── Pure vector helpers (exported for direct testing) ─────────────────────────
|
|
2
|
+
/** Dot product over the overlapping prefix of two vectors. */
|
|
3
|
+
export function dot(a, b) {
|
|
4
|
+
const n = Math.min(a.length, b.length);
|
|
5
|
+
let s = 0;
|
|
6
|
+
for (let i = 0; i < n; i++)
|
|
7
|
+
s += a[i] * b[i];
|
|
8
|
+
return s;
|
|
9
|
+
}
|
|
10
|
+
/** Cosine similarity in [-1, 1]. Returns 0 if either vector is all-zero. */
|
|
11
|
+
export function cosineSimilarity(a, b) {
|
|
12
|
+
const n = Math.min(a.length, b.length);
|
|
13
|
+
let d = 0;
|
|
14
|
+
let na = 0;
|
|
15
|
+
let nb = 0;
|
|
16
|
+
for (let i = 0; i < n; i++) {
|
|
17
|
+
d += a[i] * b[i];
|
|
18
|
+
na += a[i] * a[i];
|
|
19
|
+
nb += b[i] * b[i];
|
|
20
|
+
}
|
|
21
|
+
const denom = Math.sqrt(na) * Math.sqrt(nb);
|
|
22
|
+
return denom === 0 ? 0 : d / denom;
|
|
23
|
+
}
|
|
24
|
+
/** Serialize a vector to a little-endian Float32 BLOB (an owning copy). */
|
|
25
|
+
export function vectorToBlob(v) {
|
|
26
|
+
const buf = Buffer.allocUnsafe(v.length * 4);
|
|
27
|
+
for (let i = 0; i < v.length; i++)
|
|
28
|
+
buf.writeFloatLE(v[i], i * 4);
|
|
29
|
+
return buf;
|
|
30
|
+
}
|
|
31
|
+
/** Deserialize a little-endian Float32 BLOB back into a vector. */
|
|
32
|
+
export function blobToVector(buf) {
|
|
33
|
+
const out = new Float32Array(buf.byteLength >> 2);
|
|
34
|
+
for (let i = 0; i < out.length; i++)
|
|
35
|
+
out[i] = buf.readFloatLE(i << 2);
|
|
36
|
+
return out;
|
|
37
|
+
}
|
|
38
|
+
/** Indices of the `k` largest scores, sorted descending. */
|
|
39
|
+
export function topKIndices(scores, k) {
|
|
40
|
+
const n = scores.length;
|
|
41
|
+
const idx = new Array(n);
|
|
42
|
+
for (let i = 0; i < n; i++)
|
|
43
|
+
idx[i] = i;
|
|
44
|
+
idx.sort((a, b) => scores[b] - scores[a]);
|
|
45
|
+
idx.length = Math.min(Math.max(0, k), n);
|
|
46
|
+
return idx;
|
|
47
|
+
}
|
|
48
|
+
// ── In-memory backend ─────────────────────────────────────────────────────────
|
|
49
|
+
/**
|
|
50
|
+
* A non-persistent {@link VectorStore} backed by plain JS maps. Used by the test
|
|
51
|
+
* suite and as the `auto` fallback when the SQLite native dependency is
|
|
52
|
+
* unavailable. Holds everything in memory; rebuilt on each process.
|
|
53
|
+
*/
|
|
54
|
+
export class InMemoryVectorStore {
|
|
55
|
+
meta = null;
|
|
56
|
+
fileHashes = new Map();
|
|
57
|
+
chunksByPath = new Map();
|
|
58
|
+
getMeta() {
|
|
59
|
+
return this.meta;
|
|
60
|
+
}
|
|
61
|
+
setMeta(meta) {
|
|
62
|
+
this.meta = { ...meta };
|
|
63
|
+
}
|
|
64
|
+
getFileHashes() {
|
|
65
|
+
return new Map(this.fileHashes);
|
|
66
|
+
}
|
|
67
|
+
upsertFile(path, hash, chunks) {
|
|
68
|
+
this.fileHashes.set(path, hash);
|
|
69
|
+
this.chunksByPath.set(path, chunks);
|
|
70
|
+
}
|
|
71
|
+
deleteByPath(path) {
|
|
72
|
+
this.fileHashes.delete(path);
|
|
73
|
+
this.chunksByPath.delete(path);
|
|
74
|
+
}
|
|
75
|
+
countFiles() {
|
|
76
|
+
return this.fileHashes.size;
|
|
77
|
+
}
|
|
78
|
+
countChunks() {
|
|
79
|
+
let total = 0;
|
|
80
|
+
for (const chunks of this.chunksByPath.values())
|
|
81
|
+
total += chunks.length;
|
|
82
|
+
return total;
|
|
83
|
+
}
|
|
84
|
+
search(query, k, pathFilter) {
|
|
85
|
+
const records = [];
|
|
86
|
+
const scores = [];
|
|
87
|
+
for (const [path, chunks] of this.chunksByPath) {
|
|
88
|
+
if (pathFilter && !pathFilter(path))
|
|
89
|
+
continue;
|
|
90
|
+
for (const ec of chunks) {
|
|
91
|
+
records.push({
|
|
92
|
+
path: ec.chunk.path,
|
|
93
|
+
startLine: ec.chunk.startLine,
|
|
94
|
+
endLine: ec.chunk.endLine,
|
|
95
|
+
text: ec.chunk.text,
|
|
96
|
+
score: dot(query, ec.vector),
|
|
97
|
+
});
|
|
98
|
+
scores.push(records[records.length - 1].score);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return topKIndices(scores, k).map((i) => records[i]);
|
|
102
|
+
}
|
|
103
|
+
close() {
|
|
104
|
+
/* nothing to release */
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
const SCHEMA_VERSION = "1";
|
|
108
|
+
/**
|
|
109
|
+
* Persistent {@link VectorStore} backed by better-sqlite3 at `.cruxy/index.db`.
|
|
110
|
+
* Vectors are stored as Float32 BLOBs; search lazily loads them into one
|
|
111
|
+
* contiguous matrix (cached until the next write) so warm queries are a tight
|
|
112
|
+
* dot-product scan rather than per-row deserialization.
|
|
113
|
+
*
|
|
114
|
+
* Construct via {@link createSqliteVectorStore}, which loads the native module.
|
|
115
|
+
*/
|
|
116
|
+
export class SqliteVectorStore {
|
|
117
|
+
db;
|
|
118
|
+
cache = null;
|
|
119
|
+
closed = false;
|
|
120
|
+
constructor(db) {
|
|
121
|
+
this.db = db;
|
|
122
|
+
this.db.pragma("journal_mode = WAL");
|
|
123
|
+
this.db.pragma("synchronous = NORMAL");
|
|
124
|
+
this.db.exec(`
|
|
125
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
126
|
+
key TEXT PRIMARY KEY,
|
|
127
|
+
value TEXT NOT NULL
|
|
128
|
+
);
|
|
129
|
+
CREATE TABLE IF NOT EXISTS files (
|
|
130
|
+
path TEXT PRIMARY KEY,
|
|
131
|
+
hash TEXT NOT NULL
|
|
132
|
+
);
|
|
133
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
134
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
135
|
+
path TEXT NOT NULL,
|
|
136
|
+
start_line INTEGER NOT NULL,
|
|
137
|
+
end_line INTEGER NOT NULL,
|
|
138
|
+
text TEXT NOT NULL,
|
|
139
|
+
vector BLOB NOT NULL
|
|
140
|
+
);
|
|
141
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path);
|
|
142
|
+
`);
|
|
143
|
+
this.setMetaValue("schema_version", SCHEMA_VERSION);
|
|
144
|
+
}
|
|
145
|
+
getMeta() {
|
|
146
|
+
const embedderId = this.getMetaValue("embedder_id");
|
|
147
|
+
const dimStr = this.getMetaValue("dim");
|
|
148
|
+
if (embedderId === null || dimStr === null)
|
|
149
|
+
return null;
|
|
150
|
+
return { embedderId, dim: Number(dimStr) };
|
|
151
|
+
}
|
|
152
|
+
setMeta(meta) {
|
|
153
|
+
this.setMetaValue("embedder_id", meta.embedderId);
|
|
154
|
+
this.setMetaValue("dim", String(meta.dim));
|
|
155
|
+
}
|
|
156
|
+
getFileHashes() {
|
|
157
|
+
const rows = this.db
|
|
158
|
+
.prepare("SELECT path, hash FROM files")
|
|
159
|
+
.all();
|
|
160
|
+
const map = new Map();
|
|
161
|
+
for (const r of rows)
|
|
162
|
+
map.set(r.path, r.hash);
|
|
163
|
+
return map;
|
|
164
|
+
}
|
|
165
|
+
upsertFile(path, hash, chunks) {
|
|
166
|
+
const tx = this.db.transaction(() => {
|
|
167
|
+
this.db.prepare("DELETE FROM chunks WHERE path = ?").run(path);
|
|
168
|
+
this.db
|
|
169
|
+
.prepare("INSERT INTO files (path, hash) VALUES (?, ?) " +
|
|
170
|
+
"ON CONFLICT(path) DO UPDATE SET hash = excluded.hash")
|
|
171
|
+
.run(path, hash);
|
|
172
|
+
const insert = this.db.prepare("INSERT INTO chunks (path, start_line, end_line, text, vector) VALUES (?, ?, ?, ?, ?)");
|
|
173
|
+
for (const { chunk, vector } of chunks) {
|
|
174
|
+
insert.run(path, chunk.startLine, chunk.endLine, chunk.text, vectorToBlob(vector));
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
tx();
|
|
178
|
+
this.cache = null;
|
|
179
|
+
}
|
|
180
|
+
deleteByPath(path) {
|
|
181
|
+
const tx = this.db.transaction(() => {
|
|
182
|
+
this.db.prepare("DELETE FROM chunks WHERE path = ?").run(path);
|
|
183
|
+
this.db.prepare("DELETE FROM files WHERE path = ?").run(path);
|
|
184
|
+
});
|
|
185
|
+
tx();
|
|
186
|
+
this.cache = null;
|
|
187
|
+
}
|
|
188
|
+
countFiles() {
|
|
189
|
+
return this.db.prepare("SELECT COUNT(*) AS n FROM files").get().n;
|
|
190
|
+
}
|
|
191
|
+
countChunks() {
|
|
192
|
+
return this.db.prepare("SELECT COUNT(*) AS n FROM chunks").get().n;
|
|
193
|
+
}
|
|
194
|
+
search(query, k, pathFilter) {
|
|
195
|
+
const cache = this.ensureCache();
|
|
196
|
+
if (cache.n === 0)
|
|
197
|
+
return [];
|
|
198
|
+
const { dim, matrix } = cache;
|
|
199
|
+
const candidates = [];
|
|
200
|
+
const scores = [];
|
|
201
|
+
for (let i = 0; i < cache.n; i++) {
|
|
202
|
+
if (pathFilter && !pathFilter(cache.paths[i]))
|
|
203
|
+
continue;
|
|
204
|
+
let s = 0;
|
|
205
|
+
const off = i * dim;
|
|
206
|
+
for (let d = 0; d < dim; d++)
|
|
207
|
+
s += query[d] * matrix[off + d];
|
|
208
|
+
candidates.push(i);
|
|
209
|
+
scores.push(s);
|
|
210
|
+
}
|
|
211
|
+
const getText = this.db.prepare("SELECT text FROM chunks WHERE id = ?");
|
|
212
|
+
return topKIndices(scores, k).map((j) => {
|
|
213
|
+
const row = candidates[j];
|
|
214
|
+
const text = getText.get(cache.ids[row])?.text ??
|
|
215
|
+
"";
|
|
216
|
+
return {
|
|
217
|
+
path: cache.paths[row],
|
|
218
|
+
startLine: cache.starts[row],
|
|
219
|
+
endLine: cache.ends[row],
|
|
220
|
+
text,
|
|
221
|
+
score: scores[j],
|
|
222
|
+
};
|
|
223
|
+
});
|
|
224
|
+
}
|
|
225
|
+
close() {
|
|
226
|
+
if (this.closed)
|
|
227
|
+
return;
|
|
228
|
+
this.db.close();
|
|
229
|
+
this.closed = true;
|
|
230
|
+
}
|
|
231
|
+
// ── internals ──────────────────────────────────────────────────────────────
|
|
232
|
+
ensureCache() {
|
|
233
|
+
if (this.cache)
|
|
234
|
+
return this.cache;
|
|
235
|
+
const rows = this.db
|
|
236
|
+
.prepare("SELECT id, path, start_line, end_line, vector FROM chunks ORDER BY id")
|
|
237
|
+
.all();
|
|
238
|
+
const n = rows.length;
|
|
239
|
+
const dim = n > 0 ? rows[0].vector.byteLength >> 2 : (this.getMeta()?.dim ?? 0);
|
|
240
|
+
const matrix = new Float32Array(n * dim);
|
|
241
|
+
const ids = new Array(n);
|
|
242
|
+
const paths = new Array(n);
|
|
243
|
+
const starts = new Array(n);
|
|
244
|
+
const ends = new Array(n);
|
|
245
|
+
for (let i = 0; i < n; i++) {
|
|
246
|
+
const r = rows[i];
|
|
247
|
+
ids[i] = r.id;
|
|
248
|
+
paths[i] = r.path;
|
|
249
|
+
starts[i] = r.start_line;
|
|
250
|
+
ends[i] = r.end_line;
|
|
251
|
+
const off = i * dim;
|
|
252
|
+
for (let d = 0; d < dim; d++)
|
|
253
|
+
matrix[off + d] = r.vector.readFloatLE(d << 2);
|
|
254
|
+
}
|
|
255
|
+
this.cache = { n, dim, matrix, ids, paths, starts, ends };
|
|
256
|
+
return this.cache;
|
|
257
|
+
}
|
|
258
|
+
getMetaValue(key) {
|
|
259
|
+
const row = this.db
|
|
260
|
+
.prepare("SELECT value FROM meta WHERE key = ?")
|
|
261
|
+
.get(key);
|
|
262
|
+
return row ? row.value : null;
|
|
263
|
+
}
|
|
264
|
+
setMetaValue(key, value) {
|
|
265
|
+
this.db
|
|
266
|
+
.prepare("INSERT INTO meta (key, value) VALUES (?, ?) " +
|
|
267
|
+
"ON CONFLICT(key) DO UPDATE SET value = excluded.value")
|
|
268
|
+
.run(key, value);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Open (or create) a {@link SqliteVectorStore} at `dbPath`. The native module is
|
|
273
|
+
* imported lazily here, so nothing loads it until an index is actually used.
|
|
274
|
+
* The parent directory must already exist.
|
|
275
|
+
*/
|
|
276
|
+
export async function createSqliteVectorStore(dbPath) {
|
|
277
|
+
const { default: Database } = await import("better-sqlite3");
|
|
278
|
+
return new SqliteVectorStore(new Database(dbPath));
|
|
279
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for the codebase index (C.17). Kept in one place so the walker,
|
|
3
|
+
* chunker, embedder, store, indexer, and retriever can depend on the same
|
|
4
|
+
* vocabulary without import cycles — mirroring `src/tools/types.ts`.
|
|
5
|
+
*/
|
|
6
|
+
/** A contiguous slice of a source file: the unit that gets embedded and stored. */
|
|
7
|
+
export interface Chunk {
|
|
8
|
+
/** Project-relative POSIX path of the source file. */
|
|
9
|
+
path: string;
|
|
10
|
+
/** 1-based line where the chunk begins (inclusive). */
|
|
11
|
+
startLine: number;
|
|
12
|
+
/** 1-based line where the chunk ends (inclusive). */
|
|
13
|
+
endLine: number;
|
|
14
|
+
/** The chunk's raw text (the joined source lines). */
|
|
15
|
+
text: string;
|
|
16
|
+
}
|
|
17
|
+
/** A chunk paired with its embedding, ready to persist. */
|
|
18
|
+
export interface EmbeddedChunk {
|
|
19
|
+
chunk: Chunk;
|
|
20
|
+
/** L2-normalized embedding; length equals the embedder's `dim`. */
|
|
21
|
+
vector: Float32Array;
|
|
22
|
+
}
|
|
23
|
+
/** A stored chunk scored against a query, as returned by `VectorStore.search`. */
|
|
24
|
+
export interface ScoredRecord {
|
|
25
|
+
path: string;
|
|
26
|
+
startLine: number;
|
|
27
|
+
endLine: number;
|
|
28
|
+
text: string;
|
|
29
|
+
/** Cosine similarity in [-1, 1]; higher is more relevant. */
|
|
30
|
+
score: number;
|
|
31
|
+
}
|
|
32
|
+
/** One ranked search result handed back to the agent. */
|
|
33
|
+
export interface SearchHit {
|
|
34
|
+
path: string;
|
|
35
|
+
startLine: number;
|
|
36
|
+
endLine: number;
|
|
37
|
+
/** Cosine similarity in [-1, 1], rounded for display. */
|
|
38
|
+
score: number;
|
|
39
|
+
/** The (possibly trimmed) source snippet for the chunk. */
|
|
40
|
+
snippet: string;
|
|
41
|
+
}
|
|
42
|
+
/** Outcome of an indexing run, surfaced by `cruxy index`. */
|
|
43
|
+
export interface IndexStats {
|
|
44
|
+
/** Candidate text files remaining after the walk's ignore/size/binary filters. */
|
|
45
|
+
filesSeen: number;
|
|
46
|
+
/** Files added or re-embedded this run (content changed, new, or forced). */
|
|
47
|
+
filesIndexed: number;
|
|
48
|
+
/** Files skipped because their content hash was unchanged. */
|
|
49
|
+
filesSkipped: number;
|
|
50
|
+
/** Files removed from the index (deleted on disk or newly ignored). */
|
|
51
|
+
filesPurged: number;
|
|
52
|
+
/** Chunks embedded this run. */
|
|
53
|
+
chunksIndexed: number;
|
|
54
|
+
/** Wall-clock duration in milliseconds. */
|
|
55
|
+
durationMs: number;
|
|
56
|
+
}
|
|
57
|
+
/** Snapshot of the on-disk index for `cruxy index --status`. */
|
|
58
|
+
export interface IndexStatus {
|
|
59
|
+
/** Whether a populated index already exists. */
|
|
60
|
+
exists: boolean;
|
|
61
|
+
/** Embedder identity the index was built with (e.g. `fastembed:bge-small-en-v1.5`). */
|
|
62
|
+
embedderId: string | null;
|
|
63
|
+
/** Embedding dimensionality recorded in the index. */
|
|
64
|
+
dim: number | null;
|
|
65
|
+
/** Number of indexed files. */
|
|
66
|
+
files: number;
|
|
67
|
+
/** Number of stored chunks. */
|
|
68
|
+
chunks: number;
|
|
69
|
+
/** Absolute path of the index store, or null for an in-memory store. */
|
|
70
|
+
storePath: string | null;
|
|
71
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Small shared helpers for the index pipeline: content hashing, vector
|
|
3
|
+
* normalization, a bounded-concurrency map, a token estimate, and glob→RegExp
|
|
4
|
+
* compilation (reused by both the `pathGlob` search filter and the gitignore
|
|
5
|
+
* matcher in `walker.ts`).
|
|
6
|
+
*/
|
|
7
|
+
/** Stable content hash of a file's bytes — sha256, hex. Drives incremental indexing. */
|
|
8
|
+
export declare function contentHash(data: Buffer | string): string;
|
|
9
|
+
/**
|
|
10
|
+
* L2-normalize `v` in place and return it. A zero vector is left untouched (its
|
|
11
|
+
* norm is 0). Normalizing at embed time lets the store treat cosine similarity
|
|
12
|
+
* as a plain dot product.
|
|
13
|
+
*/
|
|
14
|
+
export declare function l2normalize(v: Float32Array): Float32Array;
|
|
15
|
+
/**
|
|
16
|
+
* Approximate token count via the chars/4 heuristic used elsewhere in the CLI
|
|
17
|
+
* (see `agent/session.ts`). Good enough to budget snippet sizes.
|
|
18
|
+
*/
|
|
19
|
+
export declare function estimateTokens(text: string): number;
|
|
20
|
+
/**
|
|
21
|
+
* Run `fn` over `items` with at most `limit` calls in flight, preserving input
|
|
22
|
+
* order in the result. Used to bound filesystem/CPU concurrency during indexing.
|
|
23
|
+
*/
|
|
24
|
+
export declare function mapLimit<T, R>(items: readonly T[], limit: number, fn: (item: T, index: number) => Promise<R>): Promise<R[]>;
|
|
25
|
+
/**
|
|
26
|
+
* Convert a glob into the *body* of a RegExp (no anchors), supporting `*`
|
|
27
|
+
* (within a path segment), `**` (across segments, including zero), and `?`
|
|
28
|
+
* (a single non-separator char). Backslashes are normalized to `/` first.
|
|
29
|
+
*/
|
|
30
|
+
export declare function globToRegexBody(glob: string): string;
|
|
31
|
+
/** Compile a glob into a RegExp anchored to a full POSIX path. */
|
|
32
|
+
export declare function globToRegExp(glob: string): RegExp;
|
|
33
|
+
/** Sniff the head of a buffer for a NUL byte — the cheap "is this binary?" test. */
|
|
34
|
+
export declare function isBinary(buf: Buffer, sniffBytes?: number): boolean;
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
/**
|
|
3
|
+
* Small shared helpers for the index pipeline: content hashing, vector
|
|
4
|
+
* normalization, a bounded-concurrency map, a token estimate, and glob→RegExp
|
|
5
|
+
* compilation (reused by both the `pathGlob` search filter and the gitignore
|
|
6
|
+
* matcher in `walker.ts`).
|
|
7
|
+
*/
|
|
8
|
+
/** Stable content hash of a file's bytes — sha256, hex. Drives incremental indexing. */
|
|
9
|
+
export function contentHash(data) {
|
|
10
|
+
return createHash("sha256").update(data).digest("hex");
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* L2-normalize `v` in place and return it. A zero vector is left untouched (its
|
|
14
|
+
* norm is 0). Normalizing at embed time lets the store treat cosine similarity
|
|
15
|
+
* as a plain dot product.
|
|
16
|
+
*/
|
|
17
|
+
export function l2normalize(v) {
|
|
18
|
+
let sum = 0;
|
|
19
|
+
for (let i = 0; i < v.length; i++)
|
|
20
|
+
sum += v[i] * v[i];
|
|
21
|
+
const norm = Math.sqrt(sum);
|
|
22
|
+
if (norm > 0) {
|
|
23
|
+
for (let i = 0; i < v.length; i++)
|
|
24
|
+
v[i] /= norm;
|
|
25
|
+
}
|
|
26
|
+
return v;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Approximate token count via the chars/4 heuristic used elsewhere in the CLI
|
|
30
|
+
* (see `agent/session.ts`). Good enough to budget snippet sizes.
|
|
31
|
+
*/
|
|
32
|
+
export function estimateTokens(text) {
|
|
33
|
+
return Math.ceil(text.length / 4);
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Run `fn` over `items` with at most `limit` calls in flight, preserving input
|
|
37
|
+
* order in the result. Used to bound filesystem/CPU concurrency during indexing.
|
|
38
|
+
*/
|
|
39
|
+
export async function mapLimit(items, limit, fn) {
|
|
40
|
+
const results = new Array(items.length);
|
|
41
|
+
if (items.length === 0)
|
|
42
|
+
return results;
|
|
43
|
+
let next = 0;
|
|
44
|
+
const workers = Math.max(1, Math.min(limit, items.length));
|
|
45
|
+
async function run() {
|
|
46
|
+
for (;;) {
|
|
47
|
+
const i = next++;
|
|
48
|
+
if (i >= items.length)
|
|
49
|
+
return;
|
|
50
|
+
results[i] = await fn(items[i], i);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
await Promise.all(Array.from({ length: workers }, () => run()));
|
|
54
|
+
return results;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Convert a glob into the *body* of a RegExp (no anchors), supporting `*`
|
|
58
|
+
* (within a path segment), `**` (across segments, including zero), and `?`
|
|
59
|
+
* (a single non-separator char). Backslashes are normalized to `/` first.
|
|
60
|
+
*/
|
|
61
|
+
export function globToRegexBody(glob) {
|
|
62
|
+
const g = glob.replace(/\\/g, "/");
|
|
63
|
+
let re = "";
|
|
64
|
+
for (let i = 0; i < g.length; i++) {
|
|
65
|
+
const c = g[i];
|
|
66
|
+
if (c === "*") {
|
|
67
|
+
if (g[i + 1] === "*") {
|
|
68
|
+
// `**/` matches zero or more leading directories; a trailing/standalone
|
|
69
|
+
// `**` matches anything (including separators).
|
|
70
|
+
const slash = g[i + 2] === "/";
|
|
71
|
+
i += slash ? 2 : 1;
|
|
72
|
+
re += slash ? "(?:.*/)?" : ".*";
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
re += "[^/]*";
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
else if (c === "?") {
|
|
79
|
+
re += "[^/]";
|
|
80
|
+
}
|
|
81
|
+
else if (/[.+^${}()|[\]\\]/.test(c)) {
|
|
82
|
+
re += `\\${c}`;
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
re += c;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return re;
|
|
89
|
+
}
|
|
90
|
+
/** Compile a glob into a RegExp anchored to a full POSIX path. */
|
|
91
|
+
export function globToRegExp(glob) {
|
|
92
|
+
return new RegExp(`^${globToRegexBody(glob)}$`);
|
|
93
|
+
}
|
|
94
|
+
/** Sniff the head of a buffer for a NUL byte — the cheap "is this binary?" test. */
|
|
95
|
+
export function isBinary(buf, sniffBytes = 4096) {
|
|
96
|
+
return buf.subarray(0, sniffBytes).includes(0);
|
|
97
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Repository walker for the index. Enumerates indexable text files under a root,
|
|
3
|
+
* honoring `.gitignore` and `.cruxyignore` (nested files included), an
|
|
4
|
+
* always-ignore set, a hard secrets denylist, a per-file size cap, and a binary
|
|
5
|
+
* sniff. Yielded paths are project-relative and POSIX-separated.
|
|
6
|
+
*/
|
|
7
|
+
/** A file the walker considers indexable. */
|
|
8
|
+
export interface WalkEntry {
|
|
9
|
+
/** Project-relative POSIX path. */
|
|
10
|
+
relPath: string;
|
|
11
|
+
/** Absolute path on disk. */
|
|
12
|
+
absPath: string;
|
|
13
|
+
/** File size in bytes. */
|
|
14
|
+
size: number;
|
|
15
|
+
}
|
|
16
|
+
export interface WalkOptions {
|
|
17
|
+
/** Hard per-file size cap, in bytes. Larger files are skipped. */
|
|
18
|
+
maxFileBytes: number;
|
|
19
|
+
/** Ignore-file names to honor, relative to each directory (gitignore syntax). */
|
|
20
|
+
ignoreFileNames?: string[];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* A compiled set of gitignore-style patterns, matched against paths relative to
|
|
24
|
+
* the file's own directory. Supports comments, blank lines, `!` negation,
|
|
25
|
+
* leading-`/` anchoring, trailing-`/` directory-only rules, and `*`/`**`/`?`.
|
|
26
|
+
*/
|
|
27
|
+
export declare class IgnoreMatcher {
|
|
28
|
+
private readonly rules;
|
|
29
|
+
constructor(patterns: string[]);
|
|
30
|
+
private add;
|
|
31
|
+
/**
|
|
32
|
+
* Decide whether `relPath` is ignored: `true`/`false` when a rule matches
|
|
33
|
+
* (last match wins), or `undefined` when no rule applies.
|
|
34
|
+
*/
|
|
35
|
+
decide(relPath: string, isDir: boolean): boolean | undefined;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Walk `root` and yield every indexable text file. Directories in the
|
|
39
|
+
* always-ignore set are skipped wholesale; ignore files accumulate down the
|
|
40
|
+
* tree; secret-bearing, oversized, and binary files are filtered out.
|
|
41
|
+
*/
|
|
42
|
+
export declare function walkRepo(root: string, opts: WalkOptions): AsyncGenerator<WalkEntry>;
|