@cruxy/cli 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ import { promises as fs } from "node:fs";
2
+ import { l2normalize } from "./util.js";
3
+ /**
4
+ * Output dimensionality of bge-small-en-v1.5, and the default size of the
5
+ * hashing embedder's feature space, so the two backends are interchangeable in
6
+ * the store schema.
7
+ */
8
+ export const DEFAULT_DIM = 384;
9
+ // ── Hashing embedder ────────────────────────────────────────────────────────
10
+ /** 32-bit FNV-1a hash of a string. Deterministic and dependency-free. */
11
+ function fnv1a(str) {
12
+ let h = 0x811c9dc5;
13
+ for (let i = 0; i < str.length; i++) {
14
+ h ^= str.charCodeAt(i);
15
+ // h *= 16777619, kept in 32-bit range via Math.imul.
16
+ h = Math.imul(h, 0x01000193);
17
+ }
18
+ return h >>> 0;
19
+ }
20
+ /**
21
+ * Split text into lowercase lexical tokens, breaking identifiers on camelCase
22
+ * and snake/kebab boundaries so `getUserById` contributes `get`/`user`/`by`/`id`.
23
+ */
24
+ export function tokenize(text) {
25
+ const tokens = [];
26
+ for (const word of text.match(/[A-Za-z0-9]+/g) ?? []) {
27
+ for (const sub of word.split(/(?<=[a-z0-9])(?=[A-Z])|(?<=[A-Za-z])(?=[0-9])/)) {
28
+ if (sub)
29
+ tokens.push(sub.toLowerCase());
30
+ }
31
+ }
32
+ return tokens;
33
+ }
34
+ /**
35
+ * A deterministic, dependency-free embedder: signed feature hashing
36
+ * (bag-of-words) over code-aware tokens, L2-normalized. It captures lexical
37
+ * overlap rather than true semantics, but it is offline, instant, and stable —
38
+ * which makes it the right default for tests and a sane fallback when the native
39
+ * embedding model is unavailable.
40
+ */
41
+ export class HashingEmbedder {
42
+ id;
43
+ dim;
44
+ constructor(opts = {}) {
45
+ this.dim = opts.dim ?? DEFAULT_DIM;
46
+ this.id = `hash:v1:${this.dim}`;
47
+ }
48
+ async embed(texts) {
49
+ return texts.map((t) => this.embedOne(t));
50
+ }
51
+ async embedQuery(text) {
52
+ return this.embedOne(text);
53
+ }
54
+ embedOne(text) {
55
+ const v = new Float32Array(this.dim);
56
+ for (const tok of tokenize(text)) {
57
+ const bucket = fnv1a(tok) % this.dim;
58
+ // A second hash supplies a sign, halving the bias from bucket collisions.
59
+ const sign = (fnv1a(`#${tok}`) & 1) === 0 ? 1 : -1;
60
+ v[bucket] += sign;
61
+ }
62
+ return l2normalize(v);
63
+ }
64
+ }
65
+ /**
66
+ * Local semantic embeddings via fastembed (ONNX), model bge-small-en-v1.5
67
+ * (384-dim). The native dependency is imported lazily on first use, so merely
68
+ * registering the `search_codebase` tool stays cheap and the heavy ONNX runtime
69
+ * only loads when an index is actually built or queried.
70
+ *
71
+ * Embedding is CPU-bound and single-threaded inside ONNX, so throughput is
72
+ * bounded by `batchSize` (fed sequentially through fastembed's batching
73
+ * generator) rather than by JS-level concurrency.
74
+ */
75
+ export class FastEmbedEmbedder {
76
+ id = "fastembed:bge-small-en-v1.5";
77
+ dim = DEFAULT_DIM;
78
+ opts;
79
+ model = null;
80
+ constructor(opts = {}) {
81
+ this.opts = opts;
82
+ }
83
+ getModel() {
84
+ if (!this.model) {
85
+ this.model = (async () => {
86
+ const mod = await import("fastembed");
87
+ // fastembed's init does a non-recursive mkdir of the cache dir, so it
88
+ // fails if an ancestor (e.g. ~/.cruxy) doesn't exist yet. Create it first.
89
+ if (this.opts.cacheDir) {
90
+ await fs.mkdir(this.opts.cacheDir, { recursive: true });
91
+ }
92
+ return (await mod.FlagEmbedding.init({
93
+ model: mod.EmbeddingModel.BGESmallENV15,
94
+ maxLength: this.opts.maxLength ?? 512,
95
+ cacheDir: this.opts.cacheDir,
96
+ showDownloadProgress: this.opts.showDownloadProgress ?? false,
97
+ }));
98
+ })();
99
+ }
100
+ return this.model;
101
+ }
102
+ async embed(texts) {
103
+ if (texts.length === 0)
104
+ return [];
105
+ const model = await this.getModel();
106
+ const out = [];
107
+ for await (const batch of model.embed(texts, this.opts.batchSize ?? 32)) {
108
+ for (const vec of batch)
109
+ out.push(l2normalize(Float32Array.from(vec)));
110
+ }
111
+ return out;
112
+ }
113
+ async embedQuery(text) {
114
+ const model = await this.getModel();
115
+ return l2normalize(Float32Array.from(await model.queryEmbed(text)));
116
+ }
117
+ }
118
+ /**
119
+ * Build the **production** embedder: fastembed / bge-small-en-v1.5.
120
+ *
121
+ * The native dependency is loaded eagerly here so a missing or broken install
122
+ * fails *now*, loudly, with an actionable message — rather than silently
123
+ * degrading search quality at query time. There is deliberately **no** fallback
124
+ * to {@link HashingEmbedder}: that backend is reachable only by explicit
125
+ * injection in tests (see `getIndexService`'s `deps.embedder`), never selected
126
+ * automatically by this factory or by config.
127
+ *
128
+ * @throws if the fastembed native module cannot be loaded.
129
+ */
130
+ export async function createEmbedder(opts = {}) {
131
+ try {
132
+ await import("fastembed");
133
+ }
134
+ catch (err) {
135
+ throw new Error(`the local embedding model (fastembed) could not be loaded: ${err.message}. ` +
136
+ "It requires the onnxruntime-node native addon — reinstall with `pnpm install` and " +
137
+ "ensure the native build completed. The codebase index is unavailable until this is fixed.");
138
+ }
139
+ return new FastEmbedEmbedder({ cacheDir: opts.cacheDir });
140
+ }
@@ -0,0 +1,9 @@
1
+ export * from "./types.js";
2
+ export * from "./util.js";
3
+ export * from "./chunker.js";
4
+ export * from "./embedder.js";
5
+ export * from "./store.js";
6
+ export * from "./walker.js";
7
+ export * from "./indexer.js";
8
+ export * from "./retriever.js";
9
+ export * from "./service.js";
@@ -0,0 +1,9 @@
1
+ export * from "./types.js";
2
+ export * from "./util.js";
3
+ export * from "./chunker.js";
4
+ export * from "./embedder.js";
5
+ export * from "./store.js";
6
+ export * from "./walker.js";
7
+ export * from "./indexer.js";
8
+ export * from "./retriever.js";
9
+ export * from "./service.js";
@@ -0,0 +1,45 @@
1
+ import { type ChunkOptions } from "./chunker.js";
2
+ import type { Embedder } from "./embedder.js";
3
+ import type { VectorStore } from "./store.js";
4
+ import type { IndexStats } from "./types.js";
5
+ /** Minimal logger surface the indexer reports progress to. */
6
+ interface IndexerLogger {
7
+ debug(message: string): void;
8
+ info(message: string): void;
9
+ }
10
+ export interface IndexerOptions {
11
+ /** Absolute project root to index. */
12
+ root: string;
13
+ /** Where embeddings are stored. */
14
+ store: VectorStore;
15
+ /** How text is turned into vectors. */
16
+ embedder: Embedder;
17
+ /** Hard per-file size cap, in bytes. */
18
+ maxFileBytes: number;
19
+ /** Chunking parameters. */
20
+ chunk: ChunkOptions;
21
+ /** Concurrency for reading/hashing files. */
22
+ readConcurrency?: number;
23
+ /** Files chunked + embedded + upserted per batch (bounds peak memory). */
24
+ fileBatchSize?: number;
25
+ /** Optional progress logger. */
26
+ logger?: IndexerLogger;
27
+ }
28
+ /**
29
+ * Orchestrates the index: walk → hash → (skip unchanged | chunk → embed →
30
+ * upsert) → purge removed.
31
+ *
32
+ * Incrementality is content-hash based: a file is re-embedded only when its
33
+ * bytes change; unchanged files are skipped; files that disappeared (deleted or
34
+ * newly ignored) are purged. Changing the embedder (a new model or dimension)
35
+ * invalidates the whole index — its signature is stored, and a mismatch forces a
36
+ * full re-embed so vectors never mix across models.
37
+ */
38
+ export declare class Indexer {
39
+ private readonly opts;
40
+ constructor(opts: IndexerOptions);
41
+ index(runOpts?: {
42
+ force?: boolean;
43
+ }): Promise<IndexStats>;
44
+ }
45
+ export {};
@@ -0,0 +1,104 @@
1
+ import { promises as fs } from "node:fs";
2
+ import { chunkFile } from "./chunker.js";
3
+ import { contentHash, isBinary, mapLimit } from "./util.js";
4
+ import { walkRepo } from "./walker.js";
5
+ const DEFAULT_READ_CONCURRENCY = 8;
6
+ const DEFAULT_FILE_BATCH = 64;
7
+ /**
8
+ * Orchestrates the index: walk → hash → (skip unchanged | chunk → embed →
9
+ * upsert) → purge removed.
10
+ *
11
+ * Incrementality is content-hash based: a file is re-embedded only when its
12
+ * bytes change; unchanged files are skipped; files that disappeared (deleted or
13
+ * newly ignored) are purged. Changing the embedder (a new model or dimension)
14
+ * invalidates the whole index — its signature is stored, and a mismatch forces a
15
+ * full re-embed so vectors never mix across models.
16
+ */
17
+ export class Indexer {
18
+ opts;
19
+ constructor(opts) {
20
+ this.opts = opts;
21
+ }
22
+ async index(runOpts = {}) {
23
+ const started = Date.now();
24
+ const { store, embedder, root, logger } = this.opts;
25
+ // Embedder-signature gate: a change (or an explicit --force) re-embeds all.
26
+ const meta = store.getMeta();
27
+ const signatureChanged = meta !== null &&
28
+ (meta.embedderId !== embedder.id || meta.dim !== embedder.dim);
29
+ const force = Boolean(runOpts.force) || meta === null || signatureChanged;
30
+ if (signatureChanged) {
31
+ logger?.info(`embedder changed (${meta.embedderId} → ${embedder.id}); re-embedding the whole index`);
32
+ for (const path of store.getFileHashes().keys())
33
+ store.deleteByPath(path);
34
+ }
35
+ store.setMeta({ embedderId: embedder.id, dim: embedder.dim });
36
+ const previousHashes = store.getFileHashes();
37
+ const seen = new Set();
38
+ // 1. Walk the repo into a candidate list.
39
+ const entries = [];
40
+ for await (const entry of walkRepo(root, {
41
+ maxFileBytes: this.opts.maxFileBytes,
42
+ })) {
43
+ entries.push({ relPath: entry.relPath, absPath: entry.absPath });
44
+ }
45
+ // 2. Read + hash candidates, deciding which changed (bounded concurrency).
46
+ const results = await mapLimit(entries, this.opts.readConcurrency ?? DEFAULT_READ_CONCURRENCY, async (entry) => {
47
+ const buf = await fs.readFile(entry.absPath);
48
+ // The walker already sniffed binaries; re-check in case the file changed
49
+ // between walk and read.
50
+ if (isBinary(buf))
51
+ return null;
52
+ seen.add(entry.relPath);
53
+ const hash = contentHash(buf);
54
+ if (!force && previousHashes.get(entry.relPath) === hash) {
55
+ return null; // unchanged
56
+ }
57
+ return { relPath: entry.relPath, text: buf.toString("utf8"), hash };
58
+ });
59
+ const changed = results.filter((r) => r !== null);
60
+ // `seen` holds readable text candidates (changed + unchanged); the rest are
61
+ // unchanged. Files that turned binary/unreadable aren't seen and collapse
62
+ // into purges below.
63
+ const filesSkipped = seen.size - changed.length;
64
+ // 3. Chunk → embed → upsert, batched by file to bound peak memory.
65
+ let chunksIndexed = 0;
66
+ const batchSize = this.opts.fileBatchSize ?? DEFAULT_FILE_BATCH;
67
+ for (let i = 0; i < changed.length; i += batchSize) {
68
+ const batch = changed.slice(i, i + batchSize);
69
+ const chunked = batch.map((f) => ({
70
+ relPath: f.relPath,
71
+ hash: f.hash,
72
+ chunks: chunkFile(f.relPath, f.text, this.opts.chunk),
73
+ }));
74
+ const texts = chunked.flatMap((f) => f.chunks.map((c) => c.text));
75
+ const vectors = texts.length > 0 ? await embedder.embed(texts) : [];
76
+ let v = 0;
77
+ for (const file of chunked) {
78
+ const embedded = file.chunks.map((chunk) => ({
79
+ chunk,
80
+ vector: vectors[v++],
81
+ }));
82
+ store.upsertFile(file.relPath, file.hash, embedded);
83
+ chunksIndexed += embedded.length;
84
+ }
85
+ logger?.debug(`indexed ${Math.min(i + batchSize, changed.length)}/${changed.length} changed files`);
86
+ }
87
+ // 4. Purge anything previously indexed that we no longer saw.
88
+ let filesPurged = 0;
89
+ for (const path of store.getFileHashes().keys()) {
90
+ if (!seen.has(path)) {
91
+ store.deleteByPath(path);
92
+ filesPurged++;
93
+ }
94
+ }
95
+ return {
96
+ filesSeen: seen.size,
97
+ filesIndexed: changed.length,
98
+ filesSkipped,
99
+ filesPurged,
100
+ chunksIndexed,
101
+ durationMs: Date.now() - started,
102
+ };
103
+ }
104
+ }
@@ -0,0 +1,32 @@
1
+ import type { Embedder } from "./embedder.js";
2
+ import type { VectorStore } from "./store.js";
3
+ import type { SearchHit } from "./types.js";
4
+ /** Inputs to {@link searchCodebase}. */
5
+ export interface SearchOptions {
6
+ query: string;
7
+ /** Number of hits to return; falls back to `defaultK`. Clamped to [1, 50]. */
8
+ k?: number;
9
+ /** Optional glob restricting results by path, e.g. `src/**\/*.ts`. */
10
+ pathGlob?: string;
11
+ }
12
+ /** Collaborators + budget knobs for the retriever. */
13
+ export interface RetrieverDeps {
14
+ store: VectorStore;
15
+ embedder: Embedder;
16
+ /** `k` used when the caller omits it. */
17
+ defaultK: number;
18
+ /** Approximate combined-snippet token budget (chars/4 heuristic). */
19
+ tokenBudget: number;
20
+ /** Maximum lines kept in any single snippet. */
21
+ maxSnippetLines: number;
22
+ }
23
+ /**
24
+ * Embed the query, run a cosine top-k over the index, and return ranked hits.
25
+ *
26
+ * Results are token-budgeted: snippets are trimmed to `maxSnippetLines`, then
27
+ * hits are appended only while the combined snippet estimate stays under
28
+ * `tokenBudget` — except the top hit, which is always included so a single large
29
+ * match is never dropped. This keeps tool output from flooding the agent's
30
+ * context window.
31
+ */
32
+ export declare function searchCodebase(deps: RetrieverDeps, opts: SearchOptions): Promise<SearchHit[]>;
@@ -0,0 +1,53 @@
1
+ import { estimateTokens, globToRegExp } from "./util.js";
2
+ const MAX_K = 50;
3
+ /**
4
+ * Embed the query, run a cosine top-k over the index, and return ranked hits.
5
+ *
6
+ * Results are token-budgeted: snippets are trimmed to `maxSnippetLines`, then
7
+ * hits are appended only while the combined snippet estimate stays under
8
+ * `tokenBudget` — except the top hit, which is always included so a single large
9
+ * match is never dropped. This keeps tool output from flooding the agent's
10
+ * context window.
11
+ */
12
+ export async function searchCodebase(deps, opts) {
13
+ const k = clamp(opts.k ?? deps.defaultK, 1, MAX_K);
14
+ const pathFilter = opts.pathGlob ? buildPathFilter(opts.pathGlob) : undefined;
15
+ const queryVector = await deps.embedder.embedQuery(opts.query);
16
+ const records = deps.store.search(queryVector, k, pathFilter);
17
+ const hits = [];
18
+ let remaining = deps.tokenBudget;
19
+ for (const record of records) {
20
+ const snippet = trimSnippet(record.text, deps.maxSnippetLines);
21
+ const cost = estimateTokens(snippet);
22
+ if (hits.length > 0 && cost > remaining)
23
+ break;
24
+ remaining -= cost;
25
+ hits.push({
26
+ path: record.path,
27
+ startLine: record.startLine,
28
+ endLine: record.endLine,
29
+ score: roundScore(record.score),
30
+ snippet,
31
+ });
32
+ }
33
+ return hits;
34
+ }
35
+ /** Build a path predicate from a glob (compiled once). */
36
+ function buildPathFilter(glob) {
37
+ const re = globToRegExp(glob);
38
+ return (p) => re.test(p);
39
+ }
40
+ /** Trim a snippet to at most `maxLines`, noting how many lines were dropped. */
41
+ function trimSnippet(text, maxLines) {
42
+ const lines = text.split("\n");
43
+ if (lines.length <= maxLines)
44
+ return text;
45
+ const kept = lines.slice(0, maxLines).join("\n");
46
+ return `${kept}\n… (+${lines.length - maxLines} more lines)`;
47
+ }
48
+ function roundScore(score) {
49
+ return Math.round(score * 10000) / 10000;
50
+ }
51
+ function clamp(value, min, max) {
52
+ return Math.min(max, Math.max(min, Math.trunc(value)));
53
+ }
@@ -0,0 +1,49 @@
1
+ import type { CruxyConfig } from "../config/index.js";
2
+ import { type Embedder } from "./embedder.js";
3
+ import { type SearchOptions } from "./retriever.js";
4
+ import type { IndexStats, IndexStatus, SearchHit } from "./types.js";
5
+ /** Logger surface the service reports through. */
6
+ interface ServiceLogger {
7
+ debug(message: string): void;
8
+ info(message: string): void;
9
+ warn(message: string): void;
10
+ }
11
+ /**
12
+ * A ready-to-use index for one project root: it owns the store, embedder, and
13
+ * indexer, exposes `search`/`index`/`status`, and refreshes itself lazily on the
14
+ * first search. Build one with {@link getIndexService}, which caches per cwd so
15
+ * the heavy backends are constructed at most once per process.
16
+ */
17
+ export interface IndexService {
18
+ /** Search the index, refreshing it first if it hasn't been this process. */
19
+ search(opts: SearchOptions): Promise<SearchHit[]>;
20
+ /** Build/refresh the index (incremental unless `force`). */
21
+ index(runOpts?: {
22
+ force?: boolean;
23
+ }): Promise<IndexStats>;
24
+ /** Read-only snapshot for `cruxy index --status` (does not modify the index). */
25
+ status(): IndexStatus;
26
+ /** Release backend resources. */
27
+ close(): void;
28
+ }
29
+ /** Absolute path of the SQLite index for a project root. */
30
+ export declare function indexDbPath(cwd: string): string;
31
+ /**
32
+ * Explicit dependency overrides. The sole supported override is `embedder`: it
33
+ * is the *only* way to substitute a non-production embedder (e.g.
34
+ * `HashingEmbedder`) and exists for tests. Production callers (the
35
+ * `search_codebase` tool, `cruxy index`) never pass it, so they always go
36
+ * through {@link createEmbedder} — fastembed or a loud failure.
37
+ */
38
+ export interface IndexServiceDeps {
39
+ embedder?: Embedder;
40
+ }
41
+ /**
42
+ * Get (or build) the {@link IndexService} for a project root, cached per
43
+ * resolved cwd so the embedder + store are created at most once per process.
44
+ * `deps` is for explicit test injection only (see {@link IndexServiceDeps}).
45
+ */
46
+ export declare function getIndexService(cwd: string, config: CruxyConfig, logger: ServiceLogger, deps?: IndexServiceDeps): Promise<IndexService>;
47
+ /** Drop all cached services (closing them). For tests and process teardown. */
48
+ export declare function resetIndexServices(): Promise<void>;
49
+ export {};
@@ -0,0 +1,132 @@
1
+ import { promises as fsp } from "node:fs";
2
+ import path from "node:path";
3
+ import { globalDir } from "../config/index.js";
4
+ import { GLOBAL_DIR_NAME } from "../constants.js";
5
+ import { createEmbedder } from "./embedder.js";
6
+ import { Indexer } from "./indexer.js";
7
+ import { searchCodebase } from "./retriever.js";
8
+ import { createSqliteVectorStore, InMemoryVectorStore, } from "./store.js";
9
+ /** Absolute path of the SQLite index for a project root. */
10
+ export function indexDbPath(cwd) {
11
+ return path.join(path.resolve(cwd), GLOBAL_DIR_NAME, "index.db");
12
+ }
13
+ class IndexServiceImpl {
14
+ store;
15
+ embedder;
16
+ config;
17
+ storePath;
18
+ logger;
19
+ indexer;
20
+ refreshed = false;
21
+ constructor(store, embedder, config, storePath, logger, root) {
22
+ this.store = store;
23
+ this.embedder = embedder;
24
+ this.config = config;
25
+ this.storePath = storePath;
26
+ this.logger = logger;
27
+ this.indexer = new Indexer({
28
+ root,
29
+ store,
30
+ embedder,
31
+ maxFileBytes: config.maxFileBytes,
32
+ chunk: config.chunk,
33
+ logger,
34
+ });
35
+ }
36
+ async index(runOpts = {}) {
37
+ const stats = await this.indexer.index(runOpts);
38
+ this.refreshed = true;
39
+ return stats;
40
+ }
41
+ async search(opts) {
42
+ // Lazily bring the index up to date once per process so searches never read
43
+ // stale (or, for a fresh/in-memory store, empty) results.
44
+ if (!this.refreshed) {
45
+ this.logger.debug("refreshing codebase index before first search");
46
+ const stats = await this.index();
47
+ this.logger.debug(`index refresh: +${stats.filesIndexed} ~${stats.filesSkipped} -${stats.filesPurged} ` +
48
+ `(${stats.chunksIndexed} chunks, ${stats.durationMs}ms)`);
49
+ }
50
+ return searchCodebase({
51
+ store: this.store,
52
+ embedder: this.embedder,
53
+ defaultK: this.config.search.defaultK,
54
+ tokenBudget: this.config.search.tokenBudget,
55
+ maxSnippetLines: this.config.search.maxSnippetLines,
56
+ }, opts);
57
+ }
58
+ status() {
59
+ const meta = this.store.getMeta();
60
+ const files = this.store.countFiles();
61
+ return {
62
+ exists: files > 0,
63
+ embedderId: meta?.embedderId ?? null,
64
+ dim: meta?.dim ?? null,
65
+ files,
66
+ chunks: this.store.countChunks(),
67
+ storePath: this.storePath,
68
+ };
69
+ }
70
+ close() {
71
+ this.store.close();
72
+ }
73
+ }
74
+ // ── per-cwd cache ─────────────────────────────────────────────────────────────
75
+ const cache = new Map();
76
+ /**
77
+ * Get (or build) the {@link IndexService} for a project root, cached per
78
+ * resolved cwd so the embedder + store are created at most once per process.
79
+ * `deps` is for explicit test injection only (see {@link IndexServiceDeps}).
80
+ */
81
+ export function getIndexService(cwd, config, logger, deps) {
82
+ const root = path.resolve(cwd);
83
+ let pending = cache.get(root);
84
+ if (!pending) {
85
+ pending = buildService(root, config, logger, deps).catch((err) => {
86
+ // Don't cache a failed construction — let the next call retry.
87
+ cache.delete(root);
88
+ throw err;
89
+ });
90
+ cache.set(root, pending);
91
+ }
92
+ return pending;
93
+ }
94
+ /** Drop all cached services (closing them). For tests and process teardown. */
95
+ export async function resetIndexServices() {
96
+ const services = [...cache.values()];
97
+ cache.clear();
98
+ for (const pending of services) {
99
+ try {
100
+ (await pending).close();
101
+ }
102
+ catch {
103
+ /* already closed or failed to build */
104
+ }
105
+ }
106
+ }
107
+ async function buildService(root, config, logger, deps) {
108
+ const indexConfig = config.index;
109
+ // Production always builds the fastembed embedder (or fails loudly). An
110
+ // injected embedder is honored only when a test passes one explicitly.
111
+ const embedder = deps?.embedder ??
112
+ (await createEmbedder({ cacheDir: path.join(globalDir(), "models") }));
113
+ const { store, storePath } = await openStore(root, indexConfig.store, logger);
114
+ return new IndexServiceImpl(store, embedder, indexConfig, storePath, logger, root);
115
+ }
116
+ async function openStore(root, kind, logger) {
117
+ if (kind === "memory") {
118
+ return { store: new InMemoryVectorStore(), storePath: null };
119
+ }
120
+ const dbPath = indexDbPath(root);
121
+ try {
122
+ await fsp.mkdir(path.dirname(dbPath), { recursive: true });
123
+ const store = await createSqliteVectorStore(dbPath);
124
+ return { store, storePath: dbPath };
125
+ }
126
+ catch (err) {
127
+ if (kind === "sqlite")
128
+ throw err;
129
+ logger.warn(`sqlite index unavailable (${err.message}); using an in-memory index`);
130
+ return { store: new InMemoryVectorStore(), storePath: null };
131
+ }
132
+ }
@@ -0,0 +1,103 @@
1
+ import type DatabaseNs from "better-sqlite3";
2
+ import type { EmbeddedChunk, ScoredRecord } from "./types.js";
3
+ /**
4
+ * Vector storage for the codebase index. The {@link VectorStore} interface hides
5
+ * the backend so the brute-force cosine search here can be swapped for an ANN
6
+ * index later without touching the indexer, retriever, or tool. Two backends
7
+ * ship: {@link SqliteVectorStore} (persistent, the default) and
8
+ * {@link InMemoryVectorStore} (ephemeral, for tests).
9
+ *
10
+ * Vectors are stored L2-normalized, so cosine similarity reduces to a dot
11
+ * product (see {@link dot}).
12
+ */
13
+ /** Embedder identity + dimensionality the index was built with. */
14
+ export interface VectorStoreMeta {
15
+ embedderId: string;
16
+ dim: number;
17
+ }
18
+ export interface VectorStore {
19
+ /** Embedder identity + dim recorded for this index, or null if never set. */
20
+ getMeta(): VectorStoreMeta | null;
21
+ /** Record the embedder identity + dim. Called when the index is (re)initialized. */
22
+ setMeta(meta: VectorStoreMeta): void;
23
+ /** Map of indexed file path → content hash, for incremental decisions. */
24
+ getFileHashes(): Map<string, string>;
25
+ /** Replace every chunk for `path` with `chunks` and record its hash. Atomic. */
26
+ upsertFile(path: string, hash: string, chunks: EmbeddedChunk[]): void;
27
+ /** Remove a file and all of its chunks. */
28
+ deleteByPath(path: string): void;
29
+ /**
30
+ * Brute-force cosine top-k over stored vectors, optionally restricted to paths
31
+ * for which `pathFilter` returns true. Results are sorted by score, descending.
32
+ */
33
+ search(query: Float32Array, k: number, pathFilter?: (path: string) => boolean): ScoredRecord[];
34
+ /** Number of indexed files. */
35
+ countFiles(): number;
36
+ /** Number of stored chunks. */
37
+ countChunks(): number;
38
+ /** Release any held resources (e.g. the SQLite handle). Idempotent. */
39
+ close(): void;
40
+ }
41
+ /** Dot product over the overlapping prefix of two vectors. */
42
+ export declare function dot(a: Float32Array, b: Float32Array): number;
43
+ /** Cosine similarity in [-1, 1]. Returns 0 if either vector is all-zero. */
44
+ export declare function cosineSimilarity(a: Float32Array, b: Float32Array): number;
45
+ /** Serialize a vector to a little-endian Float32 BLOB (an owning copy). */
46
+ export declare function vectorToBlob(v: Float32Array): Buffer;
47
+ /** Deserialize a little-endian Float32 BLOB back into a vector. */
48
+ export declare function blobToVector(buf: Buffer): Float32Array;
49
+ /** Indices of the `k` largest scores, sorted descending. */
50
+ export declare function topKIndices(scores: ArrayLike<number>, k: number): number[];
51
+ /**
52
+ * A non-persistent {@link VectorStore} backed by plain JS maps. Used by the test
53
+ * suite and as the `auto` fallback when the SQLite native dependency is
54
+ * unavailable. Holds everything in memory; rebuilt on each process.
55
+ */
56
+ export declare class InMemoryVectorStore implements VectorStore {
57
+ private meta;
58
+ private readonly fileHashes;
59
+ private readonly chunksByPath;
60
+ getMeta(): VectorStoreMeta | null;
61
+ setMeta(meta: VectorStoreMeta): void;
62
+ getFileHashes(): Map<string, string>;
63
+ upsertFile(path: string, hash: string, chunks: EmbeddedChunk[]): void;
64
+ deleteByPath(path: string): void;
65
+ countFiles(): number;
66
+ countChunks(): number;
67
+ search(query: Float32Array, k: number, pathFilter?: (path: string) => boolean): ScoredRecord[];
68
+ close(): void;
69
+ }
70
+ type DB = DatabaseNs.Database;
71
+ /**
72
+ * Persistent {@link VectorStore} backed by better-sqlite3 at `.cruxy/index.db`.
73
+ * Vectors are stored as Float32 BLOBs; search lazily loads them into one
74
+ * contiguous matrix (cached until the next write) so warm queries are a tight
75
+ * dot-product scan rather than per-row deserialization.
76
+ *
77
+ * Construct via {@link createSqliteVectorStore}, which loads the native module.
78
+ */
79
+ export declare class SqliteVectorStore implements VectorStore {
80
+ private readonly db;
81
+ private cache;
82
+ private closed;
83
+ constructor(db: DB);
84
+ getMeta(): VectorStoreMeta | null;
85
+ setMeta(meta: VectorStoreMeta): void;
86
+ getFileHashes(): Map<string, string>;
87
+ upsertFile(path: string, hash: string, chunks: EmbeddedChunk[]): void;
88
+ deleteByPath(path: string): void;
89
+ countFiles(): number;
90
+ countChunks(): number;
91
+ search(query: Float32Array, k: number, pathFilter?: (path: string) => boolean): ScoredRecord[];
92
+ close(): void;
93
+ private ensureCache;
94
+ private getMetaValue;
95
+ private setMetaValue;
96
+ }
97
+ /**
98
+ * Open (or create) a {@link SqliteVectorStore} at `dbPath`. The native module is
99
+ * imported lazily here, so nothing loads it until an index is actually used.
100
+ * The parent directory must already exist.
101
+ */
102
+ export declare function createSqliteVectorStore(dbPath: string): Promise<SqliteVectorStore>;
103
+ export {};