@cruxy/cli 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +35 -1
  2. package/dist/cli/commands/index.d.ts +7 -0
  3. package/dist/cli/commands/index.js +59 -0
  4. package/dist/cli/commands/skills.d.ts +8 -0
  5. package/dist/cli/commands/skills.js +51 -0
  6. package/dist/cli/program.js +4 -0
  7. package/dist/config/schema.d.ts +199 -0
  8. package/dist/config/schema.js +55 -0
  9. package/dist/constants.d.ts +13 -0
  10. package/dist/constants.js +13 -0
  11. package/dist/indexing/chunker.d.ts +28 -0
  12. package/dist/indexing/chunker.js +65 -0
  13. package/dist/indexing/embedder.d.ts +98 -0
  14. package/dist/indexing/embedder.js +140 -0
  15. package/dist/indexing/index.d.ts +9 -0
  16. package/dist/indexing/index.js +9 -0
  17. package/dist/indexing/indexer.d.ts +45 -0
  18. package/dist/indexing/indexer.js +104 -0
  19. package/dist/indexing/retriever.d.ts +32 -0
  20. package/dist/indexing/retriever.js +53 -0
  21. package/dist/indexing/service.d.ts +49 -0
  22. package/dist/indexing/service.js +132 -0
  23. package/dist/indexing/store.d.ts +103 -0
  24. package/dist/indexing/store.js +279 -0
  25. package/dist/indexing/types.d.ts +71 -0
  26. package/dist/indexing/types.js +6 -0
  27. package/dist/indexing/util.d.ts +34 -0
  28. package/dist/indexing/util.js +97 -0
  29. package/dist/indexing/walker.d.ts +42 -0
  30. package/dist/indexing/walker.js +166 -0
  31. package/dist/skills/index.d.ts +4 -0
  32. package/dist/skills/index.js +4 -0
  33. package/dist/skills/loader.d.ts +42 -0
  34. package/dist/skills/loader.js +0 -0
  35. package/dist/skills/parser.d.ts +29 -0
  36. package/dist/skills/parser.js +90 -0
  37. package/dist/skills/service.d.ts +41 -0
  38. package/dist/skills/service.js +92 -0
  39. package/dist/skills/types.d.ts +94 -0
  40. package/dist/skills/types.js +21 -0
  41. package/dist/tools/index.d.ts +3 -0
  42. package/dist/tools/index.js +3 -0
  43. package/dist/tools/list-skills.d.ts +9 -0
  44. package/dist/tools/list-skills.js +34 -0
  45. package/dist/tools/load-skill.d.ts +21 -0
  46. package/dist/tools/load-skill.js +49 -0
  47. package/dist/tools/registry.js +6 -0
  48. package/dist/tools/search-codebase.d.ts +25 -0
  49. package/dist/tools/search-codebase.js +70 -0
  50. package/package.json +6 -2
  51. package/skills/git-commit/SKILL.md +60 -0
  52. package/skills/using-skills/SKILL.md +62 -0
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Output dimensionality of bge-small-en-v1.5, and the default size of the
3
+ * hashing embedder's feature space, so the two backends are interchangeable in
4
+ * the store schema.
5
+ */
6
+ export declare const DEFAULT_DIM = 384;
7
+ /**
8
+ * Turns text into vectors. Two implementations ship: {@link FastEmbedEmbedder}
9
+ * (local ONNX, semantic) and {@link HashingEmbedder} (dependency-free,
10
+ * deterministic, lexical). The store records `id`; if it changes, the index is
11
+ * re-embedded from scratch (see `indexer.ts`).
12
+ */
13
+ export interface Embedder {
14
+ /** Stable identity of this embedder + model. Persisted with the index. */
15
+ readonly id: string;
16
+ /** Embedding dimensionality. */
17
+ readonly dim: number;
18
+ /**
19
+ * Embed document/passage texts. Returns one L2-normalized vector per input,
20
+ * in the same order. An empty input yields an empty array.
21
+ */
22
+ embed(texts: string[]): Promise<Float32Array[]>;
23
+ /**
24
+ * Embed a search query. May use a different representation than {@link embed}
25
+ * (e.g. bge prepends a retrieval instruction to queries).
26
+ */
27
+ embedQuery(text: string): Promise<Float32Array>;
28
+ }
29
+ /**
30
+ * Split text into lowercase lexical tokens, breaking identifiers on camelCase
31
+ * and snake/kebab boundaries so `getUserById` contributes `get`/`user`/`by`/`id`.
32
+ */
33
+ export declare function tokenize(text: string): string[];
34
+ export interface HashingEmbedderOptions {
35
+ dim?: number;
36
+ }
37
+ /**
38
+ * A deterministic, dependency-free embedder: signed feature hashing
39
+ * (bag-of-words) over code-aware tokens, L2-normalized. It captures lexical
40
+ * overlap rather than true semantics, but it is offline, instant, and stable —
41
+ * which makes it the right default for tests and a sane fallback when the native
42
+ * embedding model is unavailable.
43
+ */
44
+ export declare class HashingEmbedder implements Embedder {
45
+ readonly id: string;
46
+ readonly dim: number;
47
+ constructor(opts?: HashingEmbedderOptions);
48
+ embed(texts: string[]): Promise<Float32Array[]>;
49
+ embedQuery(text: string): Promise<Float32Array>;
50
+ private embedOne;
51
+ }
52
+ export interface FastEmbedEmbedderOptions {
53
+ /** Where the ONNX model is downloaded/cached. Defaults to fastembed's own dir. */
54
+ cacheDir?: string;
55
+ /** Max model input length, in tokens. */
56
+ maxLength?: number;
57
+ /** Texts per inference batch. */
58
+ batchSize?: number;
59
+ /** Print the model download progress on first run. */
60
+ showDownloadProgress?: boolean;
61
+ }
62
+ /**
63
+ * Local semantic embeddings via fastembed (ONNX), model bge-small-en-v1.5
64
+ * (384-dim). The native dependency is imported lazily on first use, so merely
65
+ * registering the `search_codebase` tool stays cheap and the heavy ONNX runtime
66
+ * only loads when an index is actually built or queried.
67
+ *
68
+ * Embedding is CPU-bound and single-threaded inside ONNX, so throughput is
69
+ * bounded by `batchSize` (fed sequentially through fastembed's batching
70
+ * generator) rather than by JS-level concurrency.
71
+ */
72
+ export declare class FastEmbedEmbedder implements Embedder {
73
+ readonly id = "fastembed:bge-small-en-v1.5";
74
+ readonly dim = 384;
75
+ private readonly opts;
76
+ private model;
77
+ constructor(opts?: FastEmbedEmbedderOptions);
78
+ private getModel;
79
+ embed(texts: string[]): Promise<Float32Array[]>;
80
+ embedQuery(text: string): Promise<Float32Array>;
81
+ }
82
+ export interface CreateEmbedderOptions {
83
+ /** Cache dir for the fastembed model. */
84
+ cacheDir?: string;
85
+ }
86
+ /**
87
+ * Build the **production** embedder: fastembed / bge-small-en-v1.5.
88
+ *
89
+ * The native dependency is loaded eagerly here so a missing or broken install
90
+ * fails *now*, loudly, with an actionable message — rather than silently
91
+ * degrading search quality at query time. There is deliberately **no** fallback
92
+ * to {@link HashingEmbedder}: that backend is reachable only by explicit
93
+ * injection in tests (see `getIndexService`'s `deps.embedder`), never selected
94
+ * automatically by this factory or by config.
95
+ *
96
+ * @throws if the fastembed native module cannot be loaded.
97
+ */
98
+ export declare function createEmbedder(opts?: CreateEmbedderOptions): Promise<Embedder>;
@@ -0,0 +1,140 @@
1
+ import { promises as fs } from "node:fs";
2
+ import { l2normalize } from "./util.js";
3
+ /**
4
+ * Output dimensionality of bge-small-en-v1.5, and the default size of the
5
+ * hashing embedder's feature space, so the two backends are interchangeable in
6
+ * the store schema.
7
+ */
8
+ export const DEFAULT_DIM = 384;
9
+ // ── Hashing embedder ────────────────────────────────────────────────────────
10
+ /** 32-bit FNV-1a hash of a string. Deterministic and dependency-free. */
11
+ function fnv1a(str) {
12
+ let h = 0x811c9dc5;
13
+ for (let i = 0; i < str.length; i++) {
14
+ h ^= str.charCodeAt(i);
15
+ // h *= 16777619, kept in 32-bit range via Math.imul.
16
+ h = Math.imul(h, 0x01000193);
17
+ }
18
+ return h >>> 0;
19
+ }
20
+ /**
21
+ * Split text into lowercase lexical tokens, breaking identifiers on camelCase
22
+ * and snake/kebab boundaries so `getUserById` contributes `get`/`user`/`by`/`id`.
23
+ */
24
+ export function tokenize(text) {
25
+ const tokens = [];
26
+ for (const word of text.match(/[A-Za-z0-9]+/g) ?? []) {
27
+ for (const sub of word.split(/(?<=[a-z0-9])(?=[A-Z])|(?<=[A-Za-z])(?=[0-9])/)) {
28
+ if (sub)
29
+ tokens.push(sub.toLowerCase());
30
+ }
31
+ }
32
+ return tokens;
33
+ }
34
+ /**
35
+ * A deterministic, dependency-free embedder: signed feature hashing
36
+ * (bag-of-words) over code-aware tokens, L2-normalized. It captures lexical
37
+ * overlap rather than true semantics, but it is offline, instant, and stable —
38
+ * which makes it the right default for tests and a sane fallback when the native
39
+ * embedding model is unavailable.
40
+ */
41
+ export class HashingEmbedder {
42
+ id;
43
+ dim;
44
+ constructor(opts = {}) {
45
+ this.dim = opts.dim ?? DEFAULT_DIM;
46
+ this.id = `hash:v1:${this.dim}`;
47
+ }
48
+ async embed(texts) {
49
+ return texts.map((t) => this.embedOne(t));
50
+ }
51
+ async embedQuery(text) {
52
+ return this.embedOne(text);
53
+ }
54
+ embedOne(text) {
55
+ const v = new Float32Array(this.dim);
56
+ for (const tok of tokenize(text)) {
57
+ const bucket = fnv1a(tok) % this.dim;
58
+ // A second hash supplies a sign, halving the bias from bucket collisions.
59
+ const sign = (fnv1a(`#${tok}`) & 1) === 0 ? 1 : -1;
60
+ v[bucket] += sign;
61
+ }
62
+ return l2normalize(v);
63
+ }
64
+ }
65
+ /**
66
+ * Local semantic embeddings via fastembed (ONNX), model bge-small-en-v1.5
67
+ * (384-dim). The native dependency is imported lazily on first use, so merely
68
+ * registering the `search_codebase` tool stays cheap and the heavy ONNX runtime
69
+ * only loads when an index is actually built or queried.
70
+ *
71
+ * Embedding is CPU-bound and single-threaded inside ONNX, so throughput is
72
+ * bounded by `batchSize` (fed sequentially through fastembed's batching
73
+ * generator) rather than by JS-level concurrency.
74
+ */
75
+ export class FastEmbedEmbedder {
76
+ id = "fastembed:bge-small-en-v1.5";
77
+ dim = DEFAULT_DIM;
78
+ opts;
79
+ model = null;
80
+ constructor(opts = {}) {
81
+ this.opts = opts;
82
+ }
83
+ getModel() {
84
+ if (!this.model) {
85
+ this.model = (async () => {
86
+ const mod = await import("fastembed");
87
+ // fastembed's init does a non-recursive mkdir of the cache dir, so it
88
+ // fails if an ancestor (e.g. ~/.cruxy) doesn't exist yet. Create it first.
89
+ if (this.opts.cacheDir) {
90
+ await fs.mkdir(this.opts.cacheDir, { recursive: true });
91
+ }
92
+ return (await mod.FlagEmbedding.init({
93
+ model: mod.EmbeddingModel.BGESmallENV15,
94
+ maxLength: this.opts.maxLength ?? 512,
95
+ cacheDir: this.opts.cacheDir,
96
+ showDownloadProgress: this.opts.showDownloadProgress ?? false,
97
+ }));
98
+ })();
99
+ }
100
+ return this.model;
101
+ }
102
+ async embed(texts) {
103
+ if (texts.length === 0)
104
+ return [];
105
+ const model = await this.getModel();
106
+ const out = [];
107
+ for await (const batch of model.embed(texts, this.opts.batchSize ?? 32)) {
108
+ for (const vec of batch)
109
+ out.push(l2normalize(Float32Array.from(vec)));
110
+ }
111
+ return out;
112
+ }
113
+ async embedQuery(text) {
114
+ const model = await this.getModel();
115
+ return l2normalize(Float32Array.from(await model.queryEmbed(text)));
116
+ }
117
+ }
118
+ /**
119
+ * Build the **production** embedder: fastembed / bge-small-en-v1.5.
120
+ *
121
+ * The native dependency is loaded eagerly here so a missing or broken install
122
+ * fails *now*, loudly, with an actionable message — rather than silently
123
+ * degrading search quality at query time. There is deliberately **no** fallback
124
+ * to {@link HashingEmbedder}: that backend is reachable only by explicit
125
+ * injection in tests (see `getIndexService`'s `deps.embedder`), never selected
126
+ * automatically by this factory or by config.
127
+ *
128
+ * @throws if the fastembed native module cannot be loaded.
129
+ */
130
+ export async function createEmbedder(opts = {}) {
131
+ try {
132
+ await import("fastembed");
133
+ }
134
+ catch (err) {
135
+ throw new Error(`the local embedding model (fastembed) could not be loaded: ${err.message}. ` +
136
+ "It requires the onnxruntime-node native addon — reinstall with `pnpm install` and " +
137
+ "ensure the native build completed. The codebase index is unavailable until this is fixed.");
138
+ }
139
+ return new FastEmbedEmbedder({ cacheDir: opts.cacheDir });
140
+ }
@@ -0,0 +1,9 @@
1
+ export * from "./types.js";
2
+ export * from "./util.js";
3
+ export * from "./chunker.js";
4
+ export * from "./embedder.js";
5
+ export * from "./store.js";
6
+ export * from "./walker.js";
7
+ export * from "./indexer.js";
8
+ export * from "./retriever.js";
9
+ export * from "./service.js";
@@ -0,0 +1,9 @@
1
+ export * from "./types.js";
2
+ export * from "./util.js";
3
+ export * from "./chunker.js";
4
+ export * from "./embedder.js";
5
+ export * from "./store.js";
6
+ export * from "./walker.js";
7
+ export * from "./indexer.js";
8
+ export * from "./retriever.js";
9
+ export * from "./service.js";
@@ -0,0 +1,45 @@
1
+ import { type ChunkOptions } from "./chunker.js";
2
+ import type { Embedder } from "./embedder.js";
3
+ import type { VectorStore } from "./store.js";
4
+ import type { IndexStats } from "./types.js";
5
+ /** Minimal logger surface the indexer reports progress to. */
6
+ interface IndexerLogger {
7
+ debug(message: string): void;
8
+ info(message: string): void;
9
+ }
10
+ export interface IndexerOptions {
11
+ /** Absolute project root to index. */
12
+ root: string;
13
+ /** Where embeddings are stored. */
14
+ store: VectorStore;
15
+ /** How text is turned into vectors. */
16
+ embedder: Embedder;
17
+ /** Hard per-file size cap, in bytes. */
18
+ maxFileBytes: number;
19
+ /** Chunking parameters. */
20
+ chunk: ChunkOptions;
21
+ /** Concurrency for reading/hashing files. */
22
+ readConcurrency?: number;
23
+ /** Files chunked + embedded + upserted per batch (bounds peak memory). */
24
+ fileBatchSize?: number;
25
+ /** Optional progress logger. */
26
+ logger?: IndexerLogger;
27
+ }
28
+ /**
29
+ * Orchestrates the index: walk → hash → (skip unchanged | chunk → embed →
30
+ * upsert) → purge removed.
31
+ *
32
+ * Incrementality is content-hash based: a file is re-embedded only when its
33
+ * bytes change; unchanged files are skipped; files that disappeared (deleted or
34
+ * newly ignored) are purged. Changing the embedder (a new model or dimension)
35
+ * invalidates the whole index — its signature is stored, and a mismatch forces a
36
+ * full re-embed so vectors never mix across models.
37
+ */
38
+ export declare class Indexer {
39
+ private readonly opts;
40
+ constructor(opts: IndexerOptions);
41
+ index(runOpts?: {
42
+ force?: boolean;
43
+ }): Promise<IndexStats>;
44
+ }
45
+ export {};
@@ -0,0 +1,104 @@
1
+ import { promises as fs } from "node:fs";
2
+ import { chunkFile } from "./chunker.js";
3
+ import { contentHash, isBinary, mapLimit } from "./util.js";
4
+ import { walkRepo } from "./walker.js";
5
+ const DEFAULT_READ_CONCURRENCY = 8;
6
+ const DEFAULT_FILE_BATCH = 64;
7
+ /**
8
+ * Orchestrates the index: walk → hash → (skip unchanged | chunk → embed →
9
+ * upsert) → purge removed.
10
+ *
11
+ * Incrementality is content-hash based: a file is re-embedded only when its
12
+ * bytes change; unchanged files are skipped; files that disappeared (deleted or
13
+ * newly ignored) are purged. Changing the embedder (a new model or dimension)
14
+ * invalidates the whole index — its signature is stored, and a mismatch forces a
15
+ * full re-embed so vectors never mix across models.
16
+ */
17
+ export class Indexer {
18
+ opts;
19
+ constructor(opts) {
20
+ this.opts = opts;
21
+ }
22
+ async index(runOpts = {}) {
23
+ const started = Date.now();
24
+ const { store, embedder, root, logger } = this.opts;
25
+ // Embedder-signature gate: a change (or an explicit --force) re-embeds all.
26
+ const meta = store.getMeta();
27
+ const signatureChanged = meta !== null &&
28
+ (meta.embedderId !== embedder.id || meta.dim !== embedder.dim);
29
+ const force = Boolean(runOpts.force) || meta === null || signatureChanged;
30
+ if (signatureChanged) {
31
+ logger?.info(`embedder changed (${meta.embedderId} → ${embedder.id}); re-embedding the whole index`);
32
+ for (const path of store.getFileHashes().keys())
33
+ store.deleteByPath(path);
34
+ }
35
+ store.setMeta({ embedderId: embedder.id, dim: embedder.dim });
36
+ const previousHashes = store.getFileHashes();
37
+ const seen = new Set();
38
+ // 1. Walk the repo into a candidate list.
39
+ const entries = [];
40
+ for await (const entry of walkRepo(root, {
41
+ maxFileBytes: this.opts.maxFileBytes,
42
+ })) {
43
+ entries.push({ relPath: entry.relPath, absPath: entry.absPath });
44
+ }
45
+ // 2. Read + hash candidates, deciding which changed (bounded concurrency).
46
+ const results = await mapLimit(entries, this.opts.readConcurrency ?? DEFAULT_READ_CONCURRENCY, async (entry) => {
47
+ const buf = await fs.readFile(entry.absPath);
48
+ // The walker already sniffed binaries; re-check in case the file changed
49
+ // between walk and read.
50
+ if (isBinary(buf))
51
+ return null;
52
+ seen.add(entry.relPath);
53
+ const hash = contentHash(buf);
54
+ if (!force && previousHashes.get(entry.relPath) === hash) {
55
+ return null; // unchanged
56
+ }
57
+ return { relPath: entry.relPath, text: buf.toString("utf8"), hash };
58
+ });
59
+ const changed = results.filter((r) => r !== null);
60
+ // `seen` holds readable text candidates (changed + unchanged); the rest are
61
+ // unchanged. Files that turned binary/unreadable aren't seen and collapse
62
+ // into purges below.
63
+ const filesSkipped = seen.size - changed.length;
64
+ // 3. Chunk → embed → upsert, batched by file to bound peak memory.
65
+ let chunksIndexed = 0;
66
+ const batchSize = this.opts.fileBatchSize ?? DEFAULT_FILE_BATCH;
67
+ for (let i = 0; i < changed.length; i += batchSize) {
68
+ const batch = changed.slice(i, i + batchSize);
69
+ const chunked = batch.map((f) => ({
70
+ relPath: f.relPath,
71
+ hash: f.hash,
72
+ chunks: chunkFile(f.relPath, f.text, this.opts.chunk),
73
+ }));
74
+ const texts = chunked.flatMap((f) => f.chunks.map((c) => c.text));
75
+ const vectors = texts.length > 0 ? await embedder.embed(texts) : [];
76
+ let v = 0;
77
+ for (const file of chunked) {
78
+ const embedded = file.chunks.map((chunk) => ({
79
+ chunk,
80
+ vector: vectors[v++],
81
+ }));
82
+ store.upsertFile(file.relPath, file.hash, embedded);
83
+ chunksIndexed += embedded.length;
84
+ }
85
+ logger?.debug(`indexed ${Math.min(i + batchSize, changed.length)}/${changed.length} changed files`);
86
+ }
87
+ // 4. Purge anything previously indexed that we no longer saw.
88
+ let filesPurged = 0;
89
+ for (const path of store.getFileHashes().keys()) {
90
+ if (!seen.has(path)) {
91
+ store.deleteByPath(path);
92
+ filesPurged++;
93
+ }
94
+ }
95
+ return {
96
+ filesSeen: seen.size,
97
+ filesIndexed: changed.length,
98
+ filesSkipped,
99
+ filesPurged,
100
+ chunksIndexed,
101
+ durationMs: Date.now() - started,
102
+ };
103
+ }
104
+ }
@@ -0,0 +1,32 @@
1
+ import type { Embedder } from "./embedder.js";
2
+ import type { VectorStore } from "./store.js";
3
+ import type { SearchHit } from "./types.js";
4
+ /** Inputs to {@link searchCodebase}. */
5
+ export interface SearchOptions {
6
+ query: string;
7
+ /** Number of hits to return; falls back to `defaultK`. Clamped to [1, 50]. */
8
+ k?: number;
9
+ /** Optional glob restricting results by path, e.g. `src/**\/*.ts`. */
10
+ pathGlob?: string;
11
+ }
12
+ /** Collaborators + budget knobs for the retriever. */
13
+ export interface RetrieverDeps {
14
+ store: VectorStore;
15
+ embedder: Embedder;
16
+ /** `k` used when the caller omits it. */
17
+ defaultK: number;
18
+ /** Approximate combined-snippet token budget (chars/4 heuristic). */
19
+ tokenBudget: number;
20
+ /** Maximum lines kept in any single snippet. */
21
+ maxSnippetLines: number;
22
+ }
23
+ /**
24
+ * Embed the query, run a cosine top-k over the index, and return ranked hits.
25
+ *
26
+ * Results are token-budgeted: snippets are trimmed to `maxSnippetLines`, then
27
+ * hits are appended only while the combined snippet estimate stays under
28
+ * `tokenBudget` — except the top hit, which is always included so a single large
29
+ * match is never dropped. This keeps tool output from flooding the agent's
30
+ * context window.
31
+ */
32
+ export declare function searchCodebase(deps: RetrieverDeps, opts: SearchOptions): Promise<SearchHit[]>;
@@ -0,0 +1,53 @@
1
+ import { estimateTokens, globToRegExp } from "./util.js";
2
+ const MAX_K = 50;
3
+ /**
4
+ * Embed the query, run a cosine top-k over the index, and return ranked hits.
5
+ *
6
+ * Results are token-budgeted: snippets are trimmed to `maxSnippetLines`, then
7
+ * hits are appended only while the combined snippet estimate stays under
8
+ * `tokenBudget` — except the top hit, which is always included so a single large
9
+ * match is never dropped. This keeps tool output from flooding the agent's
10
+ * context window.
11
+ */
12
+ export async function searchCodebase(deps, opts) {
13
+ const k = clamp(opts.k ?? deps.defaultK, 1, MAX_K);
14
+ const pathFilter = opts.pathGlob ? buildPathFilter(opts.pathGlob) : undefined;
15
+ const queryVector = await deps.embedder.embedQuery(opts.query);
16
+ const records = deps.store.search(queryVector, k, pathFilter);
17
+ const hits = [];
18
+ let remaining = deps.tokenBudget;
19
+ for (const record of records) {
20
+ const snippet = trimSnippet(record.text, deps.maxSnippetLines);
21
+ const cost = estimateTokens(snippet);
22
+ if (hits.length > 0 && cost > remaining)
23
+ break;
24
+ remaining -= cost;
25
+ hits.push({
26
+ path: record.path,
27
+ startLine: record.startLine,
28
+ endLine: record.endLine,
29
+ score: roundScore(record.score),
30
+ snippet,
31
+ });
32
+ }
33
+ return hits;
34
+ }
35
+ /** Build a path predicate from a glob (compiled once). */
36
+ function buildPathFilter(glob) {
37
+ const re = globToRegExp(glob);
38
+ return (p) => re.test(p);
39
+ }
40
+ /** Trim a snippet to at most `maxLines`, noting how many lines were dropped. */
41
+ function trimSnippet(text, maxLines) {
42
+ const lines = text.split("\n");
43
+ if (lines.length <= maxLines)
44
+ return text;
45
+ const kept = lines.slice(0, maxLines).join("\n");
46
+ return `${kept}\n… (+${lines.length - maxLines} more lines)`;
47
+ }
48
+ function roundScore(score) {
49
+ return Math.round(score * 10000) / 10000;
50
+ }
51
+ function clamp(value, min, max) {
52
+ return Math.min(max, Math.max(min, Math.trunc(value)));
53
+ }
@@ -0,0 +1,49 @@
1
+ import type { CruxyConfig } from "../config/index.js";
2
+ import { type Embedder } from "./embedder.js";
3
+ import { type SearchOptions } from "./retriever.js";
4
+ import type { IndexStats, IndexStatus, SearchHit } from "./types.js";
5
+ /** Logger surface the service reports through. */
6
+ interface ServiceLogger {
7
+ debug(message: string): void;
8
+ info(message: string): void;
9
+ warn(message: string): void;
10
+ }
11
+ /**
12
+ * A ready-to-use index for one project root: it owns the store, embedder, and
13
+ * indexer, exposes `search`/`index`/`status`, and refreshes itself lazily on the
14
+ * first search. Build one with {@link getIndexService}, which caches per cwd so
15
+ * the heavy backends are constructed at most once per process.
16
+ */
17
+ export interface IndexService {
18
+ /** Search the index, refreshing it first if it hasn't been this process. */
19
+ search(opts: SearchOptions): Promise<SearchHit[]>;
20
+ /** Build/refresh the index (incremental unless `force`). */
21
+ index(runOpts?: {
22
+ force?: boolean;
23
+ }): Promise<IndexStats>;
24
+ /** Read-only snapshot for `cruxy index --status` (does not modify the index). */
25
+ status(): IndexStatus;
26
+ /** Release backend resources. */
27
+ close(): void;
28
+ }
29
+ /** Absolute path of the SQLite index for a project root. */
30
+ export declare function indexDbPath(cwd: string): string;
31
+ /**
32
+ * Explicit dependency overrides. The sole supported override is `embedder`: it
33
+ * is the *only* way to substitute a non-production embedder (e.g.
34
+ * `HashingEmbedder`) and exists for tests. Production callers (the
35
+ * `search_codebase` tool, `cruxy index`) never pass it, so they always go
36
+ * through {@link createEmbedder} — fastembed or a loud failure.
37
+ */
38
+ export interface IndexServiceDeps {
39
+ embedder?: Embedder;
40
+ }
41
+ /**
42
+ * Get (or build) the {@link IndexService} for a project root, cached per
43
+ * resolved cwd so the embedder + store are created at most once per process.
44
+ * `deps` is for explicit test injection only (see {@link IndexServiceDeps}).
45
+ */
46
+ export declare function getIndexService(cwd: string, config: CruxyConfig, logger: ServiceLogger, deps?: IndexServiceDeps): Promise<IndexService>;
47
+ /** Drop all cached services (closing them). For tests and process teardown. */
48
+ export declare function resetIndexServices(): Promise<void>;
49
+ export {};
@@ -0,0 +1,132 @@
1
+ import { promises as fsp } from "node:fs";
2
+ import path from "node:path";
3
+ import { globalDir } from "../config/index.js";
4
+ import { GLOBAL_DIR_NAME } from "../constants.js";
5
+ import { createEmbedder } from "./embedder.js";
6
+ import { Indexer } from "./indexer.js";
7
+ import { searchCodebase } from "./retriever.js";
8
+ import { createSqliteVectorStore, InMemoryVectorStore, } from "./store.js";
9
+ /** Absolute path of the SQLite index for a project root. */
10
+ export function indexDbPath(cwd) {
11
+ return path.join(path.resolve(cwd), GLOBAL_DIR_NAME, "index.db");
12
+ }
13
+ class IndexServiceImpl {
14
+ store;
15
+ embedder;
16
+ config;
17
+ storePath;
18
+ logger;
19
+ indexer;
20
+ refreshed = false;
21
+ constructor(store, embedder, config, storePath, logger, root) {
22
+ this.store = store;
23
+ this.embedder = embedder;
24
+ this.config = config;
25
+ this.storePath = storePath;
26
+ this.logger = logger;
27
+ this.indexer = new Indexer({
28
+ root,
29
+ store,
30
+ embedder,
31
+ maxFileBytes: config.maxFileBytes,
32
+ chunk: config.chunk,
33
+ logger,
34
+ });
35
+ }
36
+ async index(runOpts = {}) {
37
+ const stats = await this.indexer.index(runOpts);
38
+ this.refreshed = true;
39
+ return stats;
40
+ }
41
+ async search(opts) {
42
+ // Lazily bring the index up to date once per process so searches never read
43
+ // stale (or, for a fresh/in-memory store, empty) results.
44
+ if (!this.refreshed) {
45
+ this.logger.debug("refreshing codebase index before first search");
46
+ const stats = await this.index();
47
+ this.logger.debug(`index refresh: +${stats.filesIndexed} ~${stats.filesSkipped} -${stats.filesPurged} ` +
48
+ `(${stats.chunksIndexed} chunks, ${stats.durationMs}ms)`);
49
+ }
50
+ return searchCodebase({
51
+ store: this.store,
52
+ embedder: this.embedder,
53
+ defaultK: this.config.search.defaultK,
54
+ tokenBudget: this.config.search.tokenBudget,
55
+ maxSnippetLines: this.config.search.maxSnippetLines,
56
+ }, opts);
57
+ }
58
+ status() {
59
+ const meta = this.store.getMeta();
60
+ const files = this.store.countFiles();
61
+ return {
62
+ exists: files > 0,
63
+ embedderId: meta?.embedderId ?? null,
64
+ dim: meta?.dim ?? null,
65
+ files,
66
+ chunks: this.store.countChunks(),
67
+ storePath: this.storePath,
68
+ };
69
+ }
70
+ close() {
71
+ this.store.close();
72
+ }
73
+ }
74
+ // ── per-cwd cache ─────────────────────────────────────────────────────────────
75
+ const cache = new Map();
76
+ /**
77
+ * Get (or build) the {@link IndexService} for a project root, cached per
78
+ * resolved cwd so the embedder + store are created at most once per process.
79
+ * `deps` is for explicit test injection only (see {@link IndexServiceDeps}).
80
+ */
81
+ export function getIndexService(cwd, config, logger, deps) {
82
+ const root = path.resolve(cwd);
83
+ let pending = cache.get(root);
84
+ if (!pending) {
85
+ pending = buildService(root, config, logger, deps).catch((err) => {
86
+ // Don't cache a failed construction — let the next call retry.
87
+ cache.delete(root);
88
+ throw err;
89
+ });
90
+ cache.set(root, pending);
91
+ }
92
+ return pending;
93
+ }
94
+ /** Drop all cached services (closing them). For tests and process teardown. */
95
+ export async function resetIndexServices() {
96
+ const services = [...cache.values()];
97
+ cache.clear();
98
+ for (const pending of services) {
99
+ try {
100
+ (await pending).close();
101
+ }
102
+ catch {
103
+ /* already closed or failed to build */
104
+ }
105
+ }
106
+ }
107
+ async function buildService(root, config, logger, deps) {
108
+ const indexConfig = config.index;
109
+ // Production always builds the fastembed embedder (or fails loudly). An
110
+ // injected embedder is honored only when a test passes one explicitly.
111
+ const embedder = deps?.embedder ??
112
+ (await createEmbedder({ cacheDir: path.join(globalDir(), "models") }));
113
+ const { store, storePath } = await openStore(root, indexConfig.store, logger);
114
+ return new IndexServiceImpl(store, embedder, indexConfig, storePath, logger, root);
115
+ }
116
+ async function openStore(root, kind, logger) {
117
+ if (kind === "memory") {
118
+ return { store: new InMemoryVectorStore(), storePath: null };
119
+ }
120
+ const dbPath = indexDbPath(root);
121
+ try {
122
+ await fsp.mkdir(path.dirname(dbPath), { recursive: true });
123
+ const store = await createSqliteVectorStore(dbPath);
124
+ return { store, storePath: dbPath };
125
+ }
126
+ catch (err) {
127
+ if (kind === "sqlite")
128
+ throw err;
129
+ logger.warn(`sqlite index unavailable (${err.message}); using an in-memory index`);
130
+ return { store: new InMemoryVectorStore(), storePath: null };
131
+ }
132
+ }