@ez-corp/ez-search 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,15 @@
1
+ ISC License
2
+
3
+ Copyright (c) 2026 ez-search contributors
4
+
5
+ Permission to use, copy, modify, and/or distribute this software for any
6
+ purpose with or without fee is hereby granted, provided that the above
7
+ copyright notice and this permission notice appear in all copies.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10
+ REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11
+ AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
12
+ INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13
+ LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
14
+ OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15
+ PERFORMANCE OF THIS SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,207 @@
1
+ # ez-search
2
+
3
+ Semantic codebase search with zero cloud dependencies.
4
+
5
+ `ez-search` is a local, privacy-first CLI tool that provides semantic search over codebases, documents, and image libraries. It uses ML inference (WebGPU with CPU fallback) to generate embeddings and stores them in a local vector database. No cloud services, no API keys, no data leaves your machine.
6
+
7
+ Built as a contextual retrieval engine for AI coding assistants like Claude Code.
8
+
9
+ ## Features
10
+
11
+ - **Three search pipelines** -- code, text/documents, and images, each with a specialized embedding model
12
+ - **Incremental indexing** -- only re-embeds files that have changed (mtime + content hash)
13
+ - **WebGPU acceleration** with automatic CPU fallback
14
+ - **Respects .gitignore and .cursorignore** -- skips `node_modules`, `dist`, lockfiles, etc. by default
15
+ - **Machine-readable JSON output** -- designed for AI assistant consumption
16
+ - **Project-scoped storage** -- all index data lives in `.ez-search/` within your project
17
+
18
+ ## Requirements
19
+
20
+ - **Node.js v20+** (v22+ recommended for WebGPU support)
21
+ - Models are downloaded automatically on first run (~500MB total for all three)
22
+
23
+ ## Installation
24
+
25
+ ```bash
26
+ npm install -g ez-search
27
+ ```
28
+
29
+ Or from source:
30
+
31
+ ```bash
32
+ git clone https://github.com/ezcorp-org/ez-search.git
33
+ cd ez-search
34
+ npm install
35
+ npm run build
36
+ ```
37
+
38
+ ## Quick Start
39
+
40
+ ```bash
41
+ # Index the current directory
42
+ ez-search index .
43
+
44
+ # Search your code
45
+ ez-search query "error handling in the auth module"
46
+
47
+ # Check index status
48
+ ez-search status
49
+ ```
50
+
51
+ ## CLI Reference
52
+
53
+ ### `ez-search index <path>`
54
+
55
+ Scan a directory, chunk files, generate embeddings, and store them in the local vector database.
56
+
57
+ ```bash
58
+ ez-search index .
59
+ ez-search index ./src --type code
60
+ ez-search index . --clear --format text
61
+ ```
62
+
63
+ | Flag | Description |
64
+ |------|-------------|
65
+ | `--type <code\|text\|image>` | Index only files of a specific type. If omitted, all types are auto-detected by file extension. |
66
+ | `--clear` | Delete the existing `.ez-search/` index before indexing. |
67
+ | `--no-ignore` | Disable `.gitignore` and `.cursorignore` filtering. |
68
+ | `-q, --quiet` | Suppress status output. |
69
+ | `--format <json\|text>` | Output format. Default: `json`. |
70
+
71
+ ### `ez-search query <text>`
72
+
73
+ Search the index with a natural language query. Run from inside an indexed directory.
74
+
75
+ ```bash
76
+ ez-search query "database connection pooling"
77
+ ez-search query "how are users authenticated" --format text
78
+ ez-search query "parse config" -k 5 --dir src/config
79
+ ez-search query "validation logic" --threshold 0.7
80
+ ```
81
+
82
+ | Flag | Description |
83
+ |------|-------------|
84
+ | `-k, --top-k <n>` | Number of results to return. Default: `10`. |
85
+ | `--type <code\|text>` | Search a specific pipeline only. |
86
+ | `--dir <path>` | Scope results to a subdirectory. |
87
+ | `--threshold <score>` | Minimum relevance score (0-1) to include in results. |
88
+ | `--format <json\|text>` | Output format. Default: `json`. |
89
+
90
+ JSON output returns a grouped envelope:
91
+
92
+ ```json
93
+ {
94
+ "query": "database connection",
95
+ "totalIndexed": 142,
96
+ "searchScope": ".",
97
+ "code": [
98
+ {
99
+ "file": "src/db/pool.ts",
100
+ "lines": { "start": 12, "end": 45 },
101
+ "score": 0.87,
102
+ "text": "..."
103
+ }
104
+ ],
105
+ "text": [
106
+ {
107
+ "file": "docs/architecture.md",
108
+ "score": 0.72,
109
+ "text": "..."
110
+ }
111
+ ]
112
+ }
113
+ ```
114
+
115
+ Text output uses a human-readable format:
116
+
117
+ ```
118
+ ## Code
119
+
120
+ File: src/db/pool.ts | Lines: 12-45 | Relevance: 0.87
121
+ <chunk text>
122
+
123
+ ## Text
124
+
125
+ File: docs/architecture.md | Relevance: 0.72
126
+ <chunk text>
127
+ ```
128
+
129
+ ### `ez-search status`
130
+
131
+ Show indexing status for the current directory.
132
+
133
+ ```bash
134
+ ez-search status
135
+ ez-search status --format text
136
+ ```
137
+
138
+ | Flag | Description |
139
+ |------|-------------|
140
+ | `--format <json\|text>` | Output format. Default: `json`. |
141
+ | `--no-ignore` | Disable `.gitignore` and `.cursorignore` filtering when computing stale file count. |
142
+
143
+ Reports file count, chunk count, per-type breakdown, index size, last indexed time, and number of stale (unindexed) files.
144
+
145
+ ## Supported File Types
146
+
147
+ | Type | Extensions |
148
+ |------|------------|
149
+ | Code | `.ts` `.tsx` `.js` `.jsx` `.py` `.go` `.rs` `.java` `.c` `.cpp` `.h` `.hpp` `.rb` `.php` `.swift` `.kt` `.scala` `.sh` `.bash` `.zsh` `.css` `.scss` `.html` `.json` `.yaml` `.yml` `.toml` |
150
+ | Text | `.md` `.mdx` `.txt` `.rst` `.csv` `.pdf` |
151
+ | Image | `.jpg` `.jpeg` `.png` `.gif` `.webp` `.svg` |
152
+
153
+ ## How It Works
154
+
155
+ ez-search uses three specialized embedding models, each optimized for a different data type:
156
+
157
+ | Pipeline | Model | Dimensions | Chunking |
158
+ |----------|-------|------------|----------|
159
+ | Code | `jinaai/jina-embeddings-v2-base-code` | 768 | 500-token sliding window, 50-token overlap |
160
+ | Text | `nomic-ai/nomic-embed-text-v1.5` | 768 | Paragraph-boundary splitting, ~1600 chars per chunk |
161
+ | Image | `Xenova/clip-vit-base-patch32` | 512 | One vector per image (no chunking) |
162
+
163
+ Models are lazy-loaded -- only the model needed for the current operation is loaded. On first run, model weights are downloaded and cached in `~/.ez-search/models/`.
164
+
165
+ **Incremental indexing:** A manifest at `.ez-search/manifest.json` tracks file size, mtime, and content hash (SHA-256). On subsequent runs, only changed files are re-embedded. Chunk-level deduplication further reduces work when only part of a file changes.
166
+
167
+ **Vector storage:** Embeddings are stored in Zvec (`@zvec/zvec`), an in-process C++ vector database. Code and text share a 768-dimension collection; images use a separate 512-dimension collection. Both use cosine similarity for search.
168
+
169
+ ## Configuration
170
+
171
+ ez-search uses convention over configuration. There are no config files.
172
+
173
+ - **Project index:** stored in `<project>/.ez-search/` (add to `.gitignore`)
174
+ - **Model cache:** stored in `~/.ez-search/models/` (shared across projects)
175
+ - **File filtering:** respects `.gitignore` and `.cursorignore` by default; disable with `--no-ignore`
176
+ - **Built-in exclusions:** `node_modules`, `.git`, `dist`, `build`, lockfiles, `.min.js`, `.map`, and other common noise are always excluded
177
+
178
+ ## Troubleshooting
179
+
180
+ **WebGPU not available / falling back to CPU**
181
+
182
+ WebGPU requires Node.js v22+ and a Vulkan-capable GPU. On systems without GPU support (or on NixOS where `vulkan-loader` may not be in the default environment), ez-search falls back to CPU with q8 quantization automatically. CPU mode is slower but functionally identical.
183
+
184
+ On NixOS, you can enable Vulkan with:
185
+ ```bash
186
+ nix-shell -p vulkan-loader
187
+ ```
188
+
189
+ **Model download is slow or fails**
190
+
191
+ Models are downloaded from Hugging Face on first use. If downloads fail, check your internet connection. Models are cached in `~/.ez-search/models/` -- you can delete this directory to force re-download.
192
+
193
+ **"No index found" error**
194
+
195
+ You need to index before querying. Run `ez-search index .` in your project directory first.
196
+
197
+ **"No supported files found" error**
198
+
199
+ The target directory contains no files with recognized extensions. Check the supported file types table above.
200
+
201
+ **Large index size**
202
+
203
+ Use `ez-search index --clear .` to rebuild the index from scratch. This removes stale entries from deleted files.
204
+
205
+ ## License
206
+
207
+ [ISC](LICENSE)
@@ -0,0 +1,450 @@
1
+ /**
2
+ * Index command — end-to-end pipeline: scan -> manifest check -> chunk -> embed -> store.
3
+ *
4
+ * Pipeline flow (per type):
5
+ * 1. Resolve path and open vector collections
6
+ * 2. Handle --clear (wipe storage + manifest)
7
+ * 3. Load manifest (incremental cache)
8
+ * 4. For each type in [code, text, image]:
9
+ * a. Scan files of that type
10
+ * b. Detect changed/new/deleted files against manifest
11
+ * c. Remove deleted files' chunks from the appropriate collection
12
+ * d. Chunk changed/new files
13
+ * e. Batch embed with the correct model
14
+ * f. Insert embeddings into the appropriate collection
15
+ * 5. Optimize collections THEN save manifest (order matters)
16
+ * 6. Dispose pipelines and output results
17
+ *
18
+ * Model routing:
19
+ * code -> jinaai/jina-embeddings-v2-base-code, col-768
20
+ * text -> nomic-ai/nomic-embed-text-v1.5, col-768 (prefix: "search_document: ")
21
+ * image -> Xenova/clip-vit-base-patch32, col-512 (one vector per file)
22
+ */
23
+ import * as path from 'path';
24
+ import * as fsp from 'fs/promises';
25
+ import { rmSync } from 'fs';
26
+ const BATCH_SIZE = 32;
27
+ // ── Shared pipeline helper ─────────────────────────────────────────────────────
28
+ /**
29
+ * Shared embedding pipeline: diff files against manifest, chunk, embed, insert into col768.
30
+ * Used by both code and text pipelines (they differ only in chunker, model, prefix, tokenizer).
31
+ */
32
+ async function runTextEmbeddingPipeline(opts) {
33
+ const { type, files, col768, manifest, hashContent, hashText, makeChunkId } = opts;
34
+ let filesIndexed = 0;
35
+ let filesSkipped = 0;
36
+ let chunksCreated = 0;
37
+ let chunksReused = 0;
38
+ let chunksRemoved = 0;
39
+ // Determine which files need processing (mtime+size fast path, hash confirmation)
40
+ const filesToProcess = [];
41
+ for (const file of files) {
42
+ const existing = manifest.files[file.relativePath];
43
+ if (existing && existing.mtime === file.mtimeMs && existing.size === file.sizeBytes) {
44
+ filesSkipped++;
45
+ chunksReused += existing.chunks.length;
46
+ continue;
47
+ }
48
+ if (existing) {
49
+ const buf = await fsp.readFile(file.absolutePath);
50
+ const newHash = hashContent(buf);
51
+ if (newHash === existing.hash) {
52
+ manifest.files[file.relativePath] = { ...existing, mtime: file.mtimeMs, size: file.sizeBytes };
53
+ filesSkipped++;
54
+ chunksReused += existing.chunks.length;
55
+ continue;
56
+ }
57
+ }
58
+ filesToProcess.push(file);
59
+ }
60
+ if (filesToProcess.length === 0) {
61
+ return { filesIndexed, filesSkipped, chunksCreated, chunksReused, chunksRemoved };
62
+ }
63
+ // Load chunker and model (lazy, once per pipeline run)
64
+ let tokenizer = null;
65
+ let pipe = null;
66
+ const allPendingChunks = [];
67
+ const newFileEntries = [];
68
+ for (const file of filesToProcess) {
69
+ const ext = path.extname(file.absolutePath).toLowerCase();
70
+ let content;
71
+ if (ext === '.pdf') {
72
+ // PDF: read as buffer, extract text via pdf-parse
73
+ const buf = await fsp.readFile(file.absolutePath);
74
+ const fileHash = hashContent(buf);
75
+ const { extractPdfText, chunkTextFile } = await import('../../services/text-chunker.js');
76
+ const rawText = await extractPdfText(buf);
77
+ const chunks = chunkTextFile(rawText);
78
+ const existingEntry = manifest.files[file.relativePath];
79
+ const existingChunks = existingEntry?.chunks ?? [];
80
+ for (let i = chunks.length; i < existingChunks.length; i++) {
81
+ col768.remove(existingChunks[i].id);
82
+ chunksRemoved++;
83
+ }
84
+ const chunkRecords = [];
85
+ for (const chunk of chunks) {
86
+ const chunkId = makeChunkId(file.relativePath, chunk.chunkIndex);
87
+ const chunkTextHash = hashText(chunk.text);
88
+ const oldChunk = existingChunks[chunk.chunkIndex];
89
+ chunkRecords.push({ id: chunkId, lineStart: 0, lineEnd: 0, tokenCount: 0, textHash: chunkTextHash });
90
+ if (oldChunk && oldChunk.textHash === chunkTextHash) {
91
+ chunksReused++;
92
+ }
93
+ else {
94
+ allPendingChunks.push({
95
+ relPath: file.relativePath,
96
+ chunkId,
97
+ text: chunk.text,
98
+ lineStart: 0,
99
+ lineEnd: 0,
100
+ chunkIndex: chunk.chunkIndex,
101
+ tokenCount: 0,
102
+ textHash: chunkTextHash,
103
+ });
104
+ }
105
+ }
106
+ newFileEntries.push({ relPath: file.relativePath, mtime: file.mtimeMs, size: file.sizeBytes, hash: fileHash, chunks: chunkRecords });
107
+ filesIndexed++;
108
+ continue;
109
+ }
110
+ // Non-PDF: read as UTF-8
111
+ content = await fsp.readFile(file.absolutePath, 'utf8');
112
+ const fileHash = hashContent(Buffer.from(content));
113
+ const existingEntry = manifest.files[file.relativePath];
114
+ const existingChunks = existingEntry?.chunks ?? [];
115
+ if (type === 'code') {
116
+ // Code: use Jina tokenizer + sliding-window chunker
117
+ if (!tokenizer) {
118
+ const { loadTokenizer } = await import('../../services/chunker.js');
119
+ tokenizer = await loadTokenizer();
120
+ }
121
+ const { chunkFile } = await import('../../services/chunker.js');
122
+ const chunks = chunkFile(content, tokenizer);
123
+ for (let i = chunks.length; i < existingChunks.length; i++) {
124
+ col768.remove(existingChunks[i].id);
125
+ chunksRemoved++;
126
+ }
127
+ const chunkRecords = [];
128
+ for (const chunk of chunks) {
129
+ const chunkId = makeChunkId(file.relativePath, chunk.chunkIndex);
130
+ const chunkTextHash = hashText(chunk.text);
131
+ const oldChunk = existingChunks[chunk.chunkIndex];
132
+ chunkRecords.push({
133
+ id: chunkId,
134
+ lineStart: chunk.lineStart,
135
+ lineEnd: chunk.lineEnd,
136
+ tokenCount: chunk.tokenCount,
137
+ textHash: chunkTextHash,
138
+ });
139
+ if (oldChunk && oldChunk.textHash === chunkTextHash) {
140
+ chunksReused++;
141
+ }
142
+ else {
143
+ allPendingChunks.push({
144
+ relPath: file.relativePath,
145
+ chunkId,
146
+ text: chunk.text,
147
+ lineStart: chunk.lineStart,
148
+ lineEnd: chunk.lineEnd,
149
+ chunkIndex: chunk.chunkIndex,
150
+ tokenCount: chunk.tokenCount,
151
+ textHash: chunkTextHash,
152
+ });
153
+ }
154
+ }
155
+ newFileEntries.push({ relPath: file.relativePath, mtime: file.mtimeMs, size: file.sizeBytes, hash: fileHash, chunks: chunkRecords });
156
+ }
157
+ else {
158
+ // Text (non-PDF): paragraph-boundary chunking for Nomic
159
+ const { chunkTextFile } = await import('../../services/text-chunker.js');
160
+ const chunks = chunkTextFile(content);
161
+ for (let i = chunks.length; i < existingChunks.length; i++) {
162
+ col768.remove(existingChunks[i].id);
163
+ chunksRemoved++;
164
+ }
165
+ const chunkRecords = [];
166
+ for (const chunk of chunks) {
167
+ const chunkId = makeChunkId(file.relativePath, chunk.chunkIndex);
168
+ const chunkTextHash = hashText(chunk.text);
169
+ const oldChunk = existingChunks[chunk.chunkIndex];
170
+ chunkRecords.push({ id: chunkId, lineStart: 0, lineEnd: 0, tokenCount: 0, textHash: chunkTextHash });
171
+ if (oldChunk && oldChunk.textHash === chunkTextHash) {
172
+ chunksReused++;
173
+ }
174
+ else {
175
+ allPendingChunks.push({
176
+ relPath: file.relativePath,
177
+ chunkId,
178
+ text: chunk.text,
179
+ lineStart: 0,
180
+ lineEnd: 0,
181
+ chunkIndex: chunk.chunkIndex,
182
+ tokenCount: 0,
183
+ textHash: chunkTextHash,
184
+ });
185
+ }
186
+ }
187
+ newFileEntries.push({ relPath: file.relativePath, mtime: file.mtimeMs, size: file.sizeBytes, hash: fileHash, chunks: chunkRecords });
188
+ }
189
+ filesIndexed++;
190
+ }
191
+ // Embed all pending chunks
192
+ if (allPendingChunks.length > 0) {
193
+ const { createEmbeddingPipeline } = await import('../../services/model-router.js');
194
+ pipe = await createEmbeddingPipeline(type);
195
+ // Nomic requires "search_document: " prefix on indexed documents
196
+ const prefix = type === 'text' ? 'search_document: ' : '';
197
+ for (let batchStart = 0; batchStart < allPendingChunks.length; batchStart += BATCH_SIZE) {
198
+ const batch = allPendingChunks.slice(batchStart, batchStart + BATCH_SIZE);
199
+ const texts = batch.map((c) => prefix + c.text);
200
+ const embeddings = await pipe.embed(texts);
201
+ for (let i = 0; i < batch.length; i++) {
202
+ const chunk = batch[i];
203
+ col768.insert(chunk.chunkId, embeddings[i], {
204
+ filePath: chunk.relPath,
205
+ chunkIndex: chunk.chunkIndex,
206
+ modelId: pipe.modelId,
207
+ lineStart: chunk.lineStart,
208
+ lineEnd: chunk.lineEnd,
209
+ chunkText: chunk.text, // store without prefix
210
+ });
211
+ chunksCreated++;
212
+ }
213
+ }
214
+ await pipe.dispose();
215
+ }
216
+ // Commit new file entries to manifest
217
+ for (const entry of newFileEntries) {
218
+ manifest.files[entry.relPath] = {
219
+ mtime: entry.mtime,
220
+ size: entry.size,
221
+ hash: entry.hash,
222
+ chunks: entry.chunks,
223
+ };
224
+ }
225
+ return { filesIndexed, filesSkipped, chunksCreated, chunksReused, chunksRemoved };
226
+ }
227
+ export async function runIndex(targetPath, options) {
228
+ const startTime = Date.now();
229
+ try {
230
+ // 1. Resolve path
231
+ const absPath = path.resolve(targetPath);
232
+ // 2. Open vector collections
233
+ const { openProjectCollections } = await import('../../services/vector-db.js');
234
+ let { col768, col512, storagePath } = openProjectCollections(absPath);
235
+ // 3. Handle --clear
236
+ // rmSync removes .ez-search/ entirely (including manifest.json inside it)
237
+ if (options.clear) {
238
+ rmSync(storagePath, { recursive: true, force: true });
239
+ const reopened = openProjectCollections(absPath);
240
+ col768 = reopened.col768;
241
+ col512 = reopened.col512;
242
+ storagePath = reopened.storagePath;
243
+ }
244
+ // 4. Load manifest and helpers
245
+ const { loadManifest, saveManifest, hashContent, hashText, makeChunkId } = await import('../../services/manifest-cache.js');
246
+ const manifest = loadManifest(absPath);
247
+ // 5. Determine which types to index
248
+ const typesToIndex = options.type
249
+ ? [options.type]
250
+ : ['code', 'text', 'image'];
251
+ const { scanFiles } = await import('../../services/file-scanner.js');
252
+ // Aggregate stats
253
+ let totalFilesScanned = 0;
254
+ let totalFilesIndexed = 0;
255
+ let totalFilesSkipped = 0;
256
+ let totalChunksCreated = 0;
257
+ let totalChunksReused = 0;
258
+ let totalChunksRemoved = 0;
259
+ const allDeletedPaths = [];
260
+ // Per-type file counts for text output
261
+ const typeFileCounts = {};
262
+ let imageFilesProcessed = false;
263
+ for (const fileType of typesToIndex) {
264
+ // Scan files of this type
265
+ const scannedFiles = [];
266
+ for await (const file of scanFiles(absPath, { useIgnoreFiles: options.ignore, typeFilter: fileType })) {
267
+ scannedFiles.push(file);
268
+ }
269
+ totalFilesScanned += scannedFiles.length;
270
+ if (scannedFiles.length === 0) {
271
+ continue;
272
+ }
273
+ // Deletion detection: manifest entries with this file's extension
274
+ const scannedSet = new Set(scannedFiles.map((f) => f.relativePath));
275
+ if (fileType === 'code' || fileType === 'text') {
276
+ // Find manifest entries whose last extension matches this type's extensions
277
+ // Simpler: find entries NOT in current scan that belong to this type
278
+ // We only delete entries whose path is in the scanned set for OTHER types?
279
+ // Actually: we track all types in the same manifest. We need to scope deletions
280
+ // to files that WERE of this type (by extension). Use EXTENSION_MAP.
281
+ const { EXTENSION_MAP } = await import('../../types.js');
282
+ const deletedPaths = Object.keys(manifest.files).filter((relPath) => {
283
+ if (scannedSet.has(relPath))
284
+ return false;
285
+ const ext = path.extname(relPath).toLowerCase();
286
+ return EXTENSION_MAP[ext] === fileType;
287
+ });
288
+ for (const deletedPath of deletedPaths) {
289
+ const entry = manifest.files[deletedPath];
290
+ for (const chunk of entry.chunks) {
291
+ col768.remove(chunk.id);
292
+ totalChunksRemoved++;
293
+ }
294
+ delete manifest.files[deletedPath];
295
+ allDeletedPaths.push(deletedPath);
296
+ }
297
+ // Run code/text embedding pipeline
298
+ const result = await runTextEmbeddingPipeline({
299
+ type: fileType,
300
+ files: scannedFiles,
301
+ col768,
302
+ manifest,
303
+ hashContent,
304
+ hashText,
305
+ makeChunkId,
306
+ });
307
+ totalFilesIndexed += result.filesIndexed;
308
+ totalFilesSkipped += result.filesSkipped;
309
+ totalChunksCreated += result.chunksCreated;
310
+ totalChunksReused += result.chunksReused;
311
+ totalChunksRemoved += result.chunksRemoved;
312
+ if (result.filesIndexed > 0) {
313
+ typeFileCounts[fileType] = (typeFileCounts[fileType] ?? 0) + result.filesIndexed;
314
+ }
315
+ }
316
+ else if (fileType === 'image') {
317
+ // Image pipeline: one vector per file, goes into col-512
318
+ const { EXTENSION_MAP } = await import('../../types.js');
319
+ const deletedPaths = Object.keys(manifest.files).filter((relPath) => {
320
+ if (scannedSet.has(relPath))
321
+ return false;
322
+ const ext = path.extname(relPath).toLowerCase();
323
+ return EXTENSION_MAP[ext] === 'image';
324
+ });
325
+ for (const deletedPath of deletedPaths) {
326
+ const entry = manifest.files[deletedPath];
327
+ for (const chunk of entry.chunks) {
328
+ col512.remove(chunk.id);
329
+ totalChunksRemoved++;
330
+ }
331
+ delete manifest.files[deletedPath];
332
+ allDeletedPaths.push(deletedPath);
333
+ }
334
+ // Determine which image files need (re-)embedding
335
+ const filesToProcess = [];
336
+ for (const file of scannedFiles) {
337
+ const existing = manifest.files[file.relativePath];
338
+ if (existing && existing.mtime === file.mtimeMs && existing.size === file.sizeBytes) {
339
+ totalFilesSkipped++;
340
+ totalChunksReused += existing.chunks.length;
341
+ continue;
342
+ }
343
+ if (existing) {
344
+ const buf = await fsp.readFile(file.absolutePath);
345
+ const newHash = hashContent(buf);
346
+ if (newHash === existing.hash) {
347
+ manifest.files[file.relativePath] = { ...existing, mtime: file.mtimeMs, size: file.sizeBytes };
348
+ totalFilesSkipped++;
349
+ totalChunksReused += existing.chunks.length;
350
+ continue;
351
+ }
352
+ }
353
+ filesToProcess.push(file);
354
+ }
355
+ if (filesToProcess.length > 0) {
356
+ // Load CLIP pipeline once for the batch
357
+ const { createImageEmbeddingPipeline } = await import('../../services/image-embedder.js');
358
+ const imagePipeline = await createImageEmbeddingPipeline();
359
+ for (const file of filesToProcess) {
360
+ const buf = await fsp.readFile(file.absolutePath);
361
+ const fileHash = hashContent(buf);
362
+ const embedding = await imagePipeline.embedImage(file.absolutePath);
363
+ const chunkId = makeChunkId(file.relativePath, 0);
364
+ col512.insert(chunkId, embedding, {
365
+ filePath: file.relativePath,
366
+ chunkIndex: 0,
367
+ modelId: imagePipeline.modelId,
368
+ lineStart: 0,
369
+ lineEnd: 0,
370
+ chunkText: '',
371
+ });
372
+ totalChunksCreated++;
373
+ manifest.files[file.relativePath] = {
374
+ mtime: file.mtimeMs,
375
+ size: file.sizeBytes,
376
+ hash: fileHash,
377
+ chunks: [{ id: chunkId, lineStart: 0, lineEnd: 0, tokenCount: 0, textHash: '' }],
378
+ };
379
+ totalFilesIndexed++;
380
+ }
381
+ await imagePipeline.dispose();
382
+ imageFilesProcessed = true;
383
+ typeFileCounts['image'] = (typeFileCounts['image'] ?? 0) + filesToProcess.length;
384
+ }
385
+ }
386
+ }
387
+ // 5b. Check for empty directory (no supported files found)
388
+ if (totalFilesScanned === 0) {
389
+ const { emitError } = await import('../errors.js');
390
+ const format = options.format === 'text' ? 'text' : 'json';
391
+ emitError({ code: 'EMPTY_DIR', message: 'No supported files found in directory', suggestion: 'Ensure the directory contains supported file types (.ts, .js, .py, .go, .rs, .c, .cpp, .md, .txt, .jpg, .png, .webp)' }, format);
392
+ }
393
+ // 6. Optimize THEN save manifest
394
+ col768.optimize();
395
+ if (imageFilesProcessed) {
396
+ col512.optimize();
397
+ }
398
+ saveManifest(absPath, manifest);
399
+ // 7. Output results
400
+ const durationMs = Date.now() - startTime;
401
+ const hasChanges = totalFilesIndexed > 0 || allDeletedPaths.length > 0;
402
+ const output = {
403
+ status: hasChanges ? 'ok' : 'no_changes',
404
+ path: absPath,
405
+ filesScanned: totalFilesScanned,
406
+ filesIndexed: totalFilesIndexed,
407
+ filesSkipped: totalFilesSkipped,
408
+ chunksCreated: totalChunksCreated,
409
+ chunksReused: totalChunksReused,
410
+ chunksRemoved: totalChunksRemoved,
411
+ durationMs,
412
+ storageDir: storagePath,
413
+ };
414
+ if (!options.quiet) {
415
+ if (options.format === 'text') {
416
+ const secs = (durationMs / 1000).toFixed(1);
417
+ if (!hasChanges) {
418
+ console.log(`No changes detected. ${totalFilesScanned} files scanned in ${secs}s.`);
419
+ }
420
+ else {
421
+ const parts = [];
422
+ // Show per-type breakdown if multiple types indexed
423
+ if (Object.keys(typeFileCounts).length > 1) {
424
+ const typeParts = Object.entries(typeFileCounts).map(([t, count]) => `${count} ${t} file${count !== 1 ? 's' : ''}`);
425
+ parts.push(`indexed ${typeParts.join(', ')}`);
426
+ }
427
+ else if (totalFilesIndexed > 0) {
428
+ parts.push(`indexed ${totalFilesIndexed} file${totalFilesIndexed !== 1 ? 's' : ''}`);
429
+ }
430
+ if (allDeletedPaths.length > 0) {
431
+ parts.push(`removed ${allDeletedPaths.length} deleted file${allDeletedPaths.length !== 1 ? 's' : ''}`);
432
+ }
433
+ if (totalChunksCreated > 0) {
434
+ parts.push(`${totalChunksCreated} new chunk${totalChunksCreated !== 1 ? 's' : ''}`);
435
+ }
436
+ console.log(`${parts.join(', ')} in ${secs}s (${totalChunksReused} chunks reused, ${storagePath})`);
437
+ }
438
+ }
439
+ else {
440
+ console.log(JSON.stringify(output));
441
+ }
442
+ }
443
+ return output;
444
+ }
445
+ catch (err) {
446
+ const { emitError } = await import('../errors.js');
447
+ const message = err instanceof Error ? err.message : String(err);
448
+ return emitError({ code: 'GENERAL_ERROR', message, suggestion: 'Check the error above and retry' }, options.format === 'text' ? 'text' : 'json');
449
+ }
450
+ }