@ez-corp/ez-search 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +207 -0
- package/dist/cli/commands/index-cmd.js +450 -0
- package/dist/cli/commands/query-cmd.js +233 -0
- package/dist/cli/commands/status-cmd.js +154 -0
- package/dist/cli/errors.js +25 -0
- package/dist/cli/index.js +62 -0
- package/dist/config/paths.js +16 -0
- package/dist/services/chunker.js +96 -0
- package/dist/services/file-scanner.js +62 -0
- package/dist/services/image-embedder.js +64 -0
- package/dist/services/manifest-cache.js +85 -0
- package/dist/services/model-router.js +108 -0
- package/dist/services/query-utils.js +74 -0
- package/dist/services/staleness.js +36 -0
- package/dist/services/text-chunker.js +138 -0
- package/dist/services/vector-db.js +161 -0
- package/dist/types.js +67 -0
- package/package.json +56 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
ISC License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ez-search contributors
|
|
4
|
+
|
|
5
|
+
Permission to use, copy, modify, and/or distribute this software for any
|
|
6
|
+
purpose with or without fee is hereby granted, provided that the above
|
|
7
|
+
copyright notice and this permission notice appear in all copies.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
|
|
10
|
+
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
|
|
11
|
+
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
|
|
12
|
+
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
|
13
|
+
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
|
|
14
|
+
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
15
|
+
PERFORMANCE OF THIS SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# ez-search
|
|
2
|
+
|
|
3
|
+
Semantic codebase search with zero cloud dependencies.
|
|
4
|
+
|
|
5
|
+
`ez-search` is a local, privacy-first CLI tool that provides semantic search over codebases, documents, and image libraries. It uses ML inference (WebGPU with CPU fallback) to generate embeddings and stores them in a local vector database. No cloud services, no API keys, no data leaves your machine.
|
|
6
|
+
|
|
7
|
+
Built as a contextual retrieval engine for AI coding assistants like Claude Code.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Three search pipelines** -- code, text/documents, and images, each with a specialized embedding model
|
|
12
|
+
- **Incremental indexing** -- only re-embeds files that have changed (mtime + content hash)
|
|
13
|
+
- **WebGPU acceleration** with automatic CPU fallback
|
|
14
|
+
- **Respects .gitignore and .cursorignore** -- skips `node_modules`, `dist`, lockfiles, etc. by default
|
|
15
|
+
- **Machine-readable JSON output** -- designed for AI assistant consumption
|
|
16
|
+
- **Project-scoped storage** -- all index data lives in `.ez-search/` within your project
|
|
17
|
+
|
|
18
|
+
## Requirements
|
|
19
|
+
|
|
20
|
+
- **Node.js v20+** (v22+ recommended for WebGPU support)
|
|
21
|
+
- Models are downloaded automatically on first run (~500MB total for all three)
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
npm install -g ez-search
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Or from source:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
git clone https://github.com/ezcorp-org/ez-search.git
|
|
33
|
+
cd ez-search
|
|
34
|
+
npm install
|
|
35
|
+
npm run build
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Index the current directory
|
|
42
|
+
ez-search index .
|
|
43
|
+
|
|
44
|
+
# Search your code
|
|
45
|
+
ez-search query "error handling in the auth module"
|
|
46
|
+
|
|
47
|
+
# Check index status
|
|
48
|
+
ez-search status
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## CLI Reference
|
|
52
|
+
|
|
53
|
+
### `ez-search index <path>`
|
|
54
|
+
|
|
55
|
+
Scan a directory, chunk files, generate embeddings, and store them in the local vector database.
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
ez-search index .
|
|
59
|
+
ez-search index ./src --type code
|
|
60
|
+
ez-search index . --clear --format text
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
| Flag | Description |
|
|
64
|
+
|------|-------------|
|
|
65
|
+
| `--type <code\|text\|image>` | Index only files of a specific type. If omitted, all types are auto-detected by file extension. |
|
|
66
|
+
| `--clear` | Delete the existing `.ez-search/` index before indexing. |
|
|
67
|
+
| `--no-ignore` | Disable `.gitignore` and `.cursorignore` filtering. |
|
|
68
|
+
| `-q, --quiet` | Suppress status output. |
|
|
69
|
+
| `--format <json\|text>` | Output format. Default: `json`. |
|
|
70
|
+
|
|
71
|
+
### `ez-search query <text>`
|
|
72
|
+
|
|
73
|
+
Search the index with a natural language query. Run from inside an indexed directory.
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
ez-search query "database connection pooling"
|
|
77
|
+
ez-search query "how are users authenticated" --format text
|
|
78
|
+
ez-search query "parse config" -k 5 --dir src/config
|
|
79
|
+
ez-search query "validation logic" --threshold 0.7
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
| Flag | Description |
|
|
83
|
+
|------|-------------|
|
|
84
|
+
| `-k, --top-k <n>` | Number of results to return. Default: `10`. |
|
|
85
|
+
| `--type <code\|text>` | Search a specific pipeline only. |
|
|
86
|
+
| `--dir <path>` | Scope results to a subdirectory. |
|
|
87
|
+
| `--threshold <score>` | Minimum relevance score (0-1) to include in results. |
|
|
88
|
+
| `--format <json\|text>` | Output format. Default: `json`. |
|
|
89
|
+
|
|
90
|
+
JSON output returns a grouped envelope:
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"query": "database connection",
|
|
95
|
+
"totalIndexed": 142,
|
|
96
|
+
"searchScope": ".",
|
|
97
|
+
"code": [
|
|
98
|
+
{
|
|
99
|
+
"file": "src/db/pool.ts",
|
|
100
|
+
"lines": { "start": 12, "end": 45 },
|
|
101
|
+
"score": 0.87,
|
|
102
|
+
"text": "..."
|
|
103
|
+
}
|
|
104
|
+
],
|
|
105
|
+
"text": [
|
|
106
|
+
{
|
|
107
|
+
"file": "docs/architecture.md",
|
|
108
|
+
"score": 0.72,
|
|
109
|
+
"text": "..."
|
|
110
|
+
}
|
|
111
|
+
]
|
|
112
|
+
}
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Text output uses a human-readable format:
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
## Code
|
|
119
|
+
|
|
120
|
+
File: src/db/pool.ts | Lines: 12-45 | Relevance: 0.87
|
|
121
|
+
<chunk text>
|
|
122
|
+
|
|
123
|
+
## Text
|
|
124
|
+
|
|
125
|
+
File: docs/architecture.md | Relevance: 0.72
|
|
126
|
+
<chunk text>
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### `ez-search status`
|
|
130
|
+
|
|
131
|
+
Show indexing status for the current directory.
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
ez-search status
|
|
135
|
+
ez-search status --format text
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
| Flag | Description |
|
|
139
|
+
|------|-------------|
|
|
140
|
+
| `--format <json\|text>` | Output format. Default: `json`. |
|
|
141
|
+
| `--no-ignore` | Disable `.gitignore` and `.cursorignore` filtering when computing stale file count. |
|
|
142
|
+
|
|
143
|
+
Reports file count, chunk count, per-type breakdown, index size, last indexed time, and number of stale (unindexed) files.
|
|
144
|
+
|
|
145
|
+
## Supported File Types
|
|
146
|
+
|
|
147
|
+
| Type | Extensions |
|
|
148
|
+
|------|------------|
|
|
149
|
+
| Code | `.ts` `.tsx` `.js` `.jsx` `.py` `.go` `.rs` `.java` `.c` `.cpp` `.h` `.hpp` `.rb` `.php` `.swift` `.kt` `.scala` `.sh` `.bash` `.zsh` `.css` `.scss` `.html` `.json` `.yaml` `.yml` `.toml` |
|
|
150
|
+
| Text | `.md` `.mdx` `.txt` `.rst` `.csv` `.pdf` |
|
|
151
|
+
| Image | `.jpg` `.jpeg` `.png` `.gif` `.webp` `.svg` |
|
|
152
|
+
|
|
153
|
+
## How It Works
|
|
154
|
+
|
|
155
|
+
ez-search uses three specialized embedding models, each optimized for a different data type:
|
|
156
|
+
|
|
157
|
+
| Pipeline | Model | Dimensions | Chunking |
|
|
158
|
+
|----------|-------|------------|----------|
|
|
159
|
+
| Code | `jinaai/jina-embeddings-v2-base-code` | 768 | 500-token sliding window, 50-token overlap |
|
|
160
|
+
| Text | `nomic-ai/nomic-embed-text-v1.5` | 768 | Paragraph-boundary splitting, ~1600 chars per chunk |
|
|
161
|
+
| Image | `Xenova/clip-vit-base-patch32` | 512 | One vector per image (no chunking) |
|
|
162
|
+
|
|
163
|
+
Models are lazy-loaded -- only the model needed for the current operation is loaded. On first run, model weights are downloaded and cached in `~/.ez-search/models/`.
|
|
164
|
+
|
|
165
|
+
**Incremental indexing:** A manifest at `.ez-search/manifest.json` tracks file size, mtime, and content hash (SHA-256). On subsequent runs, only changed files are re-embedded. Chunk-level deduplication further reduces work when only part of a file changes.
|
|
166
|
+
|
|
167
|
+
**Vector storage:** Embeddings are stored in Zvec (`@zvec/zvec`), an in-process C++ vector database. Code and text share a 768-dimension collection; images use a separate 512-dimension collection. Both use cosine similarity for search.
|
|
168
|
+
|
|
169
|
+
## Configuration
|
|
170
|
+
|
|
171
|
+
ez-search uses convention over configuration. There are no config files.
|
|
172
|
+
|
|
173
|
+
- **Project index:** stored in `<project>/.ez-search/` (add to `.gitignore`)
|
|
174
|
+
- **Model cache:** stored in `~/.ez-search/models/` (shared across projects)
|
|
175
|
+
- **File filtering:** respects `.gitignore` and `.cursorignore` by default; disable with `--no-ignore`
|
|
176
|
+
- **Built-in exclusions:** `node_modules`, `.git`, `dist`, `build`, lockfiles, `.min.js`, `.map`, and other common noise are always excluded
|
|
177
|
+
|
|
178
|
+
## Troubleshooting
|
|
179
|
+
|
|
180
|
+
**WebGPU not available / falling back to CPU**
|
|
181
|
+
|
|
182
|
+
WebGPU requires Node.js v22+ and a Vulkan-capable GPU. On systems without GPU support (or on NixOS where `vulkan-loader` may not be in the default environment), ez-search falls back to CPU with q8 quantization automatically. CPU mode is slower but functionally identical.
|
|
183
|
+
|
|
184
|
+
On NixOS, you can enable Vulkan with:
|
|
185
|
+
```bash
|
|
186
|
+
nix-shell -p vulkan-loader
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Model download is slow or fails**
|
|
190
|
+
|
|
191
|
+
Models are downloaded from Hugging Face on first use. If downloads fail, check your internet connection. Models are cached in `~/.ez-search/models/` -- you can delete this directory to force re-download.
|
|
192
|
+
|
|
193
|
+
**"No index found" error**
|
|
194
|
+
|
|
195
|
+
You need to index before querying. Run `ez-search index .` in your project directory first.
|
|
196
|
+
|
|
197
|
+
**"No supported files found" error**
|
|
198
|
+
|
|
199
|
+
The target directory contains no files with recognized extensions. Check the supported file types table above.
|
|
200
|
+
|
|
201
|
+
**Large index size**
|
|
202
|
+
|
|
203
|
+
Use `ez-search index --clear .` to rebuild the index from scratch. This removes stale entries from deleted files.
|
|
204
|
+
|
|
205
|
+
## License
|
|
206
|
+
|
|
207
|
+
[ISC](LICENSE)
|
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Index command — end-to-end pipeline: scan -> manifest check -> chunk -> embed -> store.
|
|
3
|
+
*
|
|
4
|
+
* Pipeline flow (per type):
|
|
5
|
+
* 1. Resolve path and open vector collections
|
|
6
|
+
* 2. Handle --clear (wipe storage + manifest)
|
|
7
|
+
* 3. Load manifest (incremental cache)
|
|
8
|
+
* 4. For each type in [code, text, image]:
|
|
9
|
+
* a. Scan files of that type
|
|
10
|
+
* b. Detect changed/new/deleted files against manifest
|
|
11
|
+
* c. Remove deleted files' chunks from the appropriate collection
|
|
12
|
+
* d. Chunk changed/new files
|
|
13
|
+
* e. Batch embed with the correct model
|
|
14
|
+
* f. Insert embeddings into the appropriate collection
|
|
15
|
+
* 5. Optimize collections THEN save manifest (order matters)
|
|
16
|
+
* 6. Dispose pipelines and output results
|
|
17
|
+
*
|
|
18
|
+
* Model routing:
|
|
19
|
+
* code -> jinaai/jina-embeddings-v2-base-code, col-768
|
|
20
|
+
* text -> nomic-ai/nomic-embed-text-v1.5, col-768 (prefix: "search_document: ")
|
|
21
|
+
* image -> Xenova/clip-vit-base-patch32, col-512 (one vector per file)
|
|
22
|
+
*/
|
|
23
|
+
import * as path from 'path';
|
|
24
|
+
import * as fsp from 'fs/promises';
|
|
25
|
+
import { rmSync } from 'fs';
|
|
26
|
+
const BATCH_SIZE = 32;
|
|
27
|
+
// ── Shared pipeline helper ─────────────────────────────────────────────────────
|
|
28
|
+
/**
|
|
29
|
+
* Shared embedding pipeline: diff files against manifest, chunk, embed, insert into col768.
|
|
30
|
+
* Used by both code and text pipelines (they differ only in chunker, model, prefix, tokenizer).
|
|
31
|
+
*/
|
|
32
|
+
async function runTextEmbeddingPipeline(opts) {
|
|
33
|
+
const { type, files, col768, manifest, hashContent, hashText, makeChunkId } = opts;
|
|
34
|
+
let filesIndexed = 0;
|
|
35
|
+
let filesSkipped = 0;
|
|
36
|
+
let chunksCreated = 0;
|
|
37
|
+
let chunksReused = 0;
|
|
38
|
+
let chunksRemoved = 0;
|
|
39
|
+
// Determine which files need processing (mtime+size fast path, hash confirmation)
|
|
40
|
+
const filesToProcess = [];
|
|
41
|
+
for (const file of files) {
|
|
42
|
+
const existing = manifest.files[file.relativePath];
|
|
43
|
+
if (existing && existing.mtime === file.mtimeMs && existing.size === file.sizeBytes) {
|
|
44
|
+
filesSkipped++;
|
|
45
|
+
chunksReused += existing.chunks.length;
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
if (existing) {
|
|
49
|
+
const buf = await fsp.readFile(file.absolutePath);
|
|
50
|
+
const newHash = hashContent(buf);
|
|
51
|
+
if (newHash === existing.hash) {
|
|
52
|
+
manifest.files[file.relativePath] = { ...existing, mtime: file.mtimeMs, size: file.sizeBytes };
|
|
53
|
+
filesSkipped++;
|
|
54
|
+
chunksReused += existing.chunks.length;
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
filesToProcess.push(file);
|
|
59
|
+
}
|
|
60
|
+
if (filesToProcess.length === 0) {
|
|
61
|
+
return { filesIndexed, filesSkipped, chunksCreated, chunksReused, chunksRemoved };
|
|
62
|
+
}
|
|
63
|
+
// Load chunker and model (lazy, once per pipeline run)
|
|
64
|
+
let tokenizer = null;
|
|
65
|
+
let pipe = null;
|
|
66
|
+
const allPendingChunks = [];
|
|
67
|
+
const newFileEntries = [];
|
|
68
|
+
for (const file of filesToProcess) {
|
|
69
|
+
const ext = path.extname(file.absolutePath).toLowerCase();
|
|
70
|
+
let content;
|
|
71
|
+
if (ext === '.pdf') {
|
|
72
|
+
// PDF: read as buffer, extract text via pdf-parse
|
|
73
|
+
const buf = await fsp.readFile(file.absolutePath);
|
|
74
|
+
const fileHash = hashContent(buf);
|
|
75
|
+
const { extractPdfText, chunkTextFile } = await import('../../services/text-chunker.js');
|
|
76
|
+
const rawText = await extractPdfText(buf);
|
|
77
|
+
const chunks = chunkTextFile(rawText);
|
|
78
|
+
const existingEntry = manifest.files[file.relativePath];
|
|
79
|
+
const existingChunks = existingEntry?.chunks ?? [];
|
|
80
|
+
for (let i = chunks.length; i < existingChunks.length; i++) {
|
|
81
|
+
col768.remove(existingChunks[i].id);
|
|
82
|
+
chunksRemoved++;
|
|
83
|
+
}
|
|
84
|
+
const chunkRecords = [];
|
|
85
|
+
for (const chunk of chunks) {
|
|
86
|
+
const chunkId = makeChunkId(file.relativePath, chunk.chunkIndex);
|
|
87
|
+
const chunkTextHash = hashText(chunk.text);
|
|
88
|
+
const oldChunk = existingChunks[chunk.chunkIndex];
|
|
89
|
+
chunkRecords.push({ id: chunkId, lineStart: 0, lineEnd: 0, tokenCount: 0, textHash: chunkTextHash });
|
|
90
|
+
if (oldChunk && oldChunk.textHash === chunkTextHash) {
|
|
91
|
+
chunksReused++;
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
allPendingChunks.push({
|
|
95
|
+
relPath: file.relativePath,
|
|
96
|
+
chunkId,
|
|
97
|
+
text: chunk.text,
|
|
98
|
+
lineStart: 0,
|
|
99
|
+
lineEnd: 0,
|
|
100
|
+
chunkIndex: chunk.chunkIndex,
|
|
101
|
+
tokenCount: 0,
|
|
102
|
+
textHash: chunkTextHash,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
newFileEntries.push({ relPath: file.relativePath, mtime: file.mtimeMs, size: file.sizeBytes, hash: fileHash, chunks: chunkRecords });
|
|
107
|
+
filesIndexed++;
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
// Non-PDF: read as UTF-8
|
|
111
|
+
content = await fsp.readFile(file.absolutePath, 'utf8');
|
|
112
|
+
const fileHash = hashContent(Buffer.from(content));
|
|
113
|
+
const existingEntry = manifest.files[file.relativePath];
|
|
114
|
+
const existingChunks = existingEntry?.chunks ?? [];
|
|
115
|
+
if (type === 'code') {
|
|
116
|
+
// Code: use Jina tokenizer + sliding-window chunker
|
|
117
|
+
if (!tokenizer) {
|
|
118
|
+
const { loadTokenizer } = await import('../../services/chunker.js');
|
|
119
|
+
tokenizer = await loadTokenizer();
|
|
120
|
+
}
|
|
121
|
+
const { chunkFile } = await import('../../services/chunker.js');
|
|
122
|
+
const chunks = chunkFile(content, tokenizer);
|
|
123
|
+
for (let i = chunks.length; i < existingChunks.length; i++) {
|
|
124
|
+
col768.remove(existingChunks[i].id);
|
|
125
|
+
chunksRemoved++;
|
|
126
|
+
}
|
|
127
|
+
const chunkRecords = [];
|
|
128
|
+
for (const chunk of chunks) {
|
|
129
|
+
const chunkId = makeChunkId(file.relativePath, chunk.chunkIndex);
|
|
130
|
+
const chunkTextHash = hashText(chunk.text);
|
|
131
|
+
const oldChunk = existingChunks[chunk.chunkIndex];
|
|
132
|
+
chunkRecords.push({
|
|
133
|
+
id: chunkId,
|
|
134
|
+
lineStart: chunk.lineStart,
|
|
135
|
+
lineEnd: chunk.lineEnd,
|
|
136
|
+
tokenCount: chunk.tokenCount,
|
|
137
|
+
textHash: chunkTextHash,
|
|
138
|
+
});
|
|
139
|
+
if (oldChunk && oldChunk.textHash === chunkTextHash) {
|
|
140
|
+
chunksReused++;
|
|
141
|
+
}
|
|
142
|
+
else {
|
|
143
|
+
allPendingChunks.push({
|
|
144
|
+
relPath: file.relativePath,
|
|
145
|
+
chunkId,
|
|
146
|
+
text: chunk.text,
|
|
147
|
+
lineStart: chunk.lineStart,
|
|
148
|
+
lineEnd: chunk.lineEnd,
|
|
149
|
+
chunkIndex: chunk.chunkIndex,
|
|
150
|
+
tokenCount: chunk.tokenCount,
|
|
151
|
+
textHash: chunkTextHash,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
newFileEntries.push({ relPath: file.relativePath, mtime: file.mtimeMs, size: file.sizeBytes, hash: fileHash, chunks: chunkRecords });
|
|
156
|
+
}
|
|
157
|
+
else {
|
|
158
|
+
// Text (non-PDF): paragraph-boundary chunking for Nomic
|
|
159
|
+
const { chunkTextFile } = await import('../../services/text-chunker.js');
|
|
160
|
+
const chunks = chunkTextFile(content);
|
|
161
|
+
for (let i = chunks.length; i < existingChunks.length; i++) {
|
|
162
|
+
col768.remove(existingChunks[i].id);
|
|
163
|
+
chunksRemoved++;
|
|
164
|
+
}
|
|
165
|
+
const chunkRecords = [];
|
|
166
|
+
for (const chunk of chunks) {
|
|
167
|
+
const chunkId = makeChunkId(file.relativePath, chunk.chunkIndex);
|
|
168
|
+
const chunkTextHash = hashText(chunk.text);
|
|
169
|
+
const oldChunk = existingChunks[chunk.chunkIndex];
|
|
170
|
+
chunkRecords.push({ id: chunkId, lineStart: 0, lineEnd: 0, tokenCount: 0, textHash: chunkTextHash });
|
|
171
|
+
if (oldChunk && oldChunk.textHash === chunkTextHash) {
|
|
172
|
+
chunksReused++;
|
|
173
|
+
}
|
|
174
|
+
else {
|
|
175
|
+
allPendingChunks.push({
|
|
176
|
+
relPath: file.relativePath,
|
|
177
|
+
chunkId,
|
|
178
|
+
text: chunk.text,
|
|
179
|
+
lineStart: 0,
|
|
180
|
+
lineEnd: 0,
|
|
181
|
+
chunkIndex: chunk.chunkIndex,
|
|
182
|
+
tokenCount: 0,
|
|
183
|
+
textHash: chunkTextHash,
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
newFileEntries.push({ relPath: file.relativePath, mtime: file.mtimeMs, size: file.sizeBytes, hash: fileHash, chunks: chunkRecords });
|
|
188
|
+
}
|
|
189
|
+
filesIndexed++;
|
|
190
|
+
}
|
|
191
|
+
// Embed all pending chunks
|
|
192
|
+
if (allPendingChunks.length > 0) {
|
|
193
|
+
const { createEmbeddingPipeline } = await import('../../services/model-router.js');
|
|
194
|
+
pipe = await createEmbeddingPipeline(type);
|
|
195
|
+
// Nomic requires "search_document: " prefix on indexed documents
|
|
196
|
+
const prefix = type === 'text' ? 'search_document: ' : '';
|
|
197
|
+
for (let batchStart = 0; batchStart < allPendingChunks.length; batchStart += BATCH_SIZE) {
|
|
198
|
+
const batch = allPendingChunks.slice(batchStart, batchStart + BATCH_SIZE);
|
|
199
|
+
const texts = batch.map((c) => prefix + c.text);
|
|
200
|
+
const embeddings = await pipe.embed(texts);
|
|
201
|
+
for (let i = 0; i < batch.length; i++) {
|
|
202
|
+
const chunk = batch[i];
|
|
203
|
+
col768.insert(chunk.chunkId, embeddings[i], {
|
|
204
|
+
filePath: chunk.relPath,
|
|
205
|
+
chunkIndex: chunk.chunkIndex,
|
|
206
|
+
modelId: pipe.modelId,
|
|
207
|
+
lineStart: chunk.lineStart,
|
|
208
|
+
lineEnd: chunk.lineEnd,
|
|
209
|
+
chunkText: chunk.text, // store without prefix
|
|
210
|
+
});
|
|
211
|
+
chunksCreated++;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
await pipe.dispose();
|
|
215
|
+
}
|
|
216
|
+
// Commit new file entries to manifest
|
|
217
|
+
for (const entry of newFileEntries) {
|
|
218
|
+
manifest.files[entry.relPath] = {
|
|
219
|
+
mtime: entry.mtime,
|
|
220
|
+
size: entry.size,
|
|
221
|
+
hash: entry.hash,
|
|
222
|
+
chunks: entry.chunks,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
return { filesIndexed, filesSkipped, chunksCreated, chunksReused, chunksRemoved };
|
|
226
|
+
}
|
|
227
|
+
export async function runIndex(targetPath, options) {
|
|
228
|
+
const startTime = Date.now();
|
|
229
|
+
try {
|
|
230
|
+
// 1. Resolve path
|
|
231
|
+
const absPath = path.resolve(targetPath);
|
|
232
|
+
// 2. Open vector collections
|
|
233
|
+
const { openProjectCollections } = await import('../../services/vector-db.js');
|
|
234
|
+
let { col768, col512, storagePath } = openProjectCollections(absPath);
|
|
235
|
+
// 3. Handle --clear
|
|
236
|
+
// rmSync removes .ez-search/ entirely (including manifest.json inside it)
|
|
237
|
+
if (options.clear) {
|
|
238
|
+
rmSync(storagePath, { recursive: true, force: true });
|
|
239
|
+
const reopened = openProjectCollections(absPath);
|
|
240
|
+
col768 = reopened.col768;
|
|
241
|
+
col512 = reopened.col512;
|
|
242
|
+
storagePath = reopened.storagePath;
|
|
243
|
+
}
|
|
244
|
+
// 4. Load manifest and helpers
|
|
245
|
+
const { loadManifest, saveManifest, hashContent, hashText, makeChunkId } = await import('../../services/manifest-cache.js');
|
|
246
|
+
const manifest = loadManifest(absPath);
|
|
247
|
+
// 5. Determine which types to index
|
|
248
|
+
const typesToIndex = options.type
|
|
249
|
+
? [options.type]
|
|
250
|
+
: ['code', 'text', 'image'];
|
|
251
|
+
const { scanFiles } = await import('../../services/file-scanner.js');
|
|
252
|
+
// Aggregate stats
|
|
253
|
+
let totalFilesScanned = 0;
|
|
254
|
+
let totalFilesIndexed = 0;
|
|
255
|
+
let totalFilesSkipped = 0;
|
|
256
|
+
let totalChunksCreated = 0;
|
|
257
|
+
let totalChunksReused = 0;
|
|
258
|
+
let totalChunksRemoved = 0;
|
|
259
|
+
const allDeletedPaths = [];
|
|
260
|
+
// Per-type file counts for text output
|
|
261
|
+
const typeFileCounts = {};
|
|
262
|
+
let imageFilesProcessed = false;
|
|
263
|
+
for (const fileType of typesToIndex) {
|
|
264
|
+
// Scan files of this type
|
|
265
|
+
const scannedFiles = [];
|
|
266
|
+
for await (const file of scanFiles(absPath, { useIgnoreFiles: options.ignore, typeFilter: fileType })) {
|
|
267
|
+
scannedFiles.push(file);
|
|
268
|
+
}
|
|
269
|
+
totalFilesScanned += scannedFiles.length;
|
|
270
|
+
if (scannedFiles.length === 0) {
|
|
271
|
+
continue;
|
|
272
|
+
}
|
|
273
|
+
// Deletion detection: manifest entries with this file's extension
|
|
274
|
+
const scannedSet = new Set(scannedFiles.map((f) => f.relativePath));
|
|
275
|
+
if (fileType === 'code' || fileType === 'text') {
|
|
276
|
+
// Find manifest entries whose last extension matches this type's extensions
|
|
277
|
+
// Simpler: find entries NOT in current scan that belong to this type
|
|
278
|
+
// We only delete entries whose path is in the scanned set for OTHER types?
|
|
279
|
+
// Actually: we track all types in the same manifest. We need to scope deletions
|
|
280
|
+
// to files that WERE of this type (by extension). Use EXTENSION_MAP.
|
|
281
|
+
const { EXTENSION_MAP } = await import('../../types.js');
|
|
282
|
+
const deletedPaths = Object.keys(manifest.files).filter((relPath) => {
|
|
283
|
+
if (scannedSet.has(relPath))
|
|
284
|
+
return false;
|
|
285
|
+
const ext = path.extname(relPath).toLowerCase();
|
|
286
|
+
return EXTENSION_MAP[ext] === fileType;
|
|
287
|
+
});
|
|
288
|
+
for (const deletedPath of deletedPaths) {
|
|
289
|
+
const entry = manifest.files[deletedPath];
|
|
290
|
+
for (const chunk of entry.chunks) {
|
|
291
|
+
col768.remove(chunk.id);
|
|
292
|
+
totalChunksRemoved++;
|
|
293
|
+
}
|
|
294
|
+
delete manifest.files[deletedPath];
|
|
295
|
+
allDeletedPaths.push(deletedPath);
|
|
296
|
+
}
|
|
297
|
+
// Run code/text embedding pipeline
|
|
298
|
+
const result = await runTextEmbeddingPipeline({
|
|
299
|
+
type: fileType,
|
|
300
|
+
files: scannedFiles,
|
|
301
|
+
col768,
|
|
302
|
+
manifest,
|
|
303
|
+
hashContent,
|
|
304
|
+
hashText,
|
|
305
|
+
makeChunkId,
|
|
306
|
+
});
|
|
307
|
+
totalFilesIndexed += result.filesIndexed;
|
|
308
|
+
totalFilesSkipped += result.filesSkipped;
|
|
309
|
+
totalChunksCreated += result.chunksCreated;
|
|
310
|
+
totalChunksReused += result.chunksReused;
|
|
311
|
+
totalChunksRemoved += result.chunksRemoved;
|
|
312
|
+
if (result.filesIndexed > 0) {
|
|
313
|
+
typeFileCounts[fileType] = (typeFileCounts[fileType] ?? 0) + result.filesIndexed;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
else if (fileType === 'image') {
|
|
317
|
+
// Image pipeline: one vector per file, goes into col-512
|
|
318
|
+
const { EXTENSION_MAP } = await import('../../types.js');
|
|
319
|
+
const deletedPaths = Object.keys(manifest.files).filter((relPath) => {
|
|
320
|
+
if (scannedSet.has(relPath))
|
|
321
|
+
return false;
|
|
322
|
+
const ext = path.extname(relPath).toLowerCase();
|
|
323
|
+
return EXTENSION_MAP[ext] === 'image';
|
|
324
|
+
});
|
|
325
|
+
for (const deletedPath of deletedPaths) {
|
|
326
|
+
const entry = manifest.files[deletedPath];
|
|
327
|
+
for (const chunk of entry.chunks) {
|
|
328
|
+
col512.remove(chunk.id);
|
|
329
|
+
totalChunksRemoved++;
|
|
330
|
+
}
|
|
331
|
+
delete manifest.files[deletedPath];
|
|
332
|
+
allDeletedPaths.push(deletedPath);
|
|
333
|
+
}
|
|
334
|
+
// Determine which image files need (re-)embedding
|
|
335
|
+
const filesToProcess = [];
|
|
336
|
+
for (const file of scannedFiles) {
|
|
337
|
+
const existing = manifest.files[file.relativePath];
|
|
338
|
+
if (existing && existing.mtime === file.mtimeMs && existing.size === file.sizeBytes) {
|
|
339
|
+
totalFilesSkipped++;
|
|
340
|
+
totalChunksReused += existing.chunks.length;
|
|
341
|
+
continue;
|
|
342
|
+
}
|
|
343
|
+
if (existing) {
|
|
344
|
+
const buf = await fsp.readFile(file.absolutePath);
|
|
345
|
+
const newHash = hashContent(buf);
|
|
346
|
+
if (newHash === existing.hash) {
|
|
347
|
+
manifest.files[file.relativePath] = { ...existing, mtime: file.mtimeMs, size: file.sizeBytes };
|
|
348
|
+
totalFilesSkipped++;
|
|
349
|
+
totalChunksReused += existing.chunks.length;
|
|
350
|
+
continue;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
filesToProcess.push(file);
|
|
354
|
+
}
|
|
355
|
+
if (filesToProcess.length > 0) {
|
|
356
|
+
// Load CLIP pipeline once for the batch
|
|
357
|
+
const { createImageEmbeddingPipeline } = await import('../../services/image-embedder.js');
|
|
358
|
+
const imagePipeline = await createImageEmbeddingPipeline();
|
|
359
|
+
for (const file of filesToProcess) {
|
|
360
|
+
const buf = await fsp.readFile(file.absolutePath);
|
|
361
|
+
const fileHash = hashContent(buf);
|
|
362
|
+
const embedding = await imagePipeline.embedImage(file.absolutePath);
|
|
363
|
+
const chunkId = makeChunkId(file.relativePath, 0);
|
|
364
|
+
col512.insert(chunkId, embedding, {
|
|
365
|
+
filePath: file.relativePath,
|
|
366
|
+
chunkIndex: 0,
|
|
367
|
+
modelId: imagePipeline.modelId,
|
|
368
|
+
lineStart: 0,
|
|
369
|
+
lineEnd: 0,
|
|
370
|
+
chunkText: '',
|
|
371
|
+
});
|
|
372
|
+
totalChunksCreated++;
|
|
373
|
+
manifest.files[file.relativePath] = {
|
|
374
|
+
mtime: file.mtimeMs,
|
|
375
|
+
size: file.sizeBytes,
|
|
376
|
+
hash: fileHash,
|
|
377
|
+
chunks: [{ id: chunkId, lineStart: 0, lineEnd: 0, tokenCount: 0, textHash: '' }],
|
|
378
|
+
};
|
|
379
|
+
totalFilesIndexed++;
|
|
380
|
+
}
|
|
381
|
+
await imagePipeline.dispose();
|
|
382
|
+
imageFilesProcessed = true;
|
|
383
|
+
typeFileCounts['image'] = (typeFileCounts['image'] ?? 0) + filesToProcess.length;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
// 5b. Check for empty directory (no supported files found)
|
|
388
|
+
if (totalFilesScanned === 0) {
|
|
389
|
+
const { emitError } = await import('../errors.js');
|
|
390
|
+
const format = options.format === 'text' ? 'text' : 'json';
|
|
391
|
+
emitError({ code: 'EMPTY_DIR', message: 'No supported files found in directory', suggestion: 'Ensure the directory contains supported file types (.ts, .js, .py, .go, .rs, .c, .cpp, .md, .txt, .jpg, .png, .webp)' }, format);
|
|
392
|
+
}
|
|
393
|
+
// 6. Optimize THEN save manifest
|
|
394
|
+
col768.optimize();
|
|
395
|
+
if (imageFilesProcessed) {
|
|
396
|
+
col512.optimize();
|
|
397
|
+
}
|
|
398
|
+
saveManifest(absPath, manifest);
|
|
399
|
+
// 7. Output results
|
|
400
|
+
const durationMs = Date.now() - startTime;
|
|
401
|
+
const hasChanges = totalFilesIndexed > 0 || allDeletedPaths.length > 0;
|
|
402
|
+
const output = {
|
|
403
|
+
status: hasChanges ? 'ok' : 'no_changes',
|
|
404
|
+
path: absPath,
|
|
405
|
+
filesScanned: totalFilesScanned,
|
|
406
|
+
filesIndexed: totalFilesIndexed,
|
|
407
|
+
filesSkipped: totalFilesSkipped,
|
|
408
|
+
chunksCreated: totalChunksCreated,
|
|
409
|
+
chunksReused: totalChunksReused,
|
|
410
|
+
chunksRemoved: totalChunksRemoved,
|
|
411
|
+
durationMs,
|
|
412
|
+
storageDir: storagePath,
|
|
413
|
+
};
|
|
414
|
+
if (!options.quiet) {
|
|
415
|
+
if (options.format === 'text') {
|
|
416
|
+
const secs = (durationMs / 1000).toFixed(1);
|
|
417
|
+
if (!hasChanges) {
|
|
418
|
+
console.log(`No changes detected. ${totalFilesScanned} files scanned in ${secs}s.`);
|
|
419
|
+
}
|
|
420
|
+
else {
|
|
421
|
+
const parts = [];
|
|
422
|
+
// Show per-type breakdown if multiple types indexed
|
|
423
|
+
if (Object.keys(typeFileCounts).length > 1) {
|
|
424
|
+
const typeParts = Object.entries(typeFileCounts).map(([t, count]) => `${count} ${t} file${count !== 1 ? 's' : ''}`);
|
|
425
|
+
parts.push(`indexed ${typeParts.join(', ')}`);
|
|
426
|
+
}
|
|
427
|
+
else if (totalFilesIndexed > 0) {
|
|
428
|
+
parts.push(`indexed ${totalFilesIndexed} file${totalFilesIndexed !== 1 ? 's' : ''}`);
|
|
429
|
+
}
|
|
430
|
+
if (allDeletedPaths.length > 0) {
|
|
431
|
+
parts.push(`removed ${allDeletedPaths.length} deleted file${allDeletedPaths.length !== 1 ? 's' : ''}`);
|
|
432
|
+
}
|
|
433
|
+
if (totalChunksCreated > 0) {
|
|
434
|
+
parts.push(`${totalChunksCreated} new chunk${totalChunksCreated !== 1 ? 's' : ''}`);
|
|
435
|
+
}
|
|
436
|
+
console.log(`${parts.join(', ')} in ${secs}s (${totalChunksReused} chunks reused, ${storagePath})`);
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
else {
|
|
440
|
+
console.log(JSON.stringify(output));
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
return output;
|
|
444
|
+
}
|
|
445
|
+
catch (err) {
|
|
446
|
+
const { emitError } = await import('../errors.js');
|
|
447
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
448
|
+
return emitError({ code: 'GENERAL_ERROR', message, suggestion: 'Check the error above and retry' }, options.format === 'text' ? 'text' : 'json');
|
|
449
|
+
}
|
|
450
|
+
}
|