@ez-corp/ez-search 1.0.8 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
* Model routing:
|
|
19
19
|
* code -> jinaai/jina-embeddings-v2-base-code, col-768
|
|
20
20
|
* text -> nomic-ai/nomic-embed-text-v1.5, col-768 (prefix: "search_document: ")
|
|
21
|
-
* image -> Xenova/clip-vit-base-
|
|
21
|
+
* image -> Xenova/clip-vit-base-patch16, col-512 (one vector per file)
|
|
22
22
|
*/
|
|
23
23
|
import * as path from 'path';
|
|
24
24
|
import * as fsp from 'fs/promises';
|
|
@@ -30,7 +30,7 @@ const BATCH_SIZE = 32;
|
|
|
30
30
|
* Used by both code and text pipelines (they differ only in chunker, model, prefix, tokenizer).
|
|
31
31
|
*/
|
|
32
32
|
async function runTextEmbeddingPipeline(opts) {
|
|
33
|
-
const { type, files, col768, manifest, hashContent, hashText, makeChunkId } = opts;
|
|
33
|
+
const { type, files, col768, manifest, hashContent, hashText, makeChunkId, progress } = opts;
|
|
34
34
|
let filesIndexed = 0;
|
|
35
35
|
let filesSkipped = 0;
|
|
36
36
|
let chunksCreated = 0;
|
|
@@ -38,7 +38,9 @@ async function runTextEmbeddingPipeline(opts) {
|
|
|
38
38
|
let chunksRemoved = 0;
|
|
39
39
|
// Determine which files need processing (mtime+size fast path, hash confirmation)
|
|
40
40
|
const filesToProcess = [];
|
|
41
|
-
for (
|
|
41
|
+
for (let fi = 0; fi < files.length; fi++) {
|
|
42
|
+
const file = files[fi];
|
|
43
|
+
progress.update(`${type}: checking files`, fi + 1, files.length);
|
|
42
44
|
const existing = manifest.files[file.relativePath];
|
|
43
45
|
if (existing && existing.mtime === file.mtimeMs && existing.size === file.sizeBytes) {
|
|
44
46
|
filesSkipped++;
|
|
@@ -190,11 +192,15 @@ async function runTextEmbeddingPipeline(opts) {
|
|
|
190
192
|
}
|
|
191
193
|
// Embed all pending chunks
|
|
192
194
|
if (allPendingChunks.length > 0) {
|
|
195
|
+
progress.update(`${type}: loading model...`);
|
|
193
196
|
const { createEmbeddingPipeline } = await import('../../services/model-router.js');
|
|
194
197
|
pipe = await createEmbeddingPipeline(type);
|
|
195
198
|
// Nomic requires "search_document: " prefix on indexed documents
|
|
196
199
|
const prefix = type === 'text' ? 'search_document: ' : '';
|
|
200
|
+
const totalBatches = Math.ceil(allPendingChunks.length / BATCH_SIZE);
|
|
197
201
|
for (let batchStart = 0; batchStart < allPendingChunks.length; batchStart += BATCH_SIZE) {
|
|
202
|
+
const batchNum = Math.floor(batchStart / BATCH_SIZE) + 1;
|
|
203
|
+
progress.update(`${type}: embedding`, batchNum, totalBatches);
|
|
198
204
|
const batch = allPendingChunks.slice(batchStart, batchStart + BATCH_SIZE);
|
|
199
205
|
const texts = batch.map((c) => prefix + c.text);
|
|
200
206
|
const embeddings = await pipe.embed(texts);
|
|
@@ -226,6 +232,11 @@ async function runTextEmbeddingPipeline(opts) {
|
|
|
226
232
|
}
|
|
227
233
|
export async function runIndex(targetPath, options) {
|
|
228
234
|
const startTime = Date.now();
|
|
235
|
+
const { ProgressReporter } = await import('../progress.js');
|
|
236
|
+
const progress = new ProgressReporter({
|
|
237
|
+
quiet: options.quiet,
|
|
238
|
+
json: options.format !== 'text',
|
|
239
|
+
});
|
|
229
240
|
try {
|
|
230
241
|
// 1. Resolve path
|
|
231
242
|
const absPath = path.resolve(targetPath);
|
|
@@ -267,6 +278,7 @@ export async function runIndex(targetPath, options) {
|
|
|
267
278
|
const scannedFiles = [];
|
|
268
279
|
for await (const file of scanFiles(absPath, { useIgnoreFiles: options.ignore, typeFilter: fileType })) {
|
|
269
280
|
scannedFiles.push(file);
|
|
281
|
+
progress.update(`scanning ${fileType} files... ${scannedFiles.length} found`);
|
|
270
282
|
}
|
|
271
283
|
totalFilesScanned += scannedFiles.length;
|
|
272
284
|
if (scannedFiles.length === 0) {
|
|
@@ -305,6 +317,7 @@ export async function runIndex(targetPath, options) {
|
|
|
305
317
|
hashContent,
|
|
306
318
|
hashText,
|
|
307
319
|
makeChunkId,
|
|
320
|
+
progress,
|
|
308
321
|
});
|
|
309
322
|
totalFilesIndexed += result.filesIndexed;
|
|
310
323
|
totalFilesSkipped += result.filesSkipped;
|
|
@@ -356,9 +369,12 @@ export async function runIndex(targetPath, options) {
|
|
|
356
369
|
}
|
|
357
370
|
if (filesToProcess.length > 0) {
|
|
358
371
|
// Load CLIP pipeline once for the batch
|
|
372
|
+
progress.update('image: loading model...');
|
|
359
373
|
const { createImageEmbeddingPipeline } = await import('../../services/image-embedder.js');
|
|
360
374
|
const imagePipeline = await createImageEmbeddingPipeline();
|
|
361
|
-
for (
|
|
375
|
+
for (let imgIdx = 0; imgIdx < filesToProcess.length; imgIdx++) {
|
|
376
|
+
const file = filesToProcess[imgIdx];
|
|
377
|
+
progress.update('image: embedding', imgIdx + 1, filesToProcess.length);
|
|
362
378
|
const buf = await fsp.readFile(file.absolutePath);
|
|
363
379
|
const fileHash = hashContent(buf);
|
|
364
380
|
const embedding = await imagePipeline.embedImage(buf);
|
|
@@ -393,6 +409,7 @@ export async function runIndex(targetPath, options) {
|
|
|
393
409
|
emitError({ code: 'EMPTY_DIR', message: 'No supported files found in directory', suggestion: 'Ensure the directory contains supported file types (.ts, .js, .py, .go, .rs, .c, .cpp, .md, .txt, .jpg, .png, .webp)' }, format);
|
|
394
410
|
}
|
|
395
411
|
// 6. Optimize, close collections, THEN save manifest
|
|
412
|
+
progress.update('optimizing index...');
|
|
396
413
|
col768.optimize();
|
|
397
414
|
col768.close();
|
|
398
415
|
if (imageFilesProcessed) {
|
|
@@ -400,6 +417,7 @@ export async function runIndex(targetPath, options) {
|
|
|
400
417
|
}
|
|
401
418
|
col512.close();
|
|
402
419
|
saveManifest(absPath, manifest);
|
|
420
|
+
progress.done();
|
|
403
421
|
// 7. Output results
|
|
404
422
|
const durationMs = Date.now() - startTime;
|
|
405
423
|
const hasChanges = totalFilesIndexed > 0 || allDeletedPaths.length > 0;
|
|
@@ -447,6 +465,7 @@ export async function runIndex(targetPath, options) {
|
|
|
447
465
|
return output;
|
|
448
466
|
}
|
|
449
467
|
catch (err) {
|
|
468
|
+
progress.done();
|
|
450
469
|
const { emitError } = await import('../errors.js');
|
|
451
470
|
const message = err instanceof Error ? err.message : String(err);
|
|
452
471
|
return emitError({ code: 'GENERAL_ERROR', message, suggestion: 'Check the error above and retry' }, options.format === 'text' ? 'text' : 'json');
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Live progress reporter for CLI indexing.
|
|
3
|
+
*
|
|
4
|
+
* Writes a single updating line to stderr using \r + ANSI clear.
|
|
5
|
+
* Only active when stderr is a TTY and output isn't suppressed.
|
|
6
|
+
* Does not interfere with JSON/text output on stdout.
|
|
7
|
+
*/
|
|
8
|
+
const BAR_WIDTH = 20;
|
|
9
|
+
export class ProgressReporter {
|
|
10
|
+
enabled;
|
|
11
|
+
constructor(opts) {
|
|
12
|
+
this.enabled = !opts.quiet && !opts.json && !!process.stderr.isTTY;
|
|
13
|
+
}
|
|
14
|
+
/** Overwrite the current line with a status message + optional progress bar. */
|
|
15
|
+
update(message, current, total) {
|
|
16
|
+
if (!this.enabled)
|
|
17
|
+
return;
|
|
18
|
+
let line = message;
|
|
19
|
+
if (total != null && current != null && total > 0) {
|
|
20
|
+
const pct = Math.min(current / total, 1);
|
|
21
|
+
const filled = Math.round(pct * BAR_WIDTH);
|
|
22
|
+
const bar = '\u2588'.repeat(filled) + '\u2591'.repeat(BAR_WIDTH - filled);
|
|
23
|
+
line = `${message} [${bar}] ${current}/${total}`;
|
|
24
|
+
}
|
|
25
|
+
process.stderr.write(`\r\x1b[K${line}`);
|
|
26
|
+
}
|
|
27
|
+
/** Clear the progress line. Call when indexing is complete. */
|
|
28
|
+
done() {
|
|
29
|
+
if (!this.enabled)
|
|
30
|
+
return;
|
|
31
|
+
process.stderr.write('\r\x1b[K');
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
import { CLIPVisionModelWithProjection, CLIPTextModelWithProjection, AutoProcessor, AutoTokenizer, RawImage, env } from '@huggingface/transformers';
|
|
15
15
|
import { resolveModelCachePath } from '../config/paths.js';
|
|
16
16
|
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
17
|
-
const CLIP_MODEL_ID = 'Xenova/clip-vit-base-
|
|
17
|
+
const CLIP_MODEL_ID = 'Xenova/clip-vit-base-patch16';
|
|
18
18
|
const CLIP_DIM = 512;
|
|
19
19
|
// ── Helpers ──────────────────────────────────────────────────────────────────
|
|
20
20
|
/**
|
|
@@ -36,7 +36,7 @@ function l2Normalize(vec) {
|
|
|
36
36
|
}
|
|
37
37
|
// ── Public API ────────────────────────────────────────────────────────────────
|
|
38
38
|
/**
|
|
39
|
-
* Create an ImageEmbeddingPipeline backed by CLIP ViT-B/
|
|
39
|
+
* Create an ImageEmbeddingPipeline backed by CLIP ViT-B/16 (fp32).
|
|
40
40
|
*
|
|
41
41
|
* Loads the AutoProcessor and CLIPVisionModelWithProjection in parallel.
|
|
42
42
|
* Model weights are cached in ~/.ez-search/models/.
|
|
@@ -80,7 +80,7 @@ export async function createImageEmbeddingPipeline() {
|
|
|
80
80
|
};
|
|
81
81
|
}
|
|
82
82
|
/**
|
|
83
|
-
* Create a ClipTextPipeline backed by CLIP ViT-B/
|
|
83
|
+
* Create a ClipTextPipeline backed by CLIP ViT-B/16 (fp32).
|
|
84
84
|
*
|
|
85
85
|
* Loads AutoTokenizer and CLIPTextModelWithProjection in parallel.
|
|
86
86
|
* Used for text-to-image search: encode query text into CLIP's 512-dim space,
|
|
@@ -13,7 +13,7 @@ import { readFileSync, writeFileSync, renameSync, existsSync, mkdirSync } from '
|
|
|
13
13
|
import * as path from 'path';
|
|
14
14
|
import { resolveProjectStoragePath } from '../config/paths.js';
|
|
15
15
|
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
16
|
-
export const MANIFEST_VERSION =
|
|
16
|
+
export const MANIFEST_VERSION = 4;
|
|
17
17
|
export const MANIFEST_FILENAME = 'manifest.json';
|
|
18
18
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
19
19
|
function manifestPath(projectDir) {
|