@ez-corp/ez-search 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,233 @@
1
+ /**
2
+ * Query command — multi-collection grouped semantic search.
3
+ *
4
+ * Pipeline:
5
+ * 1. Resolve project directory (cwd)
6
+ * 2. Open vector collections (col-768 for code/text, col-512 for images)
7
+ * 3. Load manifest for totalIndexed count
8
+ * 4. For each requested type:
9
+ * a. code: embed with Jina, over-fetch topK*5 from col-768, filter by jina modelId
10
+ * b. text: embed with Nomic ("search_query: " prefix), over-fetch topK*5 from col-768, filter by nomic modelId
11
+ * c. image: not supported (text-to-image search deferred to Phase 6)
12
+ * 5. Apply --threshold and --dir filters per type
13
+ * 6. Collapse adjacent chunks per type
14
+ * 7. Sort by score desc, slice to topK per type
15
+ * 8. Output grouped JSON { code: [...], text: [...] } or text with ## headers
16
+ *
17
+ * col-768 holds BOTH code and text vectors; they are distinguished by modelId metadata.
18
+ * Over-fetch topK*5 ensures enough candidates after modelId filtering.
19
+ */
20
+ export async function runQuery(text, options) {
21
+ const topK = parseInt(options.topK, 10);
22
+ const threshold = options.threshold !== undefined ? parseFloat(options.threshold) : undefined;
23
+ try {
24
+ // 1. Resolve project directory
25
+ const projectDir = process.cwd();
26
+ // 2. Load manifest
27
+ const { loadManifest } = await import('../../services/manifest-cache.js');
28
+ let manifest = loadManifest(projectDir);
29
+ let totalIndexed = Object.keys(manifest.files).length;
30
+ let autoIndexResult;
31
+ // Guard: no indexed content — auto-index or fail
32
+ if (totalIndexed === 0) {
33
+ if (options.autoIndex === false) {
34
+ const { emitError } = await import('../errors.js');
35
+ emitError({ code: 'NO_INDEX', message: 'No indexed content found', suggestion: 'Run `ez-search index .` first' }, options.format === 'text' ? 'text' : 'json');
36
+ }
37
+ // Auto-index the project
38
+ const { runIndex } = await import('./index-cmd.js');
39
+ autoIndexResult = await runIndex('.', { ignore: true, quiet: true });
40
+ // Reload manifest after indexing
41
+ manifest = loadManifest(projectDir);
42
+ totalIndexed = Object.keys(manifest.files).length;
43
+ // If still no content after indexing, error out
44
+ if (totalIndexed === 0) {
45
+ const { emitError } = await import('../errors.js');
46
+ emitError({ code: 'EMPTY_DIR', message: 'No supported files found to index', suggestion: 'Ensure the directory contains supported file types' }, options.format === 'text' ? 'text' : 'json');
47
+ }
48
+ }
49
+ // Stale index detection (skip if we just auto-indexed — it's fresh)
50
+ let staleFileCount = 0;
51
+ if (!autoIndexResult) {
52
+ const { calcStaleness } = await import('../../services/staleness.js');
53
+ staleFileCount = await calcStaleness(projectDir, manifest, true);
54
+ }
55
+ const isStale = staleFileCount > 0;
56
+ // 3. Open vector collections (after auto-index so DB files exist)
57
+ const { openProjectCollections } = await import('../../services/vector-db.js');
58
+ const { col768 } = openProjectCollections(projectDir);
59
+ let typesToQuery;
60
+ if (options.type) {
61
+ typesToQuery = [options.type];
62
+ }
63
+ else {
64
+ // Pre-detect indexed types from manifest: only load models for types that have data.
65
+ // This avoids loading Jina when only text is indexed (or Nomic when only code is indexed).
66
+ const { EXTENSION_MAP } = await import('../../types.js');
67
+ const indexedTypes = new Set();
68
+ for (const filePath of Object.keys(manifest.files)) {
69
+ const ext = '.' + filePath.split('.').pop()?.toLowerCase();
70
+ const fileType = EXTENSION_MAP[ext];
71
+ if (fileType)
72
+ indexedTypes.add(fileType);
73
+ }
74
+ typesToQuery = [];
75
+ if (indexedTypes.has('code'))
76
+ typesToQuery.push('code');
77
+ if (indexedTypes.has('text'))
78
+ typesToQuery.push('text');
79
+ // image queries from text not supported — skip even if images are indexed
80
+ }
81
+ // Early exit when manifest exists but has no queryable types (e.g., after --clear without re-indexing)
82
+ if (typesToQuery.length === 0) {
83
+ const { emitError } = await import('../errors.js');
84
+ emitError({ code: 'NO_INDEX', message: 'No indexed content found', suggestion: 'Run `ez-search index .` first' }, options.format === 'text' ? 'text' : 'json');
85
+ }
86
+ // Handle unsupported image query
87
+ if (options.type === 'image') {
88
+ const { emitError } = await import('../errors.js');
89
+ emitError({
90
+ code: 'UNSUPPORTED_TYPE',
91
+ message: 'Image search requires image query input (not yet supported)',
92
+ suggestion: 'Omit --type image to search code and text',
93
+ }, options.format === 'text' ? 'text' : 'json');
94
+ }
95
+ // ── Helpers ──────────────────────────────────────────────────────────────
96
+ const { normalizeResults, filterAndCollapse } = await import('../../services/query-utils.js');
97
+ const hasPostFilters = options.dir !== undefined || threshold !== undefined;
98
+ // Over-fetch for mixed col-768 + optional post-filters
99
+ const fetchCount = topK * 5 * (hasPostFilters ? 3 : 1);
100
+ // ── Execute per-type queries sequentially (memory conservation) ──────────
101
+ const { createEmbeddingPipeline } = await import('../../services/model-router.js');
102
+ let codeResults = [];
103
+ let textResults = [];
104
+ if (typesToQuery.includes('code')) {
105
+ // Code: Jina embedding, filter for jina modelId
106
+ let pipe = null;
107
+ try {
108
+ pipe = await createEmbeddingPipeline('code');
109
+ const [queryEmbedding] = await pipe.embed([text]);
110
+ let rawResults;
111
+ try {
112
+ rawResults = col768.query(queryEmbedding, fetchCount);
113
+ }
114
+ catch {
115
+ rawResults = [];
116
+ }
117
+ const normalized = normalizeResults(rawResults);
118
+ codeResults = filterAndCollapse(normalized, (id) => id.includes('jina') || id.startsWith('jinaai/'), { threshold, dir: options.dir, topK });
119
+ }
120
+ catch (err) {
121
+ process.stderr.write(`[query] code pipeline error: ${err instanceof Error ? err.message : String(err)}\n`);
122
+ }
123
+ finally {
124
+ if (pipe)
125
+ await pipe.dispose();
126
+ }
127
+ }
128
+ if (typesToQuery.includes('text')) {
129
+ // Text: Nomic embedding with "search_query: " prefix, filter for nomic modelId
130
+ let pipe = null;
131
+ try {
132
+ pipe = await createEmbeddingPipeline('text');
133
+ const prefixedQuery = `search_query: ${text}`;
134
+ const [queryEmbedding] = await pipe.embed([prefixedQuery]);
135
+ let rawResults;
136
+ try {
137
+ rawResults = col768.query(queryEmbedding, fetchCount);
138
+ }
139
+ catch {
140
+ rawResults = [];
141
+ }
142
+ const normalized = normalizeResults(rawResults);
143
+ textResults = filterAndCollapse(normalized, (id) => id.includes('nomic'), { threshold, dir: options.dir, topK });
144
+ }
145
+ catch (err) {
146
+ process.stderr.write(`[query] text pipeline error: ${err instanceof Error ? err.message : String(err)}\n`);
147
+ }
148
+ finally {
149
+ if (pipe)
150
+ await pipe.dispose();
151
+ }
152
+ }
153
+ // ── Output ────────────────────────────────────────────────────────────────
154
+ const hasCodeResults = codeResults.length > 0;
155
+ const hasTextResults = textResults.length > 0;
156
+ const hasResults = hasCodeResults || hasTextResults;
157
+ if (options.format === 'text') {
158
+ if (autoIndexResult) {
159
+ console.log(`Auto-indexed ${autoIndexResult.filesIndexed} files in ${(autoIndexResult.durationMs / 1000).toFixed(1)}s\n`);
160
+ }
161
+ if (isStale) {
162
+ console.log(`Warning: ${staleFileCount} file(s) changed since last index. Run \`ez-search index .\` to update.\n`);
163
+ }
164
+ if (!hasResults) {
165
+ console.log('No results found.');
166
+ return;
167
+ }
168
+ if (hasCodeResults) {
169
+ console.log('## Code\n');
170
+ for (const r of codeResults) {
171
+ console.log(`File: ${r.filePath} | Lines: ${r.lineStart}-${r.lineEnd} | Relevance: ${r.score}`);
172
+ for (const line of r.chunkText.split('\n')) {
173
+ console.log(` ${line}`);
174
+ }
175
+ console.log();
176
+ }
177
+ }
178
+ if (hasTextResults) {
179
+ console.log('## Text\n');
180
+ for (const r of textResults) {
181
+ console.log(`File: ${r.filePath} | Relevance: ${r.score}`);
182
+ for (const line of r.chunkText.split('\n')) {
183
+ console.log(` ${line}`);
184
+ }
185
+ console.log();
186
+ }
187
+ }
188
+ }
189
+ else {
190
+ // JSON grouped envelope
191
+ const output = {
192
+ query: text,
193
+ totalIndexed,
194
+ searchScope: options.dir ?? '.',
195
+ };
196
+ if (autoIndexResult) {
197
+ output['indexing'] = {
198
+ status: autoIndexResult.status,
199
+ filesIndexed: autoIndexResult.filesIndexed,
200
+ durationMs: autoIndexResult.durationMs,
201
+ };
202
+ }
203
+ if (isStale) {
204
+ output['stale'] = true;
205
+ output['staleFileCount'] = staleFileCount;
206
+ }
207
+ if (hasCodeResults) {
208
+ output['code'] = codeResults.map((r) => ({
209
+ file: r.filePath,
210
+ lines: { start: r.lineStart, end: r.lineEnd },
211
+ score: r.score,
212
+ text: r.chunkText,
213
+ }));
214
+ }
215
+ if (hasTextResults) {
216
+ output['text'] = textResults.map((r) => ({
217
+ file: r.filePath,
218
+ score: r.score,
219
+ text: r.chunkText,
220
+ }));
221
+ }
222
+ if (!hasResults) {
223
+ output['message'] = 'No results found';
224
+ }
225
+ console.log(JSON.stringify(output));
226
+ }
227
+ }
228
+ catch (err) {
229
+ const { emitError } = await import('../errors.js');
230
+ const message = err instanceof Error ? err.message : String(err);
231
+ emitError({ code: 'GENERAL_ERROR', message, suggestion: 'Check the error above and retry' }, options.format === 'text' ? 'text' : 'json');
232
+ }
233
+ }
@@ -0,0 +1,154 @@
1
+ /**
2
+ * Status command — shows index state for the current directory.
3
+ *
4
+ * Outputs:
5
+ * JSON (default): { fileCount, chunkCount, lastIndexed, modelTypes, indexSizeBytes,
6
+ * storagePath, staleFileCount, byType }
7
+ * Text (--format text): compact human-readable summary
8
+ *
9
+ * Error exits:
10
+ * code 2 — NO_INDEX: no manifest in current directory
11
+ * code 1 — CORRUPT_MANIFEST: manifest exists but vector storage is missing
12
+ */
13
+ import * as path from 'path';
14
+ import * as fsp from 'fs/promises';
15
+ import { existsSync, statSync } from 'fs';
16
+ import { emitError } from '../errors.js';
17
+ import { calcStaleness } from '../../services/staleness.js';
18
+ // ── Helpers ────────────────────────────────────────────────────────────────────
19
+ /**
20
+ * Format bytes into a human-readable string.
21
+ */
22
+ function formatBytes(bytes) {
23
+ if (bytes < 1024)
24
+ return `${bytes} B`;
25
+ if (bytes < 1024 * 1024)
26
+ return `${(bytes / 1024).toFixed(1)} KB`;
27
+ if (bytes < 1024 * 1024 * 1024)
28
+ return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
29
+ return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
30
+ }
31
+ /**
32
+ * Recursively sum the size of all files in a directory.
33
+ * Returns 0 if the directory doesn't exist or on any error.
34
+ */
35
+ async function calcDirSize(dir) {
36
+ try {
37
+ if (!existsSync(dir))
38
+ return 0;
39
+ const entries = await fsp.readdir(dir, { recursive: true, withFileTypes: true });
40
+ let total = 0;
41
+ for (const entry of entries) {
42
+ if (entry.isFile()) {
43
+ try {
44
+ const fullPath = path.join(entry.parentPath ?? dir, entry.name);
45
+ const stat = await fsp.stat(fullPath);
46
+ total += stat.size;
47
+ }
48
+ catch {
49
+ // Skip unreadable files
50
+ }
51
+ }
52
+ }
53
+ return total;
54
+ }
55
+ catch {
56
+ return 0;
57
+ }
58
+ }
59
+ // ── Main command ───────────────────────────────────────────────────────────────
60
+ export async function runStatus(options = {}) {
61
+ const format = options.format === 'text' ? 'text' : 'json';
62
+ const useIgnoreFiles = options.ignore !== false; // default true; --no-ignore sets false
63
+ const projectDir = process.cwd();
64
+ // 1. Check manifest exists
65
+ const { resolveProjectStoragePath } = await import('../../config/paths.js');
66
+ const { MANIFEST_FILENAME, loadManifest } = await import('../../services/manifest-cache.js');
67
+ const manifestPath = path.join(resolveProjectStoragePath(projectDir), MANIFEST_FILENAME);
68
+ if (!existsSync(manifestPath)) {
69
+ emitError({
70
+ code: 'NO_INDEX',
71
+ message: 'No index found in current directory',
72
+ suggestion: 'Run: ez-search index .',
73
+ }, format, 2);
74
+ }
75
+ // 2. Load manifest
76
+ const manifest = loadManifest(projectDir);
77
+ // Detect corruption: file exists but parsed manifest has no entries and file is non-trivial
78
+ let warning;
79
+ let warningSuggestion;
80
+ const manifestStat = statSync(manifestPath);
81
+ if (Object.keys(manifest.files).length === 0 && manifestStat.size > 10) {
82
+ warning = 'Manifest appears corrupt or version-mismatched. Reported data may be incomplete.';
83
+ warningSuggestion = 'Run: ez-search index --clear .';
84
+ }
85
+ // 3. Get lastIndexed from manifest file mtime
86
+ const lastIndexed = new Date(manifestStat.mtimeMs).toISOString();
87
+ // 4. Per-type counts
88
+ const { EXTENSION_MAP } = await import('../../types.js');
89
+ const byType = {
90
+ code: { files: 0, chunks: 0 },
91
+ text: { files: 0, chunks: 0 },
92
+ image: { files: 0, chunks: 0 },
93
+ };
94
+ let totalChunkCount = 0;
95
+ for (const [relPath, entry] of Object.entries(manifest.files)) {
96
+ const ext = path.extname(relPath).toLowerCase();
97
+ const fileType = EXTENSION_MAP[ext];
98
+ if (fileType && fileType in byType) {
99
+ byType[fileType].files++;
100
+ byType[fileType].chunks += entry.chunks.length;
101
+ }
102
+ totalChunkCount += entry.chunks.length;
103
+ }
104
+ const fileCount = Object.keys(manifest.files).length;
105
+ const chunkCount = totalChunkCount;
106
+ // Derive modelTypes from non-zero types
107
+ const modelTypes = ['code', 'text', 'image'].filter((t) => byType[t].files > 0);
108
+ // 5. Resolve storage path and check it exists
109
+ const storagePath = resolveProjectStoragePath(projectDir);
110
+ if (!existsSync(storagePath) && !warning) {
111
+ // Manifest exists but vector storage is missing — corrupt state
112
+ emitError({
113
+ code: 'CORRUPT_MANIFEST',
114
+ message: 'Manifest exists but vector storage is missing',
115
+ suggestion: 'Run: ez-search index --clear .',
116
+ }, format, 1);
117
+ }
118
+ // 6. Calculate index size
119
+ const indexSizeBytes = await calcDirSize(storagePath);
120
+ // 7. Calculate staleness
121
+ const staleFileCount = await calcStaleness(projectDir, manifest, useIgnoreFiles);
122
+ // 8. Output
123
+ if (format === 'text') {
124
+ const lines = [
125
+ `Index: ${storagePath}`,
126
+ `Files: ${fileCount} (code: ${byType.code.files}, text: ${byType.text.files}, image: ${byType.image.files})`,
127
+ `Chunks: ${chunkCount}`,
128
+ `Last indexed: ${lastIndexed}`,
129
+ `Index size: ${formatBytes(indexSizeBytes)}`,
130
+ `Stale files: ${staleFileCount}`,
131
+ ];
132
+ if (warning) {
133
+ lines.push(`Warning: ${warning}`);
134
+ }
135
+ console.log(lines.join('\n'));
136
+ }
137
+ else {
138
+ const output = {
139
+ fileCount,
140
+ chunkCount,
141
+ lastIndexed,
142
+ modelTypes,
143
+ indexSizeBytes,
144
+ storagePath,
145
+ staleFileCount,
146
+ byType,
147
+ };
148
+ if (warning) {
149
+ output['warning'] = warning;
150
+ output['suggestion'] = warningSuggestion;
151
+ }
152
+ console.log(JSON.stringify(output));
153
+ }
154
+ }
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Shared structured error utility for all CLI commands.
3
+ *
4
+ * JSON errors go to stdout (same channel as normal output for agent parsing).
5
+ * Text errors go to stderr.
6
+ */
7
+ /**
8
+ * Emit a structured error and exit the process.
9
+ *
10
+ * @param opts - Error details
11
+ * @param format - 'json' writes structured JSON to stdout; 'text' writes human-readable to stderr
12
+ * @param exitCode - Exit code (defaults to 1)
13
+ * @returns never - Control flow ends here
14
+ */
15
+ export function emitError(opts, format, exitCode = 1) {
16
+ const { code, message, suggestion } = opts;
17
+ if (format === 'text') {
18
+ process.stderr.write(`Error: ${message}. Try: ${suggestion}\n`);
19
+ }
20
+ else {
21
+ const structured = { error: true, code, message, suggestion };
22
+ process.stdout.write(JSON.stringify(structured) + '\n');
23
+ }
24
+ process.exit(exitCode);
25
+ }
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env node
2
+ import { createRequire } from 'node:module';
3
+ import { Command } from 'commander';
4
+ const require = createRequire(import.meta.url);
5
+ const { version } = require('../../package.json');
6
+ const program = new Command();
7
+ program
8
+ .name('ez-search')
9
+ .description('Semantic codebase search with zero cloud dependencies')
10
+ .version(version);
11
+ program
12
+ .command('index <path>')
13
+ .description('Index a directory for semantic search')
14
+ .option('--no-ignore', 'disable .gitignore and .cursorignore filtering')
15
+ .option('--type <type>', 'filter files by type: code|text|image')
16
+ .option('-q, --quiet', 'suppress status output')
17
+ .option('--clear', 'remove existing index before indexing')
18
+ .option('--format <mode>', 'output format: json (default) or text')
19
+ .addHelpText('after', `
20
+ Examples:
21
+ $ ez-search index . Index current directory
22
+ $ ez-search index . --format json Index and output JSON stats
23
+ $ ez-search index . --clear --type code Re-index only code files
24
+ $ ez-search index src/ --no-ignore Index src/ including gitignored files`)
25
+ .action(async (targetPath, options) => {
26
+ const { runIndex } = await import('./commands/index-cmd.js');
27
+ await runIndex(targetPath, options);
28
+ });
29
+ program
30
+ .command('query <text>')
31
+ .description('Search the index with a natural language query')
32
+ .option('--format <mode>', 'output format: json (default) or text')
33
+ .option('-k, --top-k <n>', 'number of results to return', '10')
34
+ .option('--dir <path>', 'scope search to a subdirectory')
35
+ .option('--threshold <score>', 'minimum relevance score (0-1) to include')
36
+ .option('--type <type>', 'search specific type only: code|text|image')
37
+ .option('--no-auto-index', 'disable automatic indexing when no index exists')
38
+ .addHelpText('after', `
39
+ Examples:
40
+ $ ez-search query "authentication logic" Semantic search (auto-indexes if needed)
41
+ $ ez-search query "db connections" --format json --type code --top-k 5
42
+ $ ez-search query "error handling" --threshold 0.5 --dir src/
43
+ $ ez-search query "test" --no-auto-index Fail if no index exists`)
44
+ .action(async (text, options) => {
45
+ const { runQuery } = await import('./commands/query-cmd.js');
46
+ await runQuery(text, options);
47
+ });
48
+ program
49
+ .command('status')
50
+ .description('Show indexing status for the current directory')
51
+ .option('--format <mode>', 'output format: json (default) or text')
52
+ .option('--no-ignore', 'disable .gitignore and .cursorignore filtering')
53
+ .addHelpText('after', `
54
+ Examples:
55
+ $ ez-search status Show index status as JSON
56
+ $ ez-search status --format text Show human-readable summary`)
57
+ .action(async (options) => {
58
+ const { runStatus } = await import('./commands/status-cmd.js');
59
+ await runStatus(options);
60
+ });
61
+ program.parse();
62
+ export { program };
@@ -0,0 +1,16 @@
1
+ import * as os from 'os';
2
+ import * as path from 'path';
3
+ /**
4
+ * Resolve the storage path for a given project directory.
5
+ * Format: <projectDir>/.ez-search/
6
+ */
7
+ export function resolveProjectStoragePath(projectDir) {
8
+ return path.join(path.resolve(projectDir), '.ez-search');
9
+ }
10
+ /**
11
+ * Resolve the shared model cache path.
12
+ * Format: ~/.ez-search/models/
13
+ */
14
+ export function resolveModelCachePath() {
15
+ return path.join(os.homedir(), '.ez-search', 'models');
16
+ }
@@ -0,0 +1,96 @@
1
+ /**
2
+ * Chunker service — splits source code files into token-accurate chunks with line tracking.
3
+ *
4
+ * Uses the Jina code tokenizer (BPE, RobertaTokenizer) for accurate token counting.
5
+ * The tokenizer must be loaded once via loadTokenizer() and reused across all chunkFile() calls.
6
+ *
7
+ * Chunk windows: 500 tokens per chunk, 50 token overlap between consecutive chunks.
8
+ * Line numbers are tracked via cumulative token counts per line (1-indexed).
9
+ *
10
+ * NOTE: add_special_tokens: false is intentional — the embedding pipeline adds special tokens
11
+ * at inference time (pooling: 'mean', normalize: true). Double-adding them would corrupt embeddings.
12
+ */
13
+ import { AutoTokenizer, env } from '@huggingface/transformers';
14
+ import { resolveModelCachePath } from '../config/paths.js';
15
+ // ── Constants ─────────────────────────────────────────────────────────────────
16
+ export const CHUNK_SIZE = 500; // tokens per chunk
17
+ export const OVERLAP = 50; // token overlap between consecutive chunks
18
+ // ── Public API ────────────────────────────────────────────────────────────────
19
+ /**
20
+ * Load the Jina code tokenizer from the shared model cache.
21
+ * Call this once and pass the result to all chunkFile() calls.
22
+ */
23
+ export async function loadTokenizer() {
24
+ env.cacheDir = resolveModelCachePath();
25
+ env.allowRemoteModels = true;
26
+ return AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code');
27
+ }
28
+ /**
29
+ * Split a source file into token-accurate chunks with line number tracking.
30
+ *
31
+ * Files under CHUNK_SIZE tokens produce a single chunk spanning the entire file.
32
+ * Larger files are split into overlapping CHUNK_SIZE-token windows with OVERLAP tokens
33
+ * shared between consecutive windows.
34
+ *
35
+ * @param text - Full text content of the file
36
+ * @param tokenizer - Pre-loaded tokenizer from loadTokenizer()
37
+ * @returns Array of Chunk objects with accurate line numbers and token counts
38
+ */
39
+ export function chunkFile(text, tokenizer) {
40
+ const lines = text.split('\n');
41
+ // Build cumulative token count per line for O(n_lines) line-number lookup.
42
+ // Each line includes its trailing newline except the last, to match how the
43
+ // tokenizer sees the full text.
44
+ const cumulative = [];
45
+ let cum = 0;
46
+ for (let i = 0; i < lines.length; i++) {
47
+ const lineText = lines[i] + (i < lines.length - 1 ? '\n' : '');
48
+ const ids = tokenizer.encode(lineText, { add_special_tokens: false });
49
+ // encode() returns an array-like object — access .length directly (not a plain Array)
50
+ cum += ids.length;
51
+ cumulative.push(cum);
52
+ }
53
+ // Encode full text without special tokens (pipeline adds them at inference time)
54
+ const allIds = tokenizer.encode(text, { add_special_tokens: false });
55
+ const totalTokens = allIds.length;
56
+ // Single-chunk case: file fits within one window
57
+ if (totalTokens <= CHUNK_SIZE) {
58
+ return [{
59
+ text,
60
+ lineStart: 1,
61
+ lineEnd: lines.length,
62
+ chunkIndex: 0,
63
+ tokenCount: totalTokens,
64
+ }];
65
+ }
66
+ // Sliding window with overlap
67
+ const stride = CHUNK_SIZE - OVERLAP; // 450 tokens between window starts
68
+ const chunks = [];
69
+ for (let start = 0; start < totalTokens; start += stride) {
70
+ const end = Math.min(start + CHUNK_SIZE, totalTokens);
71
+ const chunkIds = Array.from(allIds).slice(start, end);
72
+ const chunkText = tokenizer.decode(chunkIds, { skip_special_tokens: true });
73
+ chunks.push({
74
+ text: chunkText,
75
+ lineStart: tokenIndexToLine(start, cumulative),
76
+ lineEnd: tokenIndexToLine(end - 1, cumulative),
77
+ chunkIndex: chunks.length,
78
+ tokenCount: chunkIds.length,
79
+ });
80
+ if (end === totalTokens)
81
+ break;
82
+ }
83
+ return chunks;
84
+ }
85
+ // ── Helpers ───────────────────────────────────────────────────────────────────
86
+ /**
87
+ * Map a token index to a 1-indexed line number using cumulative token counts.
88
+ * Linear scan: returns the first line whose cumulative token count exceeds tokenIdx.
89
+ */
90
+ function tokenIndexToLine(tokenIdx, cumulative) {
91
+ for (let i = 0; i < cumulative.length; i++) {
92
+ if (tokenIdx < cumulative[i])
93
+ return i + 1;
94
+ }
95
+ return cumulative.length;
96
+ }
@@ -0,0 +1,62 @@
1
+ import { readFileSync, existsSync } from 'fs';
2
+ import * as fsp from 'fs/promises';
3
+ import * as path from 'path';
4
+ import ignore from 'ignore';
5
+ import { EXTENSION_MAP, BUILTIN_EXCLUSIONS, } from '../types.js';
6
+ export async function* scanFiles(rootDir, opts) {
7
+ const absRoot = path.resolve(rootDir);
8
+ const ig = ignore();
9
+ // Built-in exclusions are always active
10
+ ig.add(BUILTIN_EXCLUSIONS);
11
+ if (opts.useIgnoreFiles) {
12
+ for (const ignoreFile of ['.gitignore', '.cursorignore']) {
13
+ const ignoreFilePath = path.join(absRoot, ignoreFile);
14
+ if (existsSync(ignoreFilePath)) {
15
+ const contents = readFileSync(ignoreFilePath, 'utf8');
16
+ ig.add(contents);
17
+ }
18
+ }
19
+ }
20
+ yield* walkDir(absRoot, absRoot, ig, opts);
21
+ }
22
+ async function* walkDir(dir, rootDir, ig, opts) {
23
+ const dirHandle = await fsp.opendir(dir);
24
+ for await (const entry of dirHandle) {
25
+ const fullPath = path.join(dir, entry.name);
26
+ const relPath = path.relative(rootDir, fullPath);
27
+ // Skip symlinks entirely
28
+ if (entry.isSymbolicLink()) {
29
+ continue;
30
+ }
31
+ if (entry.isDirectory()) {
32
+ // Check both with and without trailing slash (gitignore semantics)
33
+ if (ig.ignores(relPath + '/') || ig.ignores(relPath)) {
34
+ continue;
35
+ }
36
+ yield* walkDir(fullPath, rootDir, ig, opts);
37
+ }
38
+ else if (entry.isFile()) {
39
+ if (ig.ignores(relPath)) {
40
+ continue;
41
+ }
42
+ const ext = path.extname(entry.name).toLowerCase();
43
+ const fileType = EXTENSION_MAP[ext];
44
+ // Skip unknown extensions
45
+ if (!fileType) {
46
+ continue;
47
+ }
48
+ // Apply type filter if set
49
+ if (opts.typeFilter && fileType !== opts.typeFilter) {
50
+ continue;
51
+ }
52
+ const stat = await fsp.stat(fullPath);
53
+ yield {
54
+ absolutePath: fullPath,
55
+ relativePath: relPath,
56
+ type: fileType,
57
+ sizeBytes: stat.size,
58
+ mtimeMs: stat.mtimeMs,
59
+ };
60
+ }
61
+ }
62
+ }