npm - @ez-corp/ez-search - Versions diffs - 1.0.0 - Mend

@ez-corp/ez-search 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/LICENSE +15 -0
package/README.md +207 -0
package/dist/cli/commands/index-cmd.js +450 -0
package/dist/cli/commands/query-cmd.js +233 -0
package/dist/cli/commands/status-cmd.js +154 -0
package/dist/cli/errors.js +25 -0
package/dist/cli/index.js +62 -0
package/dist/config/paths.js +16 -0
package/dist/services/chunker.js +96 -0
package/dist/services/file-scanner.js +62 -0
package/dist/services/image-embedder.js +64 -0
package/dist/services/manifest-cache.js +85 -0
package/dist/services/model-router.js +108 -0
package/dist/services/query-utils.js +74 -0
package/dist/services/staleness.js +36 -0
package/dist/services/text-chunker.js +138 -0
package/dist/services/vector-db.js +161 -0
package/dist/types.js +67 -0
package/package.json +56 -0

package/dist/cli/commands/query-cmd.js ADDED Viewed

@@ -0,0 +1,233 @@
+/**
+ * Query command — multi-collection grouped semantic search.
+ *
+ * Pipeline:
+ *   1. Resolve project directory (cwd)
+ *   2. Open vector collections (col-768 for code/text, col-512 for images)
+ *   3. Load manifest for totalIndexed count
+ *   4. For each requested type:
+ *      a. code:  embed with Jina, over-fetch topK*5 from col-768, filter by jina modelId
+ *      b. text:  embed with Nomic ("search_query: " prefix), over-fetch topK*5 from col-768, filter by nomic modelId
+ *      c. image: not supported (text-to-image search deferred to Phase 6)
+ *   5. Apply --threshold and --dir filters per type
+ *   6. Collapse adjacent chunks per type
+ *   7. Sort by score desc, slice to topK per type
+ *   8. Output grouped JSON { code: [...], text: [...] } or text with ## headers
+ *
+ * col-768 holds BOTH code and text vectors; they are distinguished by modelId metadata.
+ * Over-fetch topK*5 ensures enough candidates after modelId filtering.
+ */
+export async function runQuery(text, options) {
+    const topK = parseInt(options.topK, 10);
+    const threshold = options.threshold !== undefined ? parseFloat(options.threshold) : undefined;
+    try {
+        // 1. Resolve project directory
+        const projectDir = process.cwd();
+        // 2. Load manifest
+        const { loadManifest } = await import('../../services/manifest-cache.js');
+        let manifest = loadManifest(projectDir);
+        let totalIndexed = Object.keys(manifest.files).length;
+        let autoIndexResult;
+        // Guard: no indexed content — auto-index or fail
+        if (totalIndexed === 0) {
+            if (options.autoIndex === false) {
+                const { emitError } = await import('../errors.js');
+                emitError({ code: 'NO_INDEX', message: 'No indexed content found', suggestion: 'Run `ez-search index .` first' }, options.format === 'text' ? 'text' : 'json');
+            }
+            // Auto-index the project
+            const { runIndex } = await import('./index-cmd.js');
+            autoIndexResult = await runIndex('.', { ignore: true, quiet: true });
+            // Reload manifest after indexing
+            manifest = loadManifest(projectDir);
+            totalIndexed = Object.keys(manifest.files).length;
+            // If still no content after indexing, error out
+            if (totalIndexed === 0) {
+                const { emitError } = await import('../errors.js');
+                emitError({ code: 'EMPTY_DIR', message: 'No supported files found to index', suggestion: 'Ensure the directory contains supported file types' }, options.format === 'text' ? 'text' : 'json');
+            }
+        }
+        // Stale index detection (skip if we just auto-indexed — it's fresh)
+        let staleFileCount = 0;
+        if (!autoIndexResult) {
+            const { calcStaleness } = await import('../../services/staleness.js');
+            staleFileCount = await calcStaleness(projectDir, manifest, true);
+        }
+        const isStale = staleFileCount > 0;
+        // 3. Open vector collections (after auto-index so DB files exist)
+        const { openProjectCollections } = await import('../../services/vector-db.js');
+        const { col768 } = openProjectCollections(projectDir);
+        let typesToQuery;
+        if (options.type) {
+            typesToQuery = [options.type];
+        }
+        else {
+            // Pre-detect indexed types from manifest: only load models for types that have data.
+            // This avoids loading Jina when only text is indexed (or Nomic when only code is indexed).
+            const { EXTENSION_MAP } = await import('../../types.js');
+            const indexedTypes = new Set();
+            for (const filePath of Object.keys(manifest.files)) {
+                const ext = '.' + filePath.split('.').pop()?.toLowerCase();
+                const fileType = EXTENSION_MAP[ext];
+                if (fileType)
+                    indexedTypes.add(fileType);
+            }
+            typesToQuery = [];
+            if (indexedTypes.has('code'))
+                typesToQuery.push('code');
+            if (indexedTypes.has('text'))
+                typesToQuery.push('text');
+            // image queries from text not supported — skip even if images are indexed
+        }
+        // Early exit when manifest exists but has no queryable types (e.g., after --clear without re-indexing)
+        if (typesToQuery.length === 0) {
+            const { emitError } = await import('../errors.js');
+            emitError({ code: 'NO_INDEX', message: 'No indexed content found', suggestion: 'Run `ez-search index .` first' }, options.format === 'text' ? 'text' : 'json');
+        }
+        // Handle unsupported image query
+        if (options.type === 'image') {
+            const { emitError } = await import('../errors.js');
+            emitError({
+                code: 'UNSUPPORTED_TYPE',
+                message: 'Image search requires image query input (not yet supported)',
+                suggestion: 'Omit --type image to search code and text',
+            }, options.format === 'text' ? 'text' : 'json');
+        }
+        // ── Helpers ──────────────────────────────────────────────────────────────
+        const { normalizeResults, filterAndCollapse } = await import('../../services/query-utils.js');
+        const hasPostFilters = options.dir !== undefined || threshold !== undefined;
+        // Over-fetch for mixed col-768 + optional post-filters
+        const fetchCount = topK * 5 * (hasPostFilters ? 3 : 1);
+        // ── Execute per-type queries sequentially (memory conservation) ──────────
+        const { createEmbeddingPipeline } = await import('../../services/model-router.js');
+        let codeResults = [];
+        let textResults = [];
+        if (typesToQuery.includes('code')) {
+            // Code: Jina embedding, filter for jina modelId
+            let pipe = null;
+            try {
+                pipe = await createEmbeddingPipeline('code');
+                const [queryEmbedding] = await pipe.embed([text]);
+                let rawResults;
+                try {
+                    rawResults = col768.query(queryEmbedding, fetchCount);
+                }
+                catch {
+                    rawResults = [];
+                }
+                const normalized = normalizeResults(rawResults);
+                codeResults = filterAndCollapse(normalized, (id) => id.includes('jina') || id.startsWith('jinaai/'), { threshold, dir: options.dir, topK });
+            }
+            catch (err) {
+                process.stderr.write(`[query] code pipeline error: ${err instanceof Error ? err.message : String(err)}\n`);
+            }
+            finally {
+                if (pipe)
+                    await pipe.dispose();
+            }
+        }
+        if (typesToQuery.includes('text')) {
+            // Text: Nomic embedding with "search_query: " prefix, filter for nomic modelId
+            let pipe = null;
+            try {
+                pipe = await createEmbeddingPipeline('text');
+                const prefixedQuery = `search_query: ${text}`;
+                const [queryEmbedding] = await pipe.embed([prefixedQuery]);
+                let rawResults;
+                try {
+                    rawResults = col768.query(queryEmbedding, fetchCount);
+                }
+                catch {
+                    rawResults = [];
+                }
+                const normalized = normalizeResults(rawResults);
+                textResults = filterAndCollapse(normalized, (id) => id.includes('nomic'), { threshold, dir: options.dir, topK });
+            }
+            catch (err) {
+                process.stderr.write(`[query] text pipeline error: ${err instanceof Error ? err.message : String(err)}\n`);
+            }
+            finally {
+                if (pipe)
+                    await pipe.dispose();
+            }
+        }
+        // ── Output ────────────────────────────────────────────────────────────────
+        const hasCodeResults = codeResults.length > 0;
+        const hasTextResults = textResults.length > 0;
+        const hasResults = hasCodeResults || hasTextResults;
+        if (options.format === 'text') {
+            if (autoIndexResult) {
+                console.log(`Auto-indexed ${autoIndexResult.filesIndexed} files in ${(autoIndexResult.durationMs / 1000).toFixed(1)}s\n`);
+            }
+            if (isStale) {
+                console.log(`Warning: ${staleFileCount} file(s) changed since last index. Run \`ez-search index .\` to update.\n`);
+            }
+            if (!hasResults) {
+                console.log('No results found.');
+                return;
+            }
+            if (hasCodeResults) {
+                console.log('## Code\n');
+                for (const r of codeResults) {
+                    console.log(`File: ${r.filePath} | Lines: ${r.lineStart}-${r.lineEnd} | Relevance: ${r.score}`);
+                    for (const line of r.chunkText.split('\n')) {
+                        console.log(`    ${line}`);
+                    }
+                    console.log();
+                }
+            }
+            if (hasTextResults) {
+                console.log('## Text\n');
+                for (const r of textResults) {
+                    console.log(`File: ${r.filePath} | Relevance: ${r.score}`);
+                    for (const line of r.chunkText.split('\n')) {
+                        console.log(`    ${line}`);
+                    }
+                    console.log();
+                }
+            }
+        }
+        else {
+            // JSON grouped envelope
+            const output = {
+                query: text,
+                totalIndexed,
+                searchScope: options.dir ?? '.',
+            };
+            if (autoIndexResult) {
+                output['indexing'] = {
+                    status: autoIndexResult.status,
+                    filesIndexed: autoIndexResult.filesIndexed,
+                    durationMs: autoIndexResult.durationMs,
+                };
+            }
+            if (isStale) {
+                output['stale'] = true;
+                output['staleFileCount'] = staleFileCount;
+            }
+            if (hasCodeResults) {
+                output['code'] = codeResults.map((r) => ({
+                    file: r.filePath,
+                    lines: { start: r.lineStart, end: r.lineEnd },
+                    score: r.score,
+                    text: r.chunkText,
+                }));
+            }
+            if (hasTextResults) {
+                output['text'] = textResults.map((r) => ({
+                    file: r.filePath,
+                    score: r.score,
+                    text: r.chunkText,
+                }));
+            }
+            if (!hasResults) {
+                output['message'] = 'No results found';
+            }
+            console.log(JSON.stringify(output));
+        }
+    }
+    catch (err) {
+        const { emitError } = await import('../errors.js');
+        const message = err instanceof Error ? err.message : String(err);
+        emitError({ code: 'GENERAL_ERROR', message, suggestion: 'Check the error above and retry' }, options.format === 'text' ? 'text' : 'json');
+    }
+}

package/dist/cli/commands/status-cmd.js ADDED Viewed

@@ -0,0 +1,154 @@
+/**
+ * Status command — shows index state for the current directory.
+ *
+ * Outputs:
+ *   JSON (default): { fileCount, chunkCount, lastIndexed, modelTypes, indexSizeBytes,
+ *                     storagePath, staleFileCount, byType }
+ *   Text (--format text): compact human-readable summary
+ *
+ * Error exits:
+ *   code 2 — NO_INDEX: no manifest in current directory
+ *   code 1 — CORRUPT_MANIFEST: manifest exists but vector storage is missing
+ */
+import * as path from 'path';
+import * as fsp from 'fs/promises';
+import { existsSync, statSync } from 'fs';
+import { emitError } from '../errors.js';
+import { calcStaleness } from '../../services/staleness.js';
+// ── Helpers ────────────────────────────────────────────────────────────────────
+/**
+ * Format bytes into a human-readable string.
+ */
+function formatBytes(bytes) {
+    if (bytes < 1024)
+        return `${bytes} B`;
+    if (bytes < 1024 * 1024)
+        return `${(bytes / 1024).toFixed(1)} KB`;
+    if (bytes < 1024 * 1024 * 1024)
+        return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+    return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
+}
+/**
+ * Recursively sum the size of all files in a directory.
+ * Returns 0 if the directory doesn't exist or on any error.
+ */
+async function calcDirSize(dir) {
+    try {
+        if (!existsSync(dir))
+            return 0;
+        const entries = await fsp.readdir(dir, { recursive: true, withFileTypes: true });
+        let total = 0;
+        for (const entry of entries) {
+            if (entry.isFile()) {
+                try {
+                    const fullPath = path.join(entry.parentPath ?? dir, entry.name);
+                    const stat = await fsp.stat(fullPath);
+                    total += stat.size;
+                }
+                catch {
+                    // Skip unreadable files
+                }
+            }
+        }
+        return total;
+    }
+    catch {
+        return 0;
+    }
+}
+// ── Main command ───────────────────────────────────────────────────────────────
+export async function runStatus(options = {}) {
+    const format = options.format === 'text' ? 'text' : 'json';
+    const useIgnoreFiles = options.ignore !== false; // default true; --no-ignore sets false
+    const projectDir = process.cwd();
+    // 1. Check manifest exists
+    const { resolveProjectStoragePath } = await import('../../config/paths.js');
+    const { MANIFEST_FILENAME, loadManifest } = await import('../../services/manifest-cache.js');
+    const manifestPath = path.join(resolveProjectStoragePath(projectDir), MANIFEST_FILENAME);
+    if (!existsSync(manifestPath)) {
+        emitError({
+            code: 'NO_INDEX',
+            message: 'No index found in current directory',
+            suggestion: 'Run: ez-search index .',
+        }, format, 2);
+    }
+    // 2. Load manifest
+    const manifest = loadManifest(projectDir);
+    // Detect corruption: file exists but parsed manifest has no entries and file is non-trivial
+    let warning;
+    let warningSuggestion;
+    const manifestStat = statSync(manifestPath);
+    if (Object.keys(manifest.files).length === 0 && manifestStat.size > 10) {
+        warning = 'Manifest appears corrupt or version-mismatched. Reported data may be incomplete.';
+        warningSuggestion = 'Run: ez-search index --clear .';
+    }
+    // 3. Get lastIndexed from manifest file mtime
+    const lastIndexed = new Date(manifestStat.mtimeMs).toISOString();
+    // 4. Per-type counts
+    const { EXTENSION_MAP } = await import('../../types.js');
+    const byType = {
+        code: { files: 0, chunks: 0 },
+        text: { files: 0, chunks: 0 },
+        image: { files: 0, chunks: 0 },
+    };
+    let totalChunkCount = 0;
+    for (const [relPath, entry] of Object.entries(manifest.files)) {
+        const ext = path.extname(relPath).toLowerCase();
+        const fileType = EXTENSION_MAP[ext];
+        if (fileType && fileType in byType) {
+            byType[fileType].files++;
+            byType[fileType].chunks += entry.chunks.length;
+        }
+        totalChunkCount += entry.chunks.length;
+    }
+    const fileCount = Object.keys(manifest.files).length;
+    const chunkCount = totalChunkCount;
+    // Derive modelTypes from non-zero types
+    const modelTypes = ['code', 'text', 'image'].filter((t) => byType[t].files > 0);
+    // 5. Resolve storage path and check it exists
+    const storagePath = resolveProjectStoragePath(projectDir);
+    if (!existsSync(storagePath) && !warning) {
+        // Manifest exists but vector storage is missing — corrupt state
+        emitError({
+            code: 'CORRUPT_MANIFEST',
+            message: 'Manifest exists but vector storage is missing',
+            suggestion: 'Run: ez-search index --clear .',
+        }, format, 1);
+    }
+    // 6. Calculate index size
+    const indexSizeBytes = await calcDirSize(storagePath);
+    // 7. Calculate staleness
+    const staleFileCount = await calcStaleness(projectDir, manifest, useIgnoreFiles);
+    // 8. Output
+    if (format === 'text') {
+        const lines = [
+            `Index: ${storagePath}`,
+            `Files: ${fileCount} (code: ${byType.code.files}, text: ${byType.text.files}, image: ${byType.image.files})`,
+            `Chunks: ${chunkCount}`,
+            `Last indexed: ${lastIndexed}`,
+            `Index size: ${formatBytes(indexSizeBytes)}`,
+            `Stale files: ${staleFileCount}`,
+        ];
+        if (warning) {
+            lines.push(`Warning: ${warning}`);
+        }
+        console.log(lines.join('\n'));
+    }
+    else {
+        const output = {
+            fileCount,
+            chunkCount,
+            lastIndexed,
+            modelTypes,
+            indexSizeBytes,
+            storagePath,
+            staleFileCount,
+            byType,
+        };
+        if (warning) {
+            output['warning'] = warning;
+            output['suggestion'] = warningSuggestion;
+        }
+        console.log(JSON.stringify(output));
+    }
+}

package/dist/cli/errors.js ADDED Viewed

@@ -0,0 +1,25 @@
+/**
+ * Shared structured error utility for all CLI commands.
+ *
+ * JSON errors go to stdout (same channel as normal output for agent parsing).
+ * Text errors go to stderr.
+ */
+/**
+ * Emit a structured error and exit the process.
+ *
+ * @param opts      - Error details
+ * @param format    - 'json' writes structured JSON to stdout; 'text' writes human-readable to stderr
+ * @param exitCode  - Exit code (defaults to 1)
+ * @returns never   - Control flow ends here
+ */
+export function emitError(opts, format, exitCode = 1) {
+    const { code, message, suggestion } = opts;
+    if (format === 'text') {
+        process.stderr.write(`Error: ${message}. Try: ${suggestion}\n`);
+    }
+    else {
+        const structured = { error: true, code, message, suggestion };
+        process.stdout.write(JSON.stringify(structured) + '\n');
+    }
+    process.exit(exitCode);
+}

package/dist/cli/index.js ADDED Viewed

@@ -0,0 +1,62 @@
+#!/usr/bin/env node
+import { createRequire } from 'node:module';
+import { Command } from 'commander';
+const require = createRequire(import.meta.url);
+const { version } = require('../../package.json');
+const program = new Command();
+program
+    .name('ez-search')
+    .description('Semantic codebase search with zero cloud dependencies')
+    .version(version);
+program
+    .command('index <path>')
+    .description('Index a directory for semantic search')
+    .option('--no-ignore', 'disable .gitignore and .cursorignore filtering')
+    .option('--type <type>', 'filter files by type: code|text|image')
+    .option('-q, --quiet', 'suppress status output')
+    .option('--clear', 'remove existing index before indexing')
+    .option('--format <mode>', 'output format: json (default) or text')
+    .addHelpText('after', `
+Examples:
+  $ ez-search index .                          Index current directory
+  $ ez-search index . --format json            Index and output JSON stats
+  $ ez-search index . --clear --type code      Re-index only code files
+  $ ez-search index src/ --no-ignore           Index src/ including gitignored files`)
+    .action(async (targetPath, options) => {
+    const { runIndex } = await import('./commands/index-cmd.js');
+    await runIndex(targetPath, options);
+});
+program
+    .command('query <text>')
+    .description('Search the index with a natural language query')
+    .option('--format <mode>', 'output format: json (default) or text')
+    .option('-k, --top-k <n>', 'number of results to return', '10')
+    .option('--dir <path>', 'scope search to a subdirectory')
+    .option('--threshold <score>', 'minimum relevance score (0-1) to include')
+    .option('--type <type>', 'search specific type only: code|text|image')
+    .option('--no-auto-index', 'disable automatic indexing when no index exists')
+    .addHelpText('after', `
+Examples:
+  $ ez-search query "authentication logic"     Semantic search (auto-indexes if needed)
+  $ ez-search query "db connections" --format json --type code --top-k 5
+  $ ez-search query "error handling" --threshold 0.5 --dir src/
+  $ ez-search query "test" --no-auto-index     Fail if no index exists`)
+    .action(async (text, options) => {
+    const { runQuery } = await import('./commands/query-cmd.js');
+    await runQuery(text, options);
+});
+program
+    .command('status')
+    .description('Show indexing status for the current directory')
+    .option('--format <mode>', 'output format: json (default) or text')
+    .option('--no-ignore', 'disable .gitignore and .cursorignore filtering')
+    .addHelpText('after', `
+Examples:
+  $ ez-search status                           Show index status as JSON
+  $ ez-search status --format text             Show human-readable summary`)
+    .action(async (options) => {
+    const { runStatus } = await import('./commands/status-cmd.js');
+    await runStatus(options);
+});
+program.parse();
+export { program };

package/dist/config/paths.js ADDED Viewed

@@ -0,0 +1,16 @@
+import * as os from 'os';
+import * as path from 'path';
+/**
+ * Resolve the storage path for a given project directory.
+ * Format: <projectDir>/.ez-search/
+ */
+export function resolveProjectStoragePath(projectDir) {
+    return path.join(path.resolve(projectDir), '.ez-search');
+}
+/**
+ * Resolve the shared model cache path.
+ * Format: ~/.ez-search/models/
+ */
+export function resolveModelCachePath() {
+    return path.join(os.homedir(), '.ez-search', 'models');
+}

package/dist/services/chunker.js ADDED Viewed

@@ -0,0 +1,96 @@
+/**
+ * Chunker service — splits source code files into token-accurate chunks with line tracking.
+ *
+ * Uses the Jina code tokenizer (BPE, RobertaTokenizer) for accurate token counting.
+ * The tokenizer must be loaded once via loadTokenizer() and reused across all chunkFile() calls.
+ *
+ * Chunk windows: 500 tokens per chunk, 50 token overlap between consecutive chunks.
+ * Line numbers are tracked via cumulative token counts per line (1-indexed).
+ *
+ * NOTE: add_special_tokens: false is intentional — the embedding pipeline adds special tokens
+ * at inference time (pooling: 'mean', normalize: true). Double-adding them would corrupt embeddings.
+ */
+import { AutoTokenizer, env } from '@huggingface/transformers';
+import { resolveModelCachePath } from '../config/paths.js';
+// ── Constants ─────────────────────────────────────────────────────────────────
+export const CHUNK_SIZE = 500; // tokens per chunk
+export const OVERLAP = 50; // token overlap between consecutive chunks
+// ── Public API ────────────────────────────────────────────────────────────────
+/**
+ * Load the Jina code tokenizer from the shared model cache.
+ * Call this once and pass the result to all chunkFile() calls.
+ */
+export async function loadTokenizer() {
+    env.cacheDir = resolveModelCachePath();
+    env.allowRemoteModels = true;
+    return AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code');
+}
+/**
+ * Split a source file into token-accurate chunks with line number tracking.
+ *
+ * Files under CHUNK_SIZE tokens produce a single chunk spanning the entire file.
+ * Larger files are split into overlapping CHUNK_SIZE-token windows with OVERLAP tokens
+ * shared between consecutive windows.
+ *
+ * @param text - Full text content of the file
+ * @param tokenizer - Pre-loaded tokenizer from loadTokenizer()
+ * @returns Array of Chunk objects with accurate line numbers and token counts
+ */
+export function chunkFile(text, tokenizer) {
+    const lines = text.split('\n');
+    // Build cumulative token count per line for O(n_lines) line-number lookup.
+    // Each line includes its trailing newline except the last, to match how the
+    // tokenizer sees the full text.
+    const cumulative = [];
+    let cum = 0;
+    for (let i = 0; i < lines.length; i++) {
+        const lineText = lines[i] + (i < lines.length - 1 ? '\n' : '');
+        const ids = tokenizer.encode(lineText, { add_special_tokens: false });
+        // encode() returns an array-like object — access .length directly (not a plain Array)
+        cum += ids.length;
+        cumulative.push(cum);
+    }
+    // Encode full text without special tokens (pipeline adds them at inference time)
+    const allIds = tokenizer.encode(text, { add_special_tokens: false });
+    const totalTokens = allIds.length;
+    // Single-chunk case: file fits within one window
+    if (totalTokens <= CHUNK_SIZE) {
+        return [{
+                text,
+                lineStart: 1,
+                lineEnd: lines.length,
+                chunkIndex: 0,
+                tokenCount: totalTokens,
+            }];
+    }
+    // Sliding window with overlap
+    const stride = CHUNK_SIZE - OVERLAP; // 450 tokens between window starts
+    const chunks = [];
+    for (let start = 0; start < totalTokens; start += stride) {
+        const end = Math.min(start + CHUNK_SIZE, totalTokens);
+        const chunkIds = Array.from(allIds).slice(start, end);
+        const chunkText = tokenizer.decode(chunkIds, { skip_special_tokens: true });
+        chunks.push({
+            text: chunkText,
+            lineStart: tokenIndexToLine(start, cumulative),
+            lineEnd: tokenIndexToLine(end - 1, cumulative),
+            chunkIndex: chunks.length,
+            tokenCount: chunkIds.length,
+        });
+        if (end === totalTokens)
+            break;
+    }
+    return chunks;
+}
+// ── Helpers ───────────────────────────────────────────────────────────────────
+/**
+ * Map a token index to a 1-indexed line number using cumulative token counts.
+ * Linear scan: returns the first line whose cumulative token count exceeds tokenIdx.
+ */
+function tokenIndexToLine(tokenIdx, cumulative) {
+    for (let i = 0; i < cumulative.length; i++) {
+        if (tokenIdx < cumulative[i])
+            return i + 1;
+    }
+    return cumulative.length;
+}

package/dist/services/file-scanner.js ADDED Viewed

@@ -0,0 +1,62 @@
+import { readFileSync, existsSync } from 'fs';
+import * as fsp from 'fs/promises';
+import * as path from 'path';
+import ignore from 'ignore';
+import { EXTENSION_MAP, BUILTIN_EXCLUSIONS, } from '../types.js';
+export async function* scanFiles(rootDir, opts) {
+    const absRoot = path.resolve(rootDir);
+    const ig = ignore();
+    // Built-in exclusions are always active
+    ig.add(BUILTIN_EXCLUSIONS);
+    if (opts.useIgnoreFiles) {
+        for (const ignoreFile of ['.gitignore', '.cursorignore']) {
+            const ignoreFilePath = path.join(absRoot, ignoreFile);
+            if (existsSync(ignoreFilePath)) {
+                const contents = readFileSync(ignoreFilePath, 'utf8');
+                ig.add(contents);
+            }
+        }
+    }
+    yield* walkDir(absRoot, absRoot, ig, opts);
+}
+async function* walkDir(dir, rootDir, ig, opts) {
+    const dirHandle = await fsp.opendir(dir);
+    for await (const entry of dirHandle) {
+        const fullPath = path.join(dir, entry.name);
+        const relPath = path.relative(rootDir, fullPath);
+        // Skip symlinks entirely
+        if (entry.isSymbolicLink()) {
+            continue;
+        }
+        if (entry.isDirectory()) {
+            // Check both with and without trailing slash (gitignore semantics)
+            if (ig.ignores(relPath + '/') || ig.ignores(relPath)) {
+                continue;
+            }
+            yield* walkDir(fullPath, rootDir, ig, opts);
+        }
+        else if (entry.isFile()) {
+            if (ig.ignores(relPath)) {
+                continue;
+            }
+            const ext = path.extname(entry.name).toLowerCase();
+            const fileType = EXTENSION_MAP[ext];
+            // Skip unknown extensions
+            if (!fileType) {
+                continue;
+            }
+            // Apply type filter if set
+            if (opts.typeFilter && fileType !== opts.typeFilter) {
+                continue;
+            }
+            const stat = await fsp.stat(fullPath);
+            yield {
+                absolutePath: fullPath,
+                relativePath: relPath,
+                type: fileType,
+                sizeBytes: stat.size,
+                mtimeMs: stat.mtimeMs,
+            };
+        }
+    }
+}