npm - sweet-search - Versions diffs - 2.5.13 → 2.6.0 - Mend

sweet-search 2.5.13 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/README.md +36 -9
package/core/cli.js +41 -3
package/core/embedding/embedding-local-model.js +106 -10
package/core/embedding/embedding-service.js +59 -1
package/core/embedding/model-client.mjs +257 -0
package/core/embedding/model-server.mjs +217 -0
package/core/incremental-indexing/application/maintenance-handlers.mjs +19 -98
package/core/incremental-indexing/application/maintenance-worker.mjs +46 -9
package/core/incremental-indexing/application/operator-cli.mjs +14 -5
package/core/incremental-indexing/application/production-reconciler-helpers.mjs +40 -0
package/core/incremental-indexing/application/production-reconciler.mjs +718 -54
package/core/incremental-indexing/application/reconciler.mjs +87 -15
package/core/incremental-indexing/domain/cutoff-cache.mjs +191 -0
package/core/incremental-indexing/domain/interval-autotune.mjs +84 -1
package/core/incremental-indexing/domain/reconcile-counters.mjs +0 -4
package/core/incremental-indexing/domain/watermark-scheduler.mjs +0 -24
package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +2 -26
package/core/incremental-indexing/infrastructure/manifest.mjs +1 -9
package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +72 -0
package/core/indexing/artifact-builder.js +1 -1
package/core/indexing/dedup/dedup-phase.js +36 -17
package/core/indexing/dedup/exemplar-selector.js +5 -0
package/core/indexing/index-codebase-v21.js +37 -14
package/core/indexing/index-maintainer.mjs +337 -6
package/core/indexing/indexer-ann.js +27 -434
package/core/indexing/indexer-build.js +30 -14
package/core/indexing/indexer-manifest.js +0 -3
package/core/indexing/indexer-phases.js +101 -25
package/core/indexing/maintainer-launcher.mjs +22 -0
package/core/indexing/maintainer-watcher.mjs +397 -0
package/core/indexing/os-priority.mjs +160 -0
package/core/indexing/rss-budget.mjs +425 -0
package/core/indexing/streaming-vectors.js +450 -0
package/core/infrastructure/config/platform.js +14 -10
package/core/infrastructure/onnx-session-utils.js +37 -0
package/core/infrastructure/sparse-gram-delta-reader.js +11 -1
package/core/ranking/late-interaction-index.js +58 -7
package/core/search/daemon-registry.js +199 -0
package/core/search/search-read-semantic.js +9 -3
package/core/search/search-semantic.js +6 -29
package/core/search/search-server.js +527 -27
package/core/search/session-daemon-prewarm.mjs +110 -1
package/core/search/sweet-search.js +0 -38
package/core/vector-store/binary-hnsw-index.js +692 -78
package/core/vector-store/index.js +1 -4
package/eval/agent-read-workflows/bin/_ss-argparse.mjs +51 -5
package/eval/agent-read-workflows/bin/_ss-helpers.mjs +95 -44
package/eval/agent-read-workflows/bin/ss-read +2 -0
package/mcp/tool-handlers.js +1 -2
package/package.json +11 -8
package/scripts/uninstall.js +2 -0
package/core/vector-store/hnsw-index.js +0 -751

package/core/indexing/indexer-ann.js CHANGED Viewed

@@ -3,25 +3,17 @@
  * Extracted from index-codebase-v21.js for file size compliance (<500 lines).
  */
-import { existsSync, openSync, fsyncSync, closeSync, writeFileSync, readFileSync, unlinkSync } from 'fs';
+import { existsSync } from 'fs';
 import path from 'path';
-import { DB_PATHS, HNSW_CONFIG, BINARY_HNSW_CONFIG } from '../infrastructure/config/index.js';
+import { DB_PATHS, BINARY_HNSW_CONFIG } from '../infrastructure/config/index.js';
 import { chunkedIn } from '../infrastructure/db-utils.js';
-import { HNSWIndex } from '../vector-store/hnsw-index.js';
 import { LateInteractionIndex } from '../ranking/late-interaction-index.js';
-import { truncateForHNSW, getEmbeddings, getModelInfo, fisherYatesShuffle } from '../embedding/embedding-service.js';
 import { buildFromCodebaseDb as buildQuantizedArtifacts, shouldSkipArtifactRebuild, updateArtifactState, ARTIFACT_THRESHOLDS } from './artifact-builder.js';
 import { log, logProgress } from './indexer-utils.js';
 import { JAVA_FAMILY } from './ast-chunker.js';
 import { isIndexAcceleratorAvailable } from './model-pool.js';
-// =============================================================================
-// DURABLE WRITE HELPERS (Phase E — fsync ordering for checkpoint safety)
-// =============================================================================
-const CHECKPOINT_INTERVAL_SEC = 30;
-const MIN_VECTORS_BETWEEN_SAVES = 1000;
 /**
  * v6.2: language-family-conditioned LI input routing.
@@ -84,38 +76,6 @@ function firstSafeRelativePath(...candidates) {
   return null;
 }
-function fsyncFile(filePath) {
-  const fd = openSync(filePath, 'r');
-  try { fsyncSync(fd); } finally { closeSync(fd); }
-}
-function fsyncDirectory(dirPath) {
-  try {
-    const fd = openSync(dirPath, 'r');
-    try { fsyncSync(fd); } finally { closeSync(fd); }
-  } catch (_err) {
-    // Directory fsync not supported on all platforms (Windows) — best effort
-  }
-}
-function writeCheckpointSidecar(sidecarPath, data) {
-  writeFileSync(sidecarPath, JSON.stringify(data, null, 2));
-}
-function readCheckpointSidecar(sidecarPath) {
-  if (!existsSync(sidecarPath)) return null;
-  try {
-    return JSON.parse(readFileSync(sidecarPath, 'utf-8'));
-  } catch (_err) { return null; }
-}
-function cleanupCheckpoint(indexPath) {
-  const checkpointPath = `${indexPath}.checkpoint`;
-  const sidecarPath = `${indexPath}.checkpoint.json`;
-  try { unlinkSync(checkpointPath); } catch (_e) { /* noop */ }
-  try { unlinkSync(sidecarPath); } catch (_e) { /* noop */ }
-}
 // =============================================================================
 // SQLITE VECTOR STREAMING (Phase B — eliminates O(n*d) in-memory arrays)
 // =============================================================================
@@ -156,68 +116,6 @@ function vectorIndexWhere(db, alias = '') {
   return `${aliasFilterSql(alias)} AND ${liveVectorSql(db, alias)}`;
 }
-function* streamVectorsFromDb(db, _dim, order = 'sequential') {
-  const vectorWhere = vectorIndexWhere(db);
-  if (order !== 'sequential') {
-    db.exec('CREATE TEMP TABLE IF NOT EXISTS hnsw_order (pos INTEGER PRIMARY KEY, vector_rowid INTEGER)');
-    db.exec('DELETE FROM hnsw_order');
-    const rowidRows = db
-      .prepare(`SELECT rowid FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
-      .all();
-    let indices = rowidRows.map((r) => r.rowid);
-    if (order === 'shuffle') {
-      fisherYatesShuffle(indices);
-    } else if (order === 'diversity') {
-      const pathRows = db
-        .prepare(`SELECT rowid, file_path FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
-        .all();
-      const filePaths = pathRows.map((r) => r.file_path);
-      const permutationPositions = diversityFirstPermutationRowids(filePaths);
-      indices = permutationPositions.map((pos) => pathRows[pos - 1]?.rowid).filter(Boolean);
-    }
-    const insertOrder = db.prepare('INSERT INTO hnsw_order (pos, vector_rowid) VALUES (?, ?)');
-    db.transaction(() => {
-      for (let pos = 0; pos < indices.length; pos++) {
-        insertOrder.run(pos, indices[pos]);
-      }
-    })();
-    const stmt = db.prepare(`
-      SELECT v.rowid as rowid, v.id, v.file_path, v.embedding, v.metadata
-      FROM hnsw_order o
-      JOIN vectors v ON v.rowid = o.vector_rowid
-      ORDER BY o.pos
-    `);
-    for (const row of stmt.iterate()) {
-      yield {
-        rowid: row.rowid,
-        id: row.id,
-        file: row.file_path,
-        embedding: new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.length / 4),
-        metadata: row.metadata ? JSON.parse(row.metadata) : {},
-      };
-    }
-    db.exec('DROP TABLE IF EXISTS temp.hnsw_order');
-  } else {
-    const stmt = db.prepare(
-      `SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${vectorWhere} ORDER BY rowid`,
-    );
-    for (const row of stmt.iterate()) {
-      yield {
-        rowid: row.rowid,
-        id: row.id,
-        file: row.file_path,
-        embedding: new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.length / 4),
-        metadata: row.metadata ? JSON.parse(row.metadata) : {},
-      };
-    }
-  }
-}
 /**
  * Pure decision function — should the hybrid CPU+GPU LI dispatcher arm?
  *
@@ -362,327 +260,6 @@ function buildLateInteractionBatches(chunks, options = {}) {
   return batches;
 }
-/** Diversity-first permutation returning 1-based rowid indices */
-function diversityFirstPermutationRowids(filePaths) {
-  const buckets = new Map();
-  for (let i = 0; i < filePaths.length; i++) {
-    const dir = filePaths[i] ? filePaths[i].replace(/\/[^/]+$/, '') : '_unknown';
-    if (!buckets.has(dir)) buckets.set(dir, []);
-    buckets.get(dir).push(i + 1); // 1-based rowid
-  }
-  const dirs = [...buckets.keys()];
-  fisherYatesShuffle(dirs);
-  const order = [];
-  let remaining = filePaths.length;
-  while (remaining > 0) {
-    for (const dir of dirs) {
-      const bucket = buckets.get(dir);
-      if (bucket.length > 0) { order.push(bucket.shift()); remaining--; }
-    }
-  }
-  return order;
-}
-// =============================================================================
-// INSERTION ORDER TUNING
-// =============================================================================
-// NOTE: applyInsertionOrder and diversityFirstPermutation (in-memory array permutation)
-// removed in Phase B. Insertion order is now handled via SQLite temp tables in
-// streamVectorsFromDb() and diversityFirstPermutationRowids().
-// =============================================================================
-// PHASE 3: HNSW INDEX (Incremental)
-// =============================================================================
-export async function incrementalUpdateHNSW(dbPath, changedFiles, dryRun = false) {
-  log('\n━━━ Phase 4: HNSW Index (Incremental) ━━━', 'bright');
-  if (dryRun) {
-    log('DRY RUN: Skipping HNSW incremental update', 'magenta');
-    return;
-  }
-  const modelInfo = getModelInfo();
-  const hnswDim = modelInfo.hnswDimension;
-  log('Loading existing HNSW index...', 'yellow');
-  const index = new HNSWIndex({
-    dimension: hnswDim,
-    M: HNSW_CONFIG.M,
-    efConstruction: HNSW_CONFIG.efConstruction,
-    efSearch: HNSW_CONFIG.efSearch,
-  });
-  let existingCount = 0;
-  try {
-    await index.load();
-    existingCount = index.nextKey;
-    log(`✓ Loaded existing index with ${existingCount} vectors`, 'green');
-  } catch (err) {
-    log(`No existing index found, creating new one`, 'yellow');
-    await index.init();
-  }
-  let removed = 0;
-  if (changedFiles && changedFiles.length > 0) {
-    log(`Removing entries for ${changedFiles.length} changed files...`, 'yellow');
-    const changedFileSet = new Set(changedFiles);
-    const idsToRemove = [];
-    for (const [id, metadata] of index.metadata.entries()) {
-      if (metadata.file && changedFileSet.has(metadata.file)) {
-        idsToRemove.push(id);
-      }
-    }
-    for (const id of idsToRemove) {
-      await index.remove(id);
-      removed++;
-    }
-    log(`✓ Removed ${removed} old entries`, 'green');
-  }
-  // Read new vectors for changed files from SQLite
-  const Database = (await import('better-sqlite3')).default;
-  const db = new Database(dbPath, { readonly: true });
-  const changedFileList = [...new Set(changedFiles || [])];
-  // Chunk the IN(?,?,...) clause to stay under SQLite's bound-parameter
-  // limit (default 32766, historic floor 999). Without chunking, a single
-  // indexing pass over >~32k changed files crashes with "too many SQL
-  // variables" — observed in production on CoSQA+ (51k docs) and BRIGHT
-  // (528k docs). See core/infrastructure/db-utils.js for the helper.
-  let rows = [];
-  if (changedFileList.length > 0) {
-    rows = chunkedIn(
-      db,
-      `SELECT rowid, id, file_path, embedding, metadata
-         FROM vectors
-        WHERE ${vectorIndexWhere(db)}
-          AND file_path IN (__IN_PLACEHOLDERS__)
-        ORDER BY rowid`,
-      changedFileList,
-    );
-    // Each batch is ORDER BY rowid internally, but batch boundaries break
-    // global monotonicity. The HNSW insertion loop below relies on rowid
-    // order for deterministic graph construction — re-sort explicitly.
-    rows.sort((a, b) => a.rowid - b.rowid);
-  }
-  const totalNew = rows.length;
-  log(`Adding ${totalNew} new entries...`, 'yellow');
-  let added = 0;
-  for (const row of rows) {
-    const embedding = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.length / 4);
-    if (!embedding || embedding.length === 0) continue;
-    const truncatedEmbedding = truncateForHNSW(embedding);
-    const metadata = row.metadata ? JSON.parse(row.metadata) : {};
-    await index.add(row.id, truncatedEmbedding, {
-      file: row.file_path,
-      name: metadata?.symbol,
-      type: metadata?.chunk_type,
-    });
-    added++;
-    if (added % 500 === 0 || added === totalNew) {
-      logProgress(added, totalNew, 'Adding to HNSW');
-    }
-  }
-  db.close();
-  log('\nSaving merged HNSW index...', 'yellow');
-  await index.save();
-  const stats = index.getStats();
-  log(`✓ HNSW index saved (${stats.totalVectors} total vectors, +${added} -${removed})`, 'green');
-  log(`  Engine: ${stats.engine}, Dimension: ${hnswDim}d (Matryoshka)`, 'dim');
-}
-// =============================================================================
-// PHASE 3: HNSW INDEX (Full Rebuild)
-// =============================================================================
-export async function buildHNSWIndex(dbPath, dryRun = false) {
-  log('\n━━━ Phase 4: HNSW Index ━━━', 'bright');
-  if (dryRun) {
-    log('DRY RUN: Skipping HNSW index', 'magenta');
-    return;
-  }
-  const Database = (await import('better-sqlite3')).default;
-  const orderMode = BINARY_HNSW_CONFIG.insertionOrder || 'sequential';
-  // Non-sequential orders require temp tables → can't use readonly
-  const db = new Database(dbPath, orderMode === 'sequential' ? { readonly: true } : {});
-  const totalVectors = db
-    .prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${vectorIndexWhere(db)}`)
-    .get().c;
-  if (totalVectors === 0) {
-    db.close();
-    log('No chunks to index', 'yellow');
-    return;
-  }
-  const modelInfo = getModelInfo();
-  const hnswDim = modelInfo.hnswDimension;
-  const index = new HNSWIndex({
-    dimension: hnswDim,
-    M: HNSW_CONFIG.M,
-    efConstruction: HNSW_CONFIG.efConstruction,
-    efSearch: HNSW_CONFIG.efSearch,
-    maxElements: Math.max(totalVectors * 2, HNSW_CONFIG.maxElements),
-  });
-  // Checkpoint resume is only safe with sequential order — non-sequential
-  // orders shuffle the stream so rowid is not a reliable resume boundary.
-  const canCheckpoint = orderMode === 'sequential';
-  const indexPath = DB_PATHS.hnswIndex;
-  const usearchPath = indexPath.replace('.idx', '.usearch');
-  const checkpointPath = `${usearchPath}.checkpoint`;
-  const sidecarPath = `${usearchPath}.checkpoint.json`;
-  const sidecar = canCheckpoint ? readCheckpointSidecar(sidecarPath) : null;
-  let resumeFromRowId = 0;
-  await index.init();
-  if (sidecar && existsSync(checkpointPath)) {
-    try {
-      if (index.index) {
-        // Load raw USearch graph from checkpoint
-        index.index.load(checkpointPath);
-        resumeFromRowId = sidecar.lastRowId || 0;
-        // Rebuild JS-side metadata (idMap, reverseMap, metadata, nextKey) for
-        // vectors already in the checkpoint. Without this, add() reuses keys
-        // from 0 and the final .meta.json would be incomplete.
-        const metaStmt = db.prepare(
-          `SELECT id, file_path, metadata
-             FROM vectors
-            WHERE rowid <= ? AND ${vectorIndexWhere(db)}
-            ORDER BY rowid`
-        );
-        let restoredKey = 0;
-        for (const row of metaStmt.iterate(resumeFromRowId)) {
-          const meta = row.metadata ? JSON.parse(row.metadata) : {};
-          const key = restoredKey++;
-          index.idMap.set(row.id, key);
-          index.reverseMap.set(key, row.id);
-          index.metadata.set(row.id, {
-            file: row.file_path,
-            name: meta?.symbol,
-            type: meta?.chunk_type,
-          });
-        }
-        index.nextKey = restoredKey;
-        log(`Resuming from checkpoint: ${sidecar.vectorsAdded} vectors, skipping rowid <= ${resumeFromRowId}`, 'green');
-      }
-    } catch (err) {
-      log(`Checkpoint found but could not load, starting fresh: ${err.message}`, 'yellow');
-      resumeFromRowId = 0;
-      // Reset any partial metadata restoration
-      index.idMap.clear();
-      index.reverseMap.clear();
-      index.metadata.clear();
-      index.nextKey = 0;
-    }
-  }
-  // Discard stale checkpoint from a previous non-sequential build
-  if (!canCheckpoint) {
-    cleanupCheckpoint(usearchPath);
-  }
-  log(`Building HNSW index (${modelInfo.dimension}d → ${hnswDim}d Matryoshka, M=${HNSW_CONFIG.M}, order=${orderMode})...`, 'yellow');
-  let added = resumeFromRowId > 0 ? (sidecar?.vectorsAdded || 0) : 0;
-  let lastCheckpointTime = Date.now();
-  let vectorsSinceCheckpoint = 0;
-  // try/finally guarantees the DB handle closes and stale checkpoint files
-  // get cleaned up even when the build loop throws. Without this, a failed
-  // build leaves .checkpoint + .checkpoint.json on disk and the NEXT run
-  // silently resumes from an indeterminate state (M5 fix).
-  let buildCompleted = false;
-  try {
-    for (const row of streamVectorsFromDb(db, hnswDim, orderMode)) {
-      // Skip already-checkpointed vectors on resume (only valid for sequential order)
-      if (resumeFromRowId > 0 && row.rowid <= resumeFromRowId) continue;
-      if (!row.embedding || row.embedding.length === 0) continue;
-      const truncatedEmbedding = truncateForHNSW(row.embedding);
-      await index.add(row.id, truncatedEmbedding, {
-        file: row.file,
-        name: row.metadata?.symbol,
-        type: row.metadata?.chunk_type,
-      });
-      added++;
-      vectorsSinceCheckpoint++;
-      // Time-based checkpoint: bounded data loss on crash (~30s max)
-      // Only for sequential order where rowid-based resume is valid.
-      if (canCheckpoint) {
-        const elapsed = (Date.now() - lastCheckpointTime) / 1000;
-        if (elapsed >= CHECKPOINT_INTERVAL_SEC && vectorsSinceCheckpoint >= MIN_VECTORS_BETWEEN_SAVES) {
-          if (!index.useFallback && index.index) {
-            index.index.save(checkpointPath);
-            fsyncFile(checkpointPath);
-            writeCheckpointSidecar(sidecarPath, {
-              vectorsAdded: added,
-              lastRowId: row.rowid,
-              version: row.rowid,
-              timestamp: new Date().toISOString(),
-              elapsedMs: Date.now() - lastCheckpointTime,
-            });
-            fsyncFile(sidecarPath);
-            fsyncDirectory(path.dirname(checkpointPath));
-            if (process.env.DEBUG) log(`  checkpoint: ${added}/${totalVectors} vectors`, 'dim');
-          }
-          lastCheckpointTime = Date.now();
-          vectorsSinceCheckpoint = 0;
-        }
-      }
-      if (added % 500 === 0 || added === totalVectors) {
-        logProgress(added, totalVectors, 'Building HNSW');
-      }
-    }
-    await index.save();
-    await index.clearStaleBitmap();
-    buildCompleted = true;
-    // Clean up checkpoint files after successful completion
-    cleanupCheckpoint(usearchPath);
-    const stats = index.getStats();
-    log(`\n✓ HNSW index built: ${stats.totalVectors} vectors (${hnswDim}d)`, 'green');
-    log(`  Using fallback: ${stats.useFallback}`, 'dim');
-  } finally {
-    try { db.close(); } catch (_err) { /* already closed */ }
-    if (!buildCompleted) {
-      // Build threw mid-stream. Remove stale checkpoint files so the next
-      // run starts from a known-good "no-resume" state rather than
-      // resuming against a different/new vector DB.
-      cleanupCheckpoint(usearchPath);
-    }
-  }
-}
 // =============================================================================
 // PHASE 4: LATE INTERACTION INDEX
 // =============================================================================
@@ -704,6 +281,14 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
     attentionBudget = null,
     segmentSize = null, // override SSLX-v3 segment threshold (default 10k)
     projectRoot,        // honored by LI skip policy for .sweet-search.config.json excludes
+    // Bounded-memory build (streaming path): evict each flushed segment's
+    // per-token slabs from the index's in-memory map so peak heap stays
+    // O(one segment) on huge repos. Safe only for from-scratch full rebuilds.
+    buildEvict = false,
+    // The streaming caller applies the LI skip policy once during its spill
+    // pass (where chunk content is in hand), so skip it here to avoid needing
+    // full chunk content resident a second time.
+    skipPolicyAlreadyApplied = false,
   } = options;
   log('\n━━━ Phase 3: Late Interaction Index (LateOn-Code) ━━━', 'bright');
@@ -718,7 +303,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
   // LI-specific check globs can't do: content-based @generated markers.
   // Disable via SWEET_SEARCH_LI_SKIP_DISABLE=1.
   let skippedSummary = null;
-  if (Array.isArray(chunks) && chunks.length > 0) {
+  if (!skipPolicyAlreadyApplied && Array.isArray(chunks) && chunks.length > 0) {
     const { applyIndexingChunkPolicy } = await import('./indexing-file-policy.js');
     const { kept, stats } = applyIndexingChunkPolicy(chunks, { projectRoot });
     if (stats.totalSkipped > 0) {
@@ -761,6 +346,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
     modelId: LATE_INTERACTION_CONFIG.model,
     indexPath: fullRebuild ? saveToPath : loadFromPath,
     loadExisting: !fullRebuild,
+    buildEvict: buildEvict && fullRebuild,
     ...(segmentSize ? { segmentSize } : {}),
   });
   if (quantBits !== defaultQuantBits || whtSeed !== 0) {
@@ -1091,7 +677,10 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
       const exemplarId = alias.metadata?.exemplarId;
       const clusterId = alias.metadata?.clusterId;
       if (!exemplarId || !clusterId) continue;
-      if (!liIndex.documents.has(exemplarId)) {
+      // hasDoc() (not documents.has()) so alias registration stays valid in
+      // bounded build mode, where the exemplar's per-token slab may already
+      // have been flushed to a segment and evicted from the live map.
+      if (!liIndex.hasDoc(exemplarId)) {
         orphaned++;
         continue;
       }
@@ -1148,18 +737,22 @@ export async function buildQuantizedArtifactsPhase(dryRun = false, options = {})
     const skipCheck = await shouldSkipArtifactRebuild({ changedFiles, force });
-    if (skipCheck.shouldSkip) {
-      log(`Skipping binary artifacts (only ${changedFiles} files changed, threshold is ${ARTIFACT_THRESHOLDS.skipThreshold})`, 'yellow');
-      log('  Float HNSW will serve search until next rebuild', 'dim');
-      log(`  Accumulated changes: ${skipCheck.accumulatedTotal || changedFiles}`, 'dim');
+    // usearch float HNSW was removed (commit c2a9817) — the binary HNSW is now
+    // the ONLY semantic search surface, and search dispatches to it whenever the
+    // artifact exists. So we can NO LONGER defer its rebuild on a sub-threshold
+    // change: that left vectors freshly committed to codebase.db invisible to
+    // 3-stage search until the next rebuild fired (the staleness Codex caught).
+    // Any actual change must rebuild the binary artifact to stay consistent with
+    // codebase.db; only a genuine no-op run (0 changed files) may skip. (The
+    // default daemon reconcile path maintains this per-tick via applyBinaryHNSWDelta.)
+    if (skipCheck.shouldSkip && (Number(changedFiles) || 0) === 0) {
+      log('Skipping binary artifacts: no files changed since last rebuild', 'dim');
       await updateArtifactState({
         rebuilt: false,
         changedFiles,
         previousState: skipCheck.state,
       });
-      return { binaryHnsw: null, int8: null, skipped: true, reason: skipCheck.reason };
+      return { binaryHnsw: null, int8: null, skipped: true, reason: 'no-changes' };
     }
     log('Building quantized artifacts from codebase.db...', 'yellow');

package/core/indexing/indexer-build.js CHANGED Viewed

@@ -160,8 +160,11 @@ export async function buildCodeGraph(files, dryRun = false) {
       const content = await fs.readFile(filePath, 'utf-8');
       const { entities, relationships } = await extractor.extractFromFile(files[i], content);
-      entityBatch.push(...entities);
-      relBatch.push(...relationships);
+      // Element-wise append, not push(...spread): a single generated mega-file
+      // (e.g. libsql's 250k-line SQLite amalgamation) can yield 65k+ entities,
+      // and spreading that many args into push() overflows the call stack.
+      for (let k = 0; k < entities.length; k++) entityBatch.push(entities[k]);
+      for (let k = 0; k < relationships.length; k++) relBatch.push(relationships[k]);
       processed++;
     } catch (err) {
       errors++;
@@ -424,7 +427,7 @@ function prepareVectorInsert(db) {
  * call this AFTER pipelinedEmbedAndInsert has written the exemplar rows.
  * Returns the number of alias rows inserted.
  */
-export function insertAliasVectors(db, aliases, modelInfo) {
+export function insertAliasVectors(db, aliases, modelInfo, options = {}) {
   if (!aliases || aliases.length === 0) return 0;
   const fetchExemplar = db.prepare(
@@ -443,16 +446,24 @@ export function insertAliasVectors(db, aliases, modelInfo) {
   // resolves to a live vectors row. This happens in incremental re-index
   // when a file containing an exemplar is deleted but alias files in
   // untouched paths still reference it.
-  const orphanDelete = db.prepare(`
-    DELETE FROM vectors
-    WHERE json_extract(metadata, '$.exemplarId') IS NOT NULL
-      AND json_extract(metadata, '$.exemplarId') NOT IN (
-        SELECT id FROM vectors WHERE json_extract(metadata, '$.exemplarId') IS NULL
-      )
-  `);
-  const orphansRemoved = orphanDelete.run().changes;
-  if (orphansRemoved > 0) {
-    log(`  ⚠ Purged ${orphansRemoved} orphan alias row(s) (exemplar absent)`, 'yellow');
+  //
+  // `skipOrphanPurge` is set by the streaming full-rebuild path, which calls
+  // this once per window into a FRESH temp db: there are no pre-existing rows
+  // to orphan, and the full-table json_extract scan would otherwise run once
+  // per window (O(windows × table)). A from-scratch build can never produce
+  // orphans, so skipping it is safe and keeps indexing fast.
+  if (!options.skipOrphanPurge) {
+    const orphanDelete = db.prepare(`
+      DELETE FROM vectors
+      WHERE json_extract(metadata, '$.exemplarId') IS NOT NULL
+        AND json_extract(metadata, '$.exemplarId') NOT IN (
+          SELECT id FROM vectors WHERE json_extract(metadata, '$.exemplarId') IS NULL
+        )
+    `);
+    const orphansRemoved = orphanDelete.run().changes;
+    if (orphansRemoved > 0) {
+      log(`  ⚠ Purged ${orphansRemoved} orphan alias row(s) (exemplar absent)`, 'yellow');
+    }
   }
   const items = [];
@@ -584,7 +595,12 @@ export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, m
     embeddingCount += batchEmbeddings.length;
     const batchItems = buildInsertItems(batchChunks, batchEmbeddings, modelInfo, batchAnnotations);
-    writeBuffer.push(...batchItems);
+    // NOT `writeBuffer.push(...batchItems)`: for local models batchSize ==
+    // texts.length, so batchItems holds the WHOLE corpus in one batch. Spreading
+    // 100k+ args into push() overflows the call stack (V8 caps spread args at
+    // ~65k-125k) and crashed indexing on large repos (swc ~133k chunks, libsql).
+    // Append element-by-element so it stays O(n) and stack-safe at any size.
+    for (let k = 0; k < batchItems.length; k++) writeBuffer.push(batchItems[k]);
     if (!useInternalProgress) {
       logProgressFn(Math.min(i + batchSize, texts.length), texts.length, 'Embedding');

package/core/indexing/indexer-manifest.js CHANGED Viewed

@@ -24,8 +24,6 @@ export function defaultIndexerManifestPaths() {
   return {
     codeGraph: basename(DB_PATHS.codeGraph),
     vectors: basename(DB_PATHS.codebase),
-    hnsw: basename(DB_PATHS.hnswIndex),
-    hnswStale: basename(DB_PATHS.hnswIndex) + '.stale.bin',
     binaryHnsw: basename(DB_PATHS.binaryHnswIndex),
     liManifest: `${liBase}.segments/manifest.json`,
     sparseBase: basename(DB_PATHS.sparseGramIndex),
@@ -45,7 +43,6 @@ export function publishIndexerManifest(options = {}) {
   const defaultTiers = {
     codeGraph: defaultManifest.codeGraph,
     vectors: defaultManifest.vectors,
-    hnsw: defaultManifest.hnsw,
     binaryHnsw: defaultManifest.binaryHnsw,
     lateInteraction: defaultManifest.lateInteraction,
     sparseGram: {