npm - sweet-search - Versions diffs - 2.5.2 → 2.5.4 - Mend

sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

package/core/cli.js +24 -3
package/core/graph/graph-expansion.js +215 -36
package/core/graph/graph-extractor.js +196 -11
package/core/graph/graph-search.js +395 -92
package/core/graph/hcgs-generator.js +2 -1
package/core/graph/index.js +2 -0
package/core/graph/repo-map.js +28 -6
package/core/graph/structural-answer-cues.js +168 -0
package/core/graph/structural-callsite-hints.js +40 -0
package/core/graph/structural-context-format.js +40 -0
package/core/graph/structural-context.js +450 -0
package/core/graph/structural-forward-push.js +156 -0
package/core/graph/structural-header-context.js +19 -0
package/core/graph/structural-importance.js +148 -0
package/core/graph/structural-pagerank.js +197 -0
package/core/graph/summary-manager.js +13 -9
package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
package/core/incremental-indexing/application/file-watcher.mjs +197 -0
package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
package/core/incremental-indexing/application/operator-cli.mjs +554 -0
package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
package/core/incremental-indexing/application/reconciler.mjs +477 -0
package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
package/core/indexing/admission-policy.js +139 -0
package/core/indexing/artifact-builder.js +29 -12
package/core/indexing/ast-chunker.js +107 -30
package/core/indexing/dedup/exemplar-selector.js +19 -1
package/core/indexing/gitignore-filter.js +223 -0
package/core/indexing/incremental-tracker.js +99 -30
package/core/indexing/index-codebase-v21.js +6 -5
package/core/indexing/index-maintainer.mjs +698 -6
package/core/indexing/indexer-ann.js +99 -15
package/core/indexing/indexer-build.js +158 -45
package/core/indexing/indexer-empty-baseline.js +80 -0
package/core/indexing/indexer-manifest.js +66 -0
package/core/indexing/indexer-phases.js +56 -23
package/core/indexing/indexer-sparse-gram.js +54 -13
package/core/indexing/indexer-utils.js +26 -208
package/core/indexing/indexing-file-policy.js +32 -7
package/core/indexing/maintainer-launcher.mjs +137 -0
package/core/indexing/merkle-tracker.js +251 -244
package/core/indexing/model-pool.js +46 -5
package/core/infrastructure/code-graph-repository.js +758 -6
package/core/infrastructure/code-graph-visibility.js +157 -0
package/core/infrastructure/codebase-repository.js +100 -13
package/core/infrastructure/config/search.js +1 -1
package/core/infrastructure/db-utils.js +118 -0
package/core/infrastructure/dedup-hashing.js +10 -13
package/core/infrastructure/hardware-capability.js +17 -7
package/core/infrastructure/index.js +8 -2
package/core/infrastructure/language-patterns/maps.js +4 -1
package/core/infrastructure/language-patterns/registry-core.js +56 -17
package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
package/core/infrastructure/language-patterns.js +69 -0
package/core/infrastructure/model-registry.js +20 -0
package/core/infrastructure/native-inference.js +7 -12
package/core/infrastructure/native-resolver.js +52 -37
package/core/infrastructure/native-sparse-gram.js +261 -20
package/core/infrastructure/native-tokenizer.js +6 -15
package/core/infrastructure/simd-distance.js +10 -16
package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
package/core/infrastructure/structural-alias-resolver.js +122 -0
package/core/infrastructure/structural-candidate-ranker.js +34 -0
package/core/infrastructure/structural-context-repository.js +472 -0
package/core/infrastructure/structural-context-utils.js +51 -0
package/core/infrastructure/structural-graph-signals.js +121 -0
package/core/infrastructure/structural-qualified-resolution.js +15 -0
package/core/infrastructure/structural-source-definitions.js +100 -0
package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
package/core/infrastructure/tree-sitter-provider.js +811 -37
package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
package/core/query/query-router.js +55 -5
package/core/ranking/file-kind-ranking.js +2192 -15
package/core/ranking/late-interaction-index.js +87 -12
package/core/search/cli-decoration.js +290 -0
package/core/search/context-expander.js +988 -78
package/core/search/index.js +1 -0
package/core/search/output-policy.js +275 -0
package/core/search/search-anchor.js +499 -0
package/core/search/search-boost.js +93 -1
package/core/search/search-cli.js +61 -204
package/core/search/search-hybrid.js +250 -10
package/core/search/search-pattern-chunks.js +57 -8
package/core/search/search-pattern-planner.js +68 -9
package/core/search/search-pattern-prefilter.js +30 -10
package/core/search/search-pattern-ripgrep.js +40 -4
package/core/search/search-pattern-sparse-overlay.js +256 -0
package/core/search/search-pattern.js +117 -29
package/core/search/search-postprocess.js +479 -5
package/core/search/search-read-semantic.js +260 -23
package/core/search/search-read.js +82 -64
package/core/search/search-reader-pin.js +71 -0
package/core/search/search-rrf.js +279 -0
package/core/search/search-semantic.js +110 -5
package/core/search/search-server.js +130 -57
package/core/search/search-trace.js +107 -0
package/core/search/server-identity.js +93 -0
package/core/search/session-daemon-prewarm.mjs +33 -10
package/core/search/sweet-search.js +399 -7
package/core/skills/sweet-index/SKILL.md +8 -6
package/core/vector-store/binary-hnsw-index.js +194 -30
package/core/vector-store/float-vector-store.js +96 -6
package/core/vector-store/hnsw-index.js +220 -49
package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
package/eval/agent-read-workflows/bin/ss-find +15 -0
package/eval/agent-read-workflows/bin/ss-grep +12 -0
package/eval/agent-read-workflows/bin/ss-read +14 -0
package/eval/agent-read-workflows/bin/ss-search +18 -0
package/eval/agent-read-workflows/bin/ss-semantic +12 -0
package/eval/agent-read-workflows/bin/ss-trace +11 -0
package/mcp/read-tool.js +109 -0
package/mcp/server.js +55 -15
package/mcp/tool-handlers.js +14 -124
package/mcp/trace-tool.js +81 -0
package/package.json +25 -10
package/scripts/hooks/intercept-read.mjs +55 -0
package/scripts/hooks/remind-tools.mjs +40 -0
package/scripts/init.js +698 -54
package/scripts/inject-agent-instructions.js +431 -0
package/scripts/install-prompt-reminders.js +188 -0
package/scripts/install-tool-enforcement.js +220 -0
package/scripts/smoke-test.js +12 -9
package/scripts/uninstall.js +276 -18
package/scripts/write-claude-rules.js +110 -0

package/core/indexing/indexer-ann.js CHANGED Viewed

@@ -7,12 +7,14 @@ import { existsSync, openSync, fsyncSync, closeSync, writeFileSync, readFileSync
 import path from 'path';
 import { DB_PATHS, HNSW_CONFIG, BINARY_HNSW_CONFIG } from '../infrastructure/config/index.js';
+import { chunkedIn } from '../infrastructure/db-utils.js';
 import { HNSWIndex } from '../vector-store/hnsw-index.js';
 import { LateInteractionIndex } from '../ranking/late-interaction-index.js';
 import { truncateForHNSW, getEmbeddings, getModelInfo, fisherYatesShuffle } from '../embedding/embedding-service.js';
 import { buildFromCodebaseDb as buildQuantizedArtifacts, shouldSkipArtifactRebuild, updateArtifactState, ARTIFACT_THRESHOLDS } from './artifact-builder.js';
 import { log, logProgress } from './indexer-utils.js';
 import { JAVA_FAMILY } from './ast-chunker.js';
+import { isIndexAcceleratorAvailable } from './model-pool.js';
 // =============================================================================
 // DURABLE WRITE HELPERS (Phase E — fsync ordering for checkpoint safety)
@@ -60,6 +62,28 @@ export function pickLiInput(chunk) {
   return chunk.li_greedy_text || chunk.embedding_text || chunk.li_text || chunk.text || chunk.content || '';
 }
+function chunkFilePath(chunk) {
+  return firstSafeRelativePath(
+    chunk?.metadata?.relative_path,
+    chunk?.metadata?.path,
+    chunk?.metadata?.file_path,
+    chunk?.file,
+    chunk?.metadata?.file,
+  ) || '';
+}
+function firstSafeRelativePath(...candidates) {
+  for (const candidate of candidates) {
+    if (typeof candidate !== 'string') continue;
+    const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
+    if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
+    if (/^[A-Za-z]:\//.test(normalized)) continue;
+    if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
+    return normalized;
+  }
+  return null;
+}
 function fsyncFile(filePath) {
   const fd = openSync(filePath, 'r');
   try { fsyncSync(fd); } finally { closeSync(fd); }
@@ -108,13 +132,38 @@ function cleanupCheckpoint(indexPath) {
 // list at the exemplar's rank position.
 const ALIAS_FILTER_SQL = "json_extract(metadata, '$.exemplarId') IS NULL";
+function hasVectorColumn(db, column) {
+  try {
+    return db.prepare('PRAGMA table_info(vectors)').all().some((col) => col.name === column);
+  } catch (_err) {
+    return false;
+  }
+}
+function aliasFilterSql(alias = '') {
+  if (!alias) return ALIAS_FILTER_SQL;
+  const prefix = alias ? `${alias}.` : '';
+  return `json_extract(${prefix}metadata, '$.exemplarId') IS NULL`;
+}
+function liveVectorSql(db, alias = '') {
+  if (!hasVectorColumn(db, 'epoch_retired')) return '1=1';
+  const prefix = alias ? `${alias}.` : '';
+  return `${prefix}epoch_retired IS NULL`;
+}
+function vectorIndexWhere(db, alias = '') {
+  return `${aliasFilterSql(alias)} AND ${liveVectorSql(db, alias)}`;
+}
 function* streamVectorsFromDb(db, _dim, order = 'sequential') {
+  const vectorWhere = vectorIndexWhere(db);
   if (order !== 'sequential') {
     db.exec('CREATE TEMP TABLE IF NOT EXISTS hnsw_order (pos INTEGER PRIMARY KEY, vector_rowid INTEGER)');
     db.exec('DELETE FROM hnsw_order');
     const rowidRows = db
-      .prepare(`SELECT rowid FROM vectors WHERE ${ALIAS_FILTER_SQL} ORDER BY rowid`)
+      .prepare(`SELECT rowid FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
       .all();
     let indices = rowidRows.map((r) => r.rowid);
@@ -122,7 +171,7 @@ function* streamVectorsFromDb(db, _dim, order = 'sequential') {
       fisherYatesShuffle(indices);
     } else if (order === 'diversity') {
       const pathRows = db
-        .prepare(`SELECT rowid, file_path FROM vectors WHERE ${ALIAS_FILTER_SQL} ORDER BY rowid`)
+        .prepare(`SELECT rowid, file_path FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
         .all();
       const filePaths = pathRows.map((r) => r.file_path);
       const permutationPositions = diversityFirstPermutationRowids(filePaths);
@@ -155,7 +204,7 @@ function* streamVectorsFromDb(db, _dim, order = 'sequential') {
     db.exec('DROP TABLE IF EXISTS temp.hnsw_order');
   } else {
     const stmt = db.prepare(
-      `SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${ALIAS_FILTER_SQL} ORDER BY rowid`,
+      `SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${vectorWhere} ORDER BY rowid`,
     );
     for (const row of stmt.iterate()) {
       yield {
@@ -192,12 +241,16 @@ function* streamVectorsFromDb(db, _dim, order = 'sequential') {
 export function decideHybridDispatcher({
   env = process.env,
   parallelLateInteraction = false,
+  acceleratorAvailable = true,
 } = {}) {
   const hybridEnv = (env.SWEET_SEARCH_LI_HYBRID ?? '').trim().toLowerCase();
   const hybridEnabled = hybridEnv === '1' || hybridEnv === 'true' || hybridEnv === 'on';
   if (!hybridEnabled) {
     return { armed: false, reason: 'not-enabled' };
   }
+  if (!acceleratorAvailable) {
+    return { armed: false, reason: 'no-accelerator' };
+  }
   // SWEET_SEARCH_LI_USE_CPU implies single-encoder CPU path — skip the
   // bidirectional cursor (which would still try to use the GPU encoder).
   if (env.SWEET_SEARCH_LI_USE_CPU === '1') {
@@ -395,14 +448,29 @@ export async function incrementalUpdateHNSW(dbPath, changedFiles, dryRun = false
   const Database = (await import('better-sqlite3')).default;
   const db = new Database(dbPath, { readonly: true });
-  const changedFileSet = new Set(changedFiles || []);
-  const placeholders = [...changedFileSet].map(() => '?').join(',');
-  const stmt = changedFileSet.size > 0
-    ? db.prepare(`SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${ALIAS_FILTER_SQL} AND file_path IN (${placeholders}) ORDER BY rowid`)
-    : db.prepare(`SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${ALIAS_FILTER_SQL} ORDER BY rowid`);
-  const rows = changedFileSet.size > 0 ? stmt.all(...changedFileSet) : [];
-  const totalNew = changedFileSet.size > 0 ? rows.length : 0;
+  const changedFileList = [...new Set(changedFiles || [])];
+  // Chunk the IN(?,?,...) clause to stay under SQLite's bound-parameter
+  // limit (default 32766, historic floor 999). Without chunking, a single
+  // indexing pass over >~32k changed files crashes with "too many SQL
+  // variables" — observed in production on CoSQA+ (51k docs) and BRIGHT
+  // (528k docs). See core/infrastructure/db-utils.js for the helper.
+  let rows = [];
+  if (changedFileList.length > 0) {
+    rows = chunkedIn(
+      db,
+      `SELECT rowid, id, file_path, embedding, metadata
+         FROM vectors
+        WHERE ${vectorIndexWhere(db)}
+          AND file_path IN (__IN_PLACEHOLDERS__)
+        ORDER BY rowid`,
+      changedFileList,
+    );
+    // Each batch is ORDER BY rowid internally, but batch boundaries break
+    // global monotonicity. The HNSW insertion loop below relies on rowid
+    // order for deterministic graph construction — re-sort explicitly.
+    rows.sort((a, b) => a.rowid - b.rowid);
+  }
+  const totalNew = rows.length;
   log(`Adding ${totalNew} new entries...`, 'yellow');
   let added = 0;
@@ -455,7 +523,7 @@ export async function buildHNSWIndex(dbPath, dryRun = false) {
   const db = new Database(dbPath, orderMode === 'sequential' ? { readonly: true } : {});
   const totalVectors = db
-    .prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${ALIAS_FILTER_SQL}`)
+    .prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${vectorIndexWhere(db)}`)
     .get().c;
   if (totalVectors === 0) {
     db.close();
@@ -499,7 +567,10 @@ export async function buildHNSWIndex(dbPath, dryRun = false) {
         // vectors already in the checkpoint. Without this, add() reuses keys
         // from 0 and the final .meta.json would be incomplete.
         const metaStmt = db.prepare(
-          'SELECT id, file_path, metadata FROM vectors WHERE rowid <= ? ORDER BY rowid'
+          `SELECT id, file_path, metadata
+             FROM vectors
+            WHERE rowid <= ? AND ${vectorIndexWhere(db)}
+            ORDER BY rowid`
         );
         let restoredKey = 0;
         for (const row of metaStmt.iterate(resumeFromRowId)) {
@@ -592,6 +663,7 @@ export async function buildHNSWIndex(dbPath, dryRun = false) {
     }
     await index.save();
+    await index.clearStaleBitmap();
     buildCompleted = true;
     // Clean up checkpoint files after successful completion
@@ -830,6 +902,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
     const hybridDecision = decideHybridDispatcher({
       env: process.env,
       parallelLateInteraction: EMBEDDING_CONFIG.parallelLateInteraction === true,
+      acceleratorAvailable: isIndexAcceleratorAvailable(),
     });
     if (!hybridDecision.armed && hybridDecision.reason === 'metal-contended-by-embed') {
       log(
@@ -837,6 +910,11 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
         + 'OR SWEET_SEARCH_EMBED_USE_CPU=1 (Metal queue is shared with parallel embed phase)',
         'yellow'
       );
+    } else if (!hybridDecision.armed && hybridDecision.reason === 'no-accelerator') {
+      log(
+        'LateInteraction hybrid: ignored — no inference accelerator detected; using ORT CPU',
+        'yellow'
+      );
     }
     const hybridDisabled = !hybridDecision.armed;
@@ -913,7 +991,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
         const tokens = tokenArrays[j];
         if (tokens && tokens.length > 0) {
           await liIndex.add(chunk.id, tokens, {
-            file: chunk.file,
+            file: chunkFilePath(chunk),
             name: chunk.metadata?.symbol,
             type: chunk.metadata?.chunk_type,
             startLine: chunk.metadata?.line_start || null,
@@ -1018,7 +1096,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
         continue;
       }
       liIndex.addAlias(alias.id, exemplarId, clusterId, {
-        file: alias.file,
+        file: chunkFilePath(alias),
         name: alias.metadata?.symbol,
         type: alias.metadata?.chunk_type,
         startLine: alias.metadata?.line_start || null,
@@ -1042,6 +1120,12 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
   return { ...liStats, added: totalAdded, removed, saveToPath };
 }
+export const __TEST__ = {
+  chunkFilePath,
+  vectorIndexWhere,
+  liveVectorSql,
+};
 // =============================================================================
 // PHASE 5: BINARY HNSW + INT8 QUANTIZED ARTIFACTS
 // =============================================================================

package/core/indexing/indexer-build.js CHANGED Viewed

@@ -11,8 +11,12 @@ import path from 'path';
 import { DB_PATHS, EMBEDDING_CONFIG, PROJECT_ROOT } from '../infrastructure/config/index.js';
 import { GraphExtractor, createGraphSchema, insertGraph } from '../graph/graph-extractor.js';
 import { resolveRelationshipTargets } from '../graph/relationship-resolver.js';
+import { populatePageRankColumn } from '../graph/structural-pagerank.js';
 import { getEmbeddings, getModelInfo } from '../embedding/embedding-service.js';
 import { configureJournalMode, checkpointWal, atomicSwapDatabase, log, logProgress } from './indexer-utils.js';
+import { assignStructuralIds } from '../incremental-indexing/domain/chunk-identity.mjs';
+import { chunkInputHashes } from '../incremental-indexing/domain/encoder-input.mjs';
+import { migrateVectorsSchema } from '../incremental-indexing/infrastructure/schema-migrations.mjs';
 // =============================================================================
 // CHUNK ENRICHMENT — scope chains + imports from code-graph.db
@@ -61,7 +65,7 @@ async function enrichChunksFromGraph(chunks, ASTChunker) {
     let enriched = 0;
     for (const chunk of chunks) {
-      const filePath = chunk.file || chunk.metadata?.path;
+      const filePath = chunkFilePath(chunk);
       if (!filePath) continue;
       // Only enrich chunks with a known symbol (skip generic 'unknown' text chunks)
@@ -187,6 +191,14 @@ export async function buildCodeGraph(files, dryRun = false) {
   log('Resolving relationship targets...', 'yellow');
   const resolutionStats = resolveRelationshipTargets(db);
+  log('Computing entity PageRank for structural ranking...', 'yellow');
+  try {
+    const prStats = populatePageRankColumn(db);
+    log(`✓ PageRank populated: ${prStats.written}/${prStats.entities} entities in ${prStats.ms}ms`, 'green');
+  } catch (err) {
+    log(`⚠ PageRank population failed (non-fatal): ${err.message}`, 'yellow');
+  }
   // Update query planner statistics before closing (SQLite 3.46+).
   // Best-effort only; failure should not strand the temp DB handle.
   closeWithOptimize(db, 'code graph build');
@@ -224,6 +236,7 @@ export function createVectorSchema(db) {
   `);
   db.exec('CREATE INDEX IF NOT EXISTS idx_vectors_session ON vectors(session_id)');
   db.exec('CREATE INDEX IF NOT EXISTS idx_vectors_file_path ON vectors(file_path)');
+  migrateVectorsSchema(db);
 }
 export function ensureVectorSchema(db) {
@@ -254,25 +267,30 @@ export function ensureVectorSchema(db) {
     db.exec('CREATE INDEX IF NOT EXISTS idx_vectors_file_path ON vectors(file_path)');
     log('  Schema migration complete', 'dim');
   }
+  migrateVectorsSchema(db);
 }
-export function buildInsertItems(chunks, embeddings, modelInfo) {
+export function buildInsertItems(chunks, embeddings, modelInfo, annotations = null, options = {}) {
   const items = [];
+  const chunkAnnotations = annotations || annotateChunksForVectorInsert(chunks);
+  const epochWritten = Number.isInteger(options.epochWritten) ? options.epochWritten : 0;
   for (let i = 0; i < chunks.length; i++) {
     const chunk = chunks[i];
     const embedding = embeddings[i];
     if (!embedding || embedding.length === 0) continue;
+    const ann = chunkAnnotations[i];
+    const filePath = chunkFilePath(chunk);
     items.push({
       id: chunk.id,
-      filePath: chunk.file,
+      filePath,
       embeddingBlob: embedding instanceof Float32Array
         ? Buffer.from(embedding.buffer, embedding.byteOffset, embedding.byteLength)
         : Buffer.from(new Float32Array(embedding).buffer),
       text: (chunk.text || chunk.content || '').slice(0, 2000),
       metadata: JSON.stringify({
-        file: chunk.file,
+        file: filePath,
         type: chunk.metadata?.chunk_type || 'code',
         name: chunk.metadata?.symbol || null,
         startLine: chunk.metadata?.line_start || null,
@@ -289,11 +307,117 @@ export function buildInsertItems(chunks, embeddings, modelInfo) {
       sessionId: `codebase-v22-${modelInfo.provider}`,
       tags: JSON.stringify(['codebase', chunk.metadata?.language || 'unknown']),
       createdAt: new Date().toISOString(),
+      chunkStructId: ann?.chunkStructId || '',
+      chunkTextHash: ann?.hashes?.chunk_text_hash || '',
+      embeddingInputHash: ann?.hashes?.embedding_input_hash || '',
+      liInputHash: ann?.hashes?.li_input_hash || '',
+      metadataFingerprint: ann?.hashes?.metadata_fingerprint || '',
+      logicalChunkId: ann?.chunkStructId || chunk.id,
+      epochWritten,
+      epochRetired: null,
     });
   }
   return items;
 }
+function chunkFilePath(chunk) {
+  return firstSafeRelativePath(
+    chunk?.metadata?.relative_path,
+    chunk?.metadata?.path,
+    chunk?.metadata?.file_path,
+    chunk?.file,
+    chunk?.metadata?.file,
+  ) || '';
+}
+function firstSafeRelativePath(...candidates) {
+  for (const candidate of candidates) {
+    if (typeof candidate !== 'string') continue;
+    const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
+    if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
+    if (/^[A-Za-z]:\//.test(normalized)) continue;
+    if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
+    return normalized;
+  }
+  return null;
+}
+function annotateChunksForVectorInsert(chunks) {
+  const annotations = new Array(chunks.length);
+  const byFile = new Map();
+  for (let i = 0; i < chunks.length; i++) {
+    const filePath = chunkFilePath(chunks[i]);
+    if (!byFile.has(filePath)) byFile.set(filePath, []);
+    byFile.get(filePath).push(i);
+  }
+  for (const [filePath, indices] of byFile.entries()) {
+    const fileChunks = indices.map((idx) => chunks[idx]);
+    const ids = assignStructuralIds(fileChunks, filePath);
+    for (let i = 0; i < indices.length; i++) {
+      const idx = indices[i];
+      annotations[idx] = {
+        ...ids[i],
+        hashes: chunkInputHashes(chunks[idx]),
+      };
+    }
+  }
+  return annotations;
+}
+function vectorInsertColumns(db) {
+  const columns = new Set(db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name));
+  return [
+    'id',
+    'file_path',
+    'embedding',
+    'text',
+    'metadata',
+    'session_id',
+    'tags',
+    'created_at',
+    'chunk_struct_id',
+    'chunk_text_hash',
+    'embedding_input_hash',
+    'li_input_hash',
+    'metadata_fingerprint',
+    'logical_chunk_id',
+    'epoch_written',
+    'epoch_retired',
+  ].filter((column) => columns.has(column));
+}
+function vectorInsertValue(item, column) {
+  switch (column) {
+    case 'id': return item.id;
+    case 'file_path': return item.filePath;
+    case 'embedding': return item.embeddingBlob;
+    case 'text': return item.text;
+    case 'metadata': return item.metadata;
+    case 'session_id': return item.sessionId;
+    case 'tags': return item.tags;
+    case 'created_at': return item.createdAt;
+    case 'chunk_struct_id': return item.chunkStructId ?? '';
+    case 'chunk_text_hash': return item.chunkTextHash ?? '';
+    case 'embedding_input_hash': return item.embeddingInputHash ?? '';
+    case 'li_input_hash': return item.liInputHash ?? '';
+    case 'metadata_fingerprint': return item.metadataFingerprint ?? '';
+    case 'logical_chunk_id': return item.logicalChunkId ?? item.chunkStructId ?? item.id;
+    case 'epoch_written': return item.epochWritten ?? 0;
+    case 'epoch_retired': return item.epochRetired ?? null;
+    default: return item[column];
+  }
+}
+function prepareVectorInsert(db) {
+  const columns = vectorInsertColumns(db);
+  const quoted = columns.map((column) => `"${column}"`).join(', ');
+  const placeholders = columns.map(() => '?').join(', ');
+  return {
+    columns,
+    stmt: db.prepare(`INSERT OR REPLACE INTO vectors (${quoted}) VALUES (${placeholders})`),
+  };
+}
 /**
  * Insert alias rows that reuse their exemplar's embedding instead of running
  * the embedding model. The exemplar must already be in the `vectors` table;
@@ -307,23 +431,11 @@ export function insertAliasVectors(db, aliases, modelInfo) {
     'SELECT embedding, metadata FROM vectors WHERE id = ?'
   );
-  const stmt = db.prepare(`
-    INSERT OR REPLACE INTO vectors (id, file_path, embedding, text, metadata, session_id, tags, created_at)
-    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-  `);
+  const { stmt, columns } = prepareVectorInsert(db);
   const insertBatch = db.transaction((items) => {
     for (const item of items) {
-      stmt.run(
-        item.id,
-        item.filePath,
-        item.embeddingBlob,
-        item.text,
-        item.metadata,
-        item.sessionId,
-        item.tags,
-        item.createdAt,
-      );
+      stmt.run(...columns.map((column) => vectorInsertValue(item, column)));
     }
   });
@@ -344,11 +456,13 @@ export function insertAliasVectors(db, aliases, modelInfo) {
   }
   const items = [];
+  const annotations = annotateChunksForVectorInsert(aliases);
   const nowIso = new Date().toISOString();
   let missing = 0;
   let dimension = null;
-  for (const alias of aliases) {
+  for (let i = 0; i < aliases.length; i++) {
+    const alias = aliases[i];
     const exemplarId = alias.metadata?.exemplarId;
     if (!exemplarId) continue;
     const row = fetchExemplar.get(exemplarId);
@@ -359,14 +473,16 @@ export function insertAliasVectors(db, aliases, modelInfo) {
     if (dimension === null) {
       dimension = Math.floor(row.embedding.length / 4);
     }
+    const ann = annotations[i];
+    const filePath = chunkFilePath(alias);
     items.push({
       id: alias.id,
-      filePath: alias.file,
+      filePath,
       embeddingBlob: row.embedding, // copy exemplar's Float32 BLOB verbatim
       text: (alias.text || alias.content || '').slice(0, 2000),
       metadata: JSON.stringify({
-        file: alias.file,
+        file: filePath,
         type: alias.metadata?.chunk_type || 'code',
         name: alias.metadata?.symbol || null,
         startLine: alias.metadata?.line_start || null,
@@ -382,6 +498,14 @@ export function insertAliasVectors(db, aliases, modelInfo) {
       sessionId: `codebase-v22-${modelInfo.provider}`,
       tags: JSON.stringify(['codebase', alias.metadata?.language || 'unknown']),
       createdAt: nowIso,
+      chunkStructId: ann?.chunkStructId || '',
+      chunkTextHash: ann?.hashes?.chunk_text_hash || '',
+      embeddingInputHash: ann?.hashes?.embedding_input_hash || '',
+      liInputHash: ann?.hashes?.li_input_hash || '',
+      metadataFingerprint: ann?.hashes?.metadata_fingerprint || '',
+      logicalChunkId: ann?.chunkStructId || alias.id,
+      epochWritten: 0,
+      epochRetired: null,
     });
   }
@@ -397,48 +521,36 @@ export function insertAliasVectors(db, aliases, modelInfo) {
   return items.length;
 }
-export function insertVectors(db, chunks, embeddings, modelInfo) {
+export function insertVectorItems(db, items) {
   const BATCH_INSERT_SIZE = 2000;
-  const stmt = db.prepare(`
-    INSERT OR REPLACE INTO vectors (id, file_path, embedding, text, metadata, session_id, tags, created_at)
-    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-  `);
+  const { stmt, columns } = prepareVectorInsert(db);
   const insertBatch = db.transaction((items) => {
     for (const item of items) {
-      stmt.run(
-        item.id,
-        item.filePath,
-        item.embeddingBlob,
-        item.text,
-        item.metadata,
-        item.sessionId,
-        item.tags,
-        item.createdAt
-      );
+      stmt.run(...columns.map((column) => vectorInsertValue(item, column)));
     }
   });
-  const items = buildInsertItems(chunks, embeddings, modelInfo);
   for (let i = 0; i < items.length; i += BATCH_INSERT_SIZE) {
     insertBatch(items.slice(i, i + BATCH_INSERT_SIZE));
   }
 }
+export function insertVectors(db, chunks, embeddings, modelInfo, annotations = null, options = {}) {
+  insertVectorItems(db, buildInsertItems(chunks, embeddings, modelInfo, annotations, options));
+}
 export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, modelInfo, logProgressFn, embeddingOptions = {}, logFn, writeFlushRows = 128) {
   let writeBuffer = [];
   let embeddingCount = 0;
+  const allAnnotations = annotateChunksForVectorInsert(allChunks);
-  const stmt = db.prepare(`
-    INSERT OR REPLACE INTO vectors (id, file_path, embedding, text, metadata, session_id, tags, created_at)
-    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-  `);
+  const { stmt, columns } = prepareVectorInsert(db);
   const insertBatch = db.transaction((items) => {
     for (const item of items) {
-      stmt.run(item.id, item.filePath, item.embeddingBlob, item.text, item.metadata, item.sessionId, item.tags, item.createdAt);
+      stmt.run(...columns.map((column) => vectorInsertValue(item, column)));
     }
   });
@@ -458,6 +570,7 @@ export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, m
   for (let i = 0; i < texts.length; i += batchSize) {
     const batch = texts.slice(i, i + batchSize);
     const batchChunks = allChunks.slice(i, i + batchSize);
+    const batchAnnotations = allAnnotations.slice(i, i + batchSize);
     // Overlap: flush accumulated writes while embedding is in-flight
     const batchResultsPromise = getEmbeddings(batch, progressOptions);
@@ -470,7 +583,7 @@ export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, m
     const batchEmbeddings = batchResults.map(r => r.embedding);
     embeddingCount += batchEmbeddings.length;
-    const batchItems = buildInsertItems(batchChunks, batchEmbeddings, modelInfo);
+    const batchItems = buildInsertItems(batchChunks, batchEmbeddings, modelInfo, batchAnnotations);
     writeBuffer.push(...batchItems);
     if (!useInternalProgress) {
@@ -549,7 +662,7 @@ export async function chunkFiles(files) {
     if (chunk.embedding_text) {
       return chunk.embedding_text.slice(0, _embCap);
     }
-    return `${chunk.file} ${chunk.metadata?.symbol || ''}\n${(chunk.text || chunk.content || '').slice(0, 1500)}`;
+    return `${chunkFilePath(chunk)} ${chunk.metadata?.symbol || ''}\n${(chunk.text || chunk.content || '').slice(0, 1500)}`;
   });
   return { allChunks, texts };

package/core/indexing/indexer-empty-baseline.js ADDED Viewed

@@ -0,0 +1,80 @@
+/**
+ * Establish a valid *empty* index baseline.
+ *
+ * A full or incremental index run over a repository with no indexable files
+ * used to early-exit without creating anything, leaving search to throw
+ * "No search indexes found" and giving the default-on reconcile maintainer no
+ * baseline to grow from. This helper instead writes a coherent zero-row
+ * baseline:
+ *
+ *   - codebase.db            vector schema, 0 rows
+ *   - code-graph.db          graph schema, 0 rows
+ *   - merkle-state.json      0 files  — so the maintainer's dirty-scan treats
+ *                                       the first created file as new
+ *   - reconcile-manifest.json         — so readers pin a real epoch
+ *
+ * With the baseline in place, search returns empty results cleanly (the
+ * graph+codebase existence check in SweetSearch.init passes; the tables are
+ * simply empty) and the reconcile maintainer can transition the repo from zero
+ * files to one file without a prior full index.
+ *
+ * The schema builders are the same ones the production reconciler uses when it
+ * lazily creates these DBs (createVectorSchema / createGraphSchema), so a
+ * baseline written here is byte-for-byte compatible with later incremental
+ * deltas (epoch columns, FTS5, indexes).
+ */
+import Database from 'better-sqlite3';
+import { existsSync, mkdirSync } from 'node:fs';
+import path from 'node:path';
+import { DB_PATHS } from '../infrastructure/config/index.js';
+import { createVectorSchema } from './indexer-build.js';
+import { createGraphSchema } from '../graph/graph-extractor.js';
+import { publishIndexerManifest } from './indexer-manifest.js';
+import { updateState } from './incremental-tracker.js';
+import { log } from './indexer-utils.js';
+/**
+ * Create `dbPath` with `createSchema` only when it does not already exist.
+ * Returns true when a fresh DB was created, false when one was already present.
+ */
+function ensureSchema(dbPath, createSchema) {
+  if (existsSync(dbPath)) return false;
+  mkdirSync(path.dirname(dbPath), { recursive: true });
+  const db = new Database(dbPath);
+  try {
+    createSchema(db);
+  } finally {
+    db.close();
+  }
+  return true;
+}
+/**
+ * Write the empty baseline for a genuinely un-indexed empty repo.
+ *
+ * No-op when `merkle-state.json` already exists: a prior index ran, so an empty
+ * working tree means the repo BECAME empty (every tracked file deleted). In that
+ * case the existing merkle must be preserved so the maintainer's deletion
+ * detection (dirty-scan: merkle-known vs on-disk) retires the now-stale rows —
+ * overwriting it with an empty file set here would erase that knowledge and
+ * strand the stale rows in codebase.db / code-graph.db forever.
+ *
+ * @returns {Promise<{createdCodebase:boolean, createdGraph:boolean, skipped?:boolean}>}
+ */
+export async function establishEmptyBaseline() {
+  if (existsSync(DB_PATHS.merkle)) {
+    return { createdCodebase: false, createdGraph: false, skipped: true };
+  }
+  const createdCodebase = ensureSchema(DB_PATHS.codebase, createVectorSchema);
+  const createdGraph = ensureSchema(DB_PATHS.codeGraph, createGraphSchema);
+  await updateState({}, { totalChunks: 0, entities: 0, relationships: 0 });
+  publishIndexerManifest({});
+  log(
+    `Established empty index baseline (0 files; codebase.db ${createdCodebase ? 'created' : 'present'}, `
+    + `code-graph.db ${createdGraph ? 'created' : 'present'})`,
+    'green',
+  );
+  return { createdCodebase, createdGraph };
+}