sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -7,12 +7,14 @@ import { existsSync, openSync, fsyncSync, closeSync, writeFileSync, readFileSync
7
7
  import path from 'path';
8
8
 
9
9
  import { DB_PATHS, HNSW_CONFIG, BINARY_HNSW_CONFIG } from '../infrastructure/config/index.js';
10
+ import { chunkedIn } from '../infrastructure/db-utils.js';
10
11
  import { HNSWIndex } from '../vector-store/hnsw-index.js';
11
12
  import { LateInteractionIndex } from '../ranking/late-interaction-index.js';
12
13
  import { truncateForHNSW, getEmbeddings, getModelInfo, fisherYatesShuffle } from '../embedding/embedding-service.js';
13
14
  import { buildFromCodebaseDb as buildQuantizedArtifacts, shouldSkipArtifactRebuild, updateArtifactState, ARTIFACT_THRESHOLDS } from './artifact-builder.js';
14
15
  import { log, logProgress } from './indexer-utils.js';
15
16
  import { JAVA_FAMILY } from './ast-chunker.js';
17
+ import { isIndexAcceleratorAvailable } from './model-pool.js';
16
18
 
17
19
  // =============================================================================
18
20
  // DURABLE WRITE HELPERS (Phase E — fsync ordering for checkpoint safety)
@@ -60,6 +62,28 @@ export function pickLiInput(chunk) {
60
62
  return chunk.li_greedy_text || chunk.embedding_text || chunk.li_text || chunk.text || chunk.content || '';
61
63
  }
62
64
 
65
+ function chunkFilePath(chunk) {
66
+ return firstSafeRelativePath(
67
+ chunk?.metadata?.relative_path,
68
+ chunk?.metadata?.path,
69
+ chunk?.metadata?.file_path,
70
+ chunk?.file,
71
+ chunk?.metadata?.file,
72
+ ) || '';
73
+ }
74
+
75
+ function firstSafeRelativePath(...candidates) {
76
+ for (const candidate of candidates) {
77
+ if (typeof candidate !== 'string') continue;
78
+ const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
79
+ if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
80
+ if (/^[A-Za-z]:\//.test(normalized)) continue;
81
+ if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
82
+ return normalized;
83
+ }
84
+ return null;
85
+ }
86
+
63
87
  function fsyncFile(filePath) {
64
88
  const fd = openSync(filePath, 'r');
65
89
  try { fsyncSync(fd); } finally { closeSync(fd); }
@@ -108,13 +132,38 @@ function cleanupCheckpoint(indexPath) {
108
132
  // list at the exemplar's rank position.
109
133
  const ALIAS_FILTER_SQL = "json_extract(metadata, '$.exemplarId') IS NULL";
110
134
 
135
+ function hasVectorColumn(db, column) {
136
+ try {
137
+ return db.prepare('PRAGMA table_info(vectors)').all().some((col) => col.name === column);
138
+ } catch (_err) {
139
+ return false;
140
+ }
141
+ }
142
+
143
+ function aliasFilterSql(alias = '') {
144
+ if (!alias) return ALIAS_FILTER_SQL;
145
+ const prefix = alias ? `${alias}.` : '';
146
+ return `json_extract(${prefix}metadata, '$.exemplarId') IS NULL`;
147
+ }
148
+
149
+ function liveVectorSql(db, alias = '') {
150
+ if (!hasVectorColumn(db, 'epoch_retired')) return '1=1';
151
+ const prefix = alias ? `${alias}.` : '';
152
+ return `${prefix}epoch_retired IS NULL`;
153
+ }
154
+
155
+ function vectorIndexWhere(db, alias = '') {
156
+ return `${aliasFilterSql(alias)} AND ${liveVectorSql(db, alias)}`;
157
+ }
158
+
111
159
  function* streamVectorsFromDb(db, _dim, order = 'sequential') {
160
+ const vectorWhere = vectorIndexWhere(db);
112
161
  if (order !== 'sequential') {
113
162
  db.exec('CREATE TEMP TABLE IF NOT EXISTS hnsw_order (pos INTEGER PRIMARY KEY, vector_rowid INTEGER)');
114
163
  db.exec('DELETE FROM hnsw_order');
115
164
 
116
165
  const rowidRows = db
117
- .prepare(`SELECT rowid FROM vectors WHERE ${ALIAS_FILTER_SQL} ORDER BY rowid`)
166
+ .prepare(`SELECT rowid FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
118
167
  .all();
119
168
  let indices = rowidRows.map((r) => r.rowid);
120
169
 
@@ -122,7 +171,7 @@ function* streamVectorsFromDb(db, _dim, order = 'sequential') {
122
171
  fisherYatesShuffle(indices);
123
172
  } else if (order === 'diversity') {
124
173
  const pathRows = db
125
- .prepare(`SELECT rowid, file_path FROM vectors WHERE ${ALIAS_FILTER_SQL} ORDER BY rowid`)
174
+ .prepare(`SELECT rowid, file_path FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
126
175
  .all();
127
176
  const filePaths = pathRows.map((r) => r.file_path);
128
177
  const permutationPositions = diversityFirstPermutationRowids(filePaths);
@@ -155,7 +204,7 @@ function* streamVectorsFromDb(db, _dim, order = 'sequential') {
155
204
  db.exec('DROP TABLE IF EXISTS temp.hnsw_order');
156
205
  } else {
157
206
  const stmt = db.prepare(
158
- `SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${ALIAS_FILTER_SQL} ORDER BY rowid`,
207
+ `SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${vectorWhere} ORDER BY rowid`,
159
208
  );
160
209
  for (const row of stmt.iterate()) {
161
210
  yield {
@@ -192,12 +241,16 @@ function* streamVectorsFromDb(db, _dim, order = 'sequential') {
192
241
  export function decideHybridDispatcher({
193
242
  env = process.env,
194
243
  parallelLateInteraction = false,
244
+ acceleratorAvailable = true,
195
245
  } = {}) {
196
246
  const hybridEnv = (env.SWEET_SEARCH_LI_HYBRID ?? '').trim().toLowerCase();
197
247
  const hybridEnabled = hybridEnv === '1' || hybridEnv === 'true' || hybridEnv === 'on';
198
248
  if (!hybridEnabled) {
199
249
  return { armed: false, reason: 'not-enabled' };
200
250
  }
251
+ if (!acceleratorAvailable) {
252
+ return { armed: false, reason: 'no-accelerator' };
253
+ }
201
254
  // SWEET_SEARCH_LI_USE_CPU implies single-encoder CPU path — skip the
202
255
  // bidirectional cursor (which would still try to use the GPU encoder).
203
256
  if (env.SWEET_SEARCH_LI_USE_CPU === '1') {
@@ -395,14 +448,29 @@ export async function incrementalUpdateHNSW(dbPath, changedFiles, dryRun = false
395
448
  const Database = (await import('better-sqlite3')).default;
396
449
  const db = new Database(dbPath, { readonly: true });
397
450
 
398
- const changedFileSet = new Set(changedFiles || []);
399
- const placeholders = [...changedFileSet].map(() => '?').join(',');
400
- const stmt = changedFileSet.size > 0
401
- ? db.prepare(`SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${ALIAS_FILTER_SQL} AND file_path IN (${placeholders}) ORDER BY rowid`)
402
- : db.prepare(`SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${ALIAS_FILTER_SQL} ORDER BY rowid`);
403
-
404
- const rows = changedFileSet.size > 0 ? stmt.all(...changedFileSet) : [];
405
- const totalNew = changedFileSet.size > 0 ? rows.length : 0;
451
+ const changedFileList = [...new Set(changedFiles || [])];
452
+ // Chunk the IN(?,?,...) clause to stay under SQLite's bound-parameter
453
+ // limit (default 32766, historic floor 999). Without chunking, a single
454
+ // indexing pass over >~32k changed files crashes with "too many SQL
455
+ // variables" observed in production on CoSQA+ (51k docs) and BRIGHT
456
+ // (528k docs). See core/infrastructure/db-utils.js for the helper.
457
+ let rows = [];
458
+ if (changedFileList.length > 0) {
459
+ rows = chunkedIn(
460
+ db,
461
+ `SELECT rowid, id, file_path, embedding, metadata
462
+ FROM vectors
463
+ WHERE ${vectorIndexWhere(db)}
464
+ AND file_path IN (__IN_PLACEHOLDERS__)
465
+ ORDER BY rowid`,
466
+ changedFileList,
467
+ );
468
+ // Each batch is ORDER BY rowid internally, but batch boundaries break
469
+ // global monotonicity. The HNSW insertion loop below relies on rowid
470
+ // order for deterministic graph construction — re-sort explicitly.
471
+ rows.sort((a, b) => a.rowid - b.rowid);
472
+ }
473
+ const totalNew = rows.length;
406
474
 
407
475
  log(`Adding ${totalNew} new entries...`, 'yellow');
408
476
  let added = 0;
@@ -455,7 +523,7 @@ export async function buildHNSWIndex(dbPath, dryRun = false) {
455
523
  const db = new Database(dbPath, orderMode === 'sequential' ? { readonly: true } : {});
456
524
 
457
525
  const totalVectors = db
458
- .prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${ALIAS_FILTER_SQL}`)
526
+ .prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${vectorIndexWhere(db)}`)
459
527
  .get().c;
460
528
  if (totalVectors === 0) {
461
529
  db.close();
@@ -499,7 +567,10 @@ export async function buildHNSWIndex(dbPath, dryRun = false) {
499
567
  // vectors already in the checkpoint. Without this, add() reuses keys
500
568
  // from 0 and the final .meta.json would be incomplete.
501
569
  const metaStmt = db.prepare(
502
- 'SELECT id, file_path, metadata FROM vectors WHERE rowid <= ? ORDER BY rowid'
570
+ `SELECT id, file_path, metadata
571
+ FROM vectors
572
+ WHERE rowid <= ? AND ${vectorIndexWhere(db)}
573
+ ORDER BY rowid`
503
574
  );
504
575
  let restoredKey = 0;
505
576
  for (const row of metaStmt.iterate(resumeFromRowId)) {
@@ -592,6 +663,7 @@ export async function buildHNSWIndex(dbPath, dryRun = false) {
592
663
  }
593
664
 
594
665
  await index.save();
666
+ await index.clearStaleBitmap();
595
667
  buildCompleted = true;
596
668
 
597
669
  // Clean up checkpoint files after successful completion
@@ -830,6 +902,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
830
902
  const hybridDecision = decideHybridDispatcher({
831
903
  env: process.env,
832
904
  parallelLateInteraction: EMBEDDING_CONFIG.parallelLateInteraction === true,
905
+ acceleratorAvailable: isIndexAcceleratorAvailable(),
833
906
  });
834
907
  if (!hybridDecision.armed && hybridDecision.reason === 'metal-contended-by-embed') {
835
908
  log(
@@ -837,6 +910,11 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
837
910
  + 'OR SWEET_SEARCH_EMBED_USE_CPU=1 (Metal queue is shared with parallel embed phase)',
838
911
  'yellow'
839
912
  );
913
+ } else if (!hybridDecision.armed && hybridDecision.reason === 'no-accelerator') {
914
+ log(
915
+ 'LateInteraction hybrid: ignored — no inference accelerator detected; using ORT CPU',
916
+ 'yellow'
917
+ );
840
918
  }
841
919
  const hybridDisabled = !hybridDecision.armed;
842
920
 
@@ -913,7 +991,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
913
991
  const tokens = tokenArrays[j];
914
992
  if (tokens && tokens.length > 0) {
915
993
  await liIndex.add(chunk.id, tokens, {
916
- file: chunk.file,
994
+ file: chunkFilePath(chunk),
917
995
  name: chunk.metadata?.symbol,
918
996
  type: chunk.metadata?.chunk_type,
919
997
  startLine: chunk.metadata?.line_start || null,
@@ -1018,7 +1096,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
1018
1096
  continue;
1019
1097
  }
1020
1098
  liIndex.addAlias(alias.id, exemplarId, clusterId, {
1021
- file: alias.file,
1099
+ file: chunkFilePath(alias),
1022
1100
  name: alias.metadata?.symbol,
1023
1101
  type: alias.metadata?.chunk_type,
1024
1102
  startLine: alias.metadata?.line_start || null,
@@ -1042,6 +1120,12 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
1042
1120
  return { ...liStats, added: totalAdded, removed, saveToPath };
1043
1121
  }
1044
1122
 
1123
+ export const __TEST__ = {
1124
+ chunkFilePath,
1125
+ vectorIndexWhere,
1126
+ liveVectorSql,
1127
+ };
1128
+
1045
1129
  // =============================================================================
1046
1130
  // PHASE 5: BINARY HNSW + INT8 QUANTIZED ARTIFACTS
1047
1131
  // =============================================================================
@@ -11,8 +11,12 @@ import path from 'path';
11
11
  import { DB_PATHS, EMBEDDING_CONFIG, PROJECT_ROOT } from '../infrastructure/config/index.js';
12
12
  import { GraphExtractor, createGraphSchema, insertGraph } from '../graph/graph-extractor.js';
13
13
  import { resolveRelationshipTargets } from '../graph/relationship-resolver.js';
14
+ import { populatePageRankColumn } from '../graph/structural-pagerank.js';
14
15
  import { getEmbeddings, getModelInfo } from '../embedding/embedding-service.js';
15
16
  import { configureJournalMode, checkpointWal, atomicSwapDatabase, log, logProgress } from './indexer-utils.js';
17
+ import { assignStructuralIds } from '../incremental-indexing/domain/chunk-identity.mjs';
18
+ import { chunkInputHashes } from '../incremental-indexing/domain/encoder-input.mjs';
19
+ import { migrateVectorsSchema } from '../incremental-indexing/infrastructure/schema-migrations.mjs';
16
20
 
17
21
  // =============================================================================
18
22
  // CHUNK ENRICHMENT — scope chains + imports from code-graph.db
@@ -61,7 +65,7 @@ async function enrichChunksFromGraph(chunks, ASTChunker) {
61
65
  let enriched = 0;
62
66
 
63
67
  for (const chunk of chunks) {
64
- const filePath = chunk.file || chunk.metadata?.path;
68
+ const filePath = chunkFilePath(chunk);
65
69
  if (!filePath) continue;
66
70
 
67
71
  // Only enrich chunks with a known symbol (skip generic 'unknown' text chunks)
@@ -187,6 +191,14 @@ export async function buildCodeGraph(files, dryRun = false) {
187
191
  log('Resolving relationship targets...', 'yellow');
188
192
  const resolutionStats = resolveRelationshipTargets(db);
189
193
 
194
+ log('Computing entity PageRank for structural ranking...', 'yellow');
195
+ try {
196
+ const prStats = populatePageRankColumn(db);
197
+ log(`✓ PageRank populated: ${prStats.written}/${prStats.entities} entities in ${prStats.ms}ms`, 'green');
198
+ } catch (err) {
199
+ log(`⚠ PageRank population failed (non-fatal): ${err.message}`, 'yellow');
200
+ }
201
+
190
202
  // Update query planner statistics before closing (SQLite 3.46+).
191
203
  // Best-effort only; failure should not strand the temp DB handle.
192
204
  closeWithOptimize(db, 'code graph build');
@@ -224,6 +236,7 @@ export function createVectorSchema(db) {
224
236
  `);
225
237
  db.exec('CREATE INDEX IF NOT EXISTS idx_vectors_session ON vectors(session_id)');
226
238
  db.exec('CREATE INDEX IF NOT EXISTS idx_vectors_file_path ON vectors(file_path)');
239
+ migrateVectorsSchema(db);
227
240
  }
228
241
 
229
242
  export function ensureVectorSchema(db) {
@@ -254,25 +267,30 @@ export function ensureVectorSchema(db) {
254
267
  db.exec('CREATE INDEX IF NOT EXISTS idx_vectors_file_path ON vectors(file_path)');
255
268
  log(' Schema migration complete', 'dim');
256
269
  }
270
+ migrateVectorsSchema(db);
257
271
  }
258
272
 
259
- export function buildInsertItems(chunks, embeddings, modelInfo) {
273
+ export function buildInsertItems(chunks, embeddings, modelInfo, annotations = null, options = {}) {
260
274
  const items = [];
275
+ const chunkAnnotations = annotations || annotateChunksForVectorInsert(chunks);
276
+ const epochWritten = Number.isInteger(options.epochWritten) ? options.epochWritten : 0;
261
277
  for (let i = 0; i < chunks.length; i++) {
262
278
  const chunk = chunks[i];
263
279
  const embedding = embeddings[i];
264
280
 
265
281
  if (!embedding || embedding.length === 0) continue;
282
+ const ann = chunkAnnotations[i];
283
+ const filePath = chunkFilePath(chunk);
266
284
 
267
285
  items.push({
268
286
  id: chunk.id,
269
- filePath: chunk.file,
287
+ filePath,
270
288
  embeddingBlob: embedding instanceof Float32Array
271
289
  ? Buffer.from(embedding.buffer, embedding.byteOffset, embedding.byteLength)
272
290
  : Buffer.from(new Float32Array(embedding).buffer),
273
291
  text: (chunk.text || chunk.content || '').slice(0, 2000),
274
292
  metadata: JSON.stringify({
275
- file: chunk.file,
293
+ file: filePath,
276
294
  type: chunk.metadata?.chunk_type || 'code',
277
295
  name: chunk.metadata?.symbol || null,
278
296
  startLine: chunk.metadata?.line_start || null,
@@ -289,11 +307,117 @@ export function buildInsertItems(chunks, embeddings, modelInfo) {
289
307
  sessionId: `codebase-v22-${modelInfo.provider}`,
290
308
  tags: JSON.stringify(['codebase', chunk.metadata?.language || 'unknown']),
291
309
  createdAt: new Date().toISOString(),
310
+ chunkStructId: ann?.chunkStructId || '',
311
+ chunkTextHash: ann?.hashes?.chunk_text_hash || '',
312
+ embeddingInputHash: ann?.hashes?.embedding_input_hash || '',
313
+ liInputHash: ann?.hashes?.li_input_hash || '',
314
+ metadataFingerprint: ann?.hashes?.metadata_fingerprint || '',
315
+ logicalChunkId: ann?.chunkStructId || chunk.id,
316
+ epochWritten,
317
+ epochRetired: null,
292
318
  });
293
319
  }
294
320
  return items;
295
321
  }
296
322
 
323
+ function chunkFilePath(chunk) {
324
+ return firstSafeRelativePath(
325
+ chunk?.metadata?.relative_path,
326
+ chunk?.metadata?.path,
327
+ chunk?.metadata?.file_path,
328
+ chunk?.file,
329
+ chunk?.metadata?.file,
330
+ ) || '';
331
+ }
332
+
333
+ function firstSafeRelativePath(...candidates) {
334
+ for (const candidate of candidates) {
335
+ if (typeof candidate !== 'string') continue;
336
+ const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
337
+ if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
338
+ if (/^[A-Za-z]:\//.test(normalized)) continue;
339
+ if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
340
+ return normalized;
341
+ }
342
+ return null;
343
+ }
344
+
345
+ function annotateChunksForVectorInsert(chunks) {
346
+ const annotations = new Array(chunks.length);
347
+ const byFile = new Map();
348
+ for (let i = 0; i < chunks.length; i++) {
349
+ const filePath = chunkFilePath(chunks[i]);
350
+ if (!byFile.has(filePath)) byFile.set(filePath, []);
351
+ byFile.get(filePath).push(i);
352
+ }
353
+ for (const [filePath, indices] of byFile.entries()) {
354
+ const fileChunks = indices.map((idx) => chunks[idx]);
355
+ const ids = assignStructuralIds(fileChunks, filePath);
356
+ for (let i = 0; i < indices.length; i++) {
357
+ const idx = indices[i];
358
+ annotations[idx] = {
359
+ ...ids[i],
360
+ hashes: chunkInputHashes(chunks[idx]),
361
+ };
362
+ }
363
+ }
364
+ return annotations;
365
+ }
366
+
367
+ function vectorInsertColumns(db) {
368
+ const columns = new Set(db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name));
369
+ return [
370
+ 'id',
371
+ 'file_path',
372
+ 'embedding',
373
+ 'text',
374
+ 'metadata',
375
+ 'session_id',
376
+ 'tags',
377
+ 'created_at',
378
+ 'chunk_struct_id',
379
+ 'chunk_text_hash',
380
+ 'embedding_input_hash',
381
+ 'li_input_hash',
382
+ 'metadata_fingerprint',
383
+ 'logical_chunk_id',
384
+ 'epoch_written',
385
+ 'epoch_retired',
386
+ ].filter((column) => columns.has(column));
387
+ }
388
+
389
+ function vectorInsertValue(item, column) {
390
+ switch (column) {
391
+ case 'id': return item.id;
392
+ case 'file_path': return item.filePath;
393
+ case 'embedding': return item.embeddingBlob;
394
+ case 'text': return item.text;
395
+ case 'metadata': return item.metadata;
396
+ case 'session_id': return item.sessionId;
397
+ case 'tags': return item.tags;
398
+ case 'created_at': return item.createdAt;
399
+ case 'chunk_struct_id': return item.chunkStructId ?? '';
400
+ case 'chunk_text_hash': return item.chunkTextHash ?? '';
401
+ case 'embedding_input_hash': return item.embeddingInputHash ?? '';
402
+ case 'li_input_hash': return item.liInputHash ?? '';
403
+ case 'metadata_fingerprint': return item.metadataFingerprint ?? '';
404
+ case 'logical_chunk_id': return item.logicalChunkId ?? item.chunkStructId ?? item.id;
405
+ case 'epoch_written': return item.epochWritten ?? 0;
406
+ case 'epoch_retired': return item.epochRetired ?? null;
407
+ default: return item[column];
408
+ }
409
+ }
410
+
411
+ function prepareVectorInsert(db) {
412
+ const columns = vectorInsertColumns(db);
413
+ const quoted = columns.map((column) => `"${column}"`).join(', ');
414
+ const placeholders = columns.map(() => '?').join(', ');
415
+ return {
416
+ columns,
417
+ stmt: db.prepare(`INSERT OR REPLACE INTO vectors (${quoted}) VALUES (${placeholders})`),
418
+ };
419
+ }
420
+
297
421
  /**
298
422
  * Insert alias rows that reuse their exemplar's embedding instead of running
299
423
  * the embedding model. The exemplar must already be in the `vectors` table;
@@ -307,23 +431,11 @@ export function insertAliasVectors(db, aliases, modelInfo) {
307
431
  'SELECT embedding, metadata FROM vectors WHERE id = ?'
308
432
  );
309
433
 
310
- const stmt = db.prepare(`
311
- INSERT OR REPLACE INTO vectors (id, file_path, embedding, text, metadata, session_id, tags, created_at)
312
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
313
- `);
434
+ const { stmt, columns } = prepareVectorInsert(db);
314
435
 
315
436
  const insertBatch = db.transaction((items) => {
316
437
  for (const item of items) {
317
- stmt.run(
318
- item.id,
319
- item.filePath,
320
- item.embeddingBlob,
321
- item.text,
322
- item.metadata,
323
- item.sessionId,
324
- item.tags,
325
- item.createdAt,
326
- );
438
+ stmt.run(...columns.map((column) => vectorInsertValue(item, column)));
327
439
  }
328
440
  });
329
441
 
@@ -344,11 +456,13 @@ export function insertAliasVectors(db, aliases, modelInfo) {
344
456
  }
345
457
 
346
458
  const items = [];
459
+ const annotations = annotateChunksForVectorInsert(aliases);
347
460
  const nowIso = new Date().toISOString();
348
461
  let missing = 0;
349
462
  let dimension = null;
350
463
 
351
- for (const alias of aliases) {
464
+ for (let i = 0; i < aliases.length; i++) {
465
+ const alias = aliases[i];
352
466
  const exemplarId = alias.metadata?.exemplarId;
353
467
  if (!exemplarId) continue;
354
468
  const row = fetchExemplar.get(exemplarId);
@@ -359,14 +473,16 @@ export function insertAliasVectors(db, aliases, modelInfo) {
359
473
  if (dimension === null) {
360
474
  dimension = Math.floor(row.embedding.length / 4);
361
475
  }
476
+ const ann = annotations[i];
477
+ const filePath = chunkFilePath(alias);
362
478
 
363
479
  items.push({
364
480
  id: alias.id,
365
- filePath: alias.file,
481
+ filePath,
366
482
  embeddingBlob: row.embedding, // copy exemplar's Float32 BLOB verbatim
367
483
  text: (alias.text || alias.content || '').slice(0, 2000),
368
484
  metadata: JSON.stringify({
369
- file: alias.file,
485
+ file: filePath,
370
486
  type: alias.metadata?.chunk_type || 'code',
371
487
  name: alias.metadata?.symbol || null,
372
488
  startLine: alias.metadata?.line_start || null,
@@ -382,6 +498,14 @@ export function insertAliasVectors(db, aliases, modelInfo) {
382
498
  sessionId: `codebase-v22-${modelInfo.provider}`,
383
499
  tags: JSON.stringify(['codebase', alias.metadata?.language || 'unknown']),
384
500
  createdAt: nowIso,
501
+ chunkStructId: ann?.chunkStructId || '',
502
+ chunkTextHash: ann?.hashes?.chunk_text_hash || '',
503
+ embeddingInputHash: ann?.hashes?.embedding_input_hash || '',
504
+ liInputHash: ann?.hashes?.li_input_hash || '',
505
+ metadataFingerprint: ann?.hashes?.metadata_fingerprint || '',
506
+ logicalChunkId: ann?.chunkStructId || alias.id,
507
+ epochWritten: 0,
508
+ epochRetired: null,
385
509
  });
386
510
  }
387
511
 
@@ -397,48 +521,36 @@ export function insertAliasVectors(db, aliases, modelInfo) {
397
521
  return items.length;
398
522
  }
399
523
 
400
- export function insertVectors(db, chunks, embeddings, modelInfo) {
524
+ export function insertVectorItems(db, items) {
401
525
  const BATCH_INSERT_SIZE = 2000;
402
526
 
403
- const stmt = db.prepare(`
404
- INSERT OR REPLACE INTO vectors (id, file_path, embedding, text, metadata, session_id, tags, created_at)
405
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
406
- `);
527
+ const { stmt, columns } = prepareVectorInsert(db);
407
528
 
408
529
  const insertBatch = db.transaction((items) => {
409
530
  for (const item of items) {
410
- stmt.run(
411
- item.id,
412
- item.filePath,
413
- item.embeddingBlob,
414
- item.text,
415
- item.metadata,
416
- item.sessionId,
417
- item.tags,
418
- item.createdAt
419
- );
531
+ stmt.run(...columns.map((column) => vectorInsertValue(item, column)));
420
532
  }
421
533
  });
422
534
 
423
- const items = buildInsertItems(chunks, embeddings, modelInfo);
424
-
425
535
  for (let i = 0; i < items.length; i += BATCH_INSERT_SIZE) {
426
536
  insertBatch(items.slice(i, i + BATCH_INSERT_SIZE));
427
537
  }
428
538
  }
429
539
 
540
+ export function insertVectors(db, chunks, embeddings, modelInfo, annotations = null, options = {}) {
541
+ insertVectorItems(db, buildInsertItems(chunks, embeddings, modelInfo, annotations, options));
542
+ }
543
+
430
544
  export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, modelInfo, logProgressFn, embeddingOptions = {}, logFn, writeFlushRows = 128) {
431
545
  let writeBuffer = [];
432
546
  let embeddingCount = 0;
547
+ const allAnnotations = annotateChunksForVectorInsert(allChunks);
433
548
 
434
- const stmt = db.prepare(`
435
- INSERT OR REPLACE INTO vectors (id, file_path, embedding, text, metadata, session_id, tags, created_at)
436
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
437
- `);
549
+ const { stmt, columns } = prepareVectorInsert(db);
438
550
 
439
551
  const insertBatch = db.transaction((items) => {
440
552
  for (const item of items) {
441
- stmt.run(item.id, item.filePath, item.embeddingBlob, item.text, item.metadata, item.sessionId, item.tags, item.createdAt);
553
+ stmt.run(...columns.map((column) => vectorInsertValue(item, column)));
442
554
  }
443
555
  });
444
556
 
@@ -458,6 +570,7 @@ export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, m
458
570
  for (let i = 0; i < texts.length; i += batchSize) {
459
571
  const batch = texts.slice(i, i + batchSize);
460
572
  const batchChunks = allChunks.slice(i, i + batchSize);
573
+ const batchAnnotations = allAnnotations.slice(i, i + batchSize);
461
574
 
462
575
  // Overlap: flush accumulated writes while embedding is in-flight
463
576
  const batchResultsPromise = getEmbeddings(batch, progressOptions);
@@ -470,7 +583,7 @@ export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, m
470
583
  const batchEmbeddings = batchResults.map(r => r.embedding);
471
584
  embeddingCount += batchEmbeddings.length;
472
585
 
473
- const batchItems = buildInsertItems(batchChunks, batchEmbeddings, modelInfo);
586
+ const batchItems = buildInsertItems(batchChunks, batchEmbeddings, modelInfo, batchAnnotations);
474
587
  writeBuffer.push(...batchItems);
475
588
 
476
589
  if (!useInternalProgress) {
@@ -549,7 +662,7 @@ export async function chunkFiles(files) {
549
662
  if (chunk.embedding_text) {
550
663
  return chunk.embedding_text.slice(0, _embCap);
551
664
  }
552
- return `${chunk.file} ${chunk.metadata?.symbol || ''}\n${(chunk.text || chunk.content || '').slice(0, 1500)}`;
665
+ return `${chunkFilePath(chunk)} ${chunk.metadata?.symbol || ''}\n${(chunk.text || chunk.content || '').slice(0, 1500)}`;
553
666
  });
554
667
 
555
668
  return { allChunks, texts };
@@ -0,0 +1,80 @@
1
+ /**
2
+ * Establish a valid *empty* index baseline.
3
+ *
4
+ * A full or incremental index run over a repository with no indexable files
5
+ * used to early-exit without creating anything, leaving search to throw
6
+ * "No search indexes found" and giving the default-on reconcile maintainer no
7
+ * baseline to grow from. This helper instead writes a coherent zero-row
8
+ * baseline:
9
+ *
10
+ * - codebase.db vector schema, 0 rows
11
+ * - code-graph.db graph schema, 0 rows
12
+ * - merkle-state.json 0 files — so the maintainer's dirty-scan treats
13
+ * the first created file as new
14
+ * - reconcile-manifest.json — so readers pin a real epoch
15
+ *
16
+ * With the baseline in place, search returns empty results cleanly (the
17
+ * graph+codebase existence check in SweetSearch.init passes; the tables are
18
+ * simply empty) and the reconcile maintainer can transition the repo from zero
19
+ * files to one file without a prior full index.
20
+ *
21
+ * The schema builders are the same ones the production reconciler uses when it
22
+ * lazily creates these DBs (createVectorSchema / createGraphSchema), so a
23
+ * baseline written here is byte-for-byte compatible with later incremental
24
+ * deltas (epoch columns, FTS5, indexes).
25
+ */
26
+
27
+ import Database from 'better-sqlite3';
28
+ import { existsSync, mkdirSync } from 'node:fs';
29
+ import path from 'node:path';
30
+
31
+ import { DB_PATHS } from '../infrastructure/config/index.js';
32
+ import { createVectorSchema } from './indexer-build.js';
33
+ import { createGraphSchema } from '../graph/graph-extractor.js';
34
+ import { publishIndexerManifest } from './indexer-manifest.js';
35
+ import { updateState } from './incremental-tracker.js';
36
+ import { log } from './indexer-utils.js';
37
+
38
+ /**
39
+ * Create `dbPath` with `createSchema` only when it does not already exist.
40
+ * Returns true when a fresh DB was created, false when one was already present.
41
+ */
42
+ function ensureSchema(dbPath, createSchema) {
43
+ if (existsSync(dbPath)) return false;
44
+ mkdirSync(path.dirname(dbPath), { recursive: true });
45
+ const db = new Database(dbPath);
46
+ try {
47
+ createSchema(db);
48
+ } finally {
49
+ db.close();
50
+ }
51
+ return true;
52
+ }
53
+
54
+ /**
55
+ * Write the empty baseline for a genuinely un-indexed empty repo.
56
+ *
57
+ * No-op when `merkle-state.json` already exists: a prior index ran, so an empty
58
+ * working tree means the repo BECAME empty (every tracked file deleted). In that
59
+ * case the existing merkle must be preserved so the maintainer's deletion
60
+ * detection (dirty-scan: merkle-known vs on-disk) retires the now-stale rows —
61
+ * overwriting it with an empty file set here would erase that knowledge and
62
+ * strand the stale rows in codebase.db / code-graph.db forever.
63
+ *
64
+ * @returns {Promise<{createdCodebase:boolean, createdGraph:boolean, skipped?:boolean}>}
65
+ */
66
+ export async function establishEmptyBaseline() {
67
+ if (existsSync(DB_PATHS.merkle)) {
68
+ return { createdCodebase: false, createdGraph: false, skipped: true };
69
+ }
70
+ const createdCodebase = ensureSchema(DB_PATHS.codebase, createVectorSchema);
71
+ const createdGraph = ensureSchema(DB_PATHS.codeGraph, createGraphSchema);
72
+ await updateState({}, { totalChunks: 0, entities: 0, relationships: 0 });
73
+ publishIndexerManifest({});
74
+ log(
75
+ `Established empty index baseline (0 files; codebase.db ${createdCodebase ? 'created' : 'present'}, `
76
+ + `code-graph.db ${createdGraph ? 'created' : 'present'})`,
77
+ 'green',
78
+ );
79
+ return { createdCodebase, createdGraph };
80
+ }