sweet-search 2.5.2 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -7,12 +7,14 @@ import { existsSync, openSync, fsyncSync, closeSync, writeFileSync, readFileSync
|
|
|
7
7
|
import path from 'path';
|
|
8
8
|
|
|
9
9
|
import { DB_PATHS, HNSW_CONFIG, BINARY_HNSW_CONFIG } from '../infrastructure/config/index.js';
|
|
10
|
+
import { chunkedIn } from '../infrastructure/db-utils.js';
|
|
10
11
|
import { HNSWIndex } from '../vector-store/hnsw-index.js';
|
|
11
12
|
import { LateInteractionIndex } from '../ranking/late-interaction-index.js';
|
|
12
13
|
import { truncateForHNSW, getEmbeddings, getModelInfo, fisherYatesShuffle } from '../embedding/embedding-service.js';
|
|
13
14
|
import { buildFromCodebaseDb as buildQuantizedArtifacts, shouldSkipArtifactRebuild, updateArtifactState, ARTIFACT_THRESHOLDS } from './artifact-builder.js';
|
|
14
15
|
import { log, logProgress } from './indexer-utils.js';
|
|
15
16
|
import { JAVA_FAMILY } from './ast-chunker.js';
|
|
17
|
+
import { isIndexAcceleratorAvailable } from './model-pool.js';
|
|
16
18
|
|
|
17
19
|
// =============================================================================
|
|
18
20
|
// DURABLE WRITE HELPERS (Phase E — fsync ordering for checkpoint safety)
|
|
@@ -60,6 +62,28 @@ export function pickLiInput(chunk) {
|
|
|
60
62
|
return chunk.li_greedy_text || chunk.embedding_text || chunk.li_text || chunk.text || chunk.content || '';
|
|
61
63
|
}
|
|
62
64
|
|
|
65
|
+
function chunkFilePath(chunk) {
|
|
66
|
+
return firstSafeRelativePath(
|
|
67
|
+
chunk?.metadata?.relative_path,
|
|
68
|
+
chunk?.metadata?.path,
|
|
69
|
+
chunk?.metadata?.file_path,
|
|
70
|
+
chunk?.file,
|
|
71
|
+
chunk?.metadata?.file,
|
|
72
|
+
) || '';
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function firstSafeRelativePath(...candidates) {
|
|
76
|
+
for (const candidate of candidates) {
|
|
77
|
+
if (typeof candidate !== 'string') continue;
|
|
78
|
+
const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
|
|
79
|
+
if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
|
|
80
|
+
if (/^[A-Za-z]:\//.test(normalized)) continue;
|
|
81
|
+
if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
|
|
82
|
+
return normalized;
|
|
83
|
+
}
|
|
84
|
+
return null;
|
|
85
|
+
}
|
|
86
|
+
|
|
63
87
|
function fsyncFile(filePath) {
|
|
64
88
|
const fd = openSync(filePath, 'r');
|
|
65
89
|
try { fsyncSync(fd); } finally { closeSync(fd); }
|
|
@@ -108,13 +132,38 @@ function cleanupCheckpoint(indexPath) {
|
|
|
108
132
|
// list at the exemplar's rank position.
|
|
109
133
|
const ALIAS_FILTER_SQL = "json_extract(metadata, '$.exemplarId') IS NULL";
|
|
110
134
|
|
|
135
|
+
function hasVectorColumn(db, column) {
|
|
136
|
+
try {
|
|
137
|
+
return db.prepare('PRAGMA table_info(vectors)').all().some((col) => col.name === column);
|
|
138
|
+
} catch (_err) {
|
|
139
|
+
return false;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function aliasFilterSql(alias = '') {
|
|
144
|
+
if (!alias) return ALIAS_FILTER_SQL;
|
|
145
|
+
const prefix = alias ? `${alias}.` : '';
|
|
146
|
+
return `json_extract(${prefix}metadata, '$.exemplarId') IS NULL`;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function liveVectorSql(db, alias = '') {
|
|
150
|
+
if (!hasVectorColumn(db, 'epoch_retired')) return '1=1';
|
|
151
|
+
const prefix = alias ? `${alias}.` : '';
|
|
152
|
+
return `${prefix}epoch_retired IS NULL`;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function vectorIndexWhere(db, alias = '') {
|
|
156
|
+
return `${aliasFilterSql(alias)} AND ${liveVectorSql(db, alias)}`;
|
|
157
|
+
}
|
|
158
|
+
|
|
111
159
|
function* streamVectorsFromDb(db, _dim, order = 'sequential') {
|
|
160
|
+
const vectorWhere = vectorIndexWhere(db);
|
|
112
161
|
if (order !== 'sequential') {
|
|
113
162
|
db.exec('CREATE TEMP TABLE IF NOT EXISTS hnsw_order (pos INTEGER PRIMARY KEY, vector_rowid INTEGER)');
|
|
114
163
|
db.exec('DELETE FROM hnsw_order');
|
|
115
164
|
|
|
116
165
|
const rowidRows = db
|
|
117
|
-
.prepare(`SELECT rowid FROM vectors WHERE ${
|
|
166
|
+
.prepare(`SELECT rowid FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
|
|
118
167
|
.all();
|
|
119
168
|
let indices = rowidRows.map((r) => r.rowid);
|
|
120
169
|
|
|
@@ -122,7 +171,7 @@ function* streamVectorsFromDb(db, _dim, order = 'sequential') {
|
|
|
122
171
|
fisherYatesShuffle(indices);
|
|
123
172
|
} else if (order === 'diversity') {
|
|
124
173
|
const pathRows = db
|
|
125
|
-
.prepare(`SELECT rowid, file_path FROM vectors WHERE ${
|
|
174
|
+
.prepare(`SELECT rowid, file_path FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
|
|
126
175
|
.all();
|
|
127
176
|
const filePaths = pathRows.map((r) => r.file_path);
|
|
128
177
|
const permutationPositions = diversityFirstPermutationRowids(filePaths);
|
|
@@ -155,7 +204,7 @@ function* streamVectorsFromDb(db, _dim, order = 'sequential') {
|
|
|
155
204
|
db.exec('DROP TABLE IF EXISTS temp.hnsw_order');
|
|
156
205
|
} else {
|
|
157
206
|
const stmt = db.prepare(
|
|
158
|
-
`SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${
|
|
207
|
+
`SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${vectorWhere} ORDER BY rowid`,
|
|
159
208
|
);
|
|
160
209
|
for (const row of stmt.iterate()) {
|
|
161
210
|
yield {
|
|
@@ -192,12 +241,16 @@ function* streamVectorsFromDb(db, _dim, order = 'sequential') {
|
|
|
192
241
|
export function decideHybridDispatcher({
|
|
193
242
|
env = process.env,
|
|
194
243
|
parallelLateInteraction = false,
|
|
244
|
+
acceleratorAvailable = true,
|
|
195
245
|
} = {}) {
|
|
196
246
|
const hybridEnv = (env.SWEET_SEARCH_LI_HYBRID ?? '').trim().toLowerCase();
|
|
197
247
|
const hybridEnabled = hybridEnv === '1' || hybridEnv === 'true' || hybridEnv === 'on';
|
|
198
248
|
if (!hybridEnabled) {
|
|
199
249
|
return { armed: false, reason: 'not-enabled' };
|
|
200
250
|
}
|
|
251
|
+
if (!acceleratorAvailable) {
|
|
252
|
+
return { armed: false, reason: 'no-accelerator' };
|
|
253
|
+
}
|
|
201
254
|
// SWEET_SEARCH_LI_USE_CPU implies single-encoder CPU path — skip the
|
|
202
255
|
// bidirectional cursor (which would still try to use the GPU encoder).
|
|
203
256
|
if (env.SWEET_SEARCH_LI_USE_CPU === '1') {
|
|
@@ -395,14 +448,29 @@ export async function incrementalUpdateHNSW(dbPath, changedFiles, dryRun = false
|
|
|
395
448
|
const Database = (await import('better-sqlite3')).default;
|
|
396
449
|
const db = new Database(dbPath, { readonly: true });
|
|
397
450
|
|
|
398
|
-
const
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
451
|
+
const changedFileList = [...new Set(changedFiles || [])];
|
|
452
|
+
// Chunk the IN(?,?,...) clause to stay under SQLite's bound-parameter
|
|
453
|
+
// limit (default 32766, historic floor 999). Without chunking, a single
|
|
454
|
+
// indexing pass over >~32k changed files crashes with "too many SQL
|
|
455
|
+
// variables" — observed in production on CoSQA+ (51k docs) and BRIGHT
|
|
456
|
+
// (528k docs). See core/infrastructure/db-utils.js for the helper.
|
|
457
|
+
let rows = [];
|
|
458
|
+
if (changedFileList.length > 0) {
|
|
459
|
+
rows = chunkedIn(
|
|
460
|
+
db,
|
|
461
|
+
`SELECT rowid, id, file_path, embedding, metadata
|
|
462
|
+
FROM vectors
|
|
463
|
+
WHERE ${vectorIndexWhere(db)}
|
|
464
|
+
AND file_path IN (__IN_PLACEHOLDERS__)
|
|
465
|
+
ORDER BY rowid`,
|
|
466
|
+
changedFileList,
|
|
467
|
+
);
|
|
468
|
+
// Each batch is ORDER BY rowid internally, but batch boundaries break
|
|
469
|
+
// global monotonicity. The HNSW insertion loop below relies on rowid
|
|
470
|
+
// order for deterministic graph construction — re-sort explicitly.
|
|
471
|
+
rows.sort((a, b) => a.rowid - b.rowid);
|
|
472
|
+
}
|
|
473
|
+
const totalNew = rows.length;
|
|
406
474
|
|
|
407
475
|
log(`Adding ${totalNew} new entries...`, 'yellow');
|
|
408
476
|
let added = 0;
|
|
@@ -455,7 +523,7 @@ export async function buildHNSWIndex(dbPath, dryRun = false) {
|
|
|
455
523
|
const db = new Database(dbPath, orderMode === 'sequential' ? { readonly: true } : {});
|
|
456
524
|
|
|
457
525
|
const totalVectors = db
|
|
458
|
-
.prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${
|
|
526
|
+
.prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${vectorIndexWhere(db)}`)
|
|
459
527
|
.get().c;
|
|
460
528
|
if (totalVectors === 0) {
|
|
461
529
|
db.close();
|
|
@@ -499,7 +567,10 @@ export async function buildHNSWIndex(dbPath, dryRun = false) {
|
|
|
499
567
|
// vectors already in the checkpoint. Without this, add() reuses keys
|
|
500
568
|
// from 0 and the final .meta.json would be incomplete.
|
|
501
569
|
const metaStmt = db.prepare(
|
|
502
|
-
|
|
570
|
+
`SELECT id, file_path, metadata
|
|
571
|
+
FROM vectors
|
|
572
|
+
WHERE rowid <= ? AND ${vectorIndexWhere(db)}
|
|
573
|
+
ORDER BY rowid`
|
|
503
574
|
);
|
|
504
575
|
let restoredKey = 0;
|
|
505
576
|
for (const row of metaStmt.iterate(resumeFromRowId)) {
|
|
@@ -592,6 +663,7 @@ export async function buildHNSWIndex(dbPath, dryRun = false) {
|
|
|
592
663
|
}
|
|
593
664
|
|
|
594
665
|
await index.save();
|
|
666
|
+
await index.clearStaleBitmap();
|
|
595
667
|
buildCompleted = true;
|
|
596
668
|
|
|
597
669
|
// Clean up checkpoint files after successful completion
|
|
@@ -830,6 +902,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
830
902
|
const hybridDecision = decideHybridDispatcher({
|
|
831
903
|
env: process.env,
|
|
832
904
|
parallelLateInteraction: EMBEDDING_CONFIG.parallelLateInteraction === true,
|
|
905
|
+
acceleratorAvailable: isIndexAcceleratorAvailable(),
|
|
833
906
|
});
|
|
834
907
|
if (!hybridDecision.armed && hybridDecision.reason === 'metal-contended-by-embed') {
|
|
835
908
|
log(
|
|
@@ -837,6 +910,11 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
837
910
|
+ 'OR SWEET_SEARCH_EMBED_USE_CPU=1 (Metal queue is shared with parallel embed phase)',
|
|
838
911
|
'yellow'
|
|
839
912
|
);
|
|
913
|
+
} else if (!hybridDecision.armed && hybridDecision.reason === 'no-accelerator') {
|
|
914
|
+
log(
|
|
915
|
+
'LateInteraction hybrid: ignored — no inference accelerator detected; using ORT CPU',
|
|
916
|
+
'yellow'
|
|
917
|
+
);
|
|
840
918
|
}
|
|
841
919
|
const hybridDisabled = !hybridDecision.armed;
|
|
842
920
|
|
|
@@ -913,7 +991,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
913
991
|
const tokens = tokenArrays[j];
|
|
914
992
|
if (tokens && tokens.length > 0) {
|
|
915
993
|
await liIndex.add(chunk.id, tokens, {
|
|
916
|
-
file: chunk
|
|
994
|
+
file: chunkFilePath(chunk),
|
|
917
995
|
name: chunk.metadata?.symbol,
|
|
918
996
|
type: chunk.metadata?.chunk_type,
|
|
919
997
|
startLine: chunk.metadata?.line_start || null,
|
|
@@ -1018,7 +1096,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
1018
1096
|
continue;
|
|
1019
1097
|
}
|
|
1020
1098
|
liIndex.addAlias(alias.id, exemplarId, clusterId, {
|
|
1021
|
-
file: alias
|
|
1099
|
+
file: chunkFilePath(alias),
|
|
1022
1100
|
name: alias.metadata?.symbol,
|
|
1023
1101
|
type: alias.metadata?.chunk_type,
|
|
1024
1102
|
startLine: alias.metadata?.line_start || null,
|
|
@@ -1042,6 +1120,12 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
1042
1120
|
return { ...liStats, added: totalAdded, removed, saveToPath };
|
|
1043
1121
|
}
|
|
1044
1122
|
|
|
1123
|
+
export const __TEST__ = {
|
|
1124
|
+
chunkFilePath,
|
|
1125
|
+
vectorIndexWhere,
|
|
1126
|
+
liveVectorSql,
|
|
1127
|
+
};
|
|
1128
|
+
|
|
1045
1129
|
// =============================================================================
|
|
1046
1130
|
// PHASE 5: BINARY HNSW + INT8 QUANTIZED ARTIFACTS
|
|
1047
1131
|
// =============================================================================
|
|
@@ -11,8 +11,12 @@ import path from 'path';
|
|
|
11
11
|
import { DB_PATHS, EMBEDDING_CONFIG, PROJECT_ROOT } from '../infrastructure/config/index.js';
|
|
12
12
|
import { GraphExtractor, createGraphSchema, insertGraph } from '../graph/graph-extractor.js';
|
|
13
13
|
import { resolveRelationshipTargets } from '../graph/relationship-resolver.js';
|
|
14
|
+
import { populatePageRankColumn } from '../graph/structural-pagerank.js';
|
|
14
15
|
import { getEmbeddings, getModelInfo } from '../embedding/embedding-service.js';
|
|
15
16
|
import { configureJournalMode, checkpointWal, atomicSwapDatabase, log, logProgress } from './indexer-utils.js';
|
|
17
|
+
import { assignStructuralIds } from '../incremental-indexing/domain/chunk-identity.mjs';
|
|
18
|
+
import { chunkInputHashes } from '../incremental-indexing/domain/encoder-input.mjs';
|
|
19
|
+
import { migrateVectorsSchema } from '../incremental-indexing/infrastructure/schema-migrations.mjs';
|
|
16
20
|
|
|
17
21
|
// =============================================================================
|
|
18
22
|
// CHUNK ENRICHMENT — scope chains + imports from code-graph.db
|
|
@@ -61,7 +65,7 @@ async function enrichChunksFromGraph(chunks, ASTChunker) {
|
|
|
61
65
|
let enriched = 0;
|
|
62
66
|
|
|
63
67
|
for (const chunk of chunks) {
|
|
64
|
-
const filePath = chunk
|
|
68
|
+
const filePath = chunkFilePath(chunk);
|
|
65
69
|
if (!filePath) continue;
|
|
66
70
|
|
|
67
71
|
// Only enrich chunks with a known symbol (skip generic 'unknown' text chunks)
|
|
@@ -187,6 +191,14 @@ export async function buildCodeGraph(files, dryRun = false) {
|
|
|
187
191
|
log('Resolving relationship targets...', 'yellow');
|
|
188
192
|
const resolutionStats = resolveRelationshipTargets(db);
|
|
189
193
|
|
|
194
|
+
log('Computing entity PageRank for structural ranking...', 'yellow');
|
|
195
|
+
try {
|
|
196
|
+
const prStats = populatePageRankColumn(db);
|
|
197
|
+
log(`✓ PageRank populated: ${prStats.written}/${prStats.entities} entities in ${prStats.ms}ms`, 'green');
|
|
198
|
+
} catch (err) {
|
|
199
|
+
log(`⚠ PageRank population failed (non-fatal): ${err.message}`, 'yellow');
|
|
200
|
+
}
|
|
201
|
+
|
|
190
202
|
// Update query planner statistics before closing (SQLite 3.46+).
|
|
191
203
|
// Best-effort only; failure should not strand the temp DB handle.
|
|
192
204
|
closeWithOptimize(db, 'code graph build');
|
|
@@ -224,6 +236,7 @@ export function createVectorSchema(db) {
|
|
|
224
236
|
`);
|
|
225
237
|
db.exec('CREATE INDEX IF NOT EXISTS idx_vectors_session ON vectors(session_id)');
|
|
226
238
|
db.exec('CREATE INDEX IF NOT EXISTS idx_vectors_file_path ON vectors(file_path)');
|
|
239
|
+
migrateVectorsSchema(db);
|
|
227
240
|
}
|
|
228
241
|
|
|
229
242
|
export function ensureVectorSchema(db) {
|
|
@@ -254,25 +267,30 @@ export function ensureVectorSchema(db) {
|
|
|
254
267
|
db.exec('CREATE INDEX IF NOT EXISTS idx_vectors_file_path ON vectors(file_path)');
|
|
255
268
|
log(' Schema migration complete', 'dim');
|
|
256
269
|
}
|
|
270
|
+
migrateVectorsSchema(db);
|
|
257
271
|
}
|
|
258
272
|
|
|
259
|
-
export function buildInsertItems(chunks, embeddings, modelInfo) {
|
|
273
|
+
export function buildInsertItems(chunks, embeddings, modelInfo, annotations = null, options = {}) {
|
|
260
274
|
const items = [];
|
|
275
|
+
const chunkAnnotations = annotations || annotateChunksForVectorInsert(chunks);
|
|
276
|
+
const epochWritten = Number.isInteger(options.epochWritten) ? options.epochWritten : 0;
|
|
261
277
|
for (let i = 0; i < chunks.length; i++) {
|
|
262
278
|
const chunk = chunks[i];
|
|
263
279
|
const embedding = embeddings[i];
|
|
264
280
|
|
|
265
281
|
if (!embedding || embedding.length === 0) continue;
|
|
282
|
+
const ann = chunkAnnotations[i];
|
|
283
|
+
const filePath = chunkFilePath(chunk);
|
|
266
284
|
|
|
267
285
|
items.push({
|
|
268
286
|
id: chunk.id,
|
|
269
|
-
filePath
|
|
287
|
+
filePath,
|
|
270
288
|
embeddingBlob: embedding instanceof Float32Array
|
|
271
289
|
? Buffer.from(embedding.buffer, embedding.byteOffset, embedding.byteLength)
|
|
272
290
|
: Buffer.from(new Float32Array(embedding).buffer),
|
|
273
291
|
text: (chunk.text || chunk.content || '').slice(0, 2000),
|
|
274
292
|
metadata: JSON.stringify({
|
|
275
|
-
file:
|
|
293
|
+
file: filePath,
|
|
276
294
|
type: chunk.metadata?.chunk_type || 'code',
|
|
277
295
|
name: chunk.metadata?.symbol || null,
|
|
278
296
|
startLine: chunk.metadata?.line_start || null,
|
|
@@ -289,11 +307,117 @@ export function buildInsertItems(chunks, embeddings, modelInfo) {
|
|
|
289
307
|
sessionId: `codebase-v22-${modelInfo.provider}`,
|
|
290
308
|
tags: JSON.stringify(['codebase', chunk.metadata?.language || 'unknown']),
|
|
291
309
|
createdAt: new Date().toISOString(),
|
|
310
|
+
chunkStructId: ann?.chunkStructId || '',
|
|
311
|
+
chunkTextHash: ann?.hashes?.chunk_text_hash || '',
|
|
312
|
+
embeddingInputHash: ann?.hashes?.embedding_input_hash || '',
|
|
313
|
+
liInputHash: ann?.hashes?.li_input_hash || '',
|
|
314
|
+
metadataFingerprint: ann?.hashes?.metadata_fingerprint || '',
|
|
315
|
+
logicalChunkId: ann?.chunkStructId || chunk.id,
|
|
316
|
+
epochWritten,
|
|
317
|
+
epochRetired: null,
|
|
292
318
|
});
|
|
293
319
|
}
|
|
294
320
|
return items;
|
|
295
321
|
}
|
|
296
322
|
|
|
323
|
+
function chunkFilePath(chunk) {
|
|
324
|
+
return firstSafeRelativePath(
|
|
325
|
+
chunk?.metadata?.relative_path,
|
|
326
|
+
chunk?.metadata?.path,
|
|
327
|
+
chunk?.metadata?.file_path,
|
|
328
|
+
chunk?.file,
|
|
329
|
+
chunk?.metadata?.file,
|
|
330
|
+
) || '';
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
function firstSafeRelativePath(...candidates) {
|
|
334
|
+
for (const candidate of candidates) {
|
|
335
|
+
if (typeof candidate !== 'string') continue;
|
|
336
|
+
const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
|
|
337
|
+
if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
|
|
338
|
+
if (/^[A-Za-z]:\//.test(normalized)) continue;
|
|
339
|
+
if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
|
|
340
|
+
return normalized;
|
|
341
|
+
}
|
|
342
|
+
return null;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
function annotateChunksForVectorInsert(chunks) {
|
|
346
|
+
const annotations = new Array(chunks.length);
|
|
347
|
+
const byFile = new Map();
|
|
348
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
349
|
+
const filePath = chunkFilePath(chunks[i]);
|
|
350
|
+
if (!byFile.has(filePath)) byFile.set(filePath, []);
|
|
351
|
+
byFile.get(filePath).push(i);
|
|
352
|
+
}
|
|
353
|
+
for (const [filePath, indices] of byFile.entries()) {
|
|
354
|
+
const fileChunks = indices.map((idx) => chunks[idx]);
|
|
355
|
+
const ids = assignStructuralIds(fileChunks, filePath);
|
|
356
|
+
for (let i = 0; i < indices.length; i++) {
|
|
357
|
+
const idx = indices[i];
|
|
358
|
+
annotations[idx] = {
|
|
359
|
+
...ids[i],
|
|
360
|
+
hashes: chunkInputHashes(chunks[idx]),
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
return annotations;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
function vectorInsertColumns(db) {
|
|
368
|
+
const columns = new Set(db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name));
|
|
369
|
+
return [
|
|
370
|
+
'id',
|
|
371
|
+
'file_path',
|
|
372
|
+
'embedding',
|
|
373
|
+
'text',
|
|
374
|
+
'metadata',
|
|
375
|
+
'session_id',
|
|
376
|
+
'tags',
|
|
377
|
+
'created_at',
|
|
378
|
+
'chunk_struct_id',
|
|
379
|
+
'chunk_text_hash',
|
|
380
|
+
'embedding_input_hash',
|
|
381
|
+
'li_input_hash',
|
|
382
|
+
'metadata_fingerprint',
|
|
383
|
+
'logical_chunk_id',
|
|
384
|
+
'epoch_written',
|
|
385
|
+
'epoch_retired',
|
|
386
|
+
].filter((column) => columns.has(column));
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
function vectorInsertValue(item, column) {
|
|
390
|
+
switch (column) {
|
|
391
|
+
case 'id': return item.id;
|
|
392
|
+
case 'file_path': return item.filePath;
|
|
393
|
+
case 'embedding': return item.embeddingBlob;
|
|
394
|
+
case 'text': return item.text;
|
|
395
|
+
case 'metadata': return item.metadata;
|
|
396
|
+
case 'session_id': return item.sessionId;
|
|
397
|
+
case 'tags': return item.tags;
|
|
398
|
+
case 'created_at': return item.createdAt;
|
|
399
|
+
case 'chunk_struct_id': return item.chunkStructId ?? '';
|
|
400
|
+
case 'chunk_text_hash': return item.chunkTextHash ?? '';
|
|
401
|
+
case 'embedding_input_hash': return item.embeddingInputHash ?? '';
|
|
402
|
+
case 'li_input_hash': return item.liInputHash ?? '';
|
|
403
|
+
case 'metadata_fingerprint': return item.metadataFingerprint ?? '';
|
|
404
|
+
case 'logical_chunk_id': return item.logicalChunkId ?? item.chunkStructId ?? item.id;
|
|
405
|
+
case 'epoch_written': return item.epochWritten ?? 0;
|
|
406
|
+
case 'epoch_retired': return item.epochRetired ?? null;
|
|
407
|
+
default: return item[column];
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
function prepareVectorInsert(db) {
|
|
412
|
+
const columns = vectorInsertColumns(db);
|
|
413
|
+
const quoted = columns.map((column) => `"${column}"`).join(', ');
|
|
414
|
+
const placeholders = columns.map(() => '?').join(', ');
|
|
415
|
+
return {
|
|
416
|
+
columns,
|
|
417
|
+
stmt: db.prepare(`INSERT OR REPLACE INTO vectors (${quoted}) VALUES (${placeholders})`),
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
|
|
297
421
|
/**
|
|
298
422
|
* Insert alias rows that reuse their exemplar's embedding instead of running
|
|
299
423
|
* the embedding model. The exemplar must already be in the `vectors` table;
|
|
@@ -307,23 +431,11 @@ export function insertAliasVectors(db, aliases, modelInfo) {
|
|
|
307
431
|
'SELECT embedding, metadata FROM vectors WHERE id = ?'
|
|
308
432
|
);
|
|
309
433
|
|
|
310
|
-
const stmt = db
|
|
311
|
-
INSERT OR REPLACE INTO vectors (id, file_path, embedding, text, metadata, session_id, tags, created_at)
|
|
312
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
313
|
-
`);
|
|
434
|
+
const { stmt, columns } = prepareVectorInsert(db);
|
|
314
435
|
|
|
315
436
|
const insertBatch = db.transaction((items) => {
|
|
316
437
|
for (const item of items) {
|
|
317
|
-
stmt.run(
|
|
318
|
-
item.id,
|
|
319
|
-
item.filePath,
|
|
320
|
-
item.embeddingBlob,
|
|
321
|
-
item.text,
|
|
322
|
-
item.metadata,
|
|
323
|
-
item.sessionId,
|
|
324
|
-
item.tags,
|
|
325
|
-
item.createdAt,
|
|
326
|
-
);
|
|
438
|
+
stmt.run(...columns.map((column) => vectorInsertValue(item, column)));
|
|
327
439
|
}
|
|
328
440
|
});
|
|
329
441
|
|
|
@@ -344,11 +456,13 @@ export function insertAliasVectors(db, aliases, modelInfo) {
|
|
|
344
456
|
}
|
|
345
457
|
|
|
346
458
|
const items = [];
|
|
459
|
+
const annotations = annotateChunksForVectorInsert(aliases);
|
|
347
460
|
const nowIso = new Date().toISOString();
|
|
348
461
|
let missing = 0;
|
|
349
462
|
let dimension = null;
|
|
350
463
|
|
|
351
|
-
for (
|
|
464
|
+
for (let i = 0; i < aliases.length; i++) {
|
|
465
|
+
const alias = aliases[i];
|
|
352
466
|
const exemplarId = alias.metadata?.exemplarId;
|
|
353
467
|
if (!exemplarId) continue;
|
|
354
468
|
const row = fetchExemplar.get(exemplarId);
|
|
@@ -359,14 +473,16 @@ export function insertAliasVectors(db, aliases, modelInfo) {
|
|
|
359
473
|
if (dimension === null) {
|
|
360
474
|
dimension = Math.floor(row.embedding.length / 4);
|
|
361
475
|
}
|
|
476
|
+
const ann = annotations[i];
|
|
477
|
+
const filePath = chunkFilePath(alias);
|
|
362
478
|
|
|
363
479
|
items.push({
|
|
364
480
|
id: alias.id,
|
|
365
|
-
filePath
|
|
481
|
+
filePath,
|
|
366
482
|
embeddingBlob: row.embedding, // copy exemplar's Float32 BLOB verbatim
|
|
367
483
|
text: (alias.text || alias.content || '').slice(0, 2000),
|
|
368
484
|
metadata: JSON.stringify({
|
|
369
|
-
file:
|
|
485
|
+
file: filePath,
|
|
370
486
|
type: alias.metadata?.chunk_type || 'code',
|
|
371
487
|
name: alias.metadata?.symbol || null,
|
|
372
488
|
startLine: alias.metadata?.line_start || null,
|
|
@@ -382,6 +498,14 @@ export function insertAliasVectors(db, aliases, modelInfo) {
|
|
|
382
498
|
sessionId: `codebase-v22-${modelInfo.provider}`,
|
|
383
499
|
tags: JSON.stringify(['codebase', alias.metadata?.language || 'unknown']),
|
|
384
500
|
createdAt: nowIso,
|
|
501
|
+
chunkStructId: ann?.chunkStructId || '',
|
|
502
|
+
chunkTextHash: ann?.hashes?.chunk_text_hash || '',
|
|
503
|
+
embeddingInputHash: ann?.hashes?.embedding_input_hash || '',
|
|
504
|
+
liInputHash: ann?.hashes?.li_input_hash || '',
|
|
505
|
+
metadataFingerprint: ann?.hashes?.metadata_fingerprint || '',
|
|
506
|
+
logicalChunkId: ann?.chunkStructId || alias.id,
|
|
507
|
+
epochWritten: 0,
|
|
508
|
+
epochRetired: null,
|
|
385
509
|
});
|
|
386
510
|
}
|
|
387
511
|
|
|
@@ -397,48 +521,36 @@ export function insertAliasVectors(db, aliases, modelInfo) {
|
|
|
397
521
|
return items.length;
|
|
398
522
|
}
|
|
399
523
|
|
|
400
|
-
export function
|
|
524
|
+
export function insertVectorItems(db, items) {
|
|
401
525
|
const BATCH_INSERT_SIZE = 2000;
|
|
402
526
|
|
|
403
|
-
const stmt = db
|
|
404
|
-
INSERT OR REPLACE INTO vectors (id, file_path, embedding, text, metadata, session_id, tags, created_at)
|
|
405
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
406
|
-
`);
|
|
527
|
+
const { stmt, columns } = prepareVectorInsert(db);
|
|
407
528
|
|
|
408
529
|
const insertBatch = db.transaction((items) => {
|
|
409
530
|
for (const item of items) {
|
|
410
|
-
stmt.run(
|
|
411
|
-
item.id,
|
|
412
|
-
item.filePath,
|
|
413
|
-
item.embeddingBlob,
|
|
414
|
-
item.text,
|
|
415
|
-
item.metadata,
|
|
416
|
-
item.sessionId,
|
|
417
|
-
item.tags,
|
|
418
|
-
item.createdAt
|
|
419
|
-
);
|
|
531
|
+
stmt.run(...columns.map((column) => vectorInsertValue(item, column)));
|
|
420
532
|
}
|
|
421
533
|
});
|
|
422
534
|
|
|
423
|
-
const items = buildInsertItems(chunks, embeddings, modelInfo);
|
|
424
|
-
|
|
425
535
|
for (let i = 0; i < items.length; i += BATCH_INSERT_SIZE) {
|
|
426
536
|
insertBatch(items.slice(i, i + BATCH_INSERT_SIZE));
|
|
427
537
|
}
|
|
428
538
|
}
|
|
429
539
|
|
|
540
|
+
export function insertVectors(db, chunks, embeddings, modelInfo, annotations = null, options = {}) {
|
|
541
|
+
insertVectorItems(db, buildInsertItems(chunks, embeddings, modelInfo, annotations, options));
|
|
542
|
+
}
|
|
543
|
+
|
|
430
544
|
export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, modelInfo, logProgressFn, embeddingOptions = {}, logFn, writeFlushRows = 128) {
|
|
431
545
|
let writeBuffer = [];
|
|
432
546
|
let embeddingCount = 0;
|
|
547
|
+
const allAnnotations = annotateChunksForVectorInsert(allChunks);
|
|
433
548
|
|
|
434
|
-
const stmt = db
|
|
435
|
-
INSERT OR REPLACE INTO vectors (id, file_path, embedding, text, metadata, session_id, tags, created_at)
|
|
436
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
437
|
-
`);
|
|
549
|
+
const { stmt, columns } = prepareVectorInsert(db);
|
|
438
550
|
|
|
439
551
|
const insertBatch = db.transaction((items) => {
|
|
440
552
|
for (const item of items) {
|
|
441
|
-
stmt.run(
|
|
553
|
+
stmt.run(...columns.map((column) => vectorInsertValue(item, column)));
|
|
442
554
|
}
|
|
443
555
|
});
|
|
444
556
|
|
|
@@ -458,6 +570,7 @@ export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, m
|
|
|
458
570
|
for (let i = 0; i < texts.length; i += batchSize) {
|
|
459
571
|
const batch = texts.slice(i, i + batchSize);
|
|
460
572
|
const batchChunks = allChunks.slice(i, i + batchSize);
|
|
573
|
+
const batchAnnotations = allAnnotations.slice(i, i + batchSize);
|
|
461
574
|
|
|
462
575
|
// Overlap: flush accumulated writes while embedding is in-flight
|
|
463
576
|
const batchResultsPromise = getEmbeddings(batch, progressOptions);
|
|
@@ -470,7 +583,7 @@ export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, m
|
|
|
470
583
|
const batchEmbeddings = batchResults.map(r => r.embedding);
|
|
471
584
|
embeddingCount += batchEmbeddings.length;
|
|
472
585
|
|
|
473
|
-
const batchItems = buildInsertItems(batchChunks, batchEmbeddings, modelInfo);
|
|
586
|
+
const batchItems = buildInsertItems(batchChunks, batchEmbeddings, modelInfo, batchAnnotations);
|
|
474
587
|
writeBuffer.push(...batchItems);
|
|
475
588
|
|
|
476
589
|
if (!useInternalProgress) {
|
|
@@ -549,7 +662,7 @@ export async function chunkFiles(files) {
|
|
|
549
662
|
if (chunk.embedding_text) {
|
|
550
663
|
return chunk.embedding_text.slice(0, _embCap);
|
|
551
664
|
}
|
|
552
|
-
return `${chunk
|
|
665
|
+
return `${chunkFilePath(chunk)} ${chunk.metadata?.symbol || ''}\n${(chunk.text || chunk.content || '').slice(0, 1500)}`;
|
|
553
666
|
});
|
|
554
667
|
|
|
555
668
|
return { allChunks, texts };
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Establish a valid *empty* index baseline.
|
|
3
|
+
*
|
|
4
|
+
* A full or incremental index run over a repository with no indexable files
|
|
5
|
+
* used to early-exit without creating anything, leaving search to throw
|
|
6
|
+
* "No search indexes found" and giving the default-on reconcile maintainer no
|
|
7
|
+
* baseline to grow from. This helper instead writes a coherent zero-row
|
|
8
|
+
* baseline:
|
|
9
|
+
*
|
|
10
|
+
* - codebase.db vector schema, 0 rows
|
|
11
|
+
* - code-graph.db graph schema, 0 rows
|
|
12
|
+
* - merkle-state.json 0 files — so the maintainer's dirty-scan treats
|
|
13
|
+
* the first created file as new
|
|
14
|
+
* - reconcile-manifest.json — so readers pin a real epoch
|
|
15
|
+
*
|
|
16
|
+
* With the baseline in place, search returns empty results cleanly (the
|
|
17
|
+
* graph+codebase existence check in SweetSearch.init passes; the tables are
|
|
18
|
+
* simply empty) and the reconcile maintainer can transition the repo from zero
|
|
19
|
+
* files to one file without a prior full index.
|
|
20
|
+
*
|
|
21
|
+
* The schema builders are the same ones the production reconciler uses when it
|
|
22
|
+
* lazily creates these DBs (createVectorSchema / createGraphSchema), so a
|
|
23
|
+
* baseline written here is byte-for-byte compatible with later incremental
|
|
24
|
+
* deltas (epoch columns, FTS5, indexes).
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
import Database from 'better-sqlite3';
|
|
28
|
+
import { existsSync, mkdirSync } from 'node:fs';
|
|
29
|
+
import path from 'node:path';
|
|
30
|
+
|
|
31
|
+
import { DB_PATHS } from '../infrastructure/config/index.js';
|
|
32
|
+
import { createVectorSchema } from './indexer-build.js';
|
|
33
|
+
import { createGraphSchema } from '../graph/graph-extractor.js';
|
|
34
|
+
import { publishIndexerManifest } from './indexer-manifest.js';
|
|
35
|
+
import { updateState } from './incremental-tracker.js';
|
|
36
|
+
import { log } from './indexer-utils.js';
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Create `dbPath` with `createSchema` only when it does not already exist.
|
|
40
|
+
* Returns true when a fresh DB was created, false when one was already present.
|
|
41
|
+
*/
|
|
42
|
+
function ensureSchema(dbPath, createSchema) {
|
|
43
|
+
if (existsSync(dbPath)) return false;
|
|
44
|
+
mkdirSync(path.dirname(dbPath), { recursive: true });
|
|
45
|
+
const db = new Database(dbPath);
|
|
46
|
+
try {
|
|
47
|
+
createSchema(db);
|
|
48
|
+
} finally {
|
|
49
|
+
db.close();
|
|
50
|
+
}
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Write the empty baseline for a genuinely un-indexed empty repo.
|
|
56
|
+
*
|
|
57
|
+
* No-op when `merkle-state.json` already exists: a prior index ran, so an empty
|
|
58
|
+
* working tree means the repo BECAME empty (every tracked file deleted). In that
|
|
59
|
+
* case the existing merkle must be preserved so the maintainer's deletion
|
|
60
|
+
* detection (dirty-scan: merkle-known vs on-disk) retires the now-stale rows —
|
|
61
|
+
* overwriting it with an empty file set here would erase that knowledge and
|
|
62
|
+
* strand the stale rows in codebase.db / code-graph.db forever.
|
|
63
|
+
*
|
|
64
|
+
* @returns {Promise<{createdCodebase:boolean, createdGraph:boolean, skipped?:boolean}>}
|
|
65
|
+
*/
|
|
66
|
+
export async function establishEmptyBaseline() {
|
|
67
|
+
if (existsSync(DB_PATHS.merkle)) {
|
|
68
|
+
return { createdCodebase: false, createdGraph: false, skipped: true };
|
|
69
|
+
}
|
|
70
|
+
const createdCodebase = ensureSchema(DB_PATHS.codebase, createVectorSchema);
|
|
71
|
+
const createdGraph = ensureSchema(DB_PATHS.codeGraph, createGraphSchema);
|
|
72
|
+
await updateState({}, { totalChunks: 0, entities: 0, relationships: 0 });
|
|
73
|
+
publishIndexerManifest({});
|
|
74
|
+
log(
|
|
75
|
+
`Established empty index baseline (0 files; codebase.db ${createdCodebase ? 'created' : 'present'}, `
|
|
76
|
+
+ `code-graph.db ${createdGraph ? 'created' : 'present'})`,
|
|
77
|
+
'green',
|
|
78
|
+
);
|
|
79
|
+
return { createdCodebase, createdGraph };
|
|
80
|
+
}
|