sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { createHash } from 'node:crypto';
|
|
4
|
+
import Database from 'better-sqlite3';
|
|
5
|
+
|
|
6
|
+
import { Reconciler } from './reconciler.mjs';
|
|
7
|
+
import { enqueueMaintenanceJob } from './maintenance-worker.mjs';
|
|
8
|
+
import { createAdmissionPolicy } from '../../indexing/admission-policy.js';
|
|
9
|
+
import { applyIndexingChunkPolicy } from '../../indexing/indexing-file-policy.js';
|
|
10
|
+
import { contentHashSync } from '../infrastructure/hashing.mjs';
|
|
11
|
+
import { readManifest, writeManifest } from '../infrastructure/manifest.mjs';
|
|
12
|
+
import { annotateChunksForDelta, snapshotFileRows, diffChunks, applyDiff } from '../infrastructure/vector-delta-writer.mjs';
|
|
13
|
+
import { appendDeltaRecord, FALLBACK_WEIGHTS_ID, fileIdFor, listDeltaSegments } from '../infrastructure/sparse-gram-delta.mjs';
|
|
14
|
+
import { fts5Merge } from '../infrastructure/sqlite-fts5.mjs';
|
|
15
|
+
import { insertEntity, insertRelationships, markBinaryStale, maintainFloatStore } from './production-reconciler-helpers.mjs';
|
|
16
|
+
import { createGraphSchema, GraphExtractor } from '../../graph/graph-extractor.js';
|
|
17
|
+
import { createVectorSchema, ensureVectorSchema, buildInsertItems, insertVectorItems } from '../../indexing/indexer-build.js';
|
|
18
|
+
import { ASTChunker, JAVA_FAMILY } from '../../indexing/ast-chunker.js';
|
|
19
|
+
import { getEmbeddings, getModelInfo } from '../../embedding/embedding-service.js';
|
|
20
|
+
import { HNSWIndex } from '../../vector-store/hnsw-index.js';
|
|
21
|
+
import { BinaryHNSWIndex } from '../../vector-store/binary-hnsw-index.js';
|
|
22
|
+
import { floatToBinary, normalizedFloatToInt8, truncateForHNSW } from '../../infrastructure/quantization.js';
|
|
23
|
+
import { extractSparseGramDeltaRecord } from '../../infrastructure/native-sparse-gram.js';
|
|
24
|
+
import { migrateEntitiesSchema, migrateRelationshipsSchema } from '../infrastructure/schema-migrations.mjs';
|
|
25
|
+
import { readMaintenanceState as readMaintenanceStateFromArtifacts } from '../infrastructure/maintenance-state-reader.mjs';
|
|
26
|
+
|
|
27
|
+
const DIRTY_QUEUE = 'index-maintainer-queue.jsonl';
|
|
28
|
+
const PROCESSING_QUEUE = 'index-maintainer-queue.processing.jsonl';
|
|
29
|
+
const MERKLE_STATE = 'merkle-state.json';
|
|
30
|
+
const METRICS_FILE = 'reconcile-metrics.jsonl';
|
|
31
|
+
|
|
32
|
+
function relPath(projectRoot, filePath) {
|
|
33
|
+
const abs = path.isAbsolute(filePath) ? filePath : path.join(projectRoot, filePath);
|
|
34
|
+
const rel = path.relative(projectRoot, abs).replace(/\\/g, '/');
|
|
35
|
+
return rel && !rel.startsWith('../') && !path.isAbsolute(rel) ? rel : null;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function readJson(filePath, fallback = null) {
|
|
39
|
+
try { return JSON.parse(fs.readFileSync(filePath, 'utf8')); } catch { return fallback; }
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function readJsonl(filePath) {
|
|
43
|
+
if (!fs.existsSync(filePath)) return [];
|
|
44
|
+
return fs.readFileSync(filePath, 'utf8').split('\n').flatMap((line) => {
|
|
45
|
+
if (!line.trim()) return [];
|
|
46
|
+
try { return [JSON.parse(line)]; } catch { return []; }
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function statTuple(absPath) {
|
|
51
|
+
const stat = fs.statSync(absPath, { bigint: true });
|
|
52
|
+
return { size: stat.size.toString(), mtime_ns: stat.mtimeNs.toString(), inode: stat.ino.toString() };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function safeWriteJson(filePath, payload) {
|
|
56
|
+
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
|
57
|
+
const tmp = `${filePath}.tmp.${process.pid}`;
|
|
58
|
+
fs.writeFileSync(tmp, JSON.stringify(payload, null, 2));
|
|
59
|
+
fs.renameSync(tmp, filePath);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function relativeArtifact(stateDir, filePath) {
|
|
63
|
+
return path.relative(stateDir, filePath).replace(/\\/g, '/');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function uniquePhysicalId(db, table, id) {
|
|
67
|
+
let candidate = id;
|
|
68
|
+
let suffix = 1;
|
|
69
|
+
const stmt = db.prepare(`SELECT 1 FROM ${table} WHERE id = ?`);
|
|
70
|
+
while (stmt.get(candidate)) candidate = `${id}.${suffix++}`;
|
|
71
|
+
return candidate;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function sparseGramRecord(basePath, content) {
|
|
75
|
+
const extracted = extractSparseGramDeltaRecord({ indexPath: basePath, content }) || { weightsId: FALLBACK_WEIGHTS_ID, grams: [] };
|
|
76
|
+
return {
|
|
77
|
+
weightsId: extracted.weightsId || FALLBACK_WEIGHTS_ID,
|
|
78
|
+
grams: [...new Set(extracted.grams || [])].sort().map((gram) => [gram, 1]),
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function resolveLatestSparseWeightsId(basePath) {
|
|
83
|
+
return extractSparseGramDeltaRecord({ indexPath: basePath, content: '' })?.weightsId || null;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function graphEntityLogicalId(filePath, type, name) {
|
|
87
|
+
return createHash('sha256').update(`${filePath}:${type}:${name}`).digest('hex').slice(0, 16);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function float32FromBuffer(buffer) {
|
|
91
|
+
const view = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
92
|
+
return new Float32Array(view);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function pickLiInput(chunk) {
|
|
96
|
+
const lang = chunk?.metadata?.language;
|
|
97
|
+
if (lang === 'python' || JAVA_FAMILY.has(lang)) {
|
|
98
|
+
return chunk.li_text || chunk.embedding_text || chunk.text || chunk.content || '';
|
|
99
|
+
}
|
|
100
|
+
return chunk.li_greedy_text || chunk.embedding_text || chunk.li_text || chunk.text || chunk.content || '';
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
async function enrichChunksFromGraph(chunks, stateDir) {
|
|
104
|
+
const dbPath = path.join(stateDir, 'code-graph.db');
|
|
105
|
+
if (!fs.existsSync(dbPath) || chunks.length === 0) return chunks;
|
|
106
|
+
const db = new Database(dbPath, { readonly: true });
|
|
107
|
+
try {
|
|
108
|
+
const entityStmt = db.prepare('SELECT type, name, start_line, end_line FROM entities WHERE file_path = ? AND epoch_retired IS NULL ORDER BY start_line ASC');
|
|
109
|
+
const fileEntityStmt = db.prepare('SELECT id FROM entities WHERE file_path = ? AND logical_entity_id = ? AND epoch_retired IS NULL ORDER BY epoch_written DESC LIMIT 1');
|
|
110
|
+
const importStmt = db.prepare("SELECT DISTINCT target_name FROM relationships WHERE source_id = ? AND type IN ('imports', 'plainImport') AND epoch_retired IS NULL ORDER BY target_name");
|
|
111
|
+
for (const chunk of chunks) {
|
|
112
|
+
const file = chunk.file || chunk.metadata?.relative_path;
|
|
113
|
+
const symbol = chunk.metadata?.symbol;
|
|
114
|
+
if (!file || !symbol || symbol === 'unknown') continue;
|
|
115
|
+
const entities = entityStmt.all(file);
|
|
116
|
+
const start = chunk.metadata?.line_start || 0;
|
|
117
|
+
const end = chunk.metadata?.line_end || start;
|
|
118
|
+
const scope = entities.filter((e) => e.start_line <= start && e.end_line >= end).map((e) => e.name);
|
|
119
|
+
const fileEntity = entities.find((e) => e.type === 'file')?.name || path.basename(file);
|
|
120
|
+
const fileLogicalId = graphEntityLogicalId(file, 'file', fileEntity);
|
|
121
|
+
const filePhysicalId = fileEntityStmt.get(file, fileLogicalId)?.id || fileLogicalId;
|
|
122
|
+
const imports = importStmt.all(filePhysicalId).map((r) => r.target_name);
|
|
123
|
+
if (scope.length > 0 || imports.length > 0) ASTChunker.enrichEmbeddingText(chunk, scope, imports);
|
|
124
|
+
}
|
|
125
|
+
} catch {
|
|
126
|
+
return chunks;
|
|
127
|
+
} finally {
|
|
128
|
+
db.close();
|
|
129
|
+
}
|
|
130
|
+
return chunks;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
export function createProductionReconciler(options = {}) {
|
|
134
|
+
const projectRoot = path.resolve(options.projectRoot || process.env.SWEET_SEARCH_PROJECT_ROOT || process.cwd());
|
|
135
|
+
const stateDir = path.resolve(options.stateDir || process.env.SWEET_SEARCH_STATE_DIR || path.join(projectRoot, '.sweet-search'));
|
|
136
|
+
const adapter = new ProductionReconcileAdapter({ ...options, projectRoot, stateDir });
|
|
137
|
+
return new Reconciler({
|
|
138
|
+
projectRoot,
|
|
139
|
+
stateDir,
|
|
140
|
+
adapters: adapter.adapters(),
|
|
141
|
+
onProgress: options.onProgress,
|
|
142
|
+
config: {
|
|
143
|
+
filesPerTick: Number.parseInt(process.env.SWEET_SEARCH_RECONCILE_FILES_PER_TICK || '50', 10),
|
|
144
|
+
cpuBudgetMs: Number.parseInt(process.env.SWEET_SEARCH_RECONCILE_CPU_BUDGET_MS || '2000', 10),
|
|
145
|
+
...(options.config || {}),
|
|
146
|
+
},
|
|
147
|
+
logger: options.logger || console,
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Run a single reconcile tick. This is a PURE primitive: given a dirty set it
|
|
153
|
+
* will build tier artifacts from scratch if none exist. Callers in the default-
|
|
154
|
+
* on path MUST first confirm a complete baseline via
|
|
155
|
+
* `infrastructure/baseline-readiness.mjs::hasCompleteBaseIndex` — the
|
|
156
|
+
* reconciler must never be the first index builder for a non-empty repo (the
|
|
157
|
+
* maintainer daemon and the operator `reconcile tick` command apply that gate).
|
|
158
|
+
*/
|
|
159
|
+
export async function runProductionReconcileTick(options = {}) {
|
|
160
|
+
const reconciler = createProductionReconciler(options);
|
|
161
|
+
const startup = reconciler.verifyStartup();
|
|
162
|
+
if (!startup.ok) throw new Error(startup.reason || 'reconciler startup verification failed');
|
|
163
|
+
return reconciler.tick();
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
class ProductionReconcileAdapter {
|
|
167
|
+
constructor(options) {
|
|
168
|
+
this.projectRoot = options.projectRoot;
|
|
169
|
+
this.stateDir = options.stateDir;
|
|
170
|
+
this.onProgress = typeof options.onProgress === 'function' ? options.onProgress : null;
|
|
171
|
+
this.vectorEncoder = options.vectorEncoder || ((texts, progressOptions = {}) => getEmbeddings(texts, {
|
|
172
|
+
useCache: false,
|
|
173
|
+
onProgress: progressOptions.onProgress,
|
|
174
|
+
}));
|
|
175
|
+
this.liEncoder = options.liEncoder || null;
|
|
176
|
+
this.modelInfo = options.modelInfo || getModelInfo();
|
|
177
|
+
// Shared admission policy — the SAME include/exclude/.sweet-search-ignore/
|
|
178
|
+
// .gitignore/size gates full indexing applies. Used as the second safety
|
|
179
|
+
// gate: queued files full indexing would skip are never reconciled, and a
|
|
180
|
+
// previously-indexed file that is now inadmissible is retired (see
|
|
181
|
+
// readDirtySet → _retireSet → hashFile).
|
|
182
|
+
this.admission = options.admissionPolicy || createAdmissionPolicy({ projectRoot: this.projectRoot });
|
|
183
|
+
this._retireSet = new Set();
|
|
184
|
+
this._liSkipFiles = new Set();
|
|
185
|
+
this.hashes = new Map();
|
|
186
|
+
this.touched = new Map();
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
progress(phase) {
|
|
190
|
+
this.onProgress?.(phase);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
adapters() {
|
|
194
|
+
return {
|
|
195
|
+
readDirtySet: () => this.readDirtySet(),
|
|
196
|
+
requeueDirtyFiles: (files) => this.requeueDirtyFiles(files),
|
|
197
|
+
hashFile: (file) => this.hashFile(file),
|
|
198
|
+
loadCurrentManifest: () => readManifest(this.stateDir),
|
|
199
|
+
persistManifest: (manifest) => this.persistManifest(manifest),
|
|
200
|
+
applyGraphDelta: (file, hashes, epoch) => this.applyGraphDelta(file, hashes, epoch),
|
|
201
|
+
applyVectorDelta: (file, chunks, hashes, epoch) => this.applyVectorDelta(file, chunks, hashes, epoch),
|
|
202
|
+
applyHNSWDelta: (file, ops, epoch) => this.applyHNSWDelta(file, ops, epoch),
|
|
203
|
+
applyBinaryHNSWDelta: (file, ops, epoch) => this.applyBinaryHNSWDelta(file, ops, epoch),
|
|
204
|
+
applyLIDelta: (file, ops, epoch) => this.applyLIDelta(file, ops, epoch),
|
|
205
|
+
applySparseGramDelta: (file, ops, epoch) => this.applySparseGramDelta(file, ops, epoch),
|
|
206
|
+
readMaintenanceState: () => this.readMaintenanceState(),
|
|
207
|
+
scheduleMaintenance: (job) => enqueueMaintenanceJob(this.stateDir, job),
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
async readDirtySet() {
|
|
212
|
+
fs.mkdirSync(this.stateDir, { recursive: true });
|
|
213
|
+
const processing = path.join(this.stateDir, PROCESSING_QUEUE);
|
|
214
|
+
const queue = path.join(this.stateDir, DIRTY_QUEUE);
|
|
215
|
+
if (!fs.existsSync(processing) && fs.existsSync(queue)) {
|
|
216
|
+
fs.renameSync(queue, processing);
|
|
217
|
+
}
|
|
218
|
+
const rels = [];
|
|
219
|
+
const seen = new Set();
|
|
220
|
+
for (const entry of readJsonl(processing)) {
|
|
221
|
+
const rel = relPath(this.projectRoot, entry.file_path || entry.path || entry.filePath || '');
|
|
222
|
+
if (!rel || seen.has(rel)) continue;
|
|
223
|
+
seen.add(rel);
|
|
224
|
+
rels.push(rel);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Second admission gate. A queued file that full indexing would skip is
|
|
228
|
+
// dropped if it was never indexed, and retired if it was (so the index
|
|
229
|
+
// converges to a fresh full rebuild). Existence + shape + size are sync;
|
|
230
|
+
// gitignore is ONE batched check over the admissible candidates.
|
|
231
|
+
const merkle = readJson(path.join(this.stateDir, MERKLE_STATE), { files: {} }).files || {};
|
|
232
|
+
const info = rels.map((rel) => {
|
|
233
|
+
const abs = path.join(this.projectRoot, rel);
|
|
234
|
+
const exists = fs.existsSync(abs);
|
|
235
|
+
const shapeOk = this.admission.admitsShape(rel);
|
|
236
|
+
const sizeOk = exists && shapeOk ? !this.admission.isOversizedAbs(abs) : false;
|
|
237
|
+
return { rel, exists, shapeOk, sizeOk };
|
|
238
|
+
});
|
|
239
|
+
const gitignored = await this.admission.gitignoredSet(
|
|
240
|
+
info.filter((i) => i.exists && i.shapeOk && i.sizeOk).map((i) => i.rel),
|
|
241
|
+
);
|
|
242
|
+
this.progress('production:dirty-gitignore');
|
|
243
|
+
|
|
244
|
+
const files = [];
|
|
245
|
+
const retire = new Set();
|
|
246
|
+
for (const i of info) {
|
|
247
|
+
const admitted = i.exists && i.shapeOk && i.sizeOk && !gitignored.has(i.rel);
|
|
248
|
+
if (admitted) {
|
|
249
|
+
files.push(i.rel);
|
|
250
|
+
} else if (merkle[i.rel]) {
|
|
251
|
+
files.push(i.rel); // previously indexed → keep so hashFile retires it
|
|
252
|
+
retire.add(i.rel);
|
|
253
|
+
}
|
|
254
|
+
// else: never indexed and inadmissible → drop
|
|
255
|
+
}
|
|
256
|
+
this._retireSet = retire;
|
|
257
|
+
return files;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
requeueDirtyFiles(files) {
|
|
261
|
+
if (!Array.isArray(files) || files.length === 0) return;
|
|
262
|
+
fs.mkdirSync(this.stateDir, { recursive: true });
|
|
263
|
+
const queue = path.join(this.stateDir, DIRTY_QUEUE);
|
|
264
|
+
for (const file of files) fs.appendFileSync(queue, JSON.stringify({ file_path: file.path || file, timestamp: Date.now(), source: 'requeue' }) + '\n');
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
async hashFile(file) {
|
|
268
|
+
const rel = typeof file === 'string' ? file : file.path;
|
|
269
|
+
const abs = path.join(this.projectRoot, rel);
|
|
270
|
+
const merkle = readJson(path.join(this.stateDir, MERKLE_STATE), { files: {} });
|
|
271
|
+
// Deleted on disk, or flagged inadmissible by readDirtySet (a previously
|
|
272
|
+
// indexed file that became excluded/oversized/gitignored): retire it so all
|
|
273
|
+
// tiers tombstone and the merkle entry is dropped.
|
|
274
|
+
if (!fs.existsSync(abs) || this._retireSet.has(rel)) {
|
|
275
|
+
const h = { file: rel, deleted: true, contentHash: '', chunks: [] };
|
|
276
|
+
this.hashes.set(rel, h);
|
|
277
|
+
return h;
|
|
278
|
+
}
|
|
279
|
+
const content = fs.readFileSync(abs);
|
|
280
|
+
const contentHash = contentHashSync(content);
|
|
281
|
+
const stat = statTuple(abs);
|
|
282
|
+
const previous = merkle.files?.[rel];
|
|
283
|
+
const h = {
|
|
284
|
+
file: rel,
|
|
285
|
+
absPath: abs,
|
|
286
|
+
content: content.toString('utf8'),
|
|
287
|
+
contentHash,
|
|
288
|
+
stat,
|
|
289
|
+
contentUnchanged: previous?.hash === contentHash,
|
|
290
|
+
};
|
|
291
|
+
this.hashes.set(rel, h);
|
|
292
|
+
return h;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
async applyGraphDelta(file, hashes, epoch) {
|
|
296
|
+
const rel = typeof file === 'string' ? file : file.path;
|
|
297
|
+
const dbPath = path.join(this.stateDir, 'code-graph.db');
|
|
298
|
+
fs.mkdirSync(this.stateDir, { recursive: true });
|
|
299
|
+
const db = new Database(dbPath);
|
|
300
|
+
db.pragma('journal_mode = WAL');
|
|
301
|
+
db.pragma('synchronous = NORMAL');
|
|
302
|
+
const hasFts = createGraphSchema(db);
|
|
303
|
+
migrateEntitiesSchema(db);
|
|
304
|
+
migrateRelationshipsSchema(db);
|
|
305
|
+
try {
|
|
306
|
+
const oldRows = db.prepare('SELECT rowid, id, logical_entity_id, signature_hash FROM entities WHERE file_path = ? AND epoch_retired IS NULL').all(rel);
|
|
307
|
+
const oldByLogical = new Map(oldRows.map((r) => [r.logical_entity_id || r.id, r]));
|
|
308
|
+
const oldIds = oldRows.map((r) => r.id);
|
|
309
|
+
const extractor = new GraphExtractor();
|
|
310
|
+
const parsed = hashes.deleted
|
|
311
|
+
? { entities: [], relationships: [] }
|
|
312
|
+
: await extractor.extractFromFile(rel, hashes.content);
|
|
313
|
+
this.progress('production:graph-extracted');
|
|
314
|
+
const entities = [...(parsed.entities || [])];
|
|
315
|
+
const relationships = parsed.relationships || [];
|
|
316
|
+
const fileLogicalId = graphEntityLogicalId(rel, 'file', path.basename(rel));
|
|
317
|
+
if (relationships.some((r) => r.source_id === fileLogicalId) && !entities.some((e) => e.id === fileLogicalId)) {
|
|
318
|
+
entities.unshift({
|
|
319
|
+
id: fileLogicalId,
|
|
320
|
+
file_path: rel,
|
|
321
|
+
type: 'file',
|
|
322
|
+
name: path.basename(rel),
|
|
323
|
+
signature: `file ${rel}`,
|
|
324
|
+
signature_hash: contentHashSync(`file:${rel}`),
|
|
325
|
+
start_line: 1,
|
|
326
|
+
end_line: Math.max(1, hashes.content?.split('\n').length || 1),
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
let upsert = 0;
|
|
330
|
+
let tombstone = 0;
|
|
331
|
+
const liveIdFor = new Map();
|
|
332
|
+
const tx = db.transaction(() => {
|
|
333
|
+
const retireEntity = db.prepare('UPDATE entities SET epoch_retired = ?, stale_since = COALESCE(stale_since, ?) WHERE id = ? AND epoch_retired IS NULL');
|
|
334
|
+
const retireRel = oldIds.length > 0
|
|
335
|
+
? db.prepare(`UPDATE relationships SET epoch_retired = ? WHERE source_id IN (${oldIds.map(() => '?').join(',')}) AND epoch_retired IS NULL`)
|
|
336
|
+
: null;
|
|
337
|
+
if (retireRel) retireRel.run(epoch, ...oldIds);
|
|
338
|
+
const nextLogical = new Set(entities.map((e) => e.id));
|
|
339
|
+
for (const row of oldRows) {
|
|
340
|
+
if (!nextLogical.has(row.logical_entity_id || row.id)) {
|
|
341
|
+
retireEntity.run(epoch, epoch, row.id);
|
|
342
|
+
tombstone += 1;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
for (const e of entities) {
|
|
346
|
+
const old = oldByLogical.get(e.id);
|
|
347
|
+
if (old && old.signature_hash === (e.signature_hash || null)) {
|
|
348
|
+
liveIdFor.set(e.id, old.id);
|
|
349
|
+
continue;
|
|
350
|
+
}
|
|
351
|
+
if (old) {
|
|
352
|
+
retireEntity.run(epoch, epoch, old.id);
|
|
353
|
+
tombstone += 1;
|
|
354
|
+
}
|
|
355
|
+
const physical = uniquePhysicalId(db, 'entities', `${e.id}@e${epoch}`);
|
|
356
|
+
liveIdFor.set(e.id, physical);
|
|
357
|
+
insertEntity(db, e, physical, epoch, hasFts);
|
|
358
|
+
upsert += 1;
|
|
359
|
+
}
|
|
360
|
+
insertRelationships(db, relationships, liveIdFor, epoch);
|
|
361
|
+
});
|
|
362
|
+
tx();
|
|
363
|
+
this.progress('production:graph-written');
|
|
364
|
+
if (hasFts) for (const table of ['entities_fts', 'entities_trigram']) try { fts5Merge(db, table, 16); } catch {}
|
|
365
|
+
this.touched.set(rel, { ...(this.touched.get(rel) || {}), graphEntities: entities.length });
|
|
366
|
+
return { ops: { graph_upsert: upsert, graph_tombstone: tombstone }, manifest: { path: 'code-graph.db' } };
|
|
367
|
+
} finally {
|
|
368
|
+
db.close();
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
async applyVectorDelta(file, _chunks, hashes, epoch) {
|
|
373
|
+
const rel = typeof file === 'string' ? file : file.path;
|
|
374
|
+
const dbPath = path.join(this.stateDir, 'codebase.db');
|
|
375
|
+
const existed = fs.existsSync(dbPath);
|
|
376
|
+
const db = new Database(dbPath);
|
|
377
|
+
db.pragma('journal_mode = WAL');
|
|
378
|
+
db.pragma('synchronous = NORMAL');
|
|
379
|
+
existed ? ensureVectorSchema(db) : createVectorSchema(db);
|
|
380
|
+
const vectorOps = [];
|
|
381
|
+
let chunks = [];
|
|
382
|
+
try {
|
|
383
|
+
if (hashes.deleted) {
|
|
384
|
+
const snap = snapshotFileRows(db, rel);
|
|
385
|
+
const retire = db.transaction(() => applyDiff(db, rel, { toReuse: [], toEncode: [], toRetire: [...snap.values()].map((r) => ({ rowId: r.id, chunkStructId: r.chunk_struct_id })) }, epoch));
|
|
386
|
+
const summary = retire();
|
|
387
|
+
const retired = summary.retiredRows.map((r) => ({ retireId: r.oldId, file: rel }));
|
|
388
|
+
this.touched.set(rel, { ...(this.touched.get(rel) || {}), hash: hashes, chunkIds: [] });
|
|
389
|
+
return { ops: { vectors_delete: summary.retiredRows.length }, vectorOps: retired, tokenOps: retired, gramOps: [{ file: rel, deleted: true }] };
|
|
390
|
+
}
|
|
391
|
+
const parsed = await new ASTChunker({ projectRoot: this.projectRoot }).parseFile(rel, hashes.content);
|
|
392
|
+
this.progress('production:vector-parsed');
|
|
393
|
+
chunks = await enrichChunksFromGraph(parsed.map((chunk, i) => ({ ...chunk, file: rel, id: `${rel}:${chunk.metadata?.line_start || 0}-${chunk.metadata?.line_end || chunk.metadata?.line_start || 0}:${i}` })), this.stateDir);
|
|
394
|
+
this.progress('production:vector-enriched');
|
|
395
|
+
// LI generated-content parity: decide ONCE, from the file's full chunk set
|
|
396
|
+
// (exactly like full indexing's per-file applyIndexingChunkPolicy), whether
|
|
397
|
+
// late interaction skips this file. Embeddings/graph/sparse still index it.
|
|
398
|
+
// Stored per-file so applyLIDelta drops adds even when only a non-first
|
|
399
|
+
// chunk changed and the @generated first chunk was hash-reused.
|
|
400
|
+
const liKept = applyIndexingChunkPolicy(chunks, { projectRoot: this.projectRoot }).kept;
|
|
401
|
+
if (chunks.length > 0 && liKept.length === 0) this._liSkipFiles.add(rel);
|
|
402
|
+
else this._liSkipFiles.delete(rel);
|
|
403
|
+
const annotations = annotateChunksForDelta(chunks, rel);
|
|
404
|
+
const snap = snapshotFileRows(db, rel);
|
|
405
|
+
const delta = diffChunks(chunks, annotations, snap);
|
|
406
|
+
const texts = delta.toEncode.map(({ chunk }) => chunk.embedding_text || `${rel}\n${chunk.text || chunk.content || ''}`);
|
|
407
|
+
const embeddings = texts.length > 0
|
|
408
|
+
? (await this.vectorEncoder(texts, { onProgress: () => this.progress('production:vector-embedding') })).map((r) => r.embedding || r)
|
|
409
|
+
: [];
|
|
410
|
+
this.progress('production:vector-embedded');
|
|
411
|
+
const encodedChunks = delta.toEncode.map(({ chunk }, i) => ({ ...chunk, id: `${chunk.id}@e${epoch}.${i}` }));
|
|
412
|
+
const encodedAnnotations = delta.toEncode.map((x) => x.ann);
|
|
413
|
+
const tx = db.transaction(() => {
|
|
414
|
+
const summary = applyDiff(db, rel, delta, epoch);
|
|
415
|
+
if (encodedChunks.length > 0) {
|
|
416
|
+
const items = buildInsertItems(encodedChunks, embeddings, this.modelInfo, encodedAnnotations, { epochWritten: epoch });
|
|
417
|
+
insertVectorItems(db, items);
|
|
418
|
+
}
|
|
419
|
+
return summary;
|
|
420
|
+
});
|
|
421
|
+
const summary = tx();
|
|
422
|
+
this.progress('production:vector-written');
|
|
423
|
+
const retiredRows = [...summary.replacedRows, ...summary.retiredRows, ...summary.versionedRows];
|
|
424
|
+
for (const row of retiredRows) {
|
|
425
|
+
vectorOps.push({ retireId: row.oldId });
|
|
426
|
+
}
|
|
427
|
+
const newIds = [...encodedChunks.map((c) => c.id), ...summary.versionedRows.map((r) => r.newId)];
|
|
428
|
+
if (newIds.length > 0) {
|
|
429
|
+
const rows = db.prepare(`SELECT id, embedding, metadata FROM vectors WHERE id IN (${newIds.map(() => '?').join(',')})`).all(...newIds);
|
|
430
|
+
for (const row of rows) {
|
|
431
|
+
vectorOps.push({ addId: row.id, embedding: float32FromBuffer(row.embedding), metadata: JSON.parse(row.metadata || '{}') });
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
const tokenOps = retiredRows.map((row) => ({ retireId: row.oldId, file: rel }));
|
|
435
|
+
for (const chunk of encodedChunks) tokenOps.push({ addId: chunk.id, chunk });
|
|
436
|
+
for (const row of summary.versionedRows) {
|
|
437
|
+
const reused = delta.toReuse.find((item) => item.ann?.chunkStructId === row.chunkStructId);
|
|
438
|
+
if (reused?.chunk) tokenOps.push({ addId: row.newId, chunk: reused.chunk });
|
|
439
|
+
}
|
|
440
|
+
this.touched.set(rel, { ...(this.touched.get(rel) || {}), hash: hashes, chunkIds: newIds, content: hashes.content });
|
|
441
|
+
return {
|
|
442
|
+
ops: { vectors_upsert: newIds.length, vectors_delete: vectorOps.filter((o) => o.retireId).length },
|
|
443
|
+
chunksTotal: chunks.length,
|
|
444
|
+
chunksEncoded: encodedChunks.length,
|
|
445
|
+
chunksReused: delta.toReuse.length,
|
|
446
|
+
chunksMetadataDirty: delta.counters.metadata_dirty,
|
|
447
|
+
vectorOps,
|
|
448
|
+
tokenOps,
|
|
449
|
+
gramOps: [{ file: rel, deleted: false, content: hashes.content, contentHash: hashes.contentHash }],
|
|
450
|
+
manifest: { path: 'codebase.db' },
|
|
451
|
+
};
|
|
452
|
+
} finally {
|
|
453
|
+
db.close();
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
async applyHNSWDelta(_file, ops) {
|
|
458
|
+
if (!Array.isArray(ops) || ops.length === 0) return { ops: { hnsw_add: 0, hnsw_tombstone: 0 } };
|
|
459
|
+
const indexPath = path.join(this.stateDir, 'codebase-hnsw.idx');
|
|
460
|
+
const index = new HNSWIndex({ indexPath, stalePath: `${indexPath}.stale.bin`, dimension: this.modelInfo.hnswDimension });
|
|
461
|
+
try { await index.load(indexPath); } catch { await index.init(); }
|
|
462
|
+
this.progress('production:hnsw-loaded');
|
|
463
|
+
let add = 0; let tombstone = 0;
|
|
464
|
+
for (const op of ops) {
|
|
465
|
+
if (op.retireId && await index.remove(op.retireId)) tombstone += 1;
|
|
466
|
+
if (op.addId && op.embedding) {
|
|
467
|
+
await index.add(op.addId, truncateForHNSW(op.embedding, this.modelInfo.hnswDimension), { file: op.metadata?.file, name: op.metadata?.name, type: op.metadata?.type });
|
|
468
|
+
add += 1;
|
|
469
|
+
}
|
|
470
|
+
if ((add + tombstone) > 0 && (add + tombstone) % 100 === 0) this.progress('production:hnsw-loop');
|
|
471
|
+
}
|
|
472
|
+
await index.save(indexPath);
|
|
473
|
+
this.progress('production:hnsw-saved');
|
|
474
|
+
return { ops: { hnsw_add: add, hnsw_tombstone: tombstone }, manifest: { path: 'codebase-hnsw.idx', stale: 'codebase-hnsw.idx.stale.bin' } };
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
async applyBinaryHNSWDelta(_file, ops) {
|
|
478
|
+
if (!Array.isArray(ops) || ops.length === 0) return { ops: { binary_hnsw_append: 0, binary_hnsw_tombstone: 0 } };
|
|
479
|
+
const indexPath = path.join(this.stateDir, 'codebase-binary-hnsw.idx');
|
|
480
|
+
const index = new BinaryHNSWIndex({ indexPath, stalePath: `${indexPath}.stale.bin`, floatDimension: this.modelInfo.hnswDimension });
|
|
481
|
+
try { await index.load(indexPath); } catch { await index.init(); }
|
|
482
|
+
this.progress('production:binary-hnsw-loaded');
|
|
483
|
+
const binaryVectorsBefore = index.idToIndex?.size ?? 0;
|
|
484
|
+
let append = 0; let tombstone = 0;
|
|
485
|
+
const floatUpserts = [];
|
|
486
|
+
const floatRemoveIds = [];
|
|
487
|
+
for (const op of ops) {
|
|
488
|
+
if (op.retireId) {
|
|
489
|
+
if (markBinaryStale(index, op.retireId)) tombstone += 1;
|
|
490
|
+
floatRemoveIds.push(op.retireId);
|
|
491
|
+
}
|
|
492
|
+
if (op.addId && op.embedding) {
|
|
493
|
+
const truncated = truncateForHNSW(op.embedding, this.modelInfo.hnswDimension);
|
|
494
|
+
await index.add(op.addId, floatToBinary(truncated), op.metadata || {}, normalizedFloatToInt8(truncated));
|
|
495
|
+
floatUpserts.push({ id: op.addId, vector: truncated });
|
|
496
|
+
append += 1;
|
|
497
|
+
}
|
|
498
|
+
if ((append + tombstone) > 0 && (append + tombstone) % 100 === 0) this.progress('production:binary-hnsw-loop');
|
|
499
|
+
}
|
|
500
|
+
await index.save(indexPath);
|
|
501
|
+
this.progress('production:binary-hnsw-saved');
|
|
502
|
+
await maintainFloatStore(indexPath, { upserts: floatUpserts, removeIds: floatRemoveIds, binaryVectorsBefore, dimension: this.modelInfo.hnswDimension });
|
|
503
|
+
this.progress('production:float-store-maintained');
|
|
504
|
+
return { ops: { binary_hnsw_append: append, binary_hnsw_tombstone: tombstone }, manifest: { path: 'codebase-binary-hnsw.idx' } };
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
async applyLIDelta(file, ops) {
|
|
508
|
+
if (!Array.isArray(ops) || ops.length === 0) return { ops: { li_segment_append: 0, li_tombstone: 0 } };
|
|
509
|
+
// LI generated-content parity: full indexing's buildLateInteractionIndex runs
|
|
510
|
+
// applyIndexingChunkPolicy so @generated / config-excluded files never reach
|
|
511
|
+
// the LI encoder, while embeddings/graph/sparse still index them. The skip is
|
|
512
|
+
// a per-file decision computed in applyVectorDelta. Drop this file's LI adds
|
|
513
|
+
// when flagged; retire ops always flow through (a file that becomes generated
|
|
514
|
+
// tombstones its old LI docs and adds none — net retire from LI only).
|
|
515
|
+
const rel = typeof file === 'string' ? file : file?.path;
|
|
516
|
+
const filteredOps = (rel && this._liSkipFiles.has(rel))
|
|
517
|
+
? ops.filter((op) => !(op.addId && op.chunk))
|
|
518
|
+
: ops;
|
|
519
|
+
const { applyLateInteractionDelta } = await import('./production-li-delta.mjs');
|
|
520
|
+
const { appended, tombstone } = await applyLateInteractionDelta({
|
|
521
|
+
indexPath: path.join(this.stateDir, 'codebase-late-interaction.db'),
|
|
522
|
+
ops: filteredOps,
|
|
523
|
+
liEncoder: this.liEncoder,
|
|
524
|
+
pickLiInput,
|
|
525
|
+
onProgress: () => this.progress('production:li-delta'),
|
|
526
|
+
});
|
|
527
|
+
return { ops: { li_segment_append: appended, li_tombstone: tombstone }, manifest: { path: 'codebase-late-interaction.db', segments: 'codebase-late-interaction.db.segments/manifest.json' } };
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
applySparseGramDelta(_file, ops, epoch) {
|
|
531
|
+
if (!Array.isArray(ops) || ops.length === 0) return { ops: { sparse_gram_delta_upsert: 0 } };
|
|
532
|
+
const base = path.join(this.stateDir, 'codebase-sparse-grams.idx');
|
|
533
|
+
let count = 0;
|
|
534
|
+
for (const op of ops) {
|
|
535
|
+
const record = op.deleted ? { weightsId: this.activeSparseWeightsId(base), grams: [] } : sparseGramRecord(base, op.content);
|
|
536
|
+
appendDeltaRecord(base, epoch, {
|
|
537
|
+
fileId: fileIdFor(op.file),
|
|
538
|
+
filePath: op.file,
|
|
539
|
+
contentHash: op.contentHash || '',
|
|
540
|
+
deleted: !!op.deleted,
|
|
541
|
+
symbolMask: 0,
|
|
542
|
+
weightsId: record.weightsId,
|
|
543
|
+
grams: record.grams,
|
|
544
|
+
});
|
|
545
|
+
count += 1;
|
|
546
|
+
if (count % 100 === 0) this.progress('production:sparse-loop');
|
|
547
|
+
}
|
|
548
|
+
this.progress('production:sparse-done');
|
|
549
|
+
return { ops: { sparse_gram_delta_upsert: count }, manifest: { base: 'codebase-sparse-grams.idx', deltas: listDeltaSegments(base, { maxEpoch: epoch }).map((s) => relativeArtifact(this.stateDir, s.path)), weightsId: this.activeSparseWeightsId(base) } };
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
activeSparseWeightsId(base) {
|
|
553
|
+
const latest = resolveLatestSparseWeightsId(base);
|
|
554
|
+
return latest || readManifest(this.stateDir)?.sparseGram?.weightsId || FALLBACK_WEIGHTS_ID;
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
readMaintenanceState() {
|
|
558
|
+
return readMaintenanceStateFromArtifacts(this.stateDir);
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
persistManifest(manifest) {
|
|
562
|
+
writeManifest(this.stateDir, manifest);
|
|
563
|
+
const merklePath = path.join(this.stateDir, MERKLE_STATE);
|
|
564
|
+
const merkle = readJson(merklePath, { version: '2.4', files: {}, stats: {} });
|
|
565
|
+
merkle.files ||= {};
|
|
566
|
+
for (const [file, data] of this.touched.entries()) {
|
|
567
|
+
if (data.hash?.deleted) delete merkle.files[file];
|
|
568
|
+
else merkle.files[file] = { hash: data.hash.contentHash, ...data.hash.stat, epoch: manifest.epoch, chunkIds: data.chunkIds || [] };
|
|
569
|
+
}
|
|
570
|
+
merkle.lastIndex = new Date().toISOString();
|
|
571
|
+
merkle.epoch = manifest.epoch;
|
|
572
|
+
merkle.stats = { ...(merkle.stats || {}), totalFiles: Object.keys(merkle.files).length };
|
|
573
|
+
safeWriteJson(merklePath, merkle);
|
|
574
|
+
try { fs.unlinkSync(path.join(this.stateDir, PROCESSING_QUEUE)); } catch {}
|
|
575
|
+
fs.appendFileSync(path.join(this.stateDir, METRICS_FILE), JSON.stringify({ ...manifest, ts: Date.now() / 1000, epoch: manifest.epoch }) + '\n');
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
export const __testing = {
|
|
580
|
+
ProductionReconcileAdapter,
|
|
581
|
+
sparseGramRecord,
|
|
582
|
+
markBinaryStale,
|
|
583
|
+
};
|