opencode-semantic-search 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +165 -0
- package/README.md +138 -0
- package/SETUP.md +541 -0
- package/bin/opencode-semantic-search.mjs +70 -0
- package/bun.lock +61 -0
- package/index.ts +138 -0
- package/install.sh +260 -0
- package/package.json +67 -0
- package/src/chunker/fallback.ts +77 -0
- package/src/chunker/index.ts +16 -0
- package/src/chunker/treesitter.ts +119 -0
- package/src/config.ts +157 -0
- package/src/diagnostics/bundle.ts +63 -0
- package/src/diagnostics/routing.ts +37 -0
- package/src/embedder/interface.ts +62 -0
- package/src/embedder/ollama.ts +60 -0
- package/src/embedder/openai.ts +71 -0
- package/src/indexer/delta.ts +165 -0
- package/src/indexer/gc.ts +10 -0
- package/src/indexer/incremental.ts +105 -0
- package/src/indexer/pipeline.test.ts +126 -0
- package/src/indexer/pipeline.ts +394 -0
- package/src/indexer/pool.ts +25 -0
- package/src/indexer/resume.ts +14 -0
- package/src/logger.ts +121 -0
- package/src/runtime.ts +111 -0
- package/src/search/context.ts +17 -0
- package/src/search/hybrid.ts +65 -0
- package/src/store/schema.sql +31 -0
- package/src/store/sqlite.ts +269 -0
- package/src/tools/diagnostic_bundle.ts +34 -0
- package/src/tools/index_status.ts +73 -0
- package/src/tools/reindex.ts +71 -0
- package/src/tools/semantic_search.ts +91 -0
- package/src/tools/smart_grep.ts +198 -0
- package/src/tui_toast.ts +191 -0
- package/src/types.d.ts +1 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import crypto from "node:crypto";
|
|
2
|
+
import type { PluginConfig } from "../config";
|
|
3
|
+
import { chunkFile } from "../chunker";
|
|
4
|
+
import { withQueryExpansion, type Embedder } from "../embedder/interface";
|
|
5
|
+
import type { Logger } from "../logger";
|
|
6
|
+
import type { SemanticStore } from "../store/sqlite";
|
|
7
|
+
|
|
8
|
+
function sha256(input: string): string {
|
|
9
|
+
return crypto.createHash("sha256").update(input).digest("hex");
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export async function indexSingleFile(
|
|
13
|
+
filePath: string,
|
|
14
|
+
store: SemanticStore,
|
|
15
|
+
embedder: Embedder,
|
|
16
|
+
config: PluginConfig,
|
|
17
|
+
logger?: Logger
|
|
18
|
+
): Promise<void> {
|
|
19
|
+
const startedAt = Date.now();
|
|
20
|
+
const file = Bun.file(filePath);
|
|
21
|
+
if (!(await file.exists())) {
|
|
22
|
+
await logger?.debug("indexer.incremental", {
|
|
23
|
+
message: "Skipped missing file",
|
|
24
|
+
extra: { filePath }
|
|
25
|
+
});
|
|
26
|
+
return;
|
|
27
|
+
}
|
|
28
|
+
const stat = await file.stat();
|
|
29
|
+
if (stat.size / 1024 > config.indexing.max_file_size_kb) {
|
|
30
|
+
await logger?.debug("indexer.incremental", {
|
|
31
|
+
message: "Skipped oversized file",
|
|
32
|
+
extra: { filePath, sizeBytes: stat.size, maxFileSizeKb: config.indexing.max_file_size_kb }
|
|
33
|
+
});
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const text = await file.text();
|
|
38
|
+
const fileHash = sha256(text);
|
|
39
|
+
|
|
40
|
+
const chunks = await chunkFile(filePath, text, config);
|
|
41
|
+
await logger?.debug("indexer.incremental", {
|
|
42
|
+
message: "Chunked file for indexing",
|
|
43
|
+
extra: { filePath, chunkCount: chunks.length }
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
const toEmbed: string[] = [];
|
|
47
|
+
const meta = chunks.map((chunk) => {
|
|
48
|
+
const chunkHash = sha256(chunk.text);
|
|
49
|
+
const cached = store.getChunkByHash(chunkHash);
|
|
50
|
+
if (!cached) {
|
|
51
|
+
toEmbed.push(withQueryExpansion(chunk.text));
|
|
52
|
+
}
|
|
53
|
+
return { chunk, chunkHash, cached };
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
const embeddings = toEmbed.length > 0 ? await embedder.embed(toEmbed) : [];
|
|
57
|
+
let cursor = 0;
|
|
58
|
+
const writes: Array<{
|
|
59
|
+
startLine: number;
|
|
60
|
+
endLine: number;
|
|
61
|
+
text: string;
|
|
62
|
+
chunkHash: string;
|
|
63
|
+
embedding: number[];
|
|
64
|
+
}> = [];
|
|
65
|
+
for (const m of meta) {
|
|
66
|
+
if (m.cached) {
|
|
67
|
+
writes.push({
|
|
68
|
+
startLine: m.chunk.startLine,
|
|
69
|
+
endLine: m.chunk.endLine,
|
|
70
|
+
text: m.chunk.text,
|
|
71
|
+
chunkHash: m.chunkHash,
|
|
72
|
+
embedding: m.cached.embedding,
|
|
73
|
+
});
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
const embedding = embeddings[cursor] ?? [];
|
|
77
|
+
cursor += 1;
|
|
78
|
+
writes.push({
|
|
79
|
+
startLine: m.chunk.startLine,
|
|
80
|
+
endLine: m.chunk.endLine,
|
|
81
|
+
text: m.chunk.text,
|
|
82
|
+
chunkHash: m.chunkHash,
|
|
83
|
+
embedding,
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
store.writeFileAndChunks(filePath, fileHash, writes);
|
|
88
|
+
|
|
89
|
+
if (toEmbed.length === 0) {
|
|
90
|
+
await logger?.debug("indexer.incremental", {
|
|
91
|
+
message: "No new embeddings needed",
|
|
92
|
+
extra: { filePath, chunkCount: chunks.length, elapsedMs: Date.now() - startedAt }
|
|
93
|
+
});
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
await logger?.info("indexer.incremental", {
|
|
97
|
+
message: "Indexed file",
|
|
98
|
+
extra: {
|
|
99
|
+
filePath,
|
|
100
|
+
chunkCount: chunks.length,
|
|
101
|
+
embeddedChunks: toEmbed.length,
|
|
102
|
+
elapsedMs: Date.now() - startedAt
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { test, expect } from "bun:test";
|
|
4
|
+
import { defaultConfig } from "../config";
|
|
5
|
+
import type { Embedder } from "../embedder/interface";
|
|
6
|
+
import type { DeltaProgressPayload } from "../runtime";
|
|
7
|
+
import { SemanticStore } from "../store/sqlite";
|
|
8
|
+
import { chunked } from "../embedder/interface";
|
|
9
|
+
import { runIndexingPipeline } from "./pipeline";
|
|
10
|
+
|
|
11
|
+
const dim = defaultConfig.embedding.dimensions;
|
|
12
|
+
|
|
13
|
+
const mockEmbedder: Embedder = {
|
|
14
|
+
async embed(texts: string[]) {
|
|
15
|
+
return texts.map(() => new Array<number>(dim).fill(0.01));
|
|
16
|
+
},
|
|
17
|
+
async healthcheck() {
|
|
18
|
+
return true;
|
|
19
|
+
},
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
function schemaPath(): string {
|
|
23
|
+
return path.join(import.meta.dir, "../store/schema.sql");
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
test("runIndexingPipeline writes one file and chunks", async () => {
|
|
27
|
+
const dir = fs.mkdtempSync(path.join(fs.realpathSync("/tmp"), "sem-pipe-"));
|
|
28
|
+
const dbPath = path.join(dir, "idx.sqlite");
|
|
29
|
+
const store = new SemanticStore(dbPath, schemaPath());
|
|
30
|
+
const filePath = path.join(dir, "hello.ts");
|
|
31
|
+
fs.writeFileSync(filePath, "export const x = 1;\n");
|
|
32
|
+
|
|
33
|
+
const result = await runIndexingPipeline([filePath], store, mockEmbedder, defaultConfig, {});
|
|
34
|
+
expect(result.indexed).toBe(1);
|
|
35
|
+
expect(result.failed.length).toBe(0);
|
|
36
|
+
|
|
37
|
+
const stats = store.stats();
|
|
38
|
+
expect(stats.files).toBe(1);
|
|
39
|
+
expect(stats.chunks).toBeGreaterThan(0);
|
|
40
|
+
store.close();
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
test("runIndexingPipeline handles multiple files with cross-file embedding batches", async () => {
|
|
44
|
+
const dir = fs.mkdtempSync(path.join(fs.realpathSync("/tmp"), "sem-pipe-"));
|
|
45
|
+
const dbPath = path.join(dir, "idx2.sqlite");
|
|
46
|
+
const store = new SemanticStore(dbPath, schemaPath());
|
|
47
|
+
const a = path.join(dir, "a.ts");
|
|
48
|
+
const b = path.join(dir, "b.ts");
|
|
49
|
+
fs.writeFileSync(a, "export const a = 1;\n");
|
|
50
|
+
fs.writeFileSync(b, "export const b = 2;\n");
|
|
51
|
+
|
|
52
|
+
const cfg = {
|
|
53
|
+
...defaultConfig,
|
|
54
|
+
embedding: { ...defaultConfig.embedding, batch_size: 2 },
|
|
55
|
+
indexing: { ...defaultConfig.indexing, concurrency: 2, embed_concurrency: 2 },
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
const result = await runIndexingPipeline([a, b], store, mockEmbedder, cfg, {});
|
|
59
|
+
expect(result.indexed).toBe(2);
|
|
60
|
+
expect(result.failed.length).toBe(0);
|
|
61
|
+
expect(store.stats().files).toBe(2);
|
|
62
|
+
store.close();
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
test("chunked batch size 0 does not hang (regression)", () => {
|
|
66
|
+
const out = chunked([1, 2, 3, 4], 0);
|
|
67
|
+
expect(out.length).toBeGreaterThan(0);
|
|
68
|
+
expect(out.flat().length).toBe(4);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
test("runIndexingPipeline writes each file once and progress is monotonic with cache-heavy mix", async () => {
|
|
72
|
+
const dir = fs.mkdtempSync(path.join(fs.realpathSync("/tmp"), "sem-pipe-"));
|
|
73
|
+
const files: string[] = [];
|
|
74
|
+
for (let i = 0; i < 10; i += 1) {
|
|
75
|
+
const fp = path.join(dir, `f-${i}.ts`);
|
|
76
|
+
fs.writeFileSync(fp, `export const v${i} = ${i};\n`);
|
|
77
|
+
files.push(fp);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const writeCalls: string[] = [];
|
|
81
|
+
let chunkLookupCalls = 0;
|
|
82
|
+
const fakeStore = {
|
|
83
|
+
getChunkByHash(_chunkHash: string): { embedding: number[] } | null {
|
|
84
|
+
chunkLookupCalls += 1;
|
|
85
|
+
if (chunkLookupCalls <= 8) {
|
|
86
|
+
return { embedding: new Array<number>(dim).fill(0.25) };
|
|
87
|
+
}
|
|
88
|
+
return null;
|
|
89
|
+
},
|
|
90
|
+
writeFileAndChunks(filePath: string): number {
|
|
91
|
+
writeCalls.push(filePath);
|
|
92
|
+
return writeCalls.length;
|
|
93
|
+
},
|
|
94
|
+
} as unknown as SemanticStore;
|
|
95
|
+
|
|
96
|
+
const variableEmbedder: Embedder = {
|
|
97
|
+
async embed(texts: string[]) {
|
|
98
|
+
await Bun.sleep(5);
|
|
99
|
+
return texts.map(() => new Array<number>(dim).fill(0.02));
|
|
100
|
+
},
|
|
101
|
+
async healthcheck() {
|
|
102
|
+
return true;
|
|
103
|
+
},
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
const progress: DeltaProgressPayload[] = [];
|
|
107
|
+
const cfg = {
|
|
108
|
+
...defaultConfig,
|
|
109
|
+
embedding: { ...defaultConfig.embedding, batch_size: 1 },
|
|
110
|
+
indexing: { ...defaultConfig.indexing, concurrency: 1, embed_concurrency: 2 },
|
|
111
|
+
};
|
|
112
|
+
const result = await runIndexingPipeline(files, fakeStore, variableEmbedder, cfg, {
|
|
113
|
+
onProgress(update) {
|
|
114
|
+
progress.push(update);
|
|
115
|
+
},
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
expect(result.failed.length).toBe(0);
|
|
119
|
+
expect(result.indexed).toBe(10);
|
|
120
|
+
expect(writeCalls.length).toBe(10);
|
|
121
|
+
expect(new Set(writeCalls).size).toBe(10);
|
|
122
|
+
const currentValues = progress.map((p) => p.current);
|
|
123
|
+
for (let i = 1; i < currentValues.length; i += 1) {
|
|
124
|
+
expect(currentValues[i]).toBeGreaterThanOrEqual(currentValues[i - 1] ?? 0);
|
|
125
|
+
}
|
|
126
|
+
});
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
import crypto from "node:crypto";
|
|
2
|
+
import type { PluginConfig } from "../config";
|
|
3
|
+
import { chunkFile } from "../chunker";
|
|
4
|
+
import { chunked, withQueryExpansion, type Embedder } from "../embedder/interface";
|
|
5
|
+
import type { Logger } from "../logger";
|
|
6
|
+
import type { DeltaProgressPayload } from "../runtime";
|
|
7
|
+
import type { SemanticStore } from "../store/sqlite";
|
|
8
|
+
import { mapPool } from "./pool";
|
|
9
|
+
|
|
10
|
+
function sha256(input: string): string {
|
|
11
|
+
return crypto.createHash("sha256").update(input).digest("hex");
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
async function withIndexingRetry<T>(
|
|
15
|
+
fn: () => Promise<T>,
|
|
16
|
+
maxAttempts: number,
|
|
17
|
+
baseMs: number
|
|
18
|
+
): Promise<T> {
|
|
19
|
+
let attempt = 0;
|
|
20
|
+
let waitMs = Math.max(1, baseMs);
|
|
21
|
+
let lastErr: unknown;
|
|
22
|
+
while (attempt < maxAttempts) {
|
|
23
|
+
try {
|
|
24
|
+
return await fn();
|
|
25
|
+
} catch (error) {
|
|
26
|
+
lastErr = error;
|
|
27
|
+
attempt += 1;
|
|
28
|
+
if (attempt >= maxAttempts) break;
|
|
29
|
+
await Bun.sleep(waitMs);
|
|
30
|
+
waitMs *= 2;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
throw lastErr;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
interface PendingRow {
|
|
37
|
+
startLine: number;
|
|
38
|
+
endLine: number;
|
|
39
|
+
text: string;
|
|
40
|
+
chunkHash: string;
|
|
41
|
+
embedding?: number[];
|
|
42
|
+
needsEmbed: boolean;
|
|
43
|
+
embedText?: string;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
interface PreparedOk {
|
|
47
|
+
kind: "ok";
|
|
48
|
+
filePath: string;
|
|
49
|
+
fileHash: string;
|
|
50
|
+
rows: PendingRow[];
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
interface PreparedSkip {
|
|
54
|
+
kind: "skip";
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
interface PreparedFail {
|
|
58
|
+
kind: "fail";
|
|
59
|
+
filePath: string;
|
|
60
|
+
error: unknown;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
type Prepared = PreparedOk | PreparedSkip | PreparedFail;
|
|
64
|
+
|
|
65
|
+
export interface IndexingPipelineFailure {
|
|
66
|
+
filePath: string;
|
|
67
|
+
error: unknown;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export interface IndexingPipelineResult {
|
|
71
|
+
/** Files successfully written to the store. */
|
|
72
|
+
indexed: number;
|
|
73
|
+
/** Per-file failures after retries or DB errors. */
|
|
74
|
+
failed: IndexingPipelineFailure[];
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export interface IndexingPipelineOptions {
|
|
78
|
+
logger?: Logger;
|
|
79
|
+
onProgress?: (update: DeltaProgressPayload) => void;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export async function runIndexingPipeline(
|
|
83
|
+
filePaths: readonly string[],
|
|
84
|
+
store: SemanticStore,
|
|
85
|
+
embedder: Embedder,
|
|
86
|
+
config: PluginConfig,
|
|
87
|
+
options?: IndexingPipelineOptions
|
|
88
|
+
): Promise<IndexingPipelineResult> {
|
|
89
|
+
const logger = options?.logger;
|
|
90
|
+
const onProgress = options?.onProgress;
|
|
91
|
+
const chunkConcurrency = Math.max(1, config.indexing.concurrency);
|
|
92
|
+
const embedConcurrency = Math.max(1, config.indexing.embed_concurrency);
|
|
93
|
+
const retryAttempts = Math.max(1, config.indexing.retry_attempts);
|
|
94
|
+
const retryBaseMs = Math.max(1, config.indexing.retry_base_ms);
|
|
95
|
+
|
|
96
|
+
if (filePaths.length === 0) {
|
|
97
|
+
return { indexed: 0, failed: [] };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
await logger?.info("indexer.pipeline", {
|
|
101
|
+
message: "Chunking stage started",
|
|
102
|
+
extra: {
|
|
103
|
+
totalFiles: filePaths.length,
|
|
104
|
+
chunkConcurrency,
|
|
105
|
+
retryAttempts,
|
|
106
|
+
embedConcurrency,
|
|
107
|
+
batchSize: config.embedding.batch_size,
|
|
108
|
+
},
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
onProgress?.({
|
|
112
|
+
phase: "indexing",
|
|
113
|
+
current: 0,
|
|
114
|
+
total: filePaths.length,
|
|
115
|
+
label: "chunking files…",
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
async function prepareOne(filePath: string): Promise<Prepared> {
|
|
119
|
+
try {
|
|
120
|
+
return await withIndexingRetry(async (): Promise<PreparedOk | PreparedSkip> => {
|
|
121
|
+
const file = Bun.file(filePath);
|
|
122
|
+
if (!(await file.exists())) {
|
|
123
|
+
await logger?.debug("indexer.pipeline", {
|
|
124
|
+
message: "Skipped missing file",
|
|
125
|
+
extra: { filePath }
|
|
126
|
+
});
|
|
127
|
+
return { kind: "skip" };
|
|
128
|
+
}
|
|
129
|
+
const stat = await file.stat();
|
|
130
|
+
if (stat.size / 1024 > config.indexing.max_file_size_kb) {
|
|
131
|
+
await logger?.debug("indexer.pipeline", {
|
|
132
|
+
message: "Skipped oversized file",
|
|
133
|
+
extra: { filePath, sizeBytes: stat.size, maxFileSizeKb: config.indexing.max_file_size_kb }
|
|
134
|
+
});
|
|
135
|
+
return { kind: "skip" };
|
|
136
|
+
}
|
|
137
|
+
const text = await file.text();
|
|
138
|
+
const fileHash = sha256(text);
|
|
139
|
+
const chunks = await chunkFile(filePath, text, config);
|
|
140
|
+
await logger?.debug("indexer.pipeline", {
|
|
141
|
+
message: "Chunked file for indexing",
|
|
142
|
+
extra: { filePath, chunkCount: chunks.length }
|
|
143
|
+
});
|
|
144
|
+
const rows: PendingRow[] = [];
|
|
145
|
+
for (const chunk of chunks) {
|
|
146
|
+
const chunkHash = sha256(chunk.text);
|
|
147
|
+
const cached = store.getChunkByHash(chunkHash);
|
|
148
|
+
if (cached) {
|
|
149
|
+
rows.push({
|
|
150
|
+
startLine: chunk.startLine,
|
|
151
|
+
endLine: chunk.endLine,
|
|
152
|
+
text: chunk.text,
|
|
153
|
+
chunkHash,
|
|
154
|
+
embedding: cached.embedding,
|
|
155
|
+
needsEmbed: false,
|
|
156
|
+
});
|
|
157
|
+
} else {
|
|
158
|
+
rows.push({
|
|
159
|
+
startLine: chunk.startLine,
|
|
160
|
+
endLine: chunk.endLine,
|
|
161
|
+
text: chunk.text,
|
|
162
|
+
chunkHash,
|
|
163
|
+
needsEmbed: true,
|
|
164
|
+
embedText: withQueryExpansion(chunk.text),
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
return { kind: "ok", filePath, fileHash, rows };
|
|
169
|
+
}, retryAttempts, retryBaseMs);
|
|
170
|
+
} catch (error) {
|
|
171
|
+
return { kind: "fail", filePath, error };
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
const preparedList = await mapPool([...filePaths], chunkConcurrency, (fp, _i) => prepareOne(fp));
|
|
176
|
+
|
|
177
|
+
const okFiles: PreparedOk[] = [];
|
|
178
|
+
const failed: IndexingPipelineFailure[] = [];
|
|
179
|
+
let skippedStage1 = 0;
|
|
180
|
+
for (const p of preparedList) {
|
|
181
|
+
if (p.kind === "ok") {
|
|
182
|
+
okFiles.push(p);
|
|
183
|
+
} else if (p.kind === "fail") {
|
|
184
|
+
failed.push({ filePath: p.filePath, error: p.error });
|
|
185
|
+
} else {
|
|
186
|
+
skippedStage1 += 1;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const failedStage1 = failed.length;
|
|
191
|
+
let done = skippedStage1 + failedStage1;
|
|
192
|
+
const totalFiles = filePaths.length;
|
|
193
|
+
let reportedCurrent = done;
|
|
194
|
+
|
|
195
|
+
const totalChunksPrepared = okFiles.reduce((n, f) => n + f.rows.length, 0);
|
|
196
|
+
await logger?.info("indexer.pipeline", {
|
|
197
|
+
message: "Chunking stage finished",
|
|
198
|
+
extra: {
|
|
199
|
+
totalFiles,
|
|
200
|
+
preparedOk: okFiles.length,
|
|
201
|
+
skipped: skippedStage1,
|
|
202
|
+
failedPrepare: failedStage1,
|
|
203
|
+
totalChunks: totalChunksPrepared,
|
|
204
|
+
},
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
for (const f of failed) {
|
|
208
|
+
const errMsg = f.error instanceof Error ? f.error.message : String(f.error);
|
|
209
|
+
await logger?.warn("indexer.pipeline", {
|
|
210
|
+
message: "Prepare stage failed for file (after retries)",
|
|
211
|
+
extra: { filePath: f.filePath, errorMessage: errMsg },
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
interface EmbedJob {
|
|
216
|
+
okIndex: number;
|
|
217
|
+
rowIdx: number;
|
|
218
|
+
text: string;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
const embedJobs: EmbedJob[] = [];
|
|
222
|
+
for (let fi = 0; fi < okFiles.length; fi += 1) {
|
|
223
|
+
const f = okFiles[fi];
|
|
224
|
+
if (!f) continue;
|
|
225
|
+
for (let ri = 0; ri < f.rows.length; ri += 1) {
|
|
226
|
+
const row = f.rows[ri];
|
|
227
|
+
if (!row) continue;
|
|
228
|
+
if (row.needsEmbed && row.embedText !== undefined) {
|
|
229
|
+
embedJobs.push({ okIndex: fi, rowIdx: ri, text: row.embedText });
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
const postStage1Done = skippedStage1 + failedStage1;
|
|
235
|
+
function emitProgress(current: number, label: string): void {
|
|
236
|
+
const capped = Math.min(totalFiles, current);
|
|
237
|
+
reportedCurrent = Math.max(reportedCurrent, capped);
|
|
238
|
+
onProgress?.({
|
|
239
|
+
phase: "indexing",
|
|
240
|
+
current: reportedCurrent,
|
|
241
|
+
total: totalFiles,
|
|
242
|
+
label,
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
emitProgress(done, embedJobs.length > 0 ? "embedding…" : "saving index…");
|
|
246
|
+
|
|
247
|
+
const writtenFileIndices = new Set<number>();
|
|
248
|
+
const writeFailedIndices = new Set<number>();
|
|
249
|
+
let indexed = 0;
|
|
250
|
+
|
|
251
|
+
async function tryFlushReadyFiles(embedBatchLabel?: string): Promise<void> {
|
|
252
|
+
for (let fi = 0; fi < okFiles.length; fi += 1) {
|
|
253
|
+
if (writtenFileIndices.has(fi) || writeFailedIndices.has(fi)) continue;
|
|
254
|
+
const f = okFiles[fi];
|
|
255
|
+
if (!f) continue;
|
|
256
|
+
if (!f.rows.every((r) => r.embedding !== undefined)) continue;
|
|
257
|
+
const writes: Array<{
|
|
258
|
+
startLine: number;
|
|
259
|
+
endLine: number;
|
|
260
|
+
text: string;
|
|
261
|
+
chunkHash: string;
|
|
262
|
+
embedding: number[];
|
|
263
|
+
}> = [];
|
|
264
|
+
for (const row of f.rows) {
|
|
265
|
+
const emb = row.embedding;
|
|
266
|
+
if (emb === undefined) continue;
|
|
267
|
+
writes.push({
|
|
268
|
+
startLine: row.startLine,
|
|
269
|
+
endLine: row.endLine,
|
|
270
|
+
text: row.text,
|
|
271
|
+
chunkHash: row.chunkHash,
|
|
272
|
+
embedding: emb,
|
|
273
|
+
});
|
|
274
|
+
}
|
|
275
|
+
try {
|
|
276
|
+
writtenFileIndices.add(fi);
|
|
277
|
+
store.writeFileAndChunks(f.filePath, f.fileHash, writes);
|
|
278
|
+
indexed += 1;
|
|
279
|
+
await logger?.info("indexer.pipeline", {
|
|
280
|
+
message: "Indexed file",
|
|
281
|
+
extra: {
|
|
282
|
+
filePath: f.filePath,
|
|
283
|
+
chunkCount: f.rows.length,
|
|
284
|
+
embeddedChunks: f.rows.filter((r) => r.needsEmbed).length,
|
|
285
|
+
},
|
|
286
|
+
});
|
|
287
|
+
done = postStage1Done + indexed + writeFailedIndices.size;
|
|
288
|
+
const tail = embedBatchLabel ? ` · ${embedBatchLabel}` : "";
|
|
289
|
+
emitProgress(done, `saved ${writtenFileIndices.size}/${okFiles.length}${tail}`);
|
|
290
|
+
} catch (error) {
|
|
291
|
+
writtenFileIndices.delete(fi);
|
|
292
|
+
writeFailedIndices.add(fi);
|
|
293
|
+
failed.push({ filePath: f.filePath, error });
|
|
294
|
+
await logger?.error("indexer.pipeline", {
|
|
295
|
+
message: "Failed to write file to index",
|
|
296
|
+
extra: { filePath: f.filePath },
|
|
297
|
+
error,
|
|
298
|
+
});
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
if (embedJobs.length > 0) {
|
|
304
|
+
const embedStartedAt = Date.now();
|
|
305
|
+
const batches = chunked(embedJobs, config.embedding.batch_size);
|
|
306
|
+
await logger?.info("indexer.pipeline", {
|
|
307
|
+
message: "Embedding stage started (writes flush as each file becomes ready)",
|
|
308
|
+
extra: {
|
|
309
|
+
textsToEmbed: embedJobs.length,
|
|
310
|
+
batchCount: batches.length,
|
|
311
|
+
batchSize: config.embedding.batch_size,
|
|
312
|
+
embedConcurrency,
|
|
313
|
+
},
|
|
314
|
+
});
|
|
315
|
+
let flushQueue = Promise.resolve();
|
|
316
|
+
const flushReadyFiles = (label?: string): Promise<void> => {
|
|
317
|
+
const run = async () => {
|
|
318
|
+
await tryFlushReadyFiles(label);
|
|
319
|
+
};
|
|
320
|
+
flushQueue = flushQueue.then(run, run);
|
|
321
|
+
return flushQueue;
|
|
322
|
+
};
|
|
323
|
+
await flushReadyFiles();
|
|
324
|
+
let embedBatchesDone = 0;
|
|
325
|
+
const batchCount = batches.length;
|
|
326
|
+
await mapPool(batches, embedConcurrency, async (batch) => {
|
|
327
|
+
if (batch.length === 0) return;
|
|
328
|
+
const texts = batch.map((j) => j.text);
|
|
329
|
+
const vectors = await withIndexingRetry(
|
|
330
|
+
() => embedder.embed(texts),
|
|
331
|
+
retryAttempts,
|
|
332
|
+
retryBaseMs
|
|
333
|
+
);
|
|
334
|
+
embedBatchesDone += 1;
|
|
335
|
+
const batchLabel = `embedding batch ${embedBatchesDone}/${batchCount}`;
|
|
336
|
+
if (vectors.length !== batch.length) {
|
|
337
|
+
await logger?.warn("indexer.pipeline", {
|
|
338
|
+
message: "Embedding provider returned wrong vector count — filling zeros for batch",
|
|
339
|
+
extra: { expected: batch.length, received: vectors.length },
|
|
340
|
+
});
|
|
341
|
+
for (const job of batch) {
|
|
342
|
+
if (job) {
|
|
343
|
+
const target = okFiles[job.okIndex]?.rows[job.rowIdx];
|
|
344
|
+
if (target) {
|
|
345
|
+
target.embedding = new Array<number>(config.embedding.dimensions).fill(0);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
} else {
|
|
350
|
+
for (let vi = 0; vi < batch.length; vi += 1) {
|
|
351
|
+
const job = batch[vi];
|
|
352
|
+
const vec = vectors[vi];
|
|
353
|
+
if (!job) continue;
|
|
354
|
+
const target = okFiles[job.okIndex]?.rows[job.rowIdx];
|
|
355
|
+
if (target) {
|
|
356
|
+
target.embedding = vec ?? [];
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
done =
|
|
361
|
+
postStage1Done +
|
|
362
|
+
(batchCount > 0 ? Math.floor((embedBatchesDone / batchCount) * okFiles.length) : okFiles.length);
|
|
363
|
+
emitProgress(done, batchLabel);
|
|
364
|
+
await flushReadyFiles(batchLabel);
|
|
365
|
+
});
|
|
366
|
+
await logger?.info("indexer.pipeline", {
|
|
367
|
+
message: "Embedding stage finished",
|
|
368
|
+
extra: { elapsedMs: Date.now() - embedStartedAt, textsEmbedded: embedJobs.length },
|
|
369
|
+
});
|
|
370
|
+
} else {
|
|
371
|
+
await logger?.info("indexer.pipeline", {
|
|
372
|
+
message: "Embedding stage skipped (all chunks served from cache)",
|
|
373
|
+
extra: { files: okFiles.length },
|
|
374
|
+
});
|
|
375
|
+
await tryFlushReadyFiles();
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
await logger?.info("indexer.pipeline", {
|
|
379
|
+
message: "Write sweep for any remaining files",
|
|
380
|
+
extra: { pending: okFiles.length - writtenFileIndices.size - writeFailedIndices.size },
|
|
381
|
+
});
|
|
382
|
+
await tryFlushReadyFiles();
|
|
383
|
+
|
|
384
|
+
await logger?.info("indexer.pipeline", {
|
|
385
|
+
message: "Indexing pipeline finished",
|
|
386
|
+
extra: {
|
|
387
|
+
indexed,
|
|
388
|
+
failedCount: failed.length,
|
|
389
|
+
failedPaths: failed.map((x) => x.filePath),
|
|
390
|
+
},
|
|
391
|
+
});
|
|
392
|
+
|
|
393
|
+
return { indexed, failed };
|
|
394
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Run up to `concurrency` tasks in parallel; results match `items` order.
|
|
3
|
+
*/
|
|
4
|
+
export async function mapPool<T, R>(
|
|
5
|
+
items: readonly T[],
|
|
6
|
+
concurrency: number,
|
|
7
|
+
mapper: (item: T, index: number) => Promise<R>
|
|
8
|
+
): Promise<R[]> {
|
|
9
|
+
if (items.length === 0) return [];
|
|
10
|
+
const limit = Math.max(1, Math.min(concurrency, items.length));
|
|
11
|
+
const results = new Array<R>(items.length);
|
|
12
|
+
let nextIndex = 0;
|
|
13
|
+
const worker = async (): Promise<void> => {
|
|
14
|
+
while (true) {
|
|
15
|
+
const i = nextIndex;
|
|
16
|
+
nextIndex += 1;
|
|
17
|
+
if (i >= items.length) return;
|
|
18
|
+
const item = items[i];
|
|
19
|
+
if (item === undefined) return;
|
|
20
|
+
results[i] = await mapper(item, i);
|
|
21
|
+
}
|
|
22
|
+
};
|
|
23
|
+
await Promise.all(Array.from({ length: limit }, () => worker()));
|
|
24
|
+
return results;
|
|
25
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { SemanticStore } from "../store/sqlite";
|
|
2
|
+
|
|
3
|
+
export function markSyncStarted(store: SemanticStore): void {
|
|
4
|
+
store.setSyncState("sync_in_progress", "true");
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export function markSyncCompleted(store: SemanticStore): void {
|
|
8
|
+
store.setSyncState("sync_in_progress", "false");
|
|
9
|
+
store.setSyncState("last_sync", new Date().toISOString());
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export function isSyncInProgress(store: SemanticStore): boolean {
|
|
13
|
+
return store.getSyncState("sync_in_progress") === "true";
|
|
14
|
+
}
|