opencode-semantic-search 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +165 -0
- package/README.md +138 -0
- package/SETUP.md +541 -0
- package/bin/opencode-semantic-search.mjs +70 -0
- package/bun.lock +61 -0
- package/index.ts +138 -0
- package/install.sh +260 -0
- package/package.json +67 -0
- package/src/chunker/fallback.ts +77 -0
- package/src/chunker/index.ts +16 -0
- package/src/chunker/treesitter.ts +119 -0
- package/src/config.ts +157 -0
- package/src/diagnostics/bundle.ts +63 -0
- package/src/diagnostics/routing.ts +37 -0
- package/src/embedder/interface.ts +62 -0
- package/src/embedder/ollama.ts +60 -0
- package/src/embedder/openai.ts +71 -0
- package/src/indexer/delta.ts +165 -0
- package/src/indexer/gc.ts +10 -0
- package/src/indexer/incremental.ts +105 -0
- package/src/indexer/pipeline.test.ts +126 -0
- package/src/indexer/pipeline.ts +394 -0
- package/src/indexer/pool.ts +25 -0
- package/src/indexer/resume.ts +14 -0
- package/src/logger.ts +121 -0
- package/src/runtime.ts +111 -0
- package/src/search/context.ts +17 -0
- package/src/search/hybrid.ts +65 -0
- package/src/store/schema.sql +31 -0
- package/src/store/sqlite.ts +269 -0
- package/src/tools/diagnostic_bundle.ts +34 -0
- package/src/tools/index_status.ts +73 -0
- package/src/tools/reindex.ts +71 -0
- package/src/tools/semantic_search.ts +91 -0
- package/src/tools/smart_grep.ts +198 -0
- package/src/tui_toast.ts +191 -0
- package/src/types.d.ts +1 -0
package/src/config.ts
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import { homedir } from "node:os";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
|
|
4
|
+
export type EmbeddingProvider = "ollama" | "openai";
|
|
5
|
+
|
|
6
|
+
export interface PluginConfig {
|
|
7
|
+
logging: {
|
|
8
|
+
enabled: boolean;
|
|
9
|
+
level: "debug" | "info" | "warn" | "error";
|
|
10
|
+
verbose_paths?: string[];
|
|
11
|
+
/** Absolute path to a local log file for testing. Entries are appended as newline-delimited JSON. */
|
|
12
|
+
log_file?: string;
|
|
13
|
+
};
|
|
14
|
+
embedding: {
|
|
15
|
+
provider: EmbeddingProvider;
|
|
16
|
+
api_base: string;
|
|
17
|
+
model: string;
|
|
18
|
+
api_key_env?: string;
|
|
19
|
+
dimensions: number;
|
|
20
|
+
batch_size: number;
|
|
21
|
+
query_prefix: string;
|
|
22
|
+
/** Hard ceiling for a single embedding input (e.g. nomic-embed-text 8192). Splits oversized chunks. */
|
|
23
|
+
max_context_tokens: number;
|
|
24
|
+
};
|
|
25
|
+
chunking: {
|
|
26
|
+
max_tokens: number;
|
|
27
|
+
overlap_tokens: number;
|
|
28
|
+
};
|
|
29
|
+
indexing: {
|
|
30
|
+
include: string[];
|
|
31
|
+
exclude: string[];
|
|
32
|
+
respect_gitignore: boolean;
|
|
33
|
+
max_file_size_kb: number;
|
|
34
|
+
concurrency: number;
|
|
35
|
+
/** Concurrent embedding HTTP batch calls (cross-file batching in the indexing pipeline). */
|
|
36
|
+
embed_concurrency: number;
|
|
37
|
+
/** Per-file chunk/read/hash attempts before marking the file failed. */
|
|
38
|
+
retry_attempts: number;
|
|
39
|
+
/** Initial backoff in ms between per-file retries; doubles each attempt. */
|
|
40
|
+
retry_base_ms: number;
|
|
41
|
+
};
|
|
42
|
+
search: {
|
|
43
|
+
top_k: number;
|
|
44
|
+
similarity_threshold: number;
|
|
45
|
+
context_lines: number;
|
|
46
|
+
hybrid: {
|
|
47
|
+
weight_vector: number;
|
|
48
|
+
weight_bm25: number;
|
|
49
|
+
weight_rrf: number;
|
|
50
|
+
identifier_boost: number;
|
|
51
|
+
/** ANN + BM25 candidate pool size = ceil(topK * candidate_multiplier). */
|
|
52
|
+
candidate_multiplier: number;
|
|
53
|
+
};
|
|
54
|
+
};
|
|
55
|
+
smart_grep: {
|
|
56
|
+
enabled: boolean;
|
|
57
|
+
min_words_for_semantic: number;
|
|
58
|
+
fallback_to_grep_on_empty: boolean;
|
|
59
|
+
};
|
|
60
|
+
storage: {
|
|
61
|
+
global_cache_dir: string;
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export const defaultConfig: PluginConfig = {
|
|
66
|
+
logging: {
|
|
67
|
+
enabled: true,
|
|
68
|
+
level: "info",
|
|
69
|
+
verbose_paths: [],
|
|
70
|
+
log_file: path.join(homedir(), ".cache", "opencode", "semantic-search", "plugin.log"),
|
|
71
|
+
},
|
|
72
|
+
embedding: {
|
|
73
|
+
provider: "ollama",
|
|
74
|
+
api_base: "http://localhost:11434/v1",
|
|
75
|
+
model: "nomic-embed-text",
|
|
76
|
+
dimensions: 768,
|
|
77
|
+
batch_size: 10,
|
|
78
|
+
query_prefix: "search_query: ",
|
|
79
|
+
max_context_tokens: 8192,
|
|
80
|
+
},
|
|
81
|
+
chunking: {
|
|
82
|
+
max_tokens: 512,
|
|
83
|
+
overlap_tokens: 50,
|
|
84
|
+
},
|
|
85
|
+
indexing: {
|
|
86
|
+
include: ["**/*.ts", "**/*.tsx", "**/*.js", "**/*.jsx", "**/*.py", "**/*.go", "**/*.rs", "**/*.java"],
|
|
87
|
+
exclude: ["node_modules/**", "dist/**", ".git/**", "**/*.min.js", "**/tree/**", "**/.worktrees/**", "**/migration-*/**"],
|
|
88
|
+
respect_gitignore: true,
|
|
89
|
+
max_file_size_kb: 500,
|
|
90
|
+
concurrency: 4,
|
|
91
|
+
embed_concurrency: 2,
|
|
92
|
+
retry_attempts: 3,
|
|
93
|
+
retry_base_ms: 500,
|
|
94
|
+
},
|
|
95
|
+
search: {
|
|
96
|
+
top_k: 10,
|
|
97
|
+
similarity_threshold: 0.30,
|
|
98
|
+
context_lines: 3,
|
|
99
|
+
hybrid: {
|
|
100
|
+
weight_vector: 0.4,
|
|
101
|
+
weight_bm25: 0.3,
|
|
102
|
+
weight_rrf: 0.3,
|
|
103
|
+
identifier_boost: 1.5,
|
|
104
|
+
candidate_multiplier: 1.2,
|
|
105
|
+
},
|
|
106
|
+
},
|
|
107
|
+
smart_grep: {
|
|
108
|
+
enabled: true,
|
|
109
|
+
min_words_for_semantic: 2,
|
|
110
|
+
fallback_to_grep_on_empty: true,
|
|
111
|
+
},
|
|
112
|
+
storage: {
|
|
113
|
+
global_cache_dir: path.join(homedir(), ".cache", "opencode", "semantic-search"),
|
|
114
|
+
},
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
function isObject(value: unknown): value is Record<string, unknown> {
|
|
118
|
+
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function mergeDeep<T>(target: T, source: unknown): T {
|
|
122
|
+
if (!isObject(target) || !isObject(source)) return target;
|
|
123
|
+
const result = { ...target } as Record<string, unknown>;
|
|
124
|
+
for (const [key, value] of Object.entries(source)) {
|
|
125
|
+
const existing = result[key];
|
|
126
|
+
if (isObject(existing) && isObject(value)) {
|
|
127
|
+
result[key] = mergeDeep(existing, value);
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
result[key] = value;
|
|
131
|
+
}
|
|
132
|
+
return result as T;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const globalConfigPath = (): string =>
|
|
136
|
+
path.join(homedir(), ".config", "opencode", "semantic-search.json");
|
|
137
|
+
|
|
138
|
+
async function readJsonConfig(filePath: string): Promise<unknown> {
|
|
139
|
+
try {
|
|
140
|
+
const file = Bun.file(filePath);
|
|
141
|
+
if (await file.exists()) {
|
|
142
|
+
return JSON.parse(await file.text());
|
|
143
|
+
}
|
|
144
|
+
} catch {
|
|
145
|
+
// ignore missing or invalid file
|
|
146
|
+
}
|
|
147
|
+
return {};
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
export async function loadConfig(worktree: string): Promise<PluginConfig> {
|
|
151
|
+
const globalParsed = await readJsonConfig(globalConfigPath());
|
|
152
|
+
const projectPath = path.join(worktree, ".opencode", "semantic-search.json");
|
|
153
|
+
const projectParsed = await readJsonConfig(projectPath);
|
|
154
|
+
let merged = mergeDeep(defaultConfig, globalParsed);
|
|
155
|
+
merged = mergeDeep(merged, projectParsed);
|
|
156
|
+
return merged;
|
|
157
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import type { RuntimeContext } from "../runtime";
|
|
2
|
+
import { getRecentRoutingOutcomes } from "./routing";
|
|
3
|
+
|
|
4
|
+
export async function buildDiagnosticBundle(runtime: RuntimeContext): Promise<{
|
|
5
|
+
ok: boolean;
|
|
6
|
+
worktree: string;
|
|
7
|
+
generated_at: string;
|
|
8
|
+
elapsed_ms: number;
|
|
9
|
+
provider: {
|
|
10
|
+
name: string;
|
|
11
|
+
api_base: string;
|
|
12
|
+
model: string;
|
|
13
|
+
dimensions: number;
|
|
14
|
+
healthy: boolean;
|
|
15
|
+
};
|
|
16
|
+
index: {
|
|
17
|
+
db_path: string;
|
|
18
|
+
files_indexed: number;
|
|
19
|
+
chunks_indexed: number;
|
|
20
|
+
last_sync: string | null;
|
|
21
|
+
};
|
|
22
|
+
routing: {
|
|
23
|
+
recent_count: number;
|
|
24
|
+
recent_outcomes: ReturnType<typeof getRecentRoutingOutcomes>;
|
|
25
|
+
};
|
|
26
|
+
hints: {
|
|
27
|
+
reindex_tool: string;
|
|
28
|
+
debug_logs: string;
|
|
29
|
+
};
|
|
30
|
+
}> {
|
|
31
|
+
const startedAt = Date.now();
|
|
32
|
+
const stats = runtime.store.stats();
|
|
33
|
+
const providerHealthy = await runtime.embedder.healthcheck();
|
|
34
|
+
const recentRouting = getRecentRoutingOutcomes(runtime.store);
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
ok: true,
|
|
38
|
+
worktree: runtime.worktree,
|
|
39
|
+
generated_at: new Date().toISOString(),
|
|
40
|
+
elapsed_ms: Date.now() - startedAt,
|
|
41
|
+
provider: {
|
|
42
|
+
name: runtime.config.embedding.provider,
|
|
43
|
+
api_base: runtime.config.embedding.api_base,
|
|
44
|
+
model: runtime.config.embedding.model,
|
|
45
|
+
dimensions: runtime.config.embedding.dimensions,
|
|
46
|
+
healthy: providerHealthy,
|
|
47
|
+
},
|
|
48
|
+
index: {
|
|
49
|
+
db_path: runtime.dbPath,
|
|
50
|
+
files_indexed: stats.files,
|
|
51
|
+
chunks_indexed: stats.chunks,
|
|
52
|
+
last_sync: stats.lastSync ?? null,
|
|
53
|
+
},
|
|
54
|
+
routing: {
|
|
55
|
+
recent_count: recentRouting.length,
|
|
56
|
+
recent_outcomes: recentRouting,
|
|
57
|
+
},
|
|
58
|
+
hints: {
|
|
59
|
+
reindex_tool: "Run reindex() in OpenCode if index is stale or empty.",
|
|
60
|
+
debug_logs: "Set logging.level to debug in .opencode/semantic-search.json.",
|
|
61
|
+
},
|
|
62
|
+
};
|
|
63
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type { SemanticStore } from "../store/sqlite";
|
|
2
|
+
|
|
3
|
+
const ROUTING_STATE_KEY = "routing_recent";
|
|
4
|
+
const MAX_ROUTING_EVENTS = 25;
|
|
5
|
+
|
|
6
|
+
export interface RoutingOutcome {
|
|
7
|
+
at: string;
|
|
8
|
+
route: "semantic" | "ripgrep";
|
|
9
|
+
reason: string;
|
|
10
|
+
patternPreview: string;
|
|
11
|
+
results?: number;
|
|
12
|
+
providerHealthy?: boolean;
|
|
13
|
+
indexedChunks?: number;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function safeParse(value: string | null): RoutingOutcome[] {
|
|
17
|
+
if (!value) return [];
|
|
18
|
+
try {
|
|
19
|
+
const parsed = JSON.parse(value) as unknown;
|
|
20
|
+
if (!Array.isArray(parsed)) return [];
|
|
21
|
+
return parsed.filter((item) => {
|
|
22
|
+
return Boolean(item) && typeof item === "object";
|
|
23
|
+
}) as RoutingOutcome[];
|
|
24
|
+
} catch {
|
|
25
|
+
return [];
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function getRecentRoutingOutcomes(store: SemanticStore): RoutingOutcome[] {
|
|
30
|
+
return safeParse(store.getSyncState(ROUTING_STATE_KEY));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function appendRoutingOutcome(store: SemanticStore, outcome: RoutingOutcome): void {
|
|
34
|
+
const current = safeParse(store.getSyncState(ROUTING_STATE_KEY));
|
|
35
|
+
const next = [outcome, ...current].slice(0, MAX_ROUTING_EVENTS);
|
|
36
|
+
store.setSyncState(ROUTING_STATE_KEY, JSON.stringify(next));
|
|
37
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import type { PluginConfig } from "../config";
|
|
2
|
+
import type { Logger } from "../logger";
|
|
3
|
+
|
|
4
|
+
export interface Embedder {
|
|
5
|
+
embed(texts: string[]): Promise<number[][]>;
|
|
6
|
+
healthcheck(): Promise<boolean>;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export function withQueryExpansion(input: string): string {
|
|
10
|
+
const lower = input.toLowerCase();
|
|
11
|
+
if (lower.includes("auth")) return `${input} authentication authorize login session token`;
|
|
12
|
+
if (lower.includes("db") || lower.includes("database")) return `${input} database connection pool query migration`;
|
|
13
|
+
if (lower.includes("payment")) return `${input} payment checkout invoice charge transaction`;
|
|
14
|
+
return input;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export async function withRetry<T>(
|
|
18
|
+
fn: () => Promise<T>,
|
|
19
|
+
maxAttempts = 3,
|
|
20
|
+
onRetry?: (attempt: number, maxAttempts: number, error: unknown) => Promise<void> | void
|
|
21
|
+
): Promise<T> {
|
|
22
|
+
let attempt = 0;
|
|
23
|
+
let waitMs = 1000;
|
|
24
|
+
let lastErr: unknown;
|
|
25
|
+
while (attempt < maxAttempts) {
|
|
26
|
+
try {
|
|
27
|
+
return await fn();
|
|
28
|
+
} catch (error) {
|
|
29
|
+
lastErr = error;
|
|
30
|
+
attempt += 1;
|
|
31
|
+
if (onRetry) await onRetry(attempt, maxAttempts, error);
|
|
32
|
+
if (attempt >= maxAttempts) break;
|
|
33
|
+
await Bun.sleep(waitMs);
|
|
34
|
+
waitMs *= 4;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
throw lastErr;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export function chunked<T>(items: T[], size: number): T[][] {
|
|
41
|
+
const chunkSize = Math.max(1, size);
|
|
42
|
+
const out: T[][] = [];
|
|
43
|
+
for (let i = 0; i < items.length; i += chunkSize) out.push(items.slice(i, i + chunkSize));
|
|
44
|
+
return out;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export async function createEmbedder(config: PluginConfig, logger: Logger): Promise<Embedder> {
|
|
48
|
+
if (config.embedding.provider === "openai") {
|
|
49
|
+
const { OpenAIEmbedder } = await import("./openai");
|
|
50
|
+
await logger.info("embedder", {
|
|
51
|
+
message: "Using OpenAI embedder",
|
|
52
|
+
extra: { provider: config.embedding.provider, model: config.embedding.model }
|
|
53
|
+
});
|
|
54
|
+
return new OpenAIEmbedder(config, logger);
|
|
55
|
+
}
|
|
56
|
+
const { OllamaEmbedder } = await import("./ollama");
|
|
57
|
+
await logger.info("embedder", {
|
|
58
|
+
message: "Using Ollama embedder",
|
|
59
|
+
extra: { provider: config.embedding.provider, model: config.embedding.model }
|
|
60
|
+
});
|
|
61
|
+
return new OllamaEmbedder(config, logger);
|
|
62
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import type { PluginConfig } from "../config";
|
|
2
|
+
import type { Logger } from "../logger";
|
|
3
|
+
import { chunked, withRetry, type Embedder } from "./interface";
|
|
4
|
+
|
|
5
|
+
interface EmbeddingResponse {
|
|
6
|
+
data: Array<{ embedding: number[] }>;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export class OllamaEmbedder implements Embedder {
|
|
10
|
+
constructor(
|
|
11
|
+
private readonly config: PluginConfig,
|
|
12
|
+
private readonly logger: Logger
|
|
13
|
+
) {}
|
|
14
|
+
|
|
15
|
+
async healthcheck(): Promise<boolean> {
|
|
16
|
+
try {
|
|
17
|
+
const endpoint = this.config.embedding.api_base.replace(/\/v1$/, "/api/tags");
|
|
18
|
+
const res = await fetch(endpoint);
|
|
19
|
+
await this.logger.debug("embedder.ollama", {
|
|
20
|
+
message: "Healthcheck completed",
|
|
21
|
+
extra: { endpoint, status: res.status, ok: res.ok }
|
|
22
|
+
});
|
|
23
|
+
return res.ok;
|
|
24
|
+
} catch (error) {
|
|
25
|
+
await this.logger.warn("embedder.ollama", { message: "Healthcheck failed" });
|
|
26
|
+
await this.logger.error("embedder.ollama", { message: "Healthcheck error", error });
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
async embed(texts: string[]): Promise<number[][]> {
|
|
32
|
+
const results: number[][] = [];
|
|
33
|
+
for (const batch of chunked(texts, this.config.embedding.batch_size)) {
|
|
34
|
+
const response = await withRetry(async () => {
|
|
35
|
+
const res = await fetch(`${this.config.embedding.api_base}/embeddings`, {
|
|
36
|
+
method: "POST",
|
|
37
|
+
headers: { "content-type": "application/json" },
|
|
38
|
+
body: JSON.stringify({
|
|
39
|
+
model: this.config.embedding.model,
|
|
40
|
+
input: batch,
|
|
41
|
+
}),
|
|
42
|
+
});
|
|
43
|
+
if (!res.ok) throw new Error(`Embedding request failed: ${res.status}`);
|
|
44
|
+
return (await res.json()) as EmbeddingResponse;
|
|
45
|
+
}, 3, async (attempt, maxAttempts, error) => {
|
|
46
|
+
await this.logger.warn("embedder.ollama", {
|
|
47
|
+
message: "Embedding request retry",
|
|
48
|
+
extra: {
|
|
49
|
+
attempt,
|
|
50
|
+
maxAttempts,
|
|
51
|
+
endpoint: `${this.config.embedding.api_base}/embeddings`
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
await this.logger.error("embedder.ollama", { message: "Embedding request attempt failed", error });
|
|
55
|
+
});
|
|
56
|
+
for (const item of response.data) results.push(item.embedding);
|
|
57
|
+
}
|
|
58
|
+
return results;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import type { PluginConfig } from "../config";
|
|
2
|
+
import type { Logger } from "../logger";
|
|
3
|
+
import { chunked, withRetry, type Embedder } from "./interface";
|
|
4
|
+
|
|
5
|
+
interface EmbeddingResponse {
|
|
6
|
+
data: Array<{ embedding: number[] }>;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export class OpenAIEmbedder implements Embedder {
|
|
10
|
+
constructor(
|
|
11
|
+
private readonly config: PluginConfig,
|
|
12
|
+
private readonly logger: Logger
|
|
13
|
+
) {}
|
|
14
|
+
|
|
15
|
+
private get headers(): Record<string, string> {
|
|
16
|
+
const token = this.config.embedding.api_key_env ? process.env[this.config.embedding.api_key_env] : undefined;
|
|
17
|
+
return {
|
|
18
|
+
"content-type": "application/json",
|
|
19
|
+
...(token ? { authorization: `Bearer ${token}` } : {}),
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async healthcheck(): Promise<boolean> {
|
|
24
|
+
try {
|
|
25
|
+
const res = await fetch(`${this.config.embedding.api_base}/models`, { headers: this.headers });
|
|
26
|
+
await this.logger.debug("embedder.openai", {
|
|
27
|
+
message: "Healthcheck completed",
|
|
28
|
+
extra: { status: res.status, ok: res.ok }
|
|
29
|
+
});
|
|
30
|
+
return res.ok || res.status === 401;
|
|
31
|
+
} catch (error) {
|
|
32
|
+
await this.logger.warn("embedder.openai", {
|
|
33
|
+
message: "Healthcheck failed",
|
|
34
|
+
extra: { endpoint: `${this.config.embedding.api_base}/models` }
|
|
35
|
+
});
|
|
36
|
+
await this.logger.error("embedder.openai", { message: "Healthcheck error", error });
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async embed(texts: string[]): Promise<number[][]> {
|
|
42
|
+
const out: number[][] = [];
|
|
43
|
+
for (const batch of chunked(texts, this.config.embedding.batch_size)) {
|
|
44
|
+
const body = {
|
|
45
|
+
model: this.config.embedding.model,
|
|
46
|
+
input: batch,
|
|
47
|
+
};
|
|
48
|
+
const data = await withRetry(async () => {
|
|
49
|
+
const res = await fetch(`${this.config.embedding.api_base}/embeddings`, {
|
|
50
|
+
method: "POST",
|
|
51
|
+
headers: this.headers,
|
|
52
|
+
body: JSON.stringify(body),
|
|
53
|
+
});
|
|
54
|
+
if (!res.ok) throw new Error(`Embedding request failed: ${res.status}`);
|
|
55
|
+
return (await res.json()) as EmbeddingResponse;
|
|
56
|
+
}, 3, async (attempt, maxAttempts, error) => {
|
|
57
|
+
await this.logger.warn("embedder.openai", {
|
|
58
|
+
message: "Embedding request retry",
|
|
59
|
+
extra: {
|
|
60
|
+
attempt,
|
|
61
|
+
maxAttempts,
|
|
62
|
+
endpoint: `${this.config.embedding.api_base}/embeddings`
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
await this.logger.error("embedder.openai", { message: "Embedding request attempt failed", error });
|
|
66
|
+
});
|
|
67
|
+
for (const row of data.data) out.push(row.embedding);
|
|
68
|
+
}
|
|
69
|
+
return out;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import crypto from "node:crypto";
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import ignore from "ignore";
|
|
5
|
+
import picomatch from "picomatch";
|
|
6
|
+
import type { PluginConfig } from "../config";
|
|
7
|
+
import type { Embedder } from "../embedder/interface";
|
|
8
|
+
import type { Logger } from "../logger";
|
|
9
|
+
import type { DeltaProgressPayload } from "../runtime";
|
|
10
|
+
import type { SemanticStore } from "../store/sqlite";
|
|
11
|
+
import { garbageCollectDeletedFiles } from "./gc";
|
|
12
|
+
import { mapPool } from "./pool";
|
|
13
|
+
import { runIndexingPipeline, type IndexingPipelineFailure } from "./pipeline";
|
|
14
|
+
import { markSyncCompleted, markSyncStarted } from "./resume";
|
|
15
|
+
|
|
16
|
+
function sha256(input: string): string {
|
|
17
|
+
return crypto.createHash("sha256").update(input).digest("hex");
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/** Paths from Bun.Glob use the platform separator; gitignore expects `/`. */
|
|
21
|
+
function toPosixRelative(relativePath: string): string {
|
|
22
|
+
return path.sep === "\\" ? relativePath.replaceAll("\\", "/") : relativePath;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function loadRootGitignore(worktree: string): ReturnType<typeof ignore> | null {
|
|
26
|
+
try {
|
|
27
|
+
const gitignorePath = path.join(worktree, ".gitignore");
|
|
28
|
+
const content = fs.readFileSync(gitignorePath, "utf8");
|
|
29
|
+
return ignore().add(content);
|
|
30
|
+
} catch {
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async function listFiles(worktree: string, config: PluginConfig): Promise<string[]> {
|
|
36
|
+
const includeMatchers = config.indexing.include.map((p) => picomatch(p, { dot: true }));
|
|
37
|
+
const excludeMatchers = config.indexing.exclude.map((p) => picomatch(p, { dot: true }));
|
|
38
|
+
const gitignoreFilter =
|
|
39
|
+
config.indexing.respect_gitignore ? loadRootGitignore(worktree) : null;
|
|
40
|
+
const files: string[] = [];
|
|
41
|
+
const glob = new Bun.Glob("**/*");
|
|
42
|
+
for await (const relativePath of glob.scan({ cwd: worktree, dot: true, onlyFiles: true })) {
|
|
43
|
+
if (gitignoreFilter !== null && gitignoreFilter.ignores(toPosixRelative(relativePath))) {
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
const included = includeMatchers.some((m) => m(relativePath));
|
|
47
|
+
if (!included) continue;
|
|
48
|
+
const excluded = excludeMatchers.some((m) => m(relativePath));
|
|
49
|
+
if (excluded) continue;
|
|
50
|
+
files.push(path.join(worktree, relativePath));
|
|
51
|
+
}
|
|
52
|
+
return files;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export interface DeltaSyncOptions {
|
|
56
|
+
logger?: Logger;
|
|
57
|
+
onProgress?: (update: DeltaProgressPayload) => void;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export async function deltaSync(
|
|
61
|
+
worktree: string,
|
|
62
|
+
store: SemanticStore,
|
|
63
|
+
embedder: Embedder,
|
|
64
|
+
config: PluginConfig,
|
|
65
|
+
options?: DeltaSyncOptions
|
|
66
|
+
): Promise<void> {
|
|
67
|
+
const logger = options?.logger;
|
|
68
|
+
const onProgress = options?.onProgress;
|
|
69
|
+
const startedAt = Date.now();
|
|
70
|
+
let indexedCount = 0;
|
|
71
|
+
let lastFailedFiles: string[] | undefined;
|
|
72
|
+
markSyncStarted(store);
|
|
73
|
+
try {
|
|
74
|
+
onProgress?.({ phase: "scanning", current: 0, total: 0, label: "listing files" });
|
|
75
|
+
const files = await listFiles(worktree, config);
|
|
76
|
+
await logger?.info("indexer.delta", {
|
|
77
|
+
message: "Listed files for index",
|
|
78
|
+
extra: { count: files.length },
|
|
79
|
+
});
|
|
80
|
+
const previousStats = store.stats();
|
|
81
|
+
const initialChunks = previousStats.chunks;
|
|
82
|
+
const existing = new Set<string>(files);
|
|
83
|
+
const total = files.length;
|
|
84
|
+
const concurrency = Math.max(1, config.indexing.concurrency);
|
|
85
|
+
onProgress?.({
|
|
86
|
+
phase: "indexing",
|
|
87
|
+
current: 0,
|
|
88
|
+
total,
|
|
89
|
+
label: total === 0 ? "no files matched" : `${total} files to scan`,
|
|
90
|
+
});
|
|
91
|
+
let scanned = 0;
|
|
92
|
+
const dirtyFlags = await mapPool(files, concurrency, async (filePath) => {
|
|
93
|
+
const text = await Bun.file(filePath).text();
|
|
94
|
+
const contentHash = sha256(text);
|
|
95
|
+
const row = store.db.query("SELECT content_hash FROM files WHERE path = ?").get(filePath) as { content_hash: string } | null;
|
|
96
|
+
scanned += 1;
|
|
97
|
+
onProgress?.({
|
|
98
|
+
phase: "indexing",
|
|
99
|
+
current: scanned,
|
|
100
|
+
total,
|
|
101
|
+
label: `scan: ${path.basename(filePath)}`,
|
|
102
|
+
});
|
|
103
|
+
return row?.content_hash !== contentHash ? filePath : null;
|
|
104
|
+
});
|
|
105
|
+
const toIndex = dirtyFlags.filter((p): p is string => p !== null);
|
|
106
|
+
const unchanged = files.length - toIndex.length;
|
|
107
|
+
await logger?.info("indexer.delta", {
|
|
108
|
+
message: "Hash scan finished",
|
|
109
|
+
extra: {
|
|
110
|
+
totalFiles: files.length,
|
|
111
|
+
dirtyFiles: toIndex.length,
|
|
112
|
+
unchanged,
|
|
113
|
+
},
|
|
114
|
+
});
|
|
115
|
+
const pipelineResult = await runIndexingPipeline(toIndex, store, embedder, config, {
|
|
116
|
+
logger,
|
|
117
|
+
onProgress,
|
|
118
|
+
});
|
|
119
|
+
indexedCount = pipelineResult.indexed;
|
|
120
|
+
lastFailedFiles =
|
|
121
|
+
pipelineResult.failed.length > 0
|
|
122
|
+
? pipelineResult.failed.map((f: IndexingPipelineFailure) => f.filePath)
|
|
123
|
+
: undefined;
|
|
124
|
+
onProgress?.({
|
|
125
|
+
phase: "gc",
|
|
126
|
+
current: total,
|
|
127
|
+
total,
|
|
128
|
+
label: "removing stale entries",
|
|
129
|
+
});
|
|
130
|
+
await logger?.info("indexer.delta", {
|
|
131
|
+
message: "Garbage collection started",
|
|
132
|
+
extra: { trackedFiles: existing.size },
|
|
133
|
+
});
|
|
134
|
+
store.removeMissingFiles(existing);
|
|
135
|
+
garbageCollectDeletedFiles(store);
|
|
136
|
+
const updatedStats = store.stats();
|
|
137
|
+
const chunkDelta = updatedStats.chunks - initialChunks;
|
|
138
|
+
if (logger) {
|
|
139
|
+
const failedCount = pipelineResult.failed.length;
|
|
140
|
+
if (failedCount > 0) {
|
|
141
|
+
await logger.warn("indexer.delta", {
|
|
142
|
+
message: "Some files failed during indexing pipeline",
|
|
143
|
+
extra: {
|
|
144
|
+
failedCount,
|
|
145
|
+
failedPaths: pipelineResult.failed.map((f) => f.filePath),
|
|
146
|
+
},
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
await logger.info("indexer.delta", {
|
|
150
|
+
message: "Delta sync completed",
|
|
151
|
+
extra: {
|
|
152
|
+
scannedFiles: files.length,
|
|
153
|
+
dirtyFiles: toIndex.length,
|
|
154
|
+
filesWritten: indexedCount,
|
|
155
|
+
chunkDelta,
|
|
156
|
+
elapsedMs: Date.now() - startedAt,
|
|
157
|
+
pipelineFailed: failedCount,
|
|
158
|
+
},
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
} finally {
|
|
162
|
+
onProgress?.({ phase: "idle", current: 0, total: 0, failedFiles: lastFailedFiles });
|
|
163
|
+
markSyncCompleted(store);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import type { SemanticStore } from "../store/sqlite";
|
|
3
|
+
|
|
4
|
+
export function garbageCollectDeletedFiles(store: SemanticStore): void {
|
|
5
|
+
const rows = store.db.query("SELECT path FROM files").all() as Array<{ path: string }>;
|
|
6
|
+
for (const row of rows) {
|
|
7
|
+
if (fs.existsSync(row.path)) continue;
|
|
8
|
+
store.db.query("DELETE FROM files WHERE path = ?").run(row.path);
|
|
9
|
+
}
|
|
10
|
+
}
|