@pi-unipi/cocoindex 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +93 -0
- package/bridge.ts +774 -0
- package/commands.ts +175 -0
- package/index.ts +55 -0
- package/installer.ts +397 -0
- package/package.json +42 -0
- package/skills/cocoindex/SKILL.md +88 -0
- package/tools.ts +131 -0
package/bridge.ts
ADDED
|
@@ -0,0 +1,774 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* bridge.ts — CocoIndex CLI interaction layer
|
|
3
|
+
*
|
|
4
|
+
* Spawns cocoindex commands and queries LanceDB directly for search.
|
|
5
|
+
* The bridge handles:
|
|
6
|
+
* - CLI detection (is cocoindex installed?)
|
|
7
|
+
* - Pipeline initialization (scaffold main.py)
|
|
8
|
+
* - Project indexing (cocoindex update)
|
|
9
|
+
* - Status reporting (last run, doc count)
|
|
10
|
+
* - Search (query LanceDB directly via Node.js SDK)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { execFileSync, spawn } from "node:child_process";
|
|
14
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync, readdirSync, statSync } from "node:fs";
|
|
15
|
+
import { homedir } from "node:os";
|
|
16
|
+
import { join, dirname } from "node:path";
|
|
17
|
+
import { COCOINDEX_MIN_VERSION } from "@pi-unipi/core";
|
|
18
|
+
|
|
19
|
+
// ─────────────────────────────────────────────────────────
|
|
20
|
+
// Types
|
|
21
|
+
// ─────────────────────────────────────────────────────────
|
|
22
|
+
|
|
23
|
+
export interface IndexResult {
|
|
24
|
+
success: boolean;
|
|
25
|
+
chunksProcessed: number;
|
|
26
|
+
durationMs: number;
|
|
27
|
+
error?: string;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export interface StatusInfo {
|
|
31
|
+
indexed: boolean;
|
|
32
|
+
lastRun: string | null;
|
|
33
|
+
docCount: number;
|
|
34
|
+
pipelineConfigured: boolean;
|
|
35
|
+
cliAvailable: boolean;
|
|
36
|
+
targetStore: "lancedb" | "postgres" | "qdrant" | "sqlite" | "unknown";
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export interface SearchOptions {
|
|
40
|
+
limit?: number;
|
|
41
|
+
offset?: number;
|
|
42
|
+
minScore?: number;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface SearchResult {
|
|
46
|
+
title: string;
|
|
47
|
+
content: string;
|
|
48
|
+
source: string;
|
|
49
|
+
rank: number;
|
|
50
|
+
contentType: "code" | "prose";
|
|
51
|
+
matchLayer: "vector" | "fulltext" | "hybrid";
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface CocoindexDeps {
|
|
55
|
+
projectDir: string;
|
|
56
|
+
pipelineDir: string;
|
|
57
|
+
initialized: boolean;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// ─────────────────────────────────────────────────────────
|
|
61
|
+
// Constants
|
|
62
|
+
// ─────────────────────────────────────────────────────────
|
|
63
|
+
|
|
64
|
+
const COCOINDEX_STATE_DIR = ".cocoindex";
|
|
65
|
+
const DEFAULT_PIPELINE_DIR = ".unipi/cocoindex";
|
|
66
|
+
const DEFAULT_LANCEDB_PATH = ".unipi/cocoindex/.lancedb";
|
|
67
|
+
const DEFAULT_LEXICAL_SCAN_LIMIT = 50_000;
|
|
68
|
+
|
|
69
|
+
// ─────────────────────────────────────────────────────────
|
|
70
|
+
// CLI Detection
|
|
71
|
+
// ─────────────────────────────────────────────────────────
|
|
72
|
+
|
|
73
|
+
let cachedAvailable: boolean | null = null;
|
|
74
|
+
|
|
75
|
+
export interface AvailabilityOptions {
|
|
76
|
+
/** Bypass the cached availability result. */
|
|
77
|
+
useCache?: boolean;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Extract a semver-ish version from `cocoindex --version` output. */
|
|
81
|
+
export function parseVersion(versionStr: string): string | null {
|
|
82
|
+
const match = versionStr.match(/(?:^|[^0-9])(\d+\.\d+(?:\.\d+)?)(?:[^0-9]|$)/);
|
|
83
|
+
return match?.[1] ?? null;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/** Compare semver-ish strings. Missing/invalid versions are not acceptable. */
|
|
87
|
+
export function isVersionAtLeast(version: string | null | undefined, minimum = COCOINDEX_MIN_VERSION): boolean {
|
|
88
|
+
const parsed = version ? parseVersion(version) : null;
|
|
89
|
+
const parsedMinimum = parseVersion(minimum);
|
|
90
|
+
if (!parsed || !parsedMinimum) return false;
|
|
91
|
+
|
|
92
|
+
const actualParts = parsed.split(".").map((part) => Number.parseInt(part, 10));
|
|
93
|
+
const minParts = parsedMinimum.split(".").map((part) => Number.parseInt(part, 10));
|
|
94
|
+
const len = Math.max(actualParts.length, minParts.length, 3);
|
|
95
|
+
|
|
96
|
+
for (let i = 0; i < len; i++) {
|
|
97
|
+
const actual = actualParts[i] ?? 0;
|
|
98
|
+
const min = minParts[i] ?? 0;
|
|
99
|
+
if (!Number.isFinite(actual) || !Number.isFinite(min)) return false;
|
|
100
|
+
if (actual > min) return true;
|
|
101
|
+
if (actual < min) return false;
|
|
102
|
+
}
|
|
103
|
+
return true;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/** Reset cached availability, used after installer mutations. */
|
|
107
|
+
export function resetAvailabilityCache(): void {
|
|
108
|
+
cachedAvailable = null;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/** Check if cocoindex CLI is installed and available. */
|
|
112
|
+
export async function isAvailable(options: AvailabilityOptions = {}): Promise<boolean> {
|
|
113
|
+
const useCache = options.useCache ?? true;
|
|
114
|
+
if (useCache && cachedAvailable !== null) return cachedAvailable;
|
|
115
|
+
|
|
116
|
+
let available = false;
|
|
117
|
+
try {
|
|
118
|
+
const result = execFileSync(getCocoindexBinPath(), ["--version"], {
|
|
119
|
+
encoding: "utf-8",
|
|
120
|
+
timeout: 5000,
|
|
121
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
122
|
+
});
|
|
123
|
+
available = result.trim().length > 0;
|
|
124
|
+
} catch {
|
|
125
|
+
available = false;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (useCache) cachedAvailable = available;
|
|
129
|
+
return available;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/** Get cocoindex CLI version string. */
|
|
133
|
+
export async function getVersion(): Promise<string | null> {
|
|
134
|
+
try {
|
|
135
|
+
const result = execFileSync(getCocoindexBinPath(), ["--version"], {
|
|
136
|
+
encoding: "utf-8",
|
|
137
|
+
timeout: 5000,
|
|
138
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
139
|
+
});
|
|
140
|
+
return result.trim();
|
|
141
|
+
} catch {
|
|
142
|
+
return null;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/** Resolve cocoindex binary path — checks PATH, uv tool bin path, then common mise locations. */
|
|
147
|
+
export function getCocoindexBinPath(): string {
|
|
148
|
+
return resolveCocoindexBin();
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function resolveCocoindexBin(): string {
|
|
152
|
+
// Try PATH first.
|
|
153
|
+
try {
|
|
154
|
+
const resolved = execFileSync("sh", ["-c", "command -v cocoindex"], {
|
|
155
|
+
encoding: "utf-8",
|
|
156
|
+
timeout: 3000,
|
|
157
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
158
|
+
}).trim();
|
|
159
|
+
if (resolved) return resolved;
|
|
160
|
+
} catch {
|
|
161
|
+
// Not on PATH
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// uv tool install exposes binaries here by default.
|
|
165
|
+
const uvToolBin = join(homedir(), ".local", "bin", "cocoindex");
|
|
166
|
+
if (existsSync(uvToolBin)) return uvToolBin;
|
|
167
|
+
|
|
168
|
+
// Try mise python installations.
|
|
169
|
+
const miseRoot = join(homedir(), ".local", "share", "mise", "installs", "python");
|
|
170
|
+
try {
|
|
171
|
+
const versions = readdirSync(miseRoot).sort().reverse();
|
|
172
|
+
for (const ver of versions) {
|
|
173
|
+
const binPath = join(miseRoot, ver, "bin", "cocoindex");
|
|
174
|
+
if (existsSync(binPath)) return binPath;
|
|
175
|
+
}
|
|
176
|
+
} catch {
|
|
177
|
+
// mise not installed or no python versions
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return "cocoindex"; // Fall back to PATH resolution
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// ─────────────────────────────────────────────────────────
|
|
184
|
+
// Pipeline Management
|
|
185
|
+
// ─────────────────────────────────────────────────────────
|
|
186
|
+
|
|
187
|
+
/** Get the pipeline directory for a project. */
|
|
188
|
+
export function getPipelineDir(projectDir: string): string {
|
|
189
|
+
return join(projectDir, DEFAULT_PIPELINE_DIR);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/** Check if a pipeline is already initialized. */
|
|
193
|
+
export async function isPipelineInitialized(pipelineDir: string): Promise<boolean> {
|
|
194
|
+
return existsSync(join(pipelineDir, "main.py"));
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/** Detect the target store from main.py content. */
|
|
198
|
+
export function detectTargetStore(pipelineDir: string): StatusInfo["targetStore"] {
|
|
199
|
+
const mainPyPath = join(pipelineDir, "main.py");
|
|
200
|
+
if (!existsSync(mainPyPath)) return "unknown";
|
|
201
|
+
|
|
202
|
+
try {
|
|
203
|
+
const content = readFileSync(mainPyPath, "utf-8");
|
|
204
|
+
if (content.includes("LanceDB") || content.includes("lancedb")) return "lancedb";
|
|
205
|
+
if (content.includes("Postgres") || content.includes("postgresql")) return "postgres";
|
|
206
|
+
if (content.includes("Qdrant") || content.includes("qdrant")) return "qdrant";
|
|
207
|
+
if (content.includes("SQLite") || content.includes("sqlite")) return "sqlite";
|
|
208
|
+
return "unknown";
|
|
209
|
+
} catch {
|
|
210
|
+
return "unknown";
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/** Initialize a cocoindex pipeline with default LanceDB target. */
|
|
215
|
+
export async function initPipeline(projectDir: string): Promise<{ success: boolean; error?: string }> {
|
|
216
|
+
const pipelineDir = getPipelineDir(projectDir);
|
|
217
|
+
|
|
218
|
+
// Create directory
|
|
219
|
+
if (!existsSync(pipelineDir)) {
|
|
220
|
+
mkdirSync(pipelineDir, { recursive: true });
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// Don't overwrite existing pipeline
|
|
224
|
+
if (existsSync(join(pipelineDir, "main.py"))) {
|
|
225
|
+
return { success: true };
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Read embedding config from memory settings
|
|
229
|
+
const embeddingConfig = loadEmbeddingConfig();
|
|
230
|
+
|
|
231
|
+
const template = generatePipelineTemplate(projectDir, embeddingConfig);
|
|
232
|
+
writeFileSync(join(pipelineDir, "main.py"), template, "utf-8");
|
|
233
|
+
|
|
234
|
+
return { success: true };
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// ─────────────────────────────────────────────────────────
|
|
238
|
+
// Indexing
|
|
239
|
+
// ─────────────────────────────────────────────────────────
|
|
240
|
+
|
|
241
|
+
/** Run cocoindex update to index the project. */
|
|
242
|
+
export async function indexProject(projectDir: string): Promise<IndexResult> {
|
|
243
|
+
const available = await isAvailable();
|
|
244
|
+
if (!available) {
|
|
245
|
+
return {
|
|
246
|
+
success: false,
|
|
247
|
+
chunksProcessed: 0,
|
|
248
|
+
durationMs: 0,
|
|
249
|
+
error: "CocoIndex CLI not found. Run /unipi:cocoindex-init to install cocoindex[lancedb]>=1.0.",
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
const pipelineDir = getPipelineDir(projectDir);
|
|
254
|
+
if (!existsSync(join(pipelineDir, "main.py"))) {
|
|
255
|
+
return {
|
|
256
|
+
success: false,
|
|
257
|
+
chunksProcessed: 0,
|
|
258
|
+
durationMs: 0,
|
|
259
|
+
error: "Pipeline not initialized. Run /unipi:cocoindex-init first.",
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const start = Date.now();
|
|
264
|
+
|
|
265
|
+
const cocoindexBin = resolveCocoindexBin();
|
|
266
|
+
|
|
267
|
+
return new Promise<IndexResult>((resolve) => {
|
|
268
|
+
const proc = spawn(cocoindexBin, ["update", "main.py"], {
|
|
269
|
+
cwd: pipelineDir,
|
|
270
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
271
|
+
timeout: 300000, // 5 min timeout
|
|
272
|
+
});
|
|
273
|
+
|
|
274
|
+
let stdout = "";
|
|
275
|
+
let stderr = "";
|
|
276
|
+
|
|
277
|
+
proc.stdout.on("data", (data: Buffer) => {
|
|
278
|
+
stdout += data.toString();
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
proc.stderr.on("data", (data: Buffer) => {
|
|
282
|
+
stderr += data.toString();
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
proc.on("close", (code: number | null) => {
|
|
286
|
+
const durationMs = Date.now() - start;
|
|
287
|
+
const chunksProcessed = parseChunksProcessed(stdout);
|
|
288
|
+
|
|
289
|
+
if (code === 0) {
|
|
290
|
+
resolve({ success: true, chunksProcessed, durationMs });
|
|
291
|
+
} else {
|
|
292
|
+
resolve({
|
|
293
|
+
success: false,
|
|
294
|
+
chunksProcessed,
|
|
295
|
+
durationMs,
|
|
296
|
+
error: stderr.trim() || `Process exited with code ${code}`,
|
|
297
|
+
});
|
|
298
|
+
}
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
proc.on("error", (err: Error) => {
|
|
302
|
+
resolve({
|
|
303
|
+
success: false,
|
|
304
|
+
chunksProcessed: 0,
|
|
305
|
+
durationMs: Date.now() - start,
|
|
306
|
+
error: err.message,
|
|
307
|
+
});
|
|
308
|
+
});
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
/** Parse the number of files processed from cocoindex v1.0+ output. */
|
|
313
|
+
function parseChunksProcessed(output: string): number {
|
|
314
|
+
// v1.0+ format: "✅ process_file: 604 total | 604 added"
|
|
315
|
+
// Capture the last "added" or "reprocessed" count for process_file
|
|
316
|
+
const lines = output.split("\n");
|
|
317
|
+
let lastProcessLine: string | undefined;
|
|
318
|
+
for (const line of lines) {
|
|
319
|
+
if (line.includes("process_file:") && (line.includes("added") || line.includes("reprocessed"))) {
|
|
320
|
+
lastProcessLine = line;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
if (lastProcessLine) {
|
|
324
|
+
// Match the number before "added" or "reprocessed"
|
|
325
|
+
const match = lastProcessLine.match(/(\d+)\s+(?:added|reprocessed)/);
|
|
326
|
+
if (match) return parseInt(match[1], 10);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Fallback: old format "Processed 42 chunks"
|
|
330
|
+
const fallback = output.match(/processed\s+(\d+)\s+chunks?/i)
|
|
331
|
+
?? output.match(/(\d+)\s+chunks?\s+processed/i)
|
|
332
|
+
?? output.match(/indexed\s+(\d+)/i);
|
|
333
|
+
return fallback ? parseInt(fallback[1], 10) : 0;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// ─────────────────────────────────────────────────────────
|
|
337
|
+
// Status
|
|
338
|
+
// ─────────────────────────────────────────────────────────
|
|
339
|
+
|
|
340
|
+
/** Get indexing status for the project. */
|
|
341
|
+
export async function status(projectDir: string): Promise<StatusInfo> {
|
|
342
|
+
const pipelineDir = getPipelineDir(projectDir);
|
|
343
|
+
const cliAvailable = await isAvailable();
|
|
344
|
+
const pipelineConfigured = existsSync(join(pipelineDir, "main.py"));
|
|
345
|
+
const targetStore = detectTargetStore(pipelineDir);
|
|
346
|
+
|
|
347
|
+
let docCount = 0;
|
|
348
|
+
let lastRun: string | null = null;
|
|
349
|
+
|
|
350
|
+
// Check LanceDB data for doc count and freshness
|
|
351
|
+
const lancedbPath = join(pipelineDir, ".lancedb");
|
|
352
|
+
if (existsSync(lancedbPath)) {
|
|
353
|
+
try {
|
|
354
|
+
const stat = statSync(lancedbPath);
|
|
355
|
+
lastRun = stat.mtime.toISOString();
|
|
356
|
+
// Count .lance files as a rough doc estimate
|
|
357
|
+
const files = readdirSync(lancedbPath, { recursive: true });
|
|
358
|
+
docCount = (files as string[]).filter((f) => f.endsWith(".lance")).length;
|
|
359
|
+
} catch {
|
|
360
|
+
// Non-fatal
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
return {
|
|
365
|
+
indexed: docCount > 0,
|
|
366
|
+
lastRun,
|
|
367
|
+
docCount,
|
|
368
|
+
pipelineConfigured,
|
|
369
|
+
cliAvailable,
|
|
370
|
+
targetStore,
|
|
371
|
+
};
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// ─────────────────────────────────────────────────────────
|
|
375
|
+
// Search
|
|
376
|
+
// ─────────────────────────────────────────────────────────
|
|
377
|
+
|
|
378
|
+
/** Search indexed content by querying LanceDB directly. */
|
|
379
|
+
export async function search(
|
|
380
|
+
projectDir: string,
|
|
381
|
+
query: string,
|
|
382
|
+
options?: SearchOptions,
|
|
383
|
+
): Promise<SearchResult[]> {
|
|
384
|
+
const limit = options?.limit ?? 10;
|
|
385
|
+
const offset = options?.offset ?? 0;
|
|
386
|
+
|
|
387
|
+
try {
|
|
388
|
+
const lancedbPath = join(getPipelineDir(projectDir), ".lancedb");
|
|
389
|
+
if (!existsSync(lancedbPath)) {
|
|
390
|
+
return [];
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
// Dynamic import — LanceDB SDK may not be installed.
|
|
394
|
+
// @ts-ignore — optional dependency, may not be installed
|
|
395
|
+
const lancedb = await import("@lancedb/lancedb");
|
|
396
|
+
const db = await lancedb.connect(lancedbPath);
|
|
397
|
+
|
|
398
|
+
const tableNames = await db.tableNames();
|
|
399
|
+
if (tableNames.length === 0) return [];
|
|
400
|
+
|
|
401
|
+
const table = await db.openTable(tableNames[0]);
|
|
402
|
+
|
|
403
|
+
// Prefer semantic vector search when the pipeline/table provides a vector
|
|
404
|
+
// column. Older generated pipelines only contain path/chunk_index/content;
|
|
405
|
+
// LanceDB throws for those tables, so continue to FTS/lexical fallback.
|
|
406
|
+
const queryVector = await generateQueryEmbedding(query);
|
|
407
|
+
if (queryVector) {
|
|
408
|
+
const vectorResults = await vectorSearch(table, queryVector, limit, offset);
|
|
409
|
+
if (vectorResults.length > 0) return vectorResults;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// Prefer LanceDB's native FTS when an inverted index exists on content.
|
|
413
|
+
const ftsResults = await fullTextSearch(table, query, limit, offset);
|
|
414
|
+
if (ftsResults.length > 0) return ftsResults;
|
|
415
|
+
|
|
416
|
+
// Last-resort compatibility path for existing text-only LanceDB tables.
|
|
417
|
+
// This keeps indexed projects searchable immediately instead of returning
|
|
418
|
+
// a misleading "run cocoindex-update" message when no vector/FTS index is
|
|
419
|
+
// available yet.
|
|
420
|
+
return lexicalSearch(table, query, limit, offset);
|
|
421
|
+
} catch (err: any) {
|
|
422
|
+
if (err?.code === "MODULE_NOT_FOUND" || err?.message?.includes("Cannot find module")) {
|
|
423
|
+
return [{
|
|
424
|
+
title: "Search Unavailable",
|
|
425
|
+
content: "LanceDB SDK not installed. Install with: npm install @lancedb/lancedb",
|
|
426
|
+
source: "",
|
|
427
|
+
rank: 0,
|
|
428
|
+
contentType: "prose",
|
|
429
|
+
matchLayer: "fulltext",
|
|
430
|
+
}];
|
|
431
|
+
}
|
|
432
|
+
return [{
|
|
433
|
+
title: "Search Error",
|
|
434
|
+
content: `CocoIndex LanceDB search failed: ${err?.message ?? String(err)}`,
|
|
435
|
+
source: "",
|
|
436
|
+
rank: 0,
|
|
437
|
+
contentType: "prose",
|
|
438
|
+
matchLayer: "fulltext",
|
|
439
|
+
}];
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
async function vectorSearch(table: any, queryVector: number[], limit: number, offset: number): Promise<SearchResult[]> {
|
|
444
|
+
try {
|
|
445
|
+
const results = await table.search(queryVector)
|
|
446
|
+
.limit(limit + offset)
|
|
447
|
+
.toArray();
|
|
448
|
+
|
|
449
|
+
return results.slice(offset).map((r: any, i: number) => rowToSearchResult(r, i, "vector"));
|
|
450
|
+
} catch {
|
|
451
|
+
return [];
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
/** Fallback full-text search when vector search isn't available. */
|
|
456
|
+
async function fullTextSearch(table: any, query: string, limit: number, offset: number): Promise<SearchResult[]> {
|
|
457
|
+
try {
|
|
458
|
+
const results = await table.search(query, "fts")
|
|
459
|
+
.limit(limit + offset)
|
|
460
|
+
.toArray();
|
|
461
|
+
|
|
462
|
+
return results.slice(offset).map((r: any, i: number) => rowToSearchResult(r, i, "fulltext"));
|
|
463
|
+
} catch {
|
|
464
|
+
return [];
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
/**
|
|
469
|
+
* Compatibility fallback for existing LanceDB tables that contain text chunks
|
|
470
|
+
* but no vector column or full-text inverted index.
|
|
471
|
+
*/
|
|
472
|
+
async function lexicalSearch(table: any, query: string, limit: number, offset: number): Promise<SearchResult[]> {
|
|
473
|
+
try {
|
|
474
|
+
const terms = tokenize(query);
|
|
475
|
+
if (terms.length === 0) return [];
|
|
476
|
+
|
|
477
|
+
const rows = await table.query()
|
|
478
|
+
.limit(DEFAULT_LEXICAL_SCAN_LIMIT)
|
|
479
|
+
.toArray();
|
|
480
|
+
|
|
481
|
+
const phrase = query.trim().toLowerCase();
|
|
482
|
+
const scored = rows
|
|
483
|
+
.map((row: any) => {
|
|
484
|
+
const content = String(row.content ?? row.text ?? "");
|
|
485
|
+
const path = String(row.path ?? row.source ?? "");
|
|
486
|
+
const haystack = `${path}\n${content}`.toLowerCase();
|
|
487
|
+
let score = 0;
|
|
488
|
+
|
|
489
|
+
for (const term of terms) {
|
|
490
|
+
const contentMatches = countOccurrences(content.toLowerCase(), term);
|
|
491
|
+
const pathMatches = countOccurrences(path.toLowerCase(), term);
|
|
492
|
+
score += contentMatches + pathMatches * 3;
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
if (phrase && haystack.includes(phrase)) score += terms.length * 4;
|
|
496
|
+
return { row, score };
|
|
497
|
+
})
|
|
498
|
+
.filter((item: { score: number }) => item.score > 0)
|
|
499
|
+
.sort((a: { score: number }, b: { score: number }) => b.score - a.score)
|
|
500
|
+
.slice(offset, offset + limit);
|
|
501
|
+
|
|
502
|
+
return scored.map((item: { row: any; score: number }, i: number) => ({
|
|
503
|
+
...rowToSearchResult(item.row, i, "fulltext"),
|
|
504
|
+
rank: item.score,
|
|
505
|
+
}));
|
|
506
|
+
} catch {
|
|
507
|
+
return [];
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
function rowToSearchResult(r: any, i: number, matchLayer: SearchResult["matchLayer"]): SearchResult {
|
|
512
|
+
const path = r.path ?? r.source ?? "";
|
|
513
|
+
return {
|
|
514
|
+
title: r.title ?? path ?? `Result ${i + 1}`,
|
|
515
|
+
content: r.content ?? r.text ?? String(r),
|
|
516
|
+
source: r.source ?? path ?? "",
|
|
517
|
+
rank: r._distance ?? (1 - (r.score ?? 0)),
|
|
518
|
+
contentType: (r.content_type === "code" || path?.match(/\.(ts|tsx|js|jsx|py|rs|go|sh|bash)$/)) ? "code" : "prose",
|
|
519
|
+
matchLayer,
|
|
520
|
+
};
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
function tokenize(query: string): string[] {
|
|
524
|
+
const seen = new Set<string>();
|
|
525
|
+
const stopwords = new Set(["a", "an", "and", "are", "as", "at", "for", "from", "how", "in", "is", "of", "on", "or", "the", "to", "with"]);
|
|
526
|
+
const terms = query
|
|
527
|
+
.toLowerCase()
|
|
528
|
+
.split(/[^a-z0-9_+#.-]+/i)
|
|
529
|
+
.map((term) => term.trim())
|
|
530
|
+
.filter((term) => term.length > 1 && !stopwords.has(term));
|
|
531
|
+
|
|
532
|
+
return terms.filter((term) => {
|
|
533
|
+
if (seen.has(term)) return false;
|
|
534
|
+
seen.add(term);
|
|
535
|
+
return true;
|
|
536
|
+
});
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
function countOccurrences(value: string, needle: string): number {
|
|
540
|
+
if (!needle) return 0;
|
|
541
|
+
let count = 0;
|
|
542
|
+
let index = value.indexOf(needle);
|
|
543
|
+
while (index !== -1) {
|
|
544
|
+
count += 1;
|
|
545
|
+
index = value.indexOf(needle, index + needle.length);
|
|
546
|
+
}
|
|
547
|
+
return count;
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// ─────────────────────────────────────────────────────────
|
|
551
|
+
// Embedding
|
|
552
|
+
// ─────────────────────────────────────────────────────────
|
|
553
|
+
|
|
554
|
+
interface EmbeddingConfig {
|
|
555
|
+
apiKey: string | null;
|
|
556
|
+
model: string;
|
|
557
|
+
baseUrl: string;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
/** Load embedding config — env var takes priority, then config file, then defaults. */
|
|
561
|
+
function loadEmbeddingConfig(): EmbeddingConfig {
|
|
562
|
+
// Env var takes top priority
|
|
563
|
+
const envKey = process.env.OPENROUTER_API_KEY ?? null;
|
|
564
|
+
|
|
565
|
+
const configPath = join(homedir(), ".unipi", "memory", "config.json");
|
|
566
|
+
try {
|
|
567
|
+
if (existsSync(configPath)) {
|
|
568
|
+
const raw = readFileSync(configPath, "utf-8");
|
|
569
|
+
const config = JSON.parse(raw);
|
|
570
|
+
return {
|
|
571
|
+
apiKey: envKey ?? config.openrouterApiKey ?? config.apiKey ?? null,
|
|
572
|
+
model: config.embeddingModel ?? "qwen/qwen3-embedding-8b",
|
|
573
|
+
baseUrl: config.openrouterBaseUrl ?? "https://openrouter.ai/api/v1",
|
|
574
|
+
};
|
|
575
|
+
}
|
|
576
|
+
} catch {
|
|
577
|
+
// Fall through to defaults
|
|
578
|
+
}
|
|
579
|
+
return {
|
|
580
|
+
apiKey: envKey,
|
|
581
|
+
model: "qwen/qwen3-embedding-8b",
|
|
582
|
+
baseUrl: "https://openrouter.ai/api/v1",
|
|
583
|
+
};
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
/** Generate embedding for a query using OpenRouter API. */
|
|
587
|
+
async function generateQueryEmbedding(query: string): Promise<number[] | null> {
|
|
588
|
+
const config = loadEmbeddingConfig();
|
|
589
|
+
if (!config.apiKey) return null;
|
|
590
|
+
|
|
591
|
+
try {
|
|
592
|
+
const response = await fetch(`${config.baseUrl}/embeddings`, {
|
|
593
|
+
method: "POST",
|
|
594
|
+
headers: {
|
|
595
|
+
"Authorization": `Bearer ${config.apiKey}`,
|
|
596
|
+
"Content-Type": "application/json",
|
|
597
|
+
},
|
|
598
|
+
body: JSON.stringify({
|
|
599
|
+
model: config.model,
|
|
600
|
+
input: query,
|
|
601
|
+
}),
|
|
602
|
+
});
|
|
603
|
+
|
|
604
|
+
if (!response.ok) return null;
|
|
605
|
+
|
|
606
|
+
const data = await response.json() as any;
|
|
607
|
+
return data.data?.[0]?.embedding ?? null;
|
|
608
|
+
} catch {
|
|
609
|
+
return null;
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
// ─────────────────────────────────────────────────────────
|
|
614
|
+
// Pipeline Template
|
|
615
|
+
// ─────────────────────────────────────────────────────────
|
|
616
|
+
|
|
617
|
+
/** Generate a cocoindex pipeline main.py template (v1.0+ API). */
|
|
618
|
+
function generatePipelineTemplate(projectDir: string, embeddingConfig: EmbeddingConfig): string {
|
|
619
|
+
const projectBasename = projectDir.split("/").pop() ?? "project";
|
|
620
|
+
return `"""
|
|
621
|
+
CocoIndex pipeline for ${projectBasename}
|
|
622
|
+
Auto-generated by @pi-unipi/cocoindex — customize as needed.
|
|
623
|
+
Requires cocoindex >= 1.0.
|
|
624
|
+
"""
|
|
625
|
+
import pathlib
|
|
626
|
+
from dataclasses import dataclass
|
|
627
|
+
from typing import AsyncIterator
|
|
628
|
+
|
|
629
|
+
import cocoindex as coco
|
|
630
|
+
from cocoindex.connectors import localfs, lancedb
|
|
631
|
+
from cocoindex.resources.file import PatternFilePathMatcher
|
|
632
|
+
|
|
633
|
+
import os
|
|
634
|
+
|
|
635
|
+
# ── Configuration ────────────────────────────────────
|
|
636
|
+
PROJECT_ROOT = os.environ.get("PROJECT_ROOT", "${projectDir}")
|
|
637
|
+
|
|
638
|
+
# ── LanceDB context key ──────────────────────────────
|
|
639
|
+
db_key = coco.ContextKey("lancedb/${projectBasename}")
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
# ── Environment setup (async lifespan) ───────────────
|
|
643
|
+
@coco.lifespan
|
|
644
|
+
async def coco_lifespan(builder: coco.EnvironmentBuilder) -> AsyncIterator[None]:
|
|
645
|
+
"""Configure environment: DB path + LanceDB connection."""
|
|
646
|
+
builder.settings.db_path = pathlib.Path(__file__).parent / "cocoindex.db"
|
|
647
|
+
|
|
648
|
+
db_path = pathlib.Path(__file__).parent / ".lancedb"
|
|
649
|
+
conn = await lancedb.connect_async(str(db_path))
|
|
650
|
+
builder.provide(db_key, conn)
|
|
651
|
+
|
|
652
|
+
yield
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
# ── Row type for LanceDB ─────────────────────────────
|
|
656
|
+
@dataclass
|
|
657
|
+
class IndexRow:
|
|
658
|
+
"""A single indexed chunk stored in LanceDB."""
|
|
659
|
+
path: str
|
|
660
|
+
chunk_index: int
|
|
661
|
+
content: str
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
# ── Chunking function (memoized) ─────────────────────
|
|
665
|
+
@coco.fn
|
|
666
|
+
async def chunk_text(
|
|
667
|
+
content: str,
|
|
668
|
+
*,
|
|
669
|
+
chunk_size: int = 1500,
|
|
670
|
+
chunk_overlap: int = 200,
|
|
671
|
+
) -> list[tuple[int, str]]:
|
|
672
|
+
"""Split text into overlapping chunks."""
|
|
673
|
+
if not content.strip():
|
|
674
|
+
return []
|
|
675
|
+
|
|
676
|
+
chunks: list[tuple[int, str]] = []
|
|
677
|
+
start = 0
|
|
678
|
+
idx = 0
|
|
679
|
+
while start < len(content):
|
|
680
|
+
end = min(start + chunk_size, len(content))
|
|
681
|
+
chunk = content[start:end].strip()
|
|
682
|
+
if chunk:
|
|
683
|
+
chunks.append((idx, chunk))
|
|
684
|
+
idx += 1
|
|
685
|
+
start += chunk_size - chunk_overlap
|
|
686
|
+
if start < 0:
|
|
687
|
+
start = 0
|
|
688
|
+
|
|
689
|
+
return chunks
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
# ── Process a single file ────────────────────────────
|
|
693
|
+
@coco.fn
|
|
694
|
+
async def process_file(
|
|
695
|
+
file: localfs.File,
|
|
696
|
+
table: lancedb.TableTarget,
|
|
697
|
+
) -> None:
|
|
698
|
+
"""Read a file, chunk it, and declare rows in LanceDB."""
|
|
699
|
+
try:
|
|
700
|
+
content = await file.read_text()
|
|
701
|
+
except Exception:
|
|
702
|
+
return
|
|
703
|
+
|
|
704
|
+
if not content.strip():
|
|
705
|
+
return
|
|
706
|
+
|
|
707
|
+
relative = file.file_path.path.as_posix()
|
|
708
|
+
chunks = await chunk_text(content)
|
|
709
|
+
|
|
710
|
+
for chunk_idx, text in chunks:
|
|
711
|
+
table.declare_row(row=IndexRow(
|
|
712
|
+
path=relative,
|
|
713
|
+
chunk_index=chunk_idx,
|
|
714
|
+
content=text,
|
|
715
|
+
))
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
# ── Main app function ────────────────────────────────
|
|
719
|
+
@coco.fn
|
|
720
|
+
async def app_main() -> None:
|
|
721
|
+
"""Walk project files -> chunk -> store in LanceDB."""
|
|
722
|
+
project_root = pathlib.Path(PROJECT_ROOT)
|
|
723
|
+
|
|
724
|
+
# 1) Declare LanceDB table target
|
|
725
|
+
table_schema = await lancedb.TableSchema.from_class(
|
|
726
|
+
IndexRow,
|
|
727
|
+
primary_key=["path", "chunk_index"],
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
target = await coco.mount_target(
|
|
731
|
+
lancedb.table_target(
|
|
732
|
+
db_key,
|
|
733
|
+
"${projectBasename}_index",
|
|
734
|
+
table_schema,
|
|
735
|
+
),
|
|
736
|
+
)
|
|
737
|
+
table = lancedb.TableTarget(target, table_schema)
|
|
738
|
+
|
|
739
|
+
# 2) Walk project files
|
|
740
|
+
walker = localfs.walk_dir(
|
|
741
|
+
project_root,
|
|
742
|
+
recursive=True,
|
|
743
|
+
path_matcher=PatternFilePathMatcher(
|
|
744
|
+
included_patterns=[
|
|
745
|
+
"**/*.ts", "**/*.tsx", "**/*.js", "**/*.jsx",
|
|
746
|
+
"**/*.py", "**/*.rs", "**/*.go",
|
|
747
|
+
"**/*.md", "**/*.txt", "**/*.json", "**/*.yaml", "**/*.yml",
|
|
748
|
+
"**/*.sh", "**/*.bash",
|
|
749
|
+
],
|
|
750
|
+
excluded_patterns=[
|
|
751
|
+
"**/node_modules/**", "**/.git/**", "**/dist/**",
|
|
752
|
+
"**/build/**", "**/.next/**", "**/__pycache__/**",
|
|
753
|
+
"**/.unipi/cocoindex/**",
|
|
754
|
+
],
|
|
755
|
+
),
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# 3) Process each file
|
|
759
|
+
async for file in walker:
|
|
760
|
+
await coco.mount(
|
|
761
|
+
coco.component_subpath("process", file.file_path.path.as_posix()),
|
|
762
|
+
process_file,
|
|
763
|
+
file,
|
|
764
|
+
table,
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
# ── App instance (required by CLI) ───────────────────
|
|
769
|
+
app = coco.App(
|
|
770
|
+
coco.AppConfig(name="local_${projectBasename}"),
|
|
771
|
+
app_main,
|
|
772
|
+
)
|
|
773
|
+
`;
|
|
774
|
+
}
|