@pi-unipi/cocoindex 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bridge.ts ADDED
@@ -0,0 +1,774 @@
1
+ /**
2
+ * bridge.ts — CocoIndex CLI interaction layer
3
+ *
4
+ * Spawns cocoindex commands and queries LanceDB directly for search.
5
+ * The bridge handles:
6
+ * - CLI detection (is cocoindex installed?)
7
+ * - Pipeline initialization (scaffold main.py)
8
+ * - Project indexing (cocoindex update)
9
+ * - Status reporting (last run, doc count)
10
+ * - Search (query LanceDB directly via Node.js SDK)
11
+ */
12
+
13
+ import { execFileSync, spawn } from "node:child_process";
14
+ import { existsSync, mkdirSync, readFileSync, writeFileSync, readdirSync, statSync } from "node:fs";
15
+ import { homedir } from "node:os";
16
+ import { join, dirname } from "node:path";
17
+ import { COCOINDEX_MIN_VERSION } from "@pi-unipi/core";
18
+
19
+ // ─────────────────────────────────────────────────────────
20
+ // Types
21
+ // ─────────────────────────────────────────────────────────
22
+
23
+ export interface IndexResult {
24
+ success: boolean;
25
+ chunksProcessed: number;
26
+ durationMs: number;
27
+ error?: string;
28
+ }
29
+
30
+ export interface StatusInfo {
31
+ indexed: boolean;
32
+ lastRun: string | null;
33
+ docCount: number;
34
+ pipelineConfigured: boolean;
35
+ cliAvailable: boolean;
36
+ targetStore: "lancedb" | "postgres" | "qdrant" | "sqlite" | "unknown";
37
+ }
38
+
39
+ export interface SearchOptions {
40
+ limit?: number;
41
+ offset?: number;
42
+ minScore?: number;
43
+ }
44
+
45
+ export interface SearchResult {
46
+ title: string;
47
+ content: string;
48
+ source: string;
49
+ rank: number;
50
+ contentType: "code" | "prose";
51
+ matchLayer: "vector" | "fulltext" | "hybrid";
52
+ }
53
+
54
+ export interface CocoindexDeps {
55
+ projectDir: string;
56
+ pipelineDir: string;
57
+ initialized: boolean;
58
+ }
59
+
60
+ // ─────────────────────────────────────────────────────────
61
+ // Constants
62
+ // ─────────────────────────────────────────────────────────
63
+
64
+ const COCOINDEX_STATE_DIR = ".cocoindex";
65
+ const DEFAULT_PIPELINE_DIR = ".unipi/cocoindex";
66
+ const DEFAULT_LANCEDB_PATH = ".unipi/cocoindex/.lancedb";
67
+ const DEFAULT_LEXICAL_SCAN_LIMIT = 50_000;
68
+
69
+ // ─────────────────────────────────────────────────────────
70
+ // CLI Detection
71
+ // ─────────────────────────────────────────────────────────
72
+
73
+ let cachedAvailable: boolean | null = null;
74
+
75
+ export interface AvailabilityOptions {
76
+ /** Bypass the cached availability result. */
77
+ useCache?: boolean;
78
+ }
79
+
80
+ /** Extract a semver-ish version from `cocoindex --version` output. */
81
+ export function parseVersion(versionStr: string): string | null {
82
+ const match = versionStr.match(/(?:^|[^0-9])(\d+\.\d+(?:\.\d+)?)(?:[^0-9]|$)/);
83
+ return match?.[1] ?? null;
84
+ }
85
+
86
+ /** Compare semver-ish strings. Missing/invalid versions are not acceptable. */
87
+ export function isVersionAtLeast(version: string | null | undefined, minimum = COCOINDEX_MIN_VERSION): boolean {
88
+ const parsed = version ? parseVersion(version) : null;
89
+ const parsedMinimum = parseVersion(minimum);
90
+ if (!parsed || !parsedMinimum) return false;
91
+
92
+ const actualParts = parsed.split(".").map((part) => Number.parseInt(part, 10));
93
+ const minParts = parsedMinimum.split(".").map((part) => Number.parseInt(part, 10));
94
+ const len = Math.max(actualParts.length, minParts.length, 3);
95
+
96
+ for (let i = 0; i < len; i++) {
97
+ const actual = actualParts[i] ?? 0;
98
+ const min = minParts[i] ?? 0;
99
+ if (!Number.isFinite(actual) || !Number.isFinite(min)) return false;
100
+ if (actual > min) return true;
101
+ if (actual < min) return false;
102
+ }
103
+ return true;
104
+ }
105
+
106
+ /** Reset cached availability, used after installer mutations. */
107
+ export function resetAvailabilityCache(): void {
108
+ cachedAvailable = null;
109
+ }
110
+
111
+ /** Check if cocoindex CLI is installed and available. */
112
+ export async function isAvailable(options: AvailabilityOptions = {}): Promise<boolean> {
113
+ const useCache = options.useCache ?? true;
114
+ if (useCache && cachedAvailable !== null) return cachedAvailable;
115
+
116
+ let available = false;
117
+ try {
118
+ const result = execFileSync(getCocoindexBinPath(), ["--version"], {
119
+ encoding: "utf-8",
120
+ timeout: 5000,
121
+ stdio: ["pipe", "pipe", "pipe"],
122
+ });
123
+ available = result.trim().length > 0;
124
+ } catch {
125
+ available = false;
126
+ }
127
+
128
+ if (useCache) cachedAvailable = available;
129
+ return available;
130
+ }
131
+
132
+ /** Get cocoindex CLI version string. */
133
+ export async function getVersion(): Promise<string | null> {
134
+ try {
135
+ const result = execFileSync(getCocoindexBinPath(), ["--version"], {
136
+ encoding: "utf-8",
137
+ timeout: 5000,
138
+ stdio: ["pipe", "pipe", "pipe"],
139
+ });
140
+ return result.trim();
141
+ } catch {
142
+ return null;
143
+ }
144
+ }
145
+
146
+ /** Resolve cocoindex binary path — checks PATH, uv tool bin path, then common mise locations. */
147
+ export function getCocoindexBinPath(): string {
148
+ return resolveCocoindexBin();
149
+ }
150
+
151
+ function resolveCocoindexBin(): string {
152
+ // Try PATH first.
153
+ try {
154
+ const resolved = execFileSync("sh", ["-c", "command -v cocoindex"], {
155
+ encoding: "utf-8",
156
+ timeout: 3000,
157
+ stdio: ["pipe", "pipe", "pipe"],
158
+ }).trim();
159
+ if (resolved) return resolved;
160
+ } catch {
161
+ // Not on PATH
162
+ }
163
+
164
+ // uv tool install exposes binaries here by default.
165
+ const uvToolBin = join(homedir(), ".local", "bin", "cocoindex");
166
+ if (existsSync(uvToolBin)) return uvToolBin;
167
+
168
+ // Try mise python installations.
169
+ const miseRoot = join(homedir(), ".local", "share", "mise", "installs", "python");
170
+ try {
171
+ const versions = readdirSync(miseRoot).sort().reverse();
172
+ for (const ver of versions) {
173
+ const binPath = join(miseRoot, ver, "bin", "cocoindex");
174
+ if (existsSync(binPath)) return binPath;
175
+ }
176
+ } catch {
177
+ // mise not installed or no python versions
178
+ }
179
+
180
+ return "cocoindex"; // Fall back to PATH resolution
181
+ }
182
+
183
+ // ─────────────────────────────────────────────────────────
184
+ // Pipeline Management
185
+ // ─────────────────────────────────────────────────────────
186
+
187
+ /** Get the pipeline directory for a project. */
188
+ export function getPipelineDir(projectDir: string): string {
189
+ return join(projectDir, DEFAULT_PIPELINE_DIR);
190
+ }
191
+
192
+ /** Check if a pipeline is already initialized. */
193
+ export async function isPipelineInitialized(pipelineDir: string): Promise<boolean> {
194
+ return existsSync(join(pipelineDir, "main.py"));
195
+ }
196
+
197
+ /** Detect the target store from main.py content. */
198
+ export function detectTargetStore(pipelineDir: string): StatusInfo["targetStore"] {
199
+ const mainPyPath = join(pipelineDir, "main.py");
200
+ if (!existsSync(mainPyPath)) return "unknown";
201
+
202
+ try {
203
+ const content = readFileSync(mainPyPath, "utf-8");
204
+ if (content.includes("LanceDB") || content.includes("lancedb")) return "lancedb";
205
+ if (content.includes("Postgres") || content.includes("postgresql")) return "postgres";
206
+ if (content.includes("Qdrant") || content.includes("qdrant")) return "qdrant";
207
+ if (content.includes("SQLite") || content.includes("sqlite")) return "sqlite";
208
+ return "unknown";
209
+ } catch {
210
+ return "unknown";
211
+ }
212
+ }
213
+
214
+ /** Initialize a cocoindex pipeline with default LanceDB target. */
215
+ export async function initPipeline(projectDir: string): Promise<{ success: boolean; error?: string }> {
216
+ const pipelineDir = getPipelineDir(projectDir);
217
+
218
+ // Create directory
219
+ if (!existsSync(pipelineDir)) {
220
+ mkdirSync(pipelineDir, { recursive: true });
221
+ }
222
+
223
+ // Don't overwrite existing pipeline
224
+ if (existsSync(join(pipelineDir, "main.py"))) {
225
+ return { success: true };
226
+ }
227
+
228
+ // Read embedding config from memory settings
229
+ const embeddingConfig = loadEmbeddingConfig();
230
+
231
+ const template = generatePipelineTemplate(projectDir, embeddingConfig);
232
+ writeFileSync(join(pipelineDir, "main.py"), template, "utf-8");
233
+
234
+ return { success: true };
235
+ }
236
+
237
+ // ─────────────────────────────────────────────────────────
238
+ // Indexing
239
+ // ─────────────────────────────────────────────────────────
240
+
241
+ /** Run cocoindex update to index the project. */
242
+ export async function indexProject(projectDir: string): Promise<IndexResult> {
243
+ const available = await isAvailable();
244
+ if (!available) {
245
+ return {
246
+ success: false,
247
+ chunksProcessed: 0,
248
+ durationMs: 0,
249
+ error: "CocoIndex CLI not found. Run /unipi:cocoindex-init to install cocoindex[lancedb]>=1.0.",
250
+ };
251
+ }
252
+
253
+ const pipelineDir = getPipelineDir(projectDir);
254
+ if (!existsSync(join(pipelineDir, "main.py"))) {
255
+ return {
256
+ success: false,
257
+ chunksProcessed: 0,
258
+ durationMs: 0,
259
+ error: "Pipeline not initialized. Run /unipi:cocoindex-init first.",
260
+ };
261
+ }
262
+
263
+ const start = Date.now();
264
+
265
+ const cocoindexBin = resolveCocoindexBin();
266
+
267
+ return new Promise<IndexResult>((resolve) => {
268
+ const proc = spawn(cocoindexBin, ["update", "main.py"], {
269
+ cwd: pipelineDir,
270
+ stdio: ["pipe", "pipe", "pipe"],
271
+ timeout: 300000, // 5 min timeout
272
+ });
273
+
274
+ let stdout = "";
275
+ let stderr = "";
276
+
277
+ proc.stdout.on("data", (data: Buffer) => {
278
+ stdout += data.toString();
279
+ });
280
+
281
+ proc.stderr.on("data", (data: Buffer) => {
282
+ stderr += data.toString();
283
+ });
284
+
285
+ proc.on("close", (code: number | null) => {
286
+ const durationMs = Date.now() - start;
287
+ const chunksProcessed = parseChunksProcessed(stdout);
288
+
289
+ if (code === 0) {
290
+ resolve({ success: true, chunksProcessed, durationMs });
291
+ } else {
292
+ resolve({
293
+ success: false,
294
+ chunksProcessed,
295
+ durationMs,
296
+ error: stderr.trim() || `Process exited with code ${code}`,
297
+ });
298
+ }
299
+ });
300
+
301
+ proc.on("error", (err: Error) => {
302
+ resolve({
303
+ success: false,
304
+ chunksProcessed: 0,
305
+ durationMs: Date.now() - start,
306
+ error: err.message,
307
+ });
308
+ });
309
+ });
310
+ }
311
+
312
+ /** Parse the number of files processed from cocoindex v1.0+ output. */
313
+ function parseChunksProcessed(output: string): number {
314
+ // v1.0+ format: "✅ process_file: 604 total | 604 added"
315
+ // Capture the last "added" or "reprocessed" count for process_file
316
+ const lines = output.split("\n");
317
+ let lastProcessLine: string | undefined;
318
+ for (const line of lines) {
319
+ if (line.includes("process_file:") && (line.includes("added") || line.includes("reprocessed"))) {
320
+ lastProcessLine = line;
321
+ }
322
+ }
323
+ if (lastProcessLine) {
324
+ // Match the number before "added" or "reprocessed"
325
+ const match = lastProcessLine.match(/(\d+)\s+(?:added|reprocessed)/);
326
+ if (match) return parseInt(match[1], 10);
327
+ }
328
+
329
+ // Fallback: old format "Processed 42 chunks"
330
+ const fallback = output.match(/processed\s+(\d+)\s+chunks?/i)
331
+ ?? output.match(/(\d+)\s+chunks?\s+processed/i)
332
+ ?? output.match(/indexed\s+(\d+)/i);
333
+ return fallback ? parseInt(fallback[1], 10) : 0;
334
+ }
335
+
336
+ // ─────────────────────────────────────────────────────────
337
+ // Status
338
+ // ─────────────────────────────────────────────────────────
339
+
340
+ /** Get indexing status for the project. */
341
+ export async function status(projectDir: string): Promise<StatusInfo> {
342
+ const pipelineDir = getPipelineDir(projectDir);
343
+ const cliAvailable = await isAvailable();
344
+ const pipelineConfigured = existsSync(join(pipelineDir, "main.py"));
345
+ const targetStore = detectTargetStore(pipelineDir);
346
+
347
+ let docCount = 0;
348
+ let lastRun: string | null = null;
349
+
350
+ // Check LanceDB data for doc count and freshness
351
+ const lancedbPath = join(pipelineDir, ".lancedb");
352
+ if (existsSync(lancedbPath)) {
353
+ try {
354
+ const stat = statSync(lancedbPath);
355
+ lastRun = stat.mtime.toISOString();
356
+ // Count .lance files as a rough doc estimate
357
+ const files = readdirSync(lancedbPath, { recursive: true });
358
+ docCount = (files as string[]).filter((f) => f.endsWith(".lance")).length;
359
+ } catch {
360
+ // Non-fatal
361
+ }
362
+ }
363
+
364
+ return {
365
+ indexed: docCount > 0,
366
+ lastRun,
367
+ docCount,
368
+ pipelineConfigured,
369
+ cliAvailable,
370
+ targetStore,
371
+ };
372
+ }
373
+
374
+ // ─────────────────────────────────────────────────────────
375
+ // Search
376
+ // ─────────────────────────────────────────────────────────
377
+
378
+ /** Search indexed content by querying LanceDB directly. */
379
+ export async function search(
380
+ projectDir: string,
381
+ query: string,
382
+ options?: SearchOptions,
383
+ ): Promise<SearchResult[]> {
384
+ const limit = options?.limit ?? 10;
385
+ const offset = options?.offset ?? 0;
386
+
387
+ try {
388
+ const lancedbPath = join(getPipelineDir(projectDir), ".lancedb");
389
+ if (!existsSync(lancedbPath)) {
390
+ return [];
391
+ }
392
+
393
+ // Dynamic import — LanceDB SDK may not be installed.
394
+ // @ts-ignore — optional dependency, may not be installed
395
+ const lancedb = await import("@lancedb/lancedb");
396
+ const db = await lancedb.connect(lancedbPath);
397
+
398
+ const tableNames = await db.tableNames();
399
+ if (tableNames.length === 0) return [];
400
+
401
+ const table = await db.openTable(tableNames[0]);
402
+
403
+ // Prefer semantic vector search when the pipeline/table provides a vector
404
+ // column. Older generated pipelines only contain path/chunk_index/content;
405
+ // LanceDB throws for those tables, so continue to FTS/lexical fallback.
406
+ const queryVector = await generateQueryEmbedding(query);
407
+ if (queryVector) {
408
+ const vectorResults = await vectorSearch(table, queryVector, limit, offset);
409
+ if (vectorResults.length > 0) return vectorResults;
410
+ }
411
+
412
+ // Prefer LanceDB's native FTS when an inverted index exists on content.
413
+ const ftsResults = await fullTextSearch(table, query, limit, offset);
414
+ if (ftsResults.length > 0) return ftsResults;
415
+
416
+ // Last-resort compatibility path for existing text-only LanceDB tables.
417
+ // This keeps indexed projects searchable immediately instead of returning
418
+ // a misleading "run cocoindex-update" message when no vector/FTS index is
419
+ // available yet.
420
+ return lexicalSearch(table, query, limit, offset);
421
+ } catch (err: any) {
422
+ if (err?.code === "MODULE_NOT_FOUND" || err?.message?.includes("Cannot find module")) {
423
+ return [{
424
+ title: "Search Unavailable",
425
+ content: "LanceDB SDK not installed. Install with: npm install @lancedb/lancedb",
426
+ source: "",
427
+ rank: 0,
428
+ contentType: "prose",
429
+ matchLayer: "fulltext",
430
+ }];
431
+ }
432
+ return [{
433
+ title: "Search Error",
434
+ content: `CocoIndex LanceDB search failed: ${err?.message ?? String(err)}`,
435
+ source: "",
436
+ rank: 0,
437
+ contentType: "prose",
438
+ matchLayer: "fulltext",
439
+ }];
440
+ }
441
+ }
442
+
443
+ async function vectorSearch(table: any, queryVector: number[], limit: number, offset: number): Promise<SearchResult[]> {
444
+ try {
445
+ const results = await table.search(queryVector)
446
+ .limit(limit + offset)
447
+ .toArray();
448
+
449
+ return results.slice(offset).map((r: any, i: number) => rowToSearchResult(r, i, "vector"));
450
+ } catch {
451
+ return [];
452
+ }
453
+ }
454
+
455
+ /** Fallback full-text search when vector search isn't available. */
456
+ async function fullTextSearch(table: any, query: string, limit: number, offset: number): Promise<SearchResult[]> {
457
+ try {
458
+ const results = await table.search(query, "fts")
459
+ .limit(limit + offset)
460
+ .toArray();
461
+
462
+ return results.slice(offset).map((r: any, i: number) => rowToSearchResult(r, i, "fulltext"));
463
+ } catch {
464
+ return [];
465
+ }
466
+ }
467
+
468
+ /**
469
+ * Compatibility fallback for existing LanceDB tables that contain text chunks
470
+ * but no vector column or full-text inverted index.
471
+ */
472
+ async function lexicalSearch(table: any, query: string, limit: number, offset: number): Promise<SearchResult[]> {
473
+ try {
474
+ const terms = tokenize(query);
475
+ if (terms.length === 0) return [];
476
+
477
+ const rows = await table.query()
478
+ .limit(DEFAULT_LEXICAL_SCAN_LIMIT)
479
+ .toArray();
480
+
481
+ const phrase = query.trim().toLowerCase();
482
+ const scored = rows
483
+ .map((row: any) => {
484
+ const content = String(row.content ?? row.text ?? "");
485
+ const path = String(row.path ?? row.source ?? "");
486
+ const haystack = `${path}\n${content}`.toLowerCase();
487
+ let score = 0;
488
+
489
+ for (const term of terms) {
490
+ const contentMatches = countOccurrences(content.toLowerCase(), term);
491
+ const pathMatches = countOccurrences(path.toLowerCase(), term);
492
+ score += contentMatches + pathMatches * 3;
493
+ }
494
+
495
+ if (phrase && haystack.includes(phrase)) score += terms.length * 4;
496
+ return { row, score };
497
+ })
498
+ .filter((item: { score: number }) => item.score > 0)
499
+ .sort((a: { score: number }, b: { score: number }) => b.score - a.score)
500
+ .slice(offset, offset + limit);
501
+
502
+ return scored.map((item: { row: any; score: number }, i: number) => ({
503
+ ...rowToSearchResult(item.row, i, "fulltext"),
504
+ rank: item.score,
505
+ }));
506
+ } catch {
507
+ return [];
508
+ }
509
+ }
510
+
511
+ function rowToSearchResult(r: any, i: number, matchLayer: SearchResult["matchLayer"]): SearchResult {
512
+ const path = r.path ?? r.source ?? "";
513
+ return {
514
+ title: r.title ?? path ?? `Result ${i + 1}`,
515
+ content: r.content ?? r.text ?? String(r),
516
+ source: r.source ?? path ?? "",
517
+ rank: r._distance ?? (1 - (r.score ?? 0)),
518
+ contentType: (r.content_type === "code" || path?.match(/\.(ts|tsx|js|jsx|py|rs|go|sh|bash)$/)) ? "code" : "prose",
519
+ matchLayer,
520
+ };
521
+ }
522
+
523
+ function tokenize(query: string): string[] {
524
+ const seen = new Set<string>();
525
+ const stopwords = new Set(["a", "an", "and", "are", "as", "at", "for", "from", "how", "in", "is", "of", "on", "or", "the", "to", "with"]);
526
+ const terms = query
527
+ .toLowerCase()
528
+ .split(/[^a-z0-9_+#.-]+/i)
529
+ .map((term) => term.trim())
530
+ .filter((term) => term.length > 1 && !stopwords.has(term));
531
+
532
+ return terms.filter((term) => {
533
+ if (seen.has(term)) return false;
534
+ seen.add(term);
535
+ return true;
536
+ });
537
+ }
538
+
539
+ function countOccurrences(value: string, needle: string): number {
540
+ if (!needle) return 0;
541
+ let count = 0;
542
+ let index = value.indexOf(needle);
543
+ while (index !== -1) {
544
+ count += 1;
545
+ index = value.indexOf(needle, index + needle.length);
546
+ }
547
+ return count;
548
+ }
549
+
550
+ // ─────────────────────────────────────────────────────────
551
+ // Embedding
552
+ // ─────────────────────────────────────────────────────────
553
+
554
+ interface EmbeddingConfig {
555
+ apiKey: string | null;
556
+ model: string;
557
+ baseUrl: string;
558
+ }
559
+
560
+ /** Load embedding config — env var takes priority, then config file, then defaults. */
561
+ function loadEmbeddingConfig(): EmbeddingConfig {
562
+ // Env var takes top priority
563
+ const envKey = process.env.OPENROUTER_API_KEY ?? null;
564
+
565
+ const configPath = join(homedir(), ".unipi", "memory", "config.json");
566
+ try {
567
+ if (existsSync(configPath)) {
568
+ const raw = readFileSync(configPath, "utf-8");
569
+ const config = JSON.parse(raw);
570
+ return {
571
+ apiKey: envKey ?? config.openrouterApiKey ?? config.apiKey ?? null,
572
+ model: config.embeddingModel ?? "qwen/qwen3-embedding-8b",
573
+ baseUrl: config.openrouterBaseUrl ?? "https://openrouter.ai/api/v1",
574
+ };
575
+ }
576
+ } catch {
577
+ // Fall through to defaults
578
+ }
579
+ return {
580
+ apiKey: envKey,
581
+ model: "qwen/qwen3-embedding-8b",
582
+ baseUrl: "https://openrouter.ai/api/v1",
583
+ };
584
+ }
585
+
586
+ /** Generate embedding for a query using OpenRouter API. */
587
+ async function generateQueryEmbedding(query: string): Promise<number[] | null> {
588
+ const config = loadEmbeddingConfig();
589
+ if (!config.apiKey) return null;
590
+
591
+ try {
592
+ const response = await fetch(`${config.baseUrl}/embeddings`, {
593
+ method: "POST",
594
+ headers: {
595
+ "Authorization": `Bearer ${config.apiKey}`,
596
+ "Content-Type": "application/json",
597
+ },
598
+ body: JSON.stringify({
599
+ model: config.model,
600
+ input: query,
601
+ }),
602
+ });
603
+
604
+ if (!response.ok) return null;
605
+
606
+ const data = await response.json() as any;
607
+ return data.data?.[0]?.embedding ?? null;
608
+ } catch {
609
+ return null;
610
+ }
611
+ }
612
+
613
+ // ─────────────────────────────────────────────────────────
614
+ // Pipeline Template
615
+ // ─────────────────────────────────────────────────────────
616
+
617
+ /** Generate a cocoindex pipeline main.py template (v1.0+ API). */
618
+ function generatePipelineTemplate(projectDir: string, embeddingConfig: EmbeddingConfig): string {
619
+ const projectBasename = projectDir.split("/").pop() ?? "project";
620
+ return `"""
621
+ CocoIndex pipeline for ${projectBasename}
622
+ Auto-generated by @pi-unipi/cocoindex — customize as needed.
623
+ Requires cocoindex >= 1.0.
624
+ """
625
+ import pathlib
626
+ from dataclasses import dataclass
627
+ from typing import AsyncIterator
628
+
629
+ import cocoindex as coco
630
+ from cocoindex.connectors import localfs, lancedb
631
+ from cocoindex.resources.file import PatternFilePathMatcher
632
+
633
+ import os
634
+
635
+ # ── Configuration ────────────────────────────────────
636
+ PROJECT_ROOT = os.environ.get("PROJECT_ROOT", "${projectDir}")
637
+
638
+ # ── LanceDB context key ──────────────────────────────
639
+ db_key = coco.ContextKey("lancedb/${projectBasename}")
640
+
641
+
642
+ # ── Environment setup (async lifespan) ───────────────
643
+ @coco.lifespan
644
+ async def coco_lifespan(builder: coco.EnvironmentBuilder) -> AsyncIterator[None]:
645
+ """Configure environment: DB path + LanceDB connection."""
646
+ builder.settings.db_path = pathlib.Path(__file__).parent / "cocoindex.db"
647
+
648
+ db_path = pathlib.Path(__file__).parent / ".lancedb"
649
+ conn = await lancedb.connect_async(str(db_path))
650
+ builder.provide(db_key, conn)
651
+
652
+ yield
653
+
654
+
655
+ # ── Row type for LanceDB ─────────────────────────────
656
+ @dataclass
657
+ class IndexRow:
658
+ """A single indexed chunk stored in LanceDB."""
659
+ path: str
660
+ chunk_index: int
661
+ content: str
662
+
663
+
664
+ # ── Chunking function (memoized) ─────────────────────
665
+ @coco.fn
666
+ async def chunk_text(
667
+ content: str,
668
+ *,
669
+ chunk_size: int = 1500,
670
+ chunk_overlap: int = 200,
671
+ ) -> list[tuple[int, str]]:
672
+ """Split text into overlapping chunks."""
673
+ if not content.strip():
674
+ return []
675
+
676
+ chunks: list[tuple[int, str]] = []
677
+ start = 0
678
+ idx = 0
679
+ while start < len(content):
680
+ end = min(start + chunk_size, len(content))
681
+ chunk = content[start:end].strip()
682
+ if chunk:
683
+ chunks.append((idx, chunk))
684
+ idx += 1
685
+ start += chunk_size - chunk_overlap
686
+ if start < 0:
687
+ start = 0
688
+
689
+ return chunks
690
+
691
+
692
+ # ── Process a single file ────────────────────────────
693
+ @coco.fn
694
+ async def process_file(
695
+ file: localfs.File,
696
+ table: lancedb.TableTarget,
697
+ ) -> None:
698
+ """Read a file, chunk it, and declare rows in LanceDB."""
699
+ try:
700
+ content = await file.read_text()
701
+ except Exception:
702
+ return
703
+
704
+ if not content.strip():
705
+ return
706
+
707
+ relative = file.file_path.path.as_posix()
708
+ chunks = await chunk_text(content)
709
+
710
+ for chunk_idx, text in chunks:
711
+ table.declare_row(row=IndexRow(
712
+ path=relative,
713
+ chunk_index=chunk_idx,
714
+ content=text,
715
+ ))
716
+
717
+
718
+ # ── Main app function ────────────────────────────────
719
+ @coco.fn
720
+ async def app_main() -> None:
721
+ """Walk project files -> chunk -> store in LanceDB."""
722
+ project_root = pathlib.Path(PROJECT_ROOT)
723
+
724
+ # 1) Declare LanceDB table target
725
+ table_schema = await lancedb.TableSchema.from_class(
726
+ IndexRow,
727
+ primary_key=["path", "chunk_index"],
728
+ )
729
+
730
+ target = await coco.mount_target(
731
+ lancedb.table_target(
732
+ db_key,
733
+ "${projectBasename}_index",
734
+ table_schema,
735
+ ),
736
+ )
737
+ table = lancedb.TableTarget(target, table_schema)
738
+
739
+ # 2) Walk project files
740
+ walker = localfs.walk_dir(
741
+ project_root,
742
+ recursive=True,
743
+ path_matcher=PatternFilePathMatcher(
744
+ included_patterns=[
745
+ "**/*.ts", "**/*.tsx", "**/*.js", "**/*.jsx",
746
+ "**/*.py", "**/*.rs", "**/*.go",
747
+ "**/*.md", "**/*.txt", "**/*.json", "**/*.yaml", "**/*.yml",
748
+ "**/*.sh", "**/*.bash",
749
+ ],
750
+ excluded_patterns=[
751
+ "**/node_modules/**", "**/.git/**", "**/dist/**",
752
+ "**/build/**", "**/.next/**", "**/__pycache__/**",
753
+ "**/.unipi/cocoindex/**",
754
+ ],
755
+ ),
756
+ )
757
+
758
+ # 3) Process each file
759
+ async for file in walker:
760
+ await coco.mount(
761
+ coco.component_subpath("process", file.file_path.path.as_posix()),
762
+ process_file,
763
+ file,
764
+ table,
765
+ )
766
+
767
+
768
+ # ── App instance (required by CLI) ───────────────────
769
+ app = coco.App(
770
+ coco.AppConfig(name="local_${projectBasename}"),
771
+ app_main,
772
+ )
773
+ `;
774
+ }