@pi-unipi/cocoindex 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/bridge.ts +75 -7
  2. package/package.json +1 -1
package/bridge.ts CHANGED
@@ -64,6 +64,7 @@ export interface CocoindexDeps {
64
64
  const COCOINDEX_STATE_DIR = ".cocoindex";
65
65
  const DEFAULT_PIPELINE_DIR = ".unipi/cocoindex";
66
66
  const DEFAULT_LANCEDB_PATH = ".unipi/cocoindex/.lancedb";
67
+ const DEFAULT_UPDATE_TIMEOUT_MS = 15 * 60 * 1000;
67
68
  const DEFAULT_LEXICAL_SCAN_LIMIT = 50_000;
68
69
 
69
70
  // ─────────────────────────────────────────────────────────
@@ -268,11 +269,22 @@ export async function indexProject(projectDir: string): Promise<IndexResult> {
268
269
  const proc = spawn(cocoindexBin, ["update", "main.py"], {
269
270
  cwd: pipelineDir,
270
271
  stdio: ["pipe", "pipe", "pipe"],
271
- timeout: 300000, // 5 min timeout
272
272
  });
273
273
 
274
274
  let stdout = "";
275
275
  let stderr = "";
276
+ let timedOut = false;
277
+ let settled = false;
278
+ const timeoutMs = getUpdateTimeoutMs();
279
+
280
+ const timer = setTimeout(() => {
281
+ timedOut = true;
282
+ proc.kill("SIGTERM");
283
+ setTimeout(() => {
284
+ if (!settled) proc.kill("SIGKILL");
285
+ }, 5000).unref();
286
+ }, timeoutMs);
287
+ timer.unref();
276
288
 
277
289
  proc.stdout.on("data", (data: Buffer) => {
278
290
  stdout += data.toString();
@@ -282,7 +294,9 @@ export async function indexProject(projectDir: string): Promise<IndexResult> {
282
294
  stderr += data.toString();
283
295
  });
284
296
 
285
- proc.on("close", (code: number | null) => {
297
+ proc.on("close", (code: number | null, signal: NodeJS.Signals | null) => {
298
+ settled = true;
299
+ clearTimeout(timer);
286
300
  const durationMs = Date.now() - start;
287
301
  const chunksProcessed = parseChunksProcessed(stdout);
288
302
 
@@ -293,12 +307,14 @@ export async function indexProject(projectDir: string): Promise<IndexResult> {
293
307
  success: false,
294
308
  chunksProcessed,
295
309
  durationMs,
296
- error: stderr.trim() || `Process exited with code ${code}`,
310
+ error: formatIndexFailure({ code, signal, timedOut, timeoutMs, stdout, stderr }),
297
311
  });
298
312
  }
299
313
  });
300
314
 
301
315
  proc.on("error", (err: Error) => {
316
+ settled = true;
317
+ clearTimeout(timer);
302
318
  resolve({
303
319
  success: false,
304
320
  chunksProcessed: 0,
@@ -310,6 +326,46 @@ export async function indexProject(projectDir: string): Promise<IndexResult> {
310
326
  }
311
327
 
312
328
  /** Parse the number of files processed from cocoindex v1.0+ output. */
329
+ function getUpdateTimeoutMs(): number {
330
+ const raw = process.env.COCOINDEX_UPDATE_TIMEOUT_MS;
331
+ if (!raw) return DEFAULT_UPDATE_TIMEOUT_MS;
332
+
333
+ const parsed = Number(raw);
334
+ if (!Number.isFinite(parsed) || parsed <= 0) return DEFAULT_UPDATE_TIMEOUT_MS;
335
+ return parsed;
336
+ }
337
+
338
+ function formatIndexFailure(args: {
339
+ code: number | null;
340
+ signal: NodeJS.Signals | null;
341
+ timedOut: boolean;
342
+ timeoutMs: number;
343
+ stdout: string;
344
+ stderr: string;
345
+ }): string {
346
+ const parts: string[] = [];
347
+
348
+ if (args.timedOut) {
349
+ parts.push(`Timed out after ${(args.timeoutMs / 1000).toFixed(0)}s`);
350
+ } else if (args.signal) {
351
+ parts.push(`Process terminated by ${args.signal}`);
352
+ } else {
353
+ parts.push(`Process exited with code ${args.code ?? "unknown"}`);
354
+ }
355
+
356
+ const stderr = tailText(args.stderr.trim(), 4000);
357
+ const stdout = tailText(args.stdout.trim(), 2000);
358
+ if (stderr) parts.push(`stderr:\n${stderr}`);
359
+ if (stdout) parts.push(`stdout:\n${stdout}`);
360
+
361
+ return parts.join("\n\n");
362
+ }
363
+
364
+ function tailText(text: string, maxChars: number): string {
365
+ if (text.length <= maxChars) return text;
366
+ return `…${text.slice(-maxChars)}`;
367
+ }
368
+
313
369
  function parseChunksProcessed(output: string): number {
314
370
  // v1.0+ format: "✅ process_file: 604 total | 604 added"
315
371
  // Capture the last "added" or "reprocessed" count for process_file
@@ -321,9 +377,14 @@ function parseChunksProcessed(output: string): number {
321
377
  }
322
378
  }
323
379
  if (lastProcessLine) {
324
- // Match the number before "added" or "reprocessed"
325
- const match = lastProcessLine.match(/(\d+)\s+(?:added|reprocessed)/);
326
- if (match) return parseInt(match[1], 10);
380
+ // Prefer completed work counts. Lines can contain multiple counters, e.g.
381
+ // "process_file: 615 total | 8 added, 606 reprocessed".
382
+ const matches = [...lastProcessLine.matchAll(/(\d+)\s+(?:added|reprocessed|skipped|deleted)/g)];
383
+ const completed = matches.reduce((sum, match) => sum + parseInt(match[1], 10), 0);
384
+ if (completed > 0) return completed;
385
+
386
+ const total = lastProcessLine.match(/process_file:\s*(\d+)\s+total/);
387
+ if (total) return parseInt(total[1], 10);
327
388
  }
328
389
 
329
390
  // Fallback: old format "Processed 42 chunks"
@@ -634,6 +695,8 @@ import os
634
695
 
635
696
  # ── Configuration ────────────────────────────────────
636
697
  PROJECT_ROOT = os.environ.get("PROJECT_ROOT", "${projectDir}")
698
+ # Safety limit for huge generated/lock files. Set COCO_MAX_FILE_CHARS=0 to disable.
699
+ MAX_FILE_CHARS = int(os.environ.get("COCO_MAX_FILE_CHARS", "200000"))
637
700
 
638
701
  # ── LanceDB context key ──────────────────────────────
639
702
  db_key = coco.ContextKey("lancedb/${projectBasename}")
@@ -703,6 +766,8 @@ async def process_file(
703
766
 
704
767
  if not content.strip():
705
768
  return
769
+ if MAX_FILE_CHARS > 0 and len(content) > MAX_FILE_CHARS:
770
+ return
706
771
 
707
772
  relative = file.file_path.path.as_posix()
708
773
  chunks = await chunk_text(content)
@@ -750,7 +815,10 @@ async def app_main() -> None:
750
815
  excluded_patterns=[
751
816
  "**/node_modules/**", "**/.git/**", "**/dist/**",
752
817
  "**/build/**", "**/.next/**", "**/__pycache__/**",
753
- "**/.unipi/cocoindex/**",
818
+ "**/coverage/**", "**/.turbo/**", "**/.cache/**",
819
+ "**/.unipi/**",
820
+ "**/*.min.js", "**/bundled.js", "**/bundle.js", "**/*bundle*.js",
821
+ "**/package-lock.json", "**/pnpm-lock.yaml", "**/yarn.lock",
754
822
  ],
755
823
  ),
756
824
  )
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pi-unipi/cocoindex",
3
- "version": "2.0.0",
3
+ "version": "2.0.2",
4
4
  "description": "CocoIndex integration for Pi — AST-aware content indexing, semantic vector search, and incremental pipeline management",
5
5
  "type": "module",
6
6
  "main": "index.ts",