vexor 0.19.0a1__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vexor/cache.py CHANGED
@@ -14,8 +14,9 @@ import numpy as np
14
14
 
15
15
  from .utils import collect_files
16
16
 
17
- CACHE_DIR = Path(os.path.expanduser("~")) / ".vexor"
18
- CACHE_VERSION = 5
17
+ DEFAULT_CACHE_DIR = Path(os.path.expanduser("~")) / ".vexor"
18
+ CACHE_DIR = DEFAULT_CACHE_DIR
19
+ CACHE_VERSION = 6
19
20
  DB_FILENAME = "index.db"
20
21
  EMBED_CACHE_TTL_DAYS = 30
21
22
  EMBED_CACHE_MAX_ENTRIES = 50_000
@@ -109,7 +110,7 @@ def _deserialize_exclude_patterns(value: str | None) -> tuple[str, ...]:
109
110
  return tuple(parts)
110
111
 
111
112
 
112
- def _chunk_values(values: Sequence[str], size: int) -> Iterable[Sequence[str]]:
113
+ def _chunk_values(values: Sequence[object], size: int) -> Iterable[Sequence[object]]:
113
114
  for idx in range(0, len(values), size):
114
115
  yield values[idx : idx + size]
115
116
 
@@ -119,6 +120,17 @@ def ensure_cache_dir() -> Path:
119
120
  return CACHE_DIR
120
121
 
121
122
 
123
+ def set_cache_dir(path: Path | str | None) -> None:
124
+ global CACHE_DIR
125
+ if path is None:
126
+ CACHE_DIR = DEFAULT_CACHE_DIR
127
+ return
128
+ dir_path = Path(path).expanduser().resolve()
129
+ if dir_path.exists() and not dir_path.is_dir():
130
+ raise NotADirectoryError(f"Path is not a directory: {dir_path}")
131
+ CACHE_DIR = dir_path
132
+
133
+
122
134
  def cache_db_path() -> Path:
123
135
  """Return the absolute path to the shared SQLite cache database."""
124
136
 
@@ -146,7 +158,42 @@ def _connect(db_path: Path) -> sqlite3.Connection:
146
158
  return conn
147
159
 
148
160
 
161
+ def _table_exists(conn: sqlite3.Connection, table: str) -> bool:
162
+ row = conn.execute(
163
+ "SELECT name FROM sqlite_master WHERE type = 'table' AND name = ?",
164
+ (table,),
165
+ ).fetchone()
166
+ return row is not None
167
+
168
+
169
+ def _schema_needs_reset(conn: sqlite3.Connection) -> bool:
170
+ if _table_exists(conn, "indexed_chunk"):
171
+ return False
172
+ return any(
173
+ _table_exists(conn, table)
174
+ for table in ("index_metadata", "indexed_file", "file_embedding", "query_cache")
175
+ )
176
+
177
+
178
+ def _reset_index_schema(conn: sqlite3.Connection) -> None:
179
+ conn.execute("PRAGMA foreign_keys = OFF;")
180
+ conn.executescript(
181
+ """
182
+ DROP TABLE IF EXISTS query_cache;
183
+ DROP TABLE IF EXISTS file_embedding;
184
+ DROP TABLE IF EXISTS chunk_embedding;
185
+ DROP TABLE IF EXISTS chunk_meta;
186
+ DROP TABLE IF EXISTS indexed_chunk;
187
+ DROP TABLE IF EXISTS indexed_file;
188
+ DROP TABLE IF EXISTS index_metadata;
189
+ """
190
+ )
191
+ conn.execute("PRAGMA foreign_keys = ON;")
192
+
193
+
149
194
  def _ensure_schema(conn: sqlite3.Connection) -> None:
195
+ if _schema_needs_reset(conn):
196
+ _reset_index_schema(conn)
150
197
  conn.executescript(
151
198
  """
152
199
  CREATE TABLE IF NOT EXISTS index_metadata (
@@ -173,20 +220,31 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
173
220
  abs_path TEXT NOT NULL,
174
221
  size_bytes INTEGER NOT NULL,
175
222
  mtime REAL NOT NULL,
176
- position INTEGER NOT NULL,
177
- preview TEXT DEFAULT '',
178
- label_hash TEXT DEFAULT '',
223
+ UNIQUE(index_id, rel_path)
224
+ );
225
+
226
+ CREATE TABLE IF NOT EXISTS indexed_chunk (
227
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
228
+ index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
229
+ file_id INTEGER NOT NULL REFERENCES indexed_file(id) ON DELETE CASCADE,
179
230
  chunk_index INTEGER NOT NULL DEFAULT 0,
180
- start_line INTEGER,
181
- end_line INTEGER,
182
- UNIQUE(index_id, rel_path, chunk_index)
231
+ position INTEGER NOT NULL,
232
+ UNIQUE(index_id, file_id, chunk_index)
183
233
  );
184
234
 
185
- CREATE TABLE IF NOT EXISTS file_embedding (
186
- file_id INTEGER PRIMARY KEY REFERENCES indexed_file(id) ON DELETE CASCADE,
235
+ CREATE TABLE IF NOT EXISTS chunk_embedding (
236
+ chunk_id INTEGER PRIMARY KEY REFERENCES indexed_chunk(id) ON DELETE CASCADE,
187
237
  vector_blob BLOB NOT NULL
188
238
  );
189
239
 
240
+ CREATE TABLE IF NOT EXISTS chunk_meta (
241
+ chunk_id INTEGER PRIMARY KEY REFERENCES indexed_chunk(id) ON DELETE CASCADE,
242
+ preview TEXT DEFAULT '',
243
+ label_hash TEXT DEFAULT '',
244
+ start_line INTEGER,
245
+ end_line INTEGER
246
+ );
247
+
190
248
  CREATE TABLE IF NOT EXISTS query_cache (
191
249
  id INTEGER PRIMARY KEY AUTOINCREMENT,
192
250
  index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
@@ -206,8 +264,11 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
206
264
  UNIQUE(model, text_hash)
207
265
  );
208
266
 
209
- CREATE INDEX IF NOT EXISTS idx_indexed_file_order
210
- ON indexed_file(index_id, position);
267
+ CREATE INDEX IF NOT EXISTS idx_indexed_chunk_order
268
+ ON indexed_chunk(index_id, position);
269
+
270
+ CREATE INDEX IF NOT EXISTS idx_indexed_file_lookup
271
+ ON indexed_file(index_id, rel_path);
211
272
 
212
273
  CREATE INDEX IF NOT EXISTS idx_query_cache_lookup
213
274
  ON query_cache(index_id, query_hash);
@@ -216,133 +277,6 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
216
277
  ON embedding_cache(model, text_hash);
217
278
  """
218
279
  )
219
- try:
220
- conn.execute(
221
- "ALTER TABLE index_metadata ADD COLUMN recursive INTEGER NOT NULL DEFAULT 1"
222
- )
223
- except sqlite3.OperationalError:
224
- # Column already exists; ignore error.
225
- pass
226
- try:
227
- conn.execute(
228
- "ALTER TABLE index_metadata ADD COLUMN respect_gitignore INTEGER NOT NULL DEFAULT 1"
229
- )
230
- except sqlite3.OperationalError:
231
- pass
232
- try:
233
- conn.execute(
234
- "ALTER TABLE index_metadata ADD COLUMN mode TEXT NOT NULL DEFAULT 'name'"
235
- )
236
- except sqlite3.OperationalError:
237
- pass
238
- try:
239
- conn.execute(
240
- "ALTER TABLE indexed_file ADD COLUMN preview TEXT DEFAULT ''"
241
- )
242
- except sqlite3.OperationalError:
243
- pass
244
- try:
245
- conn.execute(
246
- "ALTER TABLE indexed_file ADD COLUMN label_hash TEXT DEFAULT ''"
247
- )
248
- except sqlite3.OperationalError:
249
- pass
250
- try:
251
- conn.execute("ALTER TABLE indexed_file ADD COLUMN start_line INTEGER")
252
- except sqlite3.OperationalError:
253
- pass
254
- try:
255
- conn.execute("ALTER TABLE indexed_file ADD COLUMN end_line INTEGER")
256
- except sqlite3.OperationalError:
257
- pass
258
- if not _table_has_column(conn, "indexed_file", "chunk_index"):
259
- _upgrade_indexed_file_with_chunk(conn)
260
- try:
261
- conn.execute(
262
- "ALTER TABLE index_metadata ADD COLUMN extensions TEXT DEFAULT ''"
263
- )
264
- except sqlite3.OperationalError:
265
- pass
266
- try:
267
- conn.execute(
268
- "ALTER TABLE index_metadata ADD COLUMN exclude_patterns TEXT DEFAULT ''"
269
- )
270
- except sqlite3.OperationalError:
271
- pass
272
- _cleanup_orphan_embeddings(conn)
273
-
274
-
275
- def _table_has_column(conn: sqlite3.Connection, table: str, column: str) -> bool:
276
- rows = conn.execute(f"PRAGMA table_info({table})").fetchall()
277
- return any(row[1] == column for row in rows)
278
-
279
-
280
- def _upgrade_indexed_file_with_chunk(conn: sqlite3.Connection) -> None:
281
- conn.execute("PRAGMA foreign_keys = OFF;")
282
- conn.execute("ALTER TABLE indexed_file RENAME TO indexed_file_legacy;")
283
- conn.executescript(
284
- """
285
- CREATE TABLE indexed_file (
286
- id INTEGER PRIMARY KEY AUTOINCREMENT,
287
- index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
288
- rel_path TEXT NOT NULL,
289
- abs_path TEXT NOT NULL,
290
- size_bytes INTEGER NOT NULL,
291
- mtime REAL NOT NULL,
292
- position INTEGER NOT NULL,
293
- preview TEXT DEFAULT '',
294
- label_hash TEXT DEFAULT '',
295
- chunk_index INTEGER NOT NULL DEFAULT 0,
296
- start_line INTEGER,
297
- end_line INTEGER,
298
- UNIQUE(index_id, rel_path, chunk_index)
299
- );
300
-
301
- CREATE INDEX IF NOT EXISTS idx_indexed_file_order
302
- ON indexed_file(index_id, position);
303
- """
304
- )
305
- conn.execute(
306
- """
307
- INSERT INTO indexed_file (
308
- id,
309
- index_id,
310
- rel_path,
311
- abs_path,
312
- size_bytes,
313
- mtime,
314
- position,
315
- preview,
316
- label_hash,
317
- chunk_index,
318
- start_line,
319
- end_line
320
- )
321
- SELECT
322
- id,
323
- index_id,
324
- rel_path,
325
- abs_path,
326
- size_bytes,
327
- mtime,
328
- position,
329
- preview,
330
- '',
331
- 0,
332
- NULL,
333
- NULL
334
- FROM indexed_file_legacy;
335
- """
336
- )
337
- conn.execute("DROP TABLE indexed_file_legacy;")
338
- conn.execute("PRAGMA foreign_keys = ON;")
339
-
340
-
341
- def _cleanup_orphan_embeddings(conn: sqlite3.Connection) -> None:
342
- with conn:
343
- conn.execute(
344
- "DELETE FROM file_embedding WHERE file_id NOT IN (SELECT id FROM indexed_file)"
345
- )
346
280
 
347
281
 
348
282
  def store_index(
@@ -418,32 +352,22 @@ def store_index(
418
352
  )
419
353
  index_id = cursor.lastrowid
420
354
 
421
- file_rows: list[tuple] = []
422
- vector_blobs: list[bytes] = []
423
- for position, entry in enumerate(entries):
355
+ file_rows_by_rel: dict[str, tuple] = {}
356
+ for entry in entries:
357
+ if entry.rel_path in file_rows_by_rel:
358
+ continue
424
359
  size_bytes = entry.size_bytes
425
360
  mtime = entry.mtime
426
361
  if size_bytes is None or mtime is None:
427
362
  stat = entry.path.stat()
428
363
  size_bytes = stat.st_size
429
364
  mtime = stat.st_mtime
430
- file_rows.append(
431
- (
432
- index_id,
433
- entry.rel_path,
434
- str(entry.path),
435
- size_bytes,
436
- mtime,
437
- position,
438
- entry.preview,
439
- entry.label_hash,
440
- entry.chunk_index,
441
- entry.start_line,
442
- entry.end_line,
443
- )
444
- )
445
- vector_blobs.append(
446
- np.asarray(entry.embedding, dtype=np.float32).tobytes()
365
+ file_rows_by_rel[entry.rel_path] = (
366
+ index_id,
367
+ entry.rel_path,
368
+ str(entry.path),
369
+ size_bytes,
370
+ mtime,
447
371
  )
448
372
 
449
373
  conn.executemany(
@@ -453,29 +377,87 @@ def store_index(
453
377
  rel_path,
454
378
  abs_path,
455
379
  size_bytes,
456
- mtime,
457
- position,
458
- preview,
459
- label_hash,
380
+ mtime
381
+ ) VALUES (?, ?, ?, ?, ?)
382
+ """,
383
+ list(file_rows_by_rel.values()),
384
+ )
385
+
386
+ file_id_map: dict[str, int] = {}
387
+ rel_paths = list(file_rows_by_rel.keys())
388
+ for chunk in _chunk_values(rel_paths, 900):
389
+ placeholders = ", ".join("?" for _ in chunk)
390
+ rows = conn.execute(
391
+ f"""
392
+ SELECT id, rel_path
393
+ FROM indexed_file
394
+ WHERE index_id = ? AND rel_path IN ({placeholders})
395
+ """,
396
+ (index_id, *chunk),
397
+ ).fetchall()
398
+ for row in rows:
399
+ file_id_map[row["rel_path"]] = int(row["id"])
400
+
401
+ chunk_rows: list[tuple] = []
402
+ vector_blobs: list[bytes] = []
403
+ meta_rows: list[tuple] = []
404
+ for position, entry in enumerate(entries):
405
+ file_id = file_id_map.get(entry.rel_path)
406
+ if file_id is None:
407
+ continue
408
+ chunk_rows.append(
409
+ (index_id, file_id, entry.chunk_index, position)
410
+ )
411
+ vector_blobs.append(
412
+ np.asarray(entry.embedding, dtype=np.float32).tobytes()
413
+ )
414
+ meta_rows.append(
415
+ (
416
+ entry.preview or "",
417
+ entry.label_hash or "",
418
+ entry.start_line,
419
+ entry.end_line,
420
+ )
421
+ )
422
+
423
+ conn.executemany(
424
+ """
425
+ INSERT INTO indexed_chunk (
426
+ index_id,
427
+ file_id,
460
428
  chunk_index,
461
- start_line,
462
- end_line
463
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
429
+ position
430
+ ) VALUES (?, ?, ?, ?)
464
431
  """,
465
- file_rows,
432
+ chunk_rows,
466
433
  )
467
434
 
468
435
  inserted_ids = conn.execute(
469
- "SELECT id FROM indexed_file WHERE index_id = ? ORDER BY position ASC",
436
+ "SELECT id FROM indexed_chunk WHERE index_id = ? ORDER BY position ASC",
470
437
  (index_id,),
471
438
  ).fetchall()
472
439
  conn.executemany(
473
- "INSERT OR REPLACE INTO file_embedding (file_id, vector_blob) VALUES (?, ?)",
440
+ "INSERT OR REPLACE INTO chunk_embedding (chunk_id, vector_blob) VALUES (?, ?)",
474
441
  (
475
442
  (row["id"], vector_blobs[idx])
476
443
  for idx, row in enumerate(inserted_ids)
477
444
  ),
478
445
  )
446
+ conn.executemany(
447
+ """
448
+ INSERT OR REPLACE INTO chunk_meta (
449
+ chunk_id,
450
+ preview,
451
+ label_hash,
452
+ start_line,
453
+ end_line
454
+ ) VALUES (?, ?, ?, ?, ?)
455
+ """,
456
+ (
457
+ (row["id"], *meta_rows[idx])
458
+ for idx, row in enumerate(inserted_ids)
459
+ ),
460
+ )
479
461
 
480
462
  return db_path
481
463
  finally:
@@ -546,112 +528,221 @@ def apply_index_updates(
546
528
  if changed_entries:
547
529
  chunk_map: dict[str, list[IndexedChunk]] = {}
548
530
  for entry in changed_entries:
549
- if entry.rel_path not in chunk_map:
550
- chunk_map[entry.rel_path] = []
551
- chunk_map[entry.rel_path].append(entry)
531
+ chunk_map.setdefault(entry.rel_path, []).append(entry)
552
532
 
553
- for rel_path, chunk_list in chunk_map.items():
533
+ for rel_path in chunk_map:
554
534
  conn.execute(
555
535
  "DELETE FROM indexed_file WHERE index_id = ? AND rel_path = ?",
556
536
  (index_id, rel_path),
557
537
  )
538
+
539
+ file_rows_by_rel: dict[str, tuple] = {}
540
+ for rel_path, chunk_list in chunk_map.items():
541
+ chunk_list.sort(key=lambda item: item.chunk_index)
542
+ sample = chunk_list[0]
543
+ size_bytes = sample.size_bytes
544
+ mtime = sample.mtime
545
+ if size_bytes is None or mtime is None:
546
+ stat = sample.path.stat()
547
+ size_bytes = stat.st_size
548
+ mtime = stat.st_mtime
549
+ file_rows_by_rel[rel_path] = (
550
+ index_id,
551
+ rel_path,
552
+ str(sample.path),
553
+ size_bytes,
554
+ mtime,
555
+ )
556
+
557
+ if file_rows_by_rel:
558
+ conn.executemany(
559
+ """
560
+ INSERT INTO indexed_file (
561
+ index_id,
562
+ rel_path,
563
+ abs_path,
564
+ size_bytes,
565
+ mtime
566
+ ) VALUES (?, ?, ?, ?, ?)
567
+ """,
568
+ list(file_rows_by_rel.values()),
569
+ )
570
+
571
+ file_id_map: dict[str, int] = {}
572
+ rel_paths = list(file_rows_by_rel.keys())
573
+ for chunk in _chunk_values(rel_paths, 900):
574
+ placeholders = ", ".join("?" for _ in chunk)
575
+ rows = conn.execute(
576
+ f"""
577
+ SELECT id, rel_path
578
+ FROM indexed_file
579
+ WHERE index_id = ? AND rel_path IN ({placeholders})
580
+ """,
581
+ (index_id, *chunk),
582
+ ).fetchall()
583
+ for row in rows:
584
+ file_id_map[row["rel_path"]] = int(row["id"])
585
+
586
+ for rel_path, chunk_list in chunk_map.items():
587
+ file_id = file_id_map.get(rel_path)
588
+ if file_id is None:
589
+ continue
558
590
  chunk_list.sort(key=lambda item: item.chunk_index)
559
- file_rows: list[tuple] = []
591
+ chunk_rows: list[tuple] = []
560
592
  vector_blobs: list[bytes] = []
593
+ meta_rows: list[tuple] = []
561
594
  for chunk in chunk_list:
562
595
  vector = np.asarray(chunk.embedding, dtype=np.float32)
563
596
  if vector_dimension is None:
564
597
  vector_dimension = vector.shape[0]
565
- size_bytes = chunk.size_bytes
566
- mtime = chunk.mtime
567
- if size_bytes is None or mtime is None:
568
- stat = chunk.path.stat()
569
- size_bytes = stat.st_size
570
- mtime = stat.st_mtime
571
- file_rows.append(
598
+ chunk_rows.append(
599
+ (index_id, file_id, chunk.chunk_index, 0)
600
+ )
601
+ vector_blobs.append(vector.tobytes())
602
+ meta_rows.append(
572
603
  (
573
- index_id,
574
- rel_path,
575
- str(chunk.path),
576
- size_bytes,
577
- mtime,
578
- 0,
579
- chunk.preview,
580
- chunk.label_hash,
581
- chunk.chunk_index,
604
+ chunk.preview or "",
605
+ chunk.label_hash or "",
582
606
  chunk.start_line,
583
607
  chunk.end_line,
584
608
  )
585
609
  )
586
- vector_blobs.append(vector.tobytes())
587
610
 
588
611
  conn.executemany(
589
612
  """
590
- INSERT INTO indexed_file (
613
+ INSERT INTO indexed_chunk (
591
614
  index_id,
592
- rel_path,
593
- abs_path,
594
- size_bytes,
595
- mtime,
596
- position,
597
- preview,
598
- label_hash,
615
+ file_id,
599
616
  chunk_index,
600
- start_line,
601
- end_line
602
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
617
+ position
618
+ ) VALUES (?, ?, ?, ?)
603
619
  """,
604
- file_rows,
620
+ chunk_rows,
605
621
  )
606
622
 
607
623
  inserted_ids = conn.execute(
608
624
  """
609
- SELECT id FROM indexed_file
610
- WHERE index_id = ? AND rel_path = ?
625
+ SELECT id FROM indexed_chunk
626
+ WHERE index_id = ? AND file_id = ?
611
627
  ORDER BY chunk_index ASC
612
628
  """,
613
- (index_id, rel_path),
629
+ (index_id, file_id),
614
630
  ).fetchall()
615
631
  conn.executemany(
616
- "INSERT INTO file_embedding (file_id, vector_blob) VALUES (?, ?)",
632
+ "INSERT OR REPLACE INTO chunk_embedding (chunk_id, vector_blob) VALUES (?, ?)",
617
633
  (
618
634
  (row["id"], vector_blobs[idx])
619
635
  for idx, row in enumerate(inserted_ids)
620
636
  ),
621
637
  )
638
+ conn.executemany(
639
+ """
640
+ INSERT OR REPLACE INTO chunk_meta (
641
+ chunk_id,
642
+ preview,
643
+ label_hash,
644
+ start_line,
645
+ end_line
646
+ ) VALUES (?, ?, ?, ?, ?)
647
+ """,
648
+ (
649
+ (row["id"], *meta_rows[idx])
650
+ for idx, row in enumerate(inserted_ids)
651
+ ),
652
+ )
622
653
 
623
654
  if touched_entries:
655
+ file_updates: dict[str, tuple[int, float]] = {}
656
+ for (
657
+ rel_path,
658
+ _chunk_index,
659
+ size_bytes,
660
+ mtime,
661
+ _preview,
662
+ _start_line,
663
+ _end_line,
664
+ _label_hash,
665
+ ) in touched_entries:
666
+ if rel_path not in file_updates:
667
+ file_updates[rel_path] = (size_bytes, mtime)
624
668
  conn.executemany(
625
669
  """
626
670
  UPDATE indexed_file
627
- SET size_bytes = ?, mtime = ?, preview = ?, start_line = ?, end_line = ?, label_hash = ?
628
- WHERE index_id = ? AND rel_path = ? AND chunk_index = ?
671
+ SET size_bytes = ?, mtime = ?
672
+ WHERE index_id = ? AND rel_path = ?
629
673
  """,
630
674
  (
675
+ (size_bytes, mtime, index_id, rel_path)
676
+ for rel_path, (size_bytes, mtime) in file_updates.items()
677
+ ),
678
+ )
679
+
680
+ chunk_id_map: dict[tuple[str, int], int] = {}
681
+ if ordered_entries or touched_entries:
682
+ rows = conn.execute(
683
+ """
684
+ SELECT c.id, c.chunk_index, f.rel_path
685
+ FROM indexed_chunk AS c
686
+ JOIN indexed_file AS f ON f.id = c.file_id
687
+ WHERE c.index_id = ?
688
+ """,
689
+ (index_id,),
690
+ ).fetchall()
691
+ for row in rows:
692
+ chunk_id_map[(row["rel_path"], int(row["chunk_index"]))] = int(
693
+ row["id"]
694
+ )
695
+
696
+ if touched_entries and chunk_id_map:
697
+ meta_rows: list[tuple] = []
698
+ for (
699
+ rel_path,
700
+ chunk_index,
701
+ _size_bytes,
702
+ _mtime,
703
+ preview,
704
+ start_line,
705
+ end_line,
706
+ label_hash,
707
+ ) in touched_entries:
708
+ chunk_id = chunk_id_map.get((rel_path, chunk_index))
709
+ if chunk_id is None:
710
+ continue
711
+ meta_rows.append(
631
712
  (
632
- size_bytes,
633
- mtime,
713
+ chunk_id,
634
714
  preview or "",
715
+ label_hash or "",
635
716
  start_line,
636
717
  end_line,
637
- label_hash or "",
638
- index_id,
639
- rel_path,
640
- chunk_index,
641
718
  )
642
- for rel_path, chunk_index, size_bytes, mtime, preview, start_line, end_line, label_hash in touched_entries
643
- ),
644
- )
719
+ )
720
+ if meta_rows:
721
+ conn.executemany(
722
+ """
723
+ INSERT OR REPLACE INTO chunk_meta (
724
+ chunk_id,
725
+ preview,
726
+ label_hash,
727
+ start_line,
728
+ end_line
729
+ ) VALUES (?, ?, ?, ?, ?)
730
+ """,
731
+ meta_rows,
732
+ )
645
733
 
646
- for position, (rel_path, chunk_index) in enumerate(ordered_entries):
647
- conn.execute(
648
- """
649
- UPDATE indexed_file
650
- SET position = ?
651
- WHERE index_id = ? AND rel_path = ? AND chunk_index = ?
652
- """,
653
- (position, index_id, rel_path, chunk_index),
654
- )
734
+ if ordered_entries and chunk_id_map:
735
+ position_updates = []
736
+ for position, (rel_path, chunk_index) in enumerate(ordered_entries):
737
+ chunk_id = chunk_id_map.get((rel_path, chunk_index))
738
+ if chunk_id is None:
739
+ continue
740
+ position_updates.append((position, chunk_id))
741
+ if position_updates:
742
+ conn.executemany(
743
+ "UPDATE indexed_chunk SET position = ? WHERE id = ?",
744
+ position_updates,
745
+ )
655
746
 
656
747
  generated_at = datetime.now(timezone.utc).isoformat()
657
748
  new_dimension = vector_dimension or existing_dimension
@@ -716,17 +807,52 @@ def backfill_chunk_lines(
716
807
 
717
808
  with conn:
718
809
  conn.execute("BEGIN IMMEDIATE;")
719
- conn.executemany(
720
- """
721
- UPDATE indexed_file
722
- SET start_line = ?, end_line = ?
723
- WHERE index_id = ? AND rel_path = ? AND chunk_index = ?
724
- """,
725
- (
726
- (start_line, end_line, index_id, rel_path, chunk_index)
727
- for rel_path, chunk_index, start_line, end_line in updates
728
- ),
729
- )
810
+ update_rows: list[tuple[int | None, int | None, int]] = []
811
+ insert_rows: list[tuple[int]] = []
812
+ if updates:
813
+ rel_paths = sorted({rel_path for rel_path, *_ in updates})
814
+ chunk_id_map: dict[tuple[str, int], int] = {}
815
+ for chunk in _chunk_values(rel_paths, 900):
816
+ placeholders = ", ".join("?" for _ in chunk)
817
+ rows = conn.execute(
818
+ f"""
819
+ SELECT c.id, c.chunk_index, f.rel_path
820
+ FROM indexed_chunk AS c
821
+ JOIN indexed_file AS f ON f.id = c.file_id
822
+ WHERE c.index_id = ? AND f.rel_path IN ({placeholders})
823
+ """,
824
+ (index_id, *chunk),
825
+ ).fetchall()
826
+ for row in rows:
827
+ chunk_id_map[(row["rel_path"], int(row["chunk_index"]))] = int(
828
+ row["id"]
829
+ )
830
+ for rel_path, chunk_index, start_line, end_line in updates:
831
+ chunk_id = chunk_id_map.get((rel_path, chunk_index))
832
+ if chunk_id is None:
833
+ continue
834
+ insert_rows.append((chunk_id,))
835
+ update_rows.append((start_line, end_line, chunk_id))
836
+ if insert_rows:
837
+ conn.executemany(
838
+ """
839
+ INSERT OR IGNORE INTO chunk_meta (
840
+ chunk_id,
841
+ preview,
842
+ label_hash
843
+ ) VALUES (?, '', '')
844
+ """,
845
+ insert_rows,
846
+ )
847
+ if update_rows:
848
+ conn.executemany(
849
+ """
850
+ UPDATE chunk_meta
851
+ SET start_line = ?, end_line = ?
852
+ WHERE chunk_id = ?
853
+ """,
854
+ update_rows,
855
+ )
730
856
  generated_at = datetime.now(timezone.utc).isoformat()
731
857
  conn.execute(
732
858
  """
@@ -782,13 +908,27 @@ def load_index(
782
908
  ).fetchone()
783
909
  if meta is None:
784
910
  raise FileNotFoundError(db_path)
911
+ version = int(meta["version"] or 0)
912
+ if version < CACHE_VERSION:
913
+ raise FileNotFoundError(db_path)
785
914
 
786
915
  rows = conn.execute(
787
916
  """
788
- SELECT rel_path, abs_path, size_bytes, mtime, preview, label_hash, chunk_index, start_line, end_line
789
- FROM indexed_file
790
- WHERE index_id = ?
791
- ORDER BY position ASC
917
+ SELECT
918
+ f.rel_path,
919
+ f.abs_path,
920
+ f.size_bytes,
921
+ f.mtime,
922
+ c.chunk_index,
923
+ m.preview,
924
+ m.label_hash,
925
+ m.start_line,
926
+ m.end_line
927
+ FROM indexed_chunk AS c
928
+ JOIN indexed_file AS f ON f.id = c.file_id
929
+ LEFT JOIN chunk_meta AS m ON m.chunk_id = c.id
930
+ WHERE c.index_id = ?
931
+ ORDER BY c.position ASC
792
932
  """,
793
933
  (meta["id"],),
794
934
  ).fetchall()
@@ -878,11 +1018,14 @@ def load_index_vectors(
878
1018
  ).fetchone()
879
1019
  if meta is None:
880
1020
  raise FileNotFoundError(db_path)
1021
+ version = int(meta["version"] or 0)
1022
+ if version < CACHE_VERSION:
1023
+ raise FileNotFoundError(db_path)
881
1024
 
882
1025
  index_id = meta["id"]
883
1026
  dimension = int(meta["dimension"])
884
1027
  chunk_count = conn.execute(
885
- "SELECT COUNT(*) AS count FROM indexed_file WHERE index_id = ?",
1028
+ "SELECT COUNT(*) AS count FROM indexed_chunk WHERE index_id = ?",
886
1029
  (index_id,),
887
1030
  ).fetchone()["count"]
888
1031
  chunk_total = int(chunk_count or 0)
@@ -904,27 +1047,48 @@ def load_index_vectors(
904
1047
  "extensions": _deserialize_extensions(meta["extensions"]),
905
1048
  "files": [],
906
1049
  "chunks": [],
1050
+ "chunk_ids": [],
907
1051
  }
908
1052
  return [], empty, metadata
909
1053
 
910
1054
  embeddings = np.empty((chunk_total, dimension), dtype=np.float32)
911
1055
  paths: list[Path] = []
912
- chunk_entries: list[dict] = []
913
- file_snapshot: dict[str, dict] = {}
1056
+ chunk_ids: list[int] = []
1057
+ file_snapshot: list[dict] = []
1058
+ file_meta_by_rel: dict[str, dict] = {}
1059
+
1060
+ file_rows = conn.execute(
1061
+ """
1062
+ SELECT rel_path, abs_path, size_bytes, mtime
1063
+ FROM indexed_file
1064
+ WHERE index_id = ?
1065
+ """,
1066
+ (index_id,),
1067
+ ).fetchall()
1068
+ for row in file_rows:
1069
+ file_meta_by_rel[row["rel_path"]] = {
1070
+ "path": row["rel_path"],
1071
+ "absolute": row["abs_path"],
1072
+ "mtime": row["mtime"],
1073
+ "size": row["size_bytes"],
1074
+ }
1075
+ seen_files: set[str] = set()
914
1076
 
915
1077
  cursor = conn.execute(
916
1078
  """
917
- SELECT f.rel_path, f.abs_path, f.size_bytes, f.mtime, f.preview, f.label_hash, f.chunk_index, f.start_line, f.end_line, e.vector_blob
918
- FROM indexed_file AS f
919
- JOIN file_embedding AS e ON e.file_id = f.id
920
- WHERE f.index_id = ?
921
- ORDER BY f.position ASC
1079
+ SELECT c.id AS chunk_id, f.rel_path, e.vector_blob
1080
+ FROM indexed_chunk AS c
1081
+ JOIN indexed_file AS f ON f.id = c.file_id
1082
+ JOIN chunk_embedding AS e ON e.chunk_id = c.id
1083
+ WHERE c.index_id = ?
1084
+ ORDER BY c.position ASC
922
1085
  """,
923
1086
  (index_id,),
924
1087
  )
925
1088
 
926
1089
  for idx, row in enumerate(cursor):
927
1090
  rel_path = row["rel_path"]
1091
+ chunk_id = int(row["chunk_id"])
928
1092
  vector = np.frombuffer(row["vector_blob"], dtype=np.float32)
929
1093
  if vector.size != dimension:
930
1094
  raise RuntimeError(
@@ -932,27 +1096,12 @@ def load_index_vectors(
932
1096
  )
933
1097
  embeddings[idx] = vector
934
1098
  paths.append(root / Path(rel_path))
935
- chunk_index = int(row["chunk_index"])
936
- chunk_entries.append(
937
- {
938
- "path": rel_path,
939
- "absolute": row["abs_path"],
940
- "mtime": row["mtime"],
941
- "size": row["size_bytes"],
942
- "preview": row["preview"],
943
- "label_hash": row["label_hash"],
944
- "chunk_index": chunk_index,
945
- "start_line": row["start_line"],
946
- "end_line": row["end_line"],
947
- }
948
- )
949
- if rel_path not in file_snapshot:
950
- file_snapshot[rel_path] = {
951
- "path": rel_path,
952
- "absolute": row["abs_path"],
953
- "mtime": row["mtime"],
954
- "size": row["size_bytes"],
955
- }
1099
+ chunk_ids.append(chunk_id)
1100
+ if rel_path not in seen_files:
1101
+ meta_row = file_meta_by_rel.get(rel_path)
1102
+ if meta_row is not None:
1103
+ file_snapshot.append(meta_row)
1104
+ seen_files.add(rel_path)
956
1105
 
957
1106
  metadata = {
958
1107
  "index_id": int(index_id),
@@ -967,14 +1116,73 @@ def load_index_vectors(
967
1116
  "dimension": meta["dimension"],
968
1117
  "exclude_patterns": _deserialize_exclude_patterns(meta["exclude_patterns"]),
969
1118
  "extensions": _deserialize_extensions(meta["extensions"]),
970
- "files": list(file_snapshot.values()),
971
- "chunks": chunk_entries,
1119
+ "files": file_snapshot,
1120
+ "chunks": [],
1121
+ "chunk_ids": chunk_ids,
972
1122
  }
973
1123
  return paths, embeddings, metadata
974
1124
  finally:
975
1125
  conn.close()
976
1126
 
977
1127
 
1128
+ def load_chunk_metadata(
1129
+ chunk_ids: Sequence[int],
1130
+ conn: sqlite3.Connection | None = None,
1131
+ ) -> dict[int, dict]:
1132
+ """Load cached chunk metadata keyed by chunk_id."""
1133
+
1134
+ if not chunk_ids:
1135
+ return {}
1136
+ unique_ids: list[int] = []
1137
+ seen: set[int] = set()
1138
+ for value in chunk_ids:
1139
+ try:
1140
+ chunk_id = int(value)
1141
+ except (TypeError, ValueError):
1142
+ continue
1143
+ if chunk_id in seen:
1144
+ continue
1145
+ seen.add(chunk_id)
1146
+ unique_ids.append(chunk_id)
1147
+ if not unique_ids:
1148
+ return {}
1149
+ db_path = cache_db_path()
1150
+ owns_connection = conn is None
1151
+ try:
1152
+ connection = conn or _connect(db_path)
1153
+ except sqlite3.OperationalError:
1154
+ return {}
1155
+ try:
1156
+ try:
1157
+ _ensure_schema(connection)
1158
+ except sqlite3.OperationalError:
1159
+ return {}
1160
+ results: dict[int, dict] = {}
1161
+ for chunk in _chunk_values(unique_ids, 900):
1162
+ placeholders = ", ".join("?" for _ in chunk)
1163
+ rows = connection.execute(
1164
+ f"""
1165
+ SELECT c.id AS chunk_id, c.chunk_index, m.preview, m.label_hash, m.start_line, m.end_line
1166
+ FROM indexed_chunk AS c
1167
+ LEFT JOIN chunk_meta AS m ON m.chunk_id = c.id
1168
+ WHERE c.id IN ({placeholders})
1169
+ """,
1170
+ tuple(chunk),
1171
+ ).fetchall()
1172
+ for row in rows:
1173
+ results[int(row["chunk_id"])] = {
1174
+ "chunk_index": int(row["chunk_index"]),
1175
+ "preview": row["preview"],
1176
+ "label_hash": row["label_hash"],
1177
+ "start_line": row["start_line"],
1178
+ "end_line": row["end_line"],
1179
+ }
1180
+ return results
1181
+ finally:
1182
+ if owns_connection:
1183
+ connection.close()
1184
+
1185
+
978
1186
  def load_query_vector(
979
1187
  index_id: int,
980
1188
  query_hash: str,
@@ -1248,7 +1456,7 @@ def list_cache_entries() -> list[dict[str, object]]:
1248
1456
  exclude_patterns,
1249
1457
  extensions,
1250
1458
  (
1251
- SELECT COUNT(DISTINCT rel_path)
1459
+ SELECT COUNT(*)
1252
1460
  FROM indexed_file
1253
1461
  WHERE index_id = index_metadata.id
1254
1462
  ) AS file_count