vexor 0.20.0__py3-none-any.whl → 0.21.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vexor/cache.py CHANGED
@@ -16,7 +16,7 @@ from .utils import collect_files
16
16
 
17
17
  DEFAULT_CACHE_DIR = Path(os.path.expanduser("~")) / ".vexor"
18
18
  CACHE_DIR = DEFAULT_CACHE_DIR
19
- CACHE_VERSION = 5
19
+ CACHE_VERSION = 6
20
20
  DB_FILENAME = "index.db"
21
21
  EMBED_CACHE_TTL_DAYS = 30
22
22
  EMBED_CACHE_MAX_ENTRIES = 50_000
@@ -110,7 +110,7 @@ def _deserialize_exclude_patterns(value: str | None) -> tuple[str, ...]:
110
110
  return tuple(parts)
111
111
 
112
112
 
113
- def _chunk_values(values: Sequence[str], size: int) -> Iterable[Sequence[str]]:
113
+ def _chunk_values(values: Sequence[object], size: int) -> Iterable[Sequence[object]]:
114
114
  for idx in range(0, len(values), size):
115
115
  yield values[idx : idx + size]
116
116
 
@@ -143,8 +143,17 @@ def cache_file(root: Path, model: str, include_hidden: bool) -> Path: # pragma:
143
143
  return cache_db_path()
144
144
 
145
145
 
146
- def _connect(db_path: Path) -> sqlite3.Connection:
147
- conn = sqlite3.connect(db_path)
146
+ def _connect(
147
+ db_path: Path,
148
+ *,
149
+ readonly: bool = False,
150
+ query_only: bool = False,
151
+ ) -> sqlite3.Connection:
152
+ if readonly:
153
+ db_uri = f"file:{db_path.as_posix()}?mode=ro"
154
+ conn = sqlite3.connect(db_uri, uri=True)
155
+ else:
156
+ conn = sqlite3.connect(db_path)
148
157
  conn.row_factory = sqlite3.Row
149
158
  try:
150
159
  conn.execute("PRAGMA journal_mode = WAL;")
@@ -155,10 +164,59 @@ def _connect(db_path: Path) -> sqlite3.Connection:
155
164
  conn.execute("PRAGMA temp_store = MEMORY;")
156
165
  conn.execute("PRAGMA busy_timeout = 5000;")
157
166
  conn.execute("PRAGMA foreign_keys = ON;")
167
+ if readonly or query_only:
168
+ conn.execute("PRAGMA query_only = ON;")
158
169
  return conn
159
170
 
160
171
 
172
+ def _ensure_schema_readonly(
173
+ conn: sqlite3.Connection,
174
+ *,
175
+ tables: Sequence[str],
176
+ ) -> None:
177
+ if _schema_needs_reset(conn):
178
+ raise sqlite3.OperationalError("Schema reset required")
179
+ for table in tables:
180
+ if not _table_exists(conn, table):
181
+ raise sqlite3.OperationalError(f"Missing table: {table}")
182
+
183
+
184
+ def _table_exists(conn: sqlite3.Connection, table: str) -> bool:
185
+ row = conn.execute(
186
+ "SELECT name FROM sqlite_master WHERE type = 'table' AND name = ?",
187
+ (table,),
188
+ ).fetchone()
189
+ return row is not None
190
+
191
+
192
+ def _schema_needs_reset(conn: sqlite3.Connection) -> bool:
193
+ if _table_exists(conn, "indexed_chunk"):
194
+ return False
195
+ return any(
196
+ _table_exists(conn, table)
197
+ for table in ("index_metadata", "indexed_file", "file_embedding", "query_cache")
198
+ )
199
+
200
+
201
+ def _reset_index_schema(conn: sqlite3.Connection) -> None:
202
+ conn.execute("PRAGMA foreign_keys = OFF;")
203
+ conn.executescript(
204
+ """
205
+ DROP TABLE IF EXISTS query_cache;
206
+ DROP TABLE IF EXISTS file_embedding;
207
+ DROP TABLE IF EXISTS chunk_embedding;
208
+ DROP TABLE IF EXISTS chunk_meta;
209
+ DROP TABLE IF EXISTS indexed_chunk;
210
+ DROP TABLE IF EXISTS indexed_file;
211
+ DROP TABLE IF EXISTS index_metadata;
212
+ """
213
+ )
214
+ conn.execute("PRAGMA foreign_keys = ON;")
215
+
216
+
161
217
  def _ensure_schema(conn: sqlite3.Connection) -> None:
218
+ if _schema_needs_reset(conn):
219
+ _reset_index_schema(conn)
162
220
  conn.executescript(
163
221
  """
164
222
  CREATE TABLE IF NOT EXISTS index_metadata (
@@ -185,20 +243,31 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
185
243
  abs_path TEXT NOT NULL,
186
244
  size_bytes INTEGER NOT NULL,
187
245
  mtime REAL NOT NULL,
188
- position INTEGER NOT NULL,
189
- preview TEXT DEFAULT '',
190
- label_hash TEXT DEFAULT '',
246
+ UNIQUE(index_id, rel_path)
247
+ );
248
+
249
+ CREATE TABLE IF NOT EXISTS indexed_chunk (
250
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
251
+ index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
252
+ file_id INTEGER NOT NULL REFERENCES indexed_file(id) ON DELETE CASCADE,
191
253
  chunk_index INTEGER NOT NULL DEFAULT 0,
192
- start_line INTEGER,
193
- end_line INTEGER,
194
- UNIQUE(index_id, rel_path, chunk_index)
254
+ position INTEGER NOT NULL,
255
+ UNIQUE(index_id, file_id, chunk_index)
195
256
  );
196
257
 
197
- CREATE TABLE IF NOT EXISTS file_embedding (
198
- file_id INTEGER PRIMARY KEY REFERENCES indexed_file(id) ON DELETE CASCADE,
258
+ CREATE TABLE IF NOT EXISTS chunk_embedding (
259
+ chunk_id INTEGER PRIMARY KEY REFERENCES indexed_chunk(id) ON DELETE CASCADE,
199
260
  vector_blob BLOB NOT NULL
200
261
  );
201
262
 
263
+ CREATE TABLE IF NOT EXISTS chunk_meta (
264
+ chunk_id INTEGER PRIMARY KEY REFERENCES indexed_chunk(id) ON DELETE CASCADE,
265
+ preview TEXT DEFAULT '',
266
+ label_hash TEXT DEFAULT '',
267
+ start_line INTEGER,
268
+ end_line INTEGER
269
+ );
270
+
202
271
  CREATE TABLE IF NOT EXISTS query_cache (
203
272
  id INTEGER PRIMARY KEY AUTOINCREMENT,
204
273
  index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
@@ -218,8 +287,11 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
218
287
  UNIQUE(model, text_hash)
219
288
  );
220
289
 
221
- CREATE INDEX IF NOT EXISTS idx_indexed_file_order
222
- ON indexed_file(index_id, position);
290
+ CREATE INDEX IF NOT EXISTS idx_indexed_chunk_order
291
+ ON indexed_chunk(index_id, position);
292
+
293
+ CREATE INDEX IF NOT EXISTS idx_indexed_file_lookup
294
+ ON indexed_file(index_id, rel_path);
223
295
 
224
296
  CREATE INDEX IF NOT EXISTS idx_query_cache_lookup
225
297
  ON query_cache(index_id, query_hash);
@@ -228,133 +300,6 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
228
300
  ON embedding_cache(model, text_hash);
229
301
  """
230
302
  )
231
- try:
232
- conn.execute(
233
- "ALTER TABLE index_metadata ADD COLUMN recursive INTEGER NOT NULL DEFAULT 1"
234
- )
235
- except sqlite3.OperationalError:
236
- # Column already exists; ignore error.
237
- pass
238
- try:
239
- conn.execute(
240
- "ALTER TABLE index_metadata ADD COLUMN respect_gitignore INTEGER NOT NULL DEFAULT 1"
241
- )
242
- except sqlite3.OperationalError:
243
- pass
244
- try:
245
- conn.execute(
246
- "ALTER TABLE index_metadata ADD COLUMN mode TEXT NOT NULL DEFAULT 'name'"
247
- )
248
- except sqlite3.OperationalError:
249
- pass
250
- try:
251
- conn.execute(
252
- "ALTER TABLE indexed_file ADD COLUMN preview TEXT DEFAULT ''"
253
- )
254
- except sqlite3.OperationalError:
255
- pass
256
- try:
257
- conn.execute(
258
- "ALTER TABLE indexed_file ADD COLUMN label_hash TEXT DEFAULT ''"
259
- )
260
- except sqlite3.OperationalError:
261
- pass
262
- try:
263
- conn.execute("ALTER TABLE indexed_file ADD COLUMN start_line INTEGER")
264
- except sqlite3.OperationalError:
265
- pass
266
- try:
267
- conn.execute("ALTER TABLE indexed_file ADD COLUMN end_line INTEGER")
268
- except sqlite3.OperationalError:
269
- pass
270
- if not _table_has_column(conn, "indexed_file", "chunk_index"):
271
- _upgrade_indexed_file_with_chunk(conn)
272
- try:
273
- conn.execute(
274
- "ALTER TABLE index_metadata ADD COLUMN extensions TEXT DEFAULT ''"
275
- )
276
- except sqlite3.OperationalError:
277
- pass
278
- try:
279
- conn.execute(
280
- "ALTER TABLE index_metadata ADD COLUMN exclude_patterns TEXT DEFAULT ''"
281
- )
282
- except sqlite3.OperationalError:
283
- pass
284
- _cleanup_orphan_embeddings(conn)
285
-
286
-
287
- def _table_has_column(conn: sqlite3.Connection, table: str, column: str) -> bool:
288
- rows = conn.execute(f"PRAGMA table_info({table})").fetchall()
289
- return any(row[1] == column for row in rows)
290
-
291
-
292
- def _upgrade_indexed_file_with_chunk(conn: sqlite3.Connection) -> None:
293
- conn.execute("PRAGMA foreign_keys = OFF;")
294
- conn.execute("ALTER TABLE indexed_file RENAME TO indexed_file_legacy;")
295
- conn.executescript(
296
- """
297
- CREATE TABLE indexed_file (
298
- id INTEGER PRIMARY KEY AUTOINCREMENT,
299
- index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
300
- rel_path TEXT NOT NULL,
301
- abs_path TEXT NOT NULL,
302
- size_bytes INTEGER NOT NULL,
303
- mtime REAL NOT NULL,
304
- position INTEGER NOT NULL,
305
- preview TEXT DEFAULT '',
306
- label_hash TEXT DEFAULT '',
307
- chunk_index INTEGER NOT NULL DEFAULT 0,
308
- start_line INTEGER,
309
- end_line INTEGER,
310
- UNIQUE(index_id, rel_path, chunk_index)
311
- );
312
-
313
- CREATE INDEX IF NOT EXISTS idx_indexed_file_order
314
- ON indexed_file(index_id, position);
315
- """
316
- )
317
- conn.execute(
318
- """
319
- INSERT INTO indexed_file (
320
- id,
321
- index_id,
322
- rel_path,
323
- abs_path,
324
- size_bytes,
325
- mtime,
326
- position,
327
- preview,
328
- label_hash,
329
- chunk_index,
330
- start_line,
331
- end_line
332
- )
333
- SELECT
334
- id,
335
- index_id,
336
- rel_path,
337
- abs_path,
338
- size_bytes,
339
- mtime,
340
- position,
341
- preview,
342
- '',
343
- 0,
344
- NULL,
345
- NULL
346
- FROM indexed_file_legacy;
347
- """
348
- )
349
- conn.execute("DROP TABLE indexed_file_legacy;")
350
- conn.execute("PRAGMA foreign_keys = ON;")
351
-
352
-
353
- def _cleanup_orphan_embeddings(conn: sqlite3.Connection) -> None:
354
- with conn:
355
- conn.execute(
356
- "DELETE FROM file_embedding WHERE file_id NOT IN (SELECT id FROM indexed_file)"
357
- )
358
303
 
359
304
 
360
305
  def store_index(
@@ -430,32 +375,22 @@ def store_index(
430
375
  )
431
376
  index_id = cursor.lastrowid
432
377
 
433
- file_rows: list[tuple] = []
434
- vector_blobs: list[bytes] = []
435
- for position, entry in enumerate(entries):
378
+ file_rows_by_rel: dict[str, tuple] = {}
379
+ for entry in entries:
380
+ if entry.rel_path in file_rows_by_rel:
381
+ continue
436
382
  size_bytes = entry.size_bytes
437
383
  mtime = entry.mtime
438
384
  if size_bytes is None or mtime is None:
439
385
  stat = entry.path.stat()
440
386
  size_bytes = stat.st_size
441
387
  mtime = stat.st_mtime
442
- file_rows.append(
443
- (
444
- index_id,
445
- entry.rel_path,
446
- str(entry.path),
447
- size_bytes,
448
- mtime,
449
- position,
450
- entry.preview,
451
- entry.label_hash,
452
- entry.chunk_index,
453
- entry.start_line,
454
- entry.end_line,
455
- )
456
- )
457
- vector_blobs.append(
458
- np.asarray(entry.embedding, dtype=np.float32).tobytes()
388
+ file_rows_by_rel[entry.rel_path] = (
389
+ index_id,
390
+ entry.rel_path,
391
+ str(entry.path),
392
+ size_bytes,
393
+ mtime,
459
394
  )
460
395
 
461
396
  conn.executemany(
@@ -465,29 +400,87 @@ def store_index(
465
400
  rel_path,
466
401
  abs_path,
467
402
  size_bytes,
468
- mtime,
469
- position,
470
- preview,
471
- label_hash,
403
+ mtime
404
+ ) VALUES (?, ?, ?, ?, ?)
405
+ """,
406
+ list(file_rows_by_rel.values()),
407
+ )
408
+
409
+ file_id_map: dict[str, int] = {}
410
+ rel_paths = list(file_rows_by_rel.keys())
411
+ for chunk in _chunk_values(rel_paths, 900):
412
+ placeholders = ", ".join("?" for _ in chunk)
413
+ rows = conn.execute(
414
+ f"""
415
+ SELECT id, rel_path
416
+ FROM indexed_file
417
+ WHERE index_id = ? AND rel_path IN ({placeholders})
418
+ """,
419
+ (index_id, *chunk),
420
+ ).fetchall()
421
+ for row in rows:
422
+ file_id_map[row["rel_path"]] = int(row["id"])
423
+
424
+ chunk_rows: list[tuple] = []
425
+ vector_blobs: list[bytes] = []
426
+ meta_rows: list[tuple] = []
427
+ for position, entry in enumerate(entries):
428
+ file_id = file_id_map.get(entry.rel_path)
429
+ if file_id is None:
430
+ continue
431
+ chunk_rows.append(
432
+ (index_id, file_id, entry.chunk_index, position)
433
+ )
434
+ vector_blobs.append(
435
+ np.asarray(entry.embedding, dtype=np.float32).tobytes()
436
+ )
437
+ meta_rows.append(
438
+ (
439
+ entry.preview or "",
440
+ entry.label_hash or "",
441
+ entry.start_line,
442
+ entry.end_line,
443
+ )
444
+ )
445
+
446
+ conn.executemany(
447
+ """
448
+ INSERT INTO indexed_chunk (
449
+ index_id,
450
+ file_id,
472
451
  chunk_index,
473
- start_line,
474
- end_line
475
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
452
+ position
453
+ ) VALUES (?, ?, ?, ?)
476
454
  """,
477
- file_rows,
455
+ chunk_rows,
478
456
  )
479
457
 
480
458
  inserted_ids = conn.execute(
481
- "SELECT id FROM indexed_file WHERE index_id = ? ORDER BY position ASC",
459
+ "SELECT id FROM indexed_chunk WHERE index_id = ? ORDER BY position ASC",
482
460
  (index_id,),
483
461
  ).fetchall()
484
462
  conn.executemany(
485
- "INSERT OR REPLACE INTO file_embedding (file_id, vector_blob) VALUES (?, ?)",
463
+ "INSERT OR REPLACE INTO chunk_embedding (chunk_id, vector_blob) VALUES (?, ?)",
486
464
  (
487
465
  (row["id"], vector_blobs[idx])
488
466
  for idx, row in enumerate(inserted_ids)
489
467
  ),
490
468
  )
469
+ conn.executemany(
470
+ """
471
+ INSERT OR REPLACE INTO chunk_meta (
472
+ chunk_id,
473
+ preview,
474
+ label_hash,
475
+ start_line,
476
+ end_line
477
+ ) VALUES (?, ?, ?, ?, ?)
478
+ """,
479
+ (
480
+ (row["id"], *meta_rows[idx])
481
+ for idx, row in enumerate(inserted_ids)
482
+ ),
483
+ )
491
484
 
492
485
  return db_path
493
486
  finally:
@@ -558,112 +551,221 @@ def apply_index_updates(
558
551
  if changed_entries:
559
552
  chunk_map: dict[str, list[IndexedChunk]] = {}
560
553
  for entry in changed_entries:
561
- if entry.rel_path not in chunk_map:
562
- chunk_map[entry.rel_path] = []
563
- chunk_map[entry.rel_path].append(entry)
554
+ chunk_map.setdefault(entry.rel_path, []).append(entry)
564
555
 
565
- for rel_path, chunk_list in chunk_map.items():
556
+ for rel_path in chunk_map:
566
557
  conn.execute(
567
558
  "DELETE FROM indexed_file WHERE index_id = ? AND rel_path = ?",
568
559
  (index_id, rel_path),
569
560
  )
561
+
562
+ file_rows_by_rel: dict[str, tuple] = {}
563
+ for rel_path, chunk_list in chunk_map.items():
564
+ chunk_list.sort(key=lambda item: item.chunk_index)
565
+ sample = chunk_list[0]
566
+ size_bytes = sample.size_bytes
567
+ mtime = sample.mtime
568
+ if size_bytes is None or mtime is None:
569
+ stat = sample.path.stat()
570
+ size_bytes = stat.st_size
571
+ mtime = stat.st_mtime
572
+ file_rows_by_rel[rel_path] = (
573
+ index_id,
574
+ rel_path,
575
+ str(sample.path),
576
+ size_bytes,
577
+ mtime,
578
+ )
579
+
580
+ if file_rows_by_rel:
581
+ conn.executemany(
582
+ """
583
+ INSERT INTO indexed_file (
584
+ index_id,
585
+ rel_path,
586
+ abs_path,
587
+ size_bytes,
588
+ mtime
589
+ ) VALUES (?, ?, ?, ?, ?)
590
+ """,
591
+ list(file_rows_by_rel.values()),
592
+ )
593
+
594
+ file_id_map: dict[str, int] = {}
595
+ rel_paths = list(file_rows_by_rel.keys())
596
+ for chunk in _chunk_values(rel_paths, 900):
597
+ placeholders = ", ".join("?" for _ in chunk)
598
+ rows = conn.execute(
599
+ f"""
600
+ SELECT id, rel_path
601
+ FROM indexed_file
602
+ WHERE index_id = ? AND rel_path IN ({placeholders})
603
+ """,
604
+ (index_id, *chunk),
605
+ ).fetchall()
606
+ for row in rows:
607
+ file_id_map[row["rel_path"]] = int(row["id"])
608
+
609
+ for rel_path, chunk_list in chunk_map.items():
610
+ file_id = file_id_map.get(rel_path)
611
+ if file_id is None:
612
+ continue
570
613
  chunk_list.sort(key=lambda item: item.chunk_index)
571
- file_rows: list[tuple] = []
614
+ chunk_rows: list[tuple] = []
572
615
  vector_blobs: list[bytes] = []
616
+ meta_rows: list[tuple] = []
573
617
  for chunk in chunk_list:
574
618
  vector = np.asarray(chunk.embedding, dtype=np.float32)
575
619
  if vector_dimension is None:
576
620
  vector_dimension = vector.shape[0]
577
- size_bytes = chunk.size_bytes
578
- mtime = chunk.mtime
579
- if size_bytes is None or mtime is None:
580
- stat = chunk.path.stat()
581
- size_bytes = stat.st_size
582
- mtime = stat.st_mtime
583
- file_rows.append(
621
+ chunk_rows.append(
622
+ (index_id, file_id, chunk.chunk_index, 0)
623
+ )
624
+ vector_blobs.append(vector.tobytes())
625
+ meta_rows.append(
584
626
  (
585
- index_id,
586
- rel_path,
587
- str(chunk.path),
588
- size_bytes,
589
- mtime,
590
- 0,
591
- chunk.preview,
592
- chunk.label_hash,
593
- chunk.chunk_index,
627
+ chunk.preview or "",
628
+ chunk.label_hash or "",
594
629
  chunk.start_line,
595
630
  chunk.end_line,
596
631
  )
597
632
  )
598
- vector_blobs.append(vector.tobytes())
599
633
 
600
634
  conn.executemany(
601
635
  """
602
- INSERT INTO indexed_file (
636
+ INSERT INTO indexed_chunk (
603
637
  index_id,
604
- rel_path,
605
- abs_path,
606
- size_bytes,
607
- mtime,
608
- position,
609
- preview,
610
- label_hash,
638
+ file_id,
611
639
  chunk_index,
612
- start_line,
613
- end_line
614
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
640
+ position
641
+ ) VALUES (?, ?, ?, ?)
615
642
  """,
616
- file_rows,
643
+ chunk_rows,
617
644
  )
618
645
 
619
646
  inserted_ids = conn.execute(
620
647
  """
621
- SELECT id FROM indexed_file
622
- WHERE index_id = ? AND rel_path = ?
648
+ SELECT id FROM indexed_chunk
649
+ WHERE index_id = ? AND file_id = ?
623
650
  ORDER BY chunk_index ASC
624
651
  """,
625
- (index_id, rel_path),
652
+ (index_id, file_id),
626
653
  ).fetchall()
627
654
  conn.executemany(
628
- "INSERT INTO file_embedding (file_id, vector_blob) VALUES (?, ?)",
655
+ "INSERT OR REPLACE INTO chunk_embedding (chunk_id, vector_blob) VALUES (?, ?)",
629
656
  (
630
657
  (row["id"], vector_blobs[idx])
631
658
  for idx, row in enumerate(inserted_ids)
632
659
  ),
633
660
  )
661
+ conn.executemany(
662
+ """
663
+ INSERT OR REPLACE INTO chunk_meta (
664
+ chunk_id,
665
+ preview,
666
+ label_hash,
667
+ start_line,
668
+ end_line
669
+ ) VALUES (?, ?, ?, ?, ?)
670
+ """,
671
+ (
672
+ (row["id"], *meta_rows[idx])
673
+ for idx, row in enumerate(inserted_ids)
674
+ ),
675
+ )
634
676
 
635
677
  if touched_entries:
678
+ file_updates: dict[str, tuple[int, float]] = {}
679
+ for (
680
+ rel_path,
681
+ _chunk_index,
682
+ size_bytes,
683
+ mtime,
684
+ _preview,
685
+ _start_line,
686
+ _end_line,
687
+ _label_hash,
688
+ ) in touched_entries:
689
+ if rel_path not in file_updates:
690
+ file_updates[rel_path] = (size_bytes, mtime)
636
691
  conn.executemany(
637
692
  """
638
693
  UPDATE indexed_file
639
- SET size_bytes = ?, mtime = ?, preview = ?, start_line = ?, end_line = ?, label_hash = ?
640
- WHERE index_id = ? AND rel_path = ? AND chunk_index = ?
694
+ SET size_bytes = ?, mtime = ?
695
+ WHERE index_id = ? AND rel_path = ?
641
696
  """,
642
697
  (
698
+ (size_bytes, mtime, index_id, rel_path)
699
+ for rel_path, (size_bytes, mtime) in file_updates.items()
700
+ ),
701
+ )
702
+
703
+ chunk_id_map: dict[tuple[str, int], int] = {}
704
+ if ordered_entries or touched_entries:
705
+ rows = conn.execute(
706
+ """
707
+ SELECT c.id, c.chunk_index, f.rel_path
708
+ FROM indexed_chunk AS c
709
+ JOIN indexed_file AS f ON f.id = c.file_id
710
+ WHERE c.index_id = ?
711
+ """,
712
+ (index_id,),
713
+ ).fetchall()
714
+ for row in rows:
715
+ chunk_id_map[(row["rel_path"], int(row["chunk_index"]))] = int(
716
+ row["id"]
717
+ )
718
+
719
+ if touched_entries and chunk_id_map:
720
+ meta_rows: list[tuple] = []
721
+ for (
722
+ rel_path,
723
+ chunk_index,
724
+ _size_bytes,
725
+ _mtime,
726
+ preview,
727
+ start_line,
728
+ end_line,
729
+ label_hash,
730
+ ) in touched_entries:
731
+ chunk_id = chunk_id_map.get((rel_path, chunk_index))
732
+ if chunk_id is None:
733
+ continue
734
+ meta_rows.append(
643
735
  (
644
- size_bytes,
645
- mtime,
736
+ chunk_id,
646
737
  preview or "",
738
+ label_hash or "",
647
739
  start_line,
648
740
  end_line,
649
- label_hash or "",
650
- index_id,
651
- rel_path,
652
- chunk_index,
653
741
  )
654
- for rel_path, chunk_index, size_bytes, mtime, preview, start_line, end_line, label_hash in touched_entries
655
- ),
656
- )
742
+ )
743
+ if meta_rows:
744
+ conn.executemany(
745
+ """
746
+ INSERT OR REPLACE INTO chunk_meta (
747
+ chunk_id,
748
+ preview,
749
+ label_hash,
750
+ start_line,
751
+ end_line
752
+ ) VALUES (?, ?, ?, ?, ?)
753
+ """,
754
+ meta_rows,
755
+ )
657
756
 
658
- for position, (rel_path, chunk_index) in enumerate(ordered_entries):
659
- conn.execute(
660
- """
661
- UPDATE indexed_file
662
- SET position = ?
663
- WHERE index_id = ? AND rel_path = ? AND chunk_index = ?
664
- """,
665
- (position, index_id, rel_path, chunk_index),
666
- )
757
+ if ordered_entries and chunk_id_map:
758
+ position_updates = []
759
+ for position, (rel_path, chunk_index) in enumerate(ordered_entries):
760
+ chunk_id = chunk_id_map.get((rel_path, chunk_index))
761
+ if chunk_id is None:
762
+ continue
763
+ position_updates.append((position, chunk_id))
764
+ if position_updates:
765
+ conn.executemany(
766
+ "UPDATE indexed_chunk SET position = ? WHERE id = ?",
767
+ position_updates,
768
+ )
667
769
 
668
770
  generated_at = datetime.now(timezone.utc).isoformat()
669
771
  new_dimension = vector_dimension or existing_dimension
@@ -728,17 +830,52 @@ def backfill_chunk_lines(
728
830
 
729
831
  with conn:
730
832
  conn.execute("BEGIN IMMEDIATE;")
731
- conn.executemany(
732
- """
733
- UPDATE indexed_file
734
- SET start_line = ?, end_line = ?
735
- WHERE index_id = ? AND rel_path = ? AND chunk_index = ?
736
- """,
737
- (
738
- (start_line, end_line, index_id, rel_path, chunk_index)
739
- for rel_path, chunk_index, start_line, end_line in updates
740
- ),
741
- )
833
+ update_rows: list[tuple[int | None, int | None, int]] = []
834
+ insert_rows: list[tuple[int]] = []
835
+ if updates:
836
+ rel_paths = sorted({rel_path for rel_path, *_ in updates})
837
+ chunk_id_map: dict[tuple[str, int], int] = {}
838
+ for chunk in _chunk_values(rel_paths, 900):
839
+ placeholders = ", ".join("?" for _ in chunk)
840
+ rows = conn.execute(
841
+ f"""
842
+ SELECT c.id, c.chunk_index, f.rel_path
843
+ FROM indexed_chunk AS c
844
+ JOIN indexed_file AS f ON f.id = c.file_id
845
+ WHERE c.index_id = ? AND f.rel_path IN ({placeholders})
846
+ """,
847
+ (index_id, *chunk),
848
+ ).fetchall()
849
+ for row in rows:
850
+ chunk_id_map[(row["rel_path"], int(row["chunk_index"]))] = int(
851
+ row["id"]
852
+ )
853
+ for rel_path, chunk_index, start_line, end_line in updates:
854
+ chunk_id = chunk_id_map.get((rel_path, chunk_index))
855
+ if chunk_id is None:
856
+ continue
857
+ insert_rows.append((chunk_id,))
858
+ update_rows.append((start_line, end_line, chunk_id))
859
+ if insert_rows:
860
+ conn.executemany(
861
+ """
862
+ INSERT OR IGNORE INTO chunk_meta (
863
+ chunk_id,
864
+ preview,
865
+ label_hash
866
+ ) VALUES (?, '', '')
867
+ """,
868
+ insert_rows,
869
+ )
870
+ if update_rows:
871
+ conn.executemany(
872
+ """
873
+ UPDATE chunk_meta
874
+ SET start_line = ?, end_line = ?
875
+ WHERE chunk_id = ?
876
+ """,
877
+ update_rows,
878
+ )
742
879
  generated_at = datetime.now(timezone.utc).isoformat()
743
880
  conn.execute(
744
881
  """
@@ -769,9 +906,15 @@ def load_index(
769
906
  if not db_path.exists():
770
907
  raise FileNotFoundError(db_path)
771
908
 
772
- conn = _connect(db_path)
909
+ conn = _connect(db_path, readonly=True)
773
910
  try:
774
- _ensure_schema(conn)
911
+ try:
912
+ _ensure_schema_readonly(
913
+ conn,
914
+ tables=("index_metadata", "indexed_file", "indexed_chunk", "chunk_meta"),
915
+ )
916
+ except sqlite3.OperationalError:
917
+ raise FileNotFoundError(db_path)
775
918
  key = _cache_key(
776
919
  root,
777
920
  include_hidden,
@@ -794,13 +937,27 @@ def load_index(
794
937
  ).fetchone()
795
938
  if meta is None:
796
939
  raise FileNotFoundError(db_path)
940
+ version = int(meta["version"] or 0)
941
+ if version < CACHE_VERSION:
942
+ raise FileNotFoundError(db_path)
797
943
 
798
944
  rows = conn.execute(
799
945
  """
800
- SELECT rel_path, abs_path, size_bytes, mtime, preview, label_hash, chunk_index, start_line, end_line
801
- FROM indexed_file
802
- WHERE index_id = ?
803
- ORDER BY position ASC
946
+ SELECT
947
+ f.rel_path,
948
+ f.abs_path,
949
+ f.size_bytes,
950
+ f.mtime,
951
+ c.chunk_index,
952
+ m.preview,
953
+ m.label_hash,
954
+ m.start_line,
955
+ m.end_line
956
+ FROM indexed_chunk AS c
957
+ JOIN indexed_file AS f ON f.id = c.file_id
958
+ LEFT JOIN chunk_meta AS m ON m.chunk_id = c.id
959
+ WHERE c.index_id = ?
960
+ ORDER BY c.position ASC
804
961
  """,
805
962
  (meta["id"],),
806
963
  ).fetchall()
@@ -865,9 +1022,20 @@ def load_index_vectors(
865
1022
  if not db_path.exists():
866
1023
  raise FileNotFoundError(db_path)
867
1024
 
868
- conn = _connect(db_path)
1025
+ conn = _connect(db_path, readonly=True)
869
1026
  try:
870
- _ensure_schema(conn)
1027
+ try:
1028
+ _ensure_schema_readonly(
1029
+ conn,
1030
+ tables=(
1031
+ "index_metadata",
1032
+ "indexed_file",
1033
+ "indexed_chunk",
1034
+ "chunk_embedding",
1035
+ ),
1036
+ )
1037
+ except sqlite3.OperationalError:
1038
+ raise FileNotFoundError(db_path)
871
1039
  key = _cache_key(
872
1040
  root,
873
1041
  include_hidden,
@@ -890,11 +1058,14 @@ def load_index_vectors(
890
1058
  ).fetchone()
891
1059
  if meta is None:
892
1060
  raise FileNotFoundError(db_path)
1061
+ version = int(meta["version"] or 0)
1062
+ if version < CACHE_VERSION:
1063
+ raise FileNotFoundError(db_path)
893
1064
 
894
1065
  index_id = meta["id"]
895
1066
  dimension = int(meta["dimension"])
896
1067
  chunk_count = conn.execute(
897
- "SELECT COUNT(*) AS count FROM indexed_file WHERE index_id = ?",
1068
+ "SELECT COUNT(*) AS count FROM indexed_chunk WHERE index_id = ?",
898
1069
  (index_id,),
899
1070
  ).fetchone()["count"]
900
1071
  chunk_total = int(chunk_count or 0)
@@ -916,27 +1087,48 @@ def load_index_vectors(
916
1087
  "extensions": _deserialize_extensions(meta["extensions"]),
917
1088
  "files": [],
918
1089
  "chunks": [],
1090
+ "chunk_ids": [],
919
1091
  }
920
1092
  return [], empty, metadata
921
1093
 
922
1094
  embeddings = np.empty((chunk_total, dimension), dtype=np.float32)
923
1095
  paths: list[Path] = []
924
- chunk_entries: list[dict] = []
925
- file_snapshot: dict[str, dict] = {}
1096
+ chunk_ids: list[int] = []
1097
+ file_snapshot: list[dict] = []
1098
+ file_meta_by_rel: dict[str, dict] = {}
1099
+
1100
+ file_rows = conn.execute(
1101
+ """
1102
+ SELECT rel_path, abs_path, size_bytes, mtime
1103
+ FROM indexed_file
1104
+ WHERE index_id = ?
1105
+ """,
1106
+ (index_id,),
1107
+ ).fetchall()
1108
+ for row in file_rows:
1109
+ file_meta_by_rel[row["rel_path"]] = {
1110
+ "path": row["rel_path"],
1111
+ "absolute": row["abs_path"],
1112
+ "mtime": row["mtime"],
1113
+ "size": row["size_bytes"],
1114
+ }
1115
+ seen_files: set[str] = set()
926
1116
 
927
1117
  cursor = conn.execute(
928
1118
  """
929
- SELECT f.rel_path, f.abs_path, f.size_bytes, f.mtime, f.preview, f.label_hash, f.chunk_index, f.start_line, f.end_line, e.vector_blob
930
- FROM indexed_file AS f
931
- JOIN file_embedding AS e ON e.file_id = f.id
932
- WHERE f.index_id = ?
933
- ORDER BY f.position ASC
1119
+ SELECT c.id AS chunk_id, f.rel_path, e.vector_blob
1120
+ FROM indexed_chunk AS c
1121
+ JOIN indexed_file AS f ON f.id = c.file_id
1122
+ JOIN chunk_embedding AS e ON e.chunk_id = c.id
1123
+ WHERE c.index_id = ?
1124
+ ORDER BY c.position ASC
934
1125
  """,
935
1126
  (index_id,),
936
1127
  )
937
1128
 
938
1129
  for idx, row in enumerate(cursor):
939
1130
  rel_path = row["rel_path"]
1131
+ chunk_id = int(row["chunk_id"])
940
1132
  vector = np.frombuffer(row["vector_blob"], dtype=np.float32)
941
1133
  if vector.size != dimension:
942
1134
  raise RuntimeError(
@@ -944,27 +1136,12 @@ def load_index_vectors(
944
1136
  )
945
1137
  embeddings[idx] = vector
946
1138
  paths.append(root / Path(rel_path))
947
- chunk_index = int(row["chunk_index"])
948
- chunk_entries.append(
949
- {
950
- "path": rel_path,
951
- "absolute": row["abs_path"],
952
- "mtime": row["mtime"],
953
- "size": row["size_bytes"],
954
- "preview": row["preview"],
955
- "label_hash": row["label_hash"],
956
- "chunk_index": chunk_index,
957
- "start_line": row["start_line"],
958
- "end_line": row["end_line"],
959
- }
960
- )
961
- if rel_path not in file_snapshot:
962
- file_snapshot[rel_path] = {
963
- "path": rel_path,
964
- "absolute": row["abs_path"],
965
- "mtime": row["mtime"],
966
- "size": row["size_bytes"],
967
- }
1139
+ chunk_ids.append(chunk_id)
1140
+ if rel_path not in seen_files:
1141
+ meta_row = file_meta_by_rel.get(rel_path)
1142
+ if meta_row is not None:
1143
+ file_snapshot.append(meta_row)
1144
+ seen_files.add(rel_path)
968
1145
 
969
1146
  metadata = {
970
1147
  "index_id": int(index_id),
@@ -979,14 +1156,76 @@ def load_index_vectors(
979
1156
  "dimension": meta["dimension"],
980
1157
  "exclude_patterns": _deserialize_exclude_patterns(meta["exclude_patterns"]),
981
1158
  "extensions": _deserialize_extensions(meta["extensions"]),
982
- "files": list(file_snapshot.values()),
983
- "chunks": chunk_entries,
1159
+ "files": file_snapshot,
1160
+ "chunks": [],
1161
+ "chunk_ids": chunk_ids,
984
1162
  }
985
1163
  return paths, embeddings, metadata
986
1164
  finally:
987
1165
  conn.close()
988
1166
 
989
1167
 
1168
+ def load_chunk_metadata(
1169
+ chunk_ids: Sequence[int],
1170
+ conn: sqlite3.Connection | None = None,
1171
+ ) -> dict[int, dict]:
1172
+ """Load cached chunk metadata keyed by chunk_id."""
1173
+
1174
+ if not chunk_ids:
1175
+ return {}
1176
+ unique_ids: list[int] = []
1177
+ seen: set[int] = set()
1178
+ for value in chunk_ids:
1179
+ try:
1180
+ chunk_id = int(value)
1181
+ except (TypeError, ValueError):
1182
+ continue
1183
+ if chunk_id in seen:
1184
+ continue
1185
+ seen.add(chunk_id)
1186
+ unique_ids.append(chunk_id)
1187
+ if not unique_ids:
1188
+ return {}
1189
+ db_path = cache_db_path()
1190
+ owns_connection = conn is None
1191
+ try:
1192
+ connection = conn or _connect(db_path, readonly=True)
1193
+ except sqlite3.OperationalError:
1194
+ return {}
1195
+ try:
1196
+ try:
1197
+ _ensure_schema_readonly(
1198
+ connection,
1199
+ tables=("indexed_chunk", "chunk_meta"),
1200
+ )
1201
+ except sqlite3.OperationalError:
1202
+ return {}
1203
+ results: dict[int, dict] = {}
1204
+ for chunk in _chunk_values(unique_ids, 900):
1205
+ placeholders = ", ".join("?" for _ in chunk)
1206
+ rows = connection.execute(
1207
+ f"""
1208
+ SELECT c.id AS chunk_id, c.chunk_index, m.preview, m.label_hash, m.start_line, m.end_line
1209
+ FROM indexed_chunk AS c
1210
+ LEFT JOIN chunk_meta AS m ON m.chunk_id = c.id
1211
+ WHERE c.id IN ({placeholders})
1212
+ """,
1213
+ tuple(chunk),
1214
+ ).fetchall()
1215
+ for row in rows:
1216
+ results[int(row["chunk_id"])] = {
1217
+ "chunk_index": int(row["chunk_index"]),
1218
+ "preview": row["preview"],
1219
+ "label_hash": row["label_hash"],
1220
+ "start_line": row["start_line"],
1221
+ "end_line": row["end_line"],
1222
+ }
1223
+ return results
1224
+ finally:
1225
+ if owns_connection:
1226
+ connection.close()
1227
+
1228
+
990
1229
  def load_query_vector(
991
1230
  index_id: int,
992
1231
  query_hash: str,
@@ -997,12 +1236,12 @@ def load_query_vector(
997
1236
  db_path = cache_db_path()
998
1237
  owns_connection = conn is None
999
1238
  try:
1000
- connection = conn or _connect(db_path)
1239
+ connection = conn or _connect(db_path, readonly=True)
1001
1240
  except sqlite3.OperationalError:
1002
1241
  return None
1003
1242
  try:
1004
1243
  try:
1005
- _ensure_schema(connection)
1244
+ _ensure_schema_readonly(connection, tables=("query_cache",))
1006
1245
  except sqlite3.OperationalError:
1007
1246
  return None
1008
1247
  row = connection.execute(
@@ -1074,12 +1313,12 @@ def load_embedding_cache(
1074
1313
  db_path = cache_db_path()
1075
1314
  owns_connection = conn is None
1076
1315
  try:
1077
- connection = conn or _connect(db_path)
1316
+ connection = conn or _connect(db_path, readonly=True)
1078
1317
  except sqlite3.OperationalError:
1079
1318
  return {}
1080
1319
  try:
1081
1320
  try:
1082
- _ensure_schema(connection)
1321
+ _ensure_schema_readonly(connection, tables=("embedding_cache",))
1083
1322
  except sqlite3.OperationalError:
1084
1323
  return {}
1085
1324
  results: dict[str, np.ndarray] = {}
@@ -1237,12 +1476,12 @@ def list_cache_entries() -> list[dict[str, object]]:
1237
1476
  return []
1238
1477
 
1239
1478
  try:
1240
- conn = _connect(db_path)
1479
+ conn = _connect(db_path, readonly=True)
1241
1480
  except sqlite3.OperationalError:
1242
1481
  return []
1243
1482
  try:
1244
1483
  try:
1245
- _ensure_schema(conn)
1484
+ _ensure_schema_readonly(conn, tables=("index_metadata", "indexed_file"))
1246
1485
  except sqlite3.OperationalError:
1247
1486
  return []
1248
1487
  rows = conn.execute(
@@ -1260,7 +1499,7 @@ def list_cache_entries() -> list[dict[str, object]]:
1260
1499
  exclude_patterns,
1261
1500
  extensions,
1262
1501
  (
1263
- SELECT COUNT(DISTINCT rel_path)
1502
+ SELECT COUNT(*)
1264
1503
  FROM indexed_file
1265
1504
  WHERE index_id = index_metadata.id
1266
1505
  ) AS file_count