vexor 0.20.0__py3-none-any.whl → 0.21.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vexor/__init__.py +1 -1
- vexor/api.py +26 -0
- vexor/cache.py +525 -286
- vexor/cli.py +53 -0
- vexor/config.py +54 -1
- vexor/providers/gemini.py +79 -13
- vexor/providers/openai.py +79 -13
- vexor/services/config_service.py +14 -0
- vexor/services/index_service.py +132 -5
- vexor/services/search_service.py +94 -27
- vexor/text.py +10 -0
- {vexor-0.20.0.dist-info → vexor-0.21.1.dist-info}/METADATA +15 -13
- {vexor-0.20.0.dist-info → vexor-0.21.1.dist-info}/RECORD +16 -16
- {vexor-0.20.0.dist-info → vexor-0.21.1.dist-info}/WHEEL +0 -0
- {vexor-0.20.0.dist-info → vexor-0.21.1.dist-info}/entry_points.txt +0 -0
- {vexor-0.20.0.dist-info → vexor-0.21.1.dist-info}/licenses/LICENSE +0 -0
vexor/cache.py
CHANGED
|
@@ -16,7 +16,7 @@ from .utils import collect_files
|
|
|
16
16
|
|
|
17
17
|
DEFAULT_CACHE_DIR = Path(os.path.expanduser("~")) / ".vexor"
|
|
18
18
|
CACHE_DIR = DEFAULT_CACHE_DIR
|
|
19
|
-
CACHE_VERSION =
|
|
19
|
+
CACHE_VERSION = 6
|
|
20
20
|
DB_FILENAME = "index.db"
|
|
21
21
|
EMBED_CACHE_TTL_DAYS = 30
|
|
22
22
|
EMBED_CACHE_MAX_ENTRIES = 50_000
|
|
@@ -110,7 +110,7 @@ def _deserialize_exclude_patterns(value: str | None) -> tuple[str, ...]:
|
|
|
110
110
|
return tuple(parts)
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
def _chunk_values(values: Sequence[
|
|
113
|
+
def _chunk_values(values: Sequence[object], size: int) -> Iterable[Sequence[object]]:
|
|
114
114
|
for idx in range(0, len(values), size):
|
|
115
115
|
yield values[idx : idx + size]
|
|
116
116
|
|
|
@@ -143,8 +143,17 @@ def cache_file(root: Path, model: str, include_hidden: bool) -> Path: # pragma:
|
|
|
143
143
|
return cache_db_path()
|
|
144
144
|
|
|
145
145
|
|
|
146
|
-
def _connect(
|
|
147
|
-
|
|
146
|
+
def _connect(
|
|
147
|
+
db_path: Path,
|
|
148
|
+
*,
|
|
149
|
+
readonly: bool = False,
|
|
150
|
+
query_only: bool = False,
|
|
151
|
+
) -> sqlite3.Connection:
|
|
152
|
+
if readonly:
|
|
153
|
+
db_uri = f"file:{db_path.as_posix()}?mode=ro"
|
|
154
|
+
conn = sqlite3.connect(db_uri, uri=True)
|
|
155
|
+
else:
|
|
156
|
+
conn = sqlite3.connect(db_path)
|
|
148
157
|
conn.row_factory = sqlite3.Row
|
|
149
158
|
try:
|
|
150
159
|
conn.execute("PRAGMA journal_mode = WAL;")
|
|
@@ -155,10 +164,59 @@ def _connect(db_path: Path) -> sqlite3.Connection:
|
|
|
155
164
|
conn.execute("PRAGMA temp_store = MEMORY;")
|
|
156
165
|
conn.execute("PRAGMA busy_timeout = 5000;")
|
|
157
166
|
conn.execute("PRAGMA foreign_keys = ON;")
|
|
167
|
+
if readonly or query_only:
|
|
168
|
+
conn.execute("PRAGMA query_only = ON;")
|
|
158
169
|
return conn
|
|
159
170
|
|
|
160
171
|
|
|
172
|
+
def _ensure_schema_readonly(
|
|
173
|
+
conn: sqlite3.Connection,
|
|
174
|
+
*,
|
|
175
|
+
tables: Sequence[str],
|
|
176
|
+
) -> None:
|
|
177
|
+
if _schema_needs_reset(conn):
|
|
178
|
+
raise sqlite3.OperationalError("Schema reset required")
|
|
179
|
+
for table in tables:
|
|
180
|
+
if not _table_exists(conn, table):
|
|
181
|
+
raise sqlite3.OperationalError(f"Missing table: {table}")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _table_exists(conn: sqlite3.Connection, table: str) -> bool:
|
|
185
|
+
row = conn.execute(
|
|
186
|
+
"SELECT name FROM sqlite_master WHERE type = 'table' AND name = ?",
|
|
187
|
+
(table,),
|
|
188
|
+
).fetchone()
|
|
189
|
+
return row is not None
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _schema_needs_reset(conn: sqlite3.Connection) -> bool:
|
|
193
|
+
if _table_exists(conn, "indexed_chunk"):
|
|
194
|
+
return False
|
|
195
|
+
return any(
|
|
196
|
+
_table_exists(conn, table)
|
|
197
|
+
for table in ("index_metadata", "indexed_file", "file_embedding", "query_cache")
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _reset_index_schema(conn: sqlite3.Connection) -> None:
|
|
202
|
+
conn.execute("PRAGMA foreign_keys = OFF;")
|
|
203
|
+
conn.executescript(
|
|
204
|
+
"""
|
|
205
|
+
DROP TABLE IF EXISTS query_cache;
|
|
206
|
+
DROP TABLE IF EXISTS file_embedding;
|
|
207
|
+
DROP TABLE IF EXISTS chunk_embedding;
|
|
208
|
+
DROP TABLE IF EXISTS chunk_meta;
|
|
209
|
+
DROP TABLE IF EXISTS indexed_chunk;
|
|
210
|
+
DROP TABLE IF EXISTS indexed_file;
|
|
211
|
+
DROP TABLE IF EXISTS index_metadata;
|
|
212
|
+
"""
|
|
213
|
+
)
|
|
214
|
+
conn.execute("PRAGMA foreign_keys = ON;")
|
|
215
|
+
|
|
216
|
+
|
|
161
217
|
def _ensure_schema(conn: sqlite3.Connection) -> None:
|
|
218
|
+
if _schema_needs_reset(conn):
|
|
219
|
+
_reset_index_schema(conn)
|
|
162
220
|
conn.executescript(
|
|
163
221
|
"""
|
|
164
222
|
CREATE TABLE IF NOT EXISTS index_metadata (
|
|
@@ -185,20 +243,31 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
|
|
|
185
243
|
abs_path TEXT NOT NULL,
|
|
186
244
|
size_bytes INTEGER NOT NULL,
|
|
187
245
|
mtime REAL NOT NULL,
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
246
|
+
UNIQUE(index_id, rel_path)
|
|
247
|
+
);
|
|
248
|
+
|
|
249
|
+
CREATE TABLE IF NOT EXISTS indexed_chunk (
|
|
250
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
251
|
+
index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
|
|
252
|
+
file_id INTEGER NOT NULL REFERENCES indexed_file(id) ON DELETE CASCADE,
|
|
191
253
|
chunk_index INTEGER NOT NULL DEFAULT 0,
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
UNIQUE(index_id, rel_path, chunk_index)
|
|
254
|
+
position INTEGER NOT NULL,
|
|
255
|
+
UNIQUE(index_id, file_id, chunk_index)
|
|
195
256
|
);
|
|
196
257
|
|
|
197
|
-
CREATE TABLE IF NOT EXISTS
|
|
198
|
-
|
|
258
|
+
CREATE TABLE IF NOT EXISTS chunk_embedding (
|
|
259
|
+
chunk_id INTEGER PRIMARY KEY REFERENCES indexed_chunk(id) ON DELETE CASCADE,
|
|
199
260
|
vector_blob BLOB NOT NULL
|
|
200
261
|
);
|
|
201
262
|
|
|
263
|
+
CREATE TABLE IF NOT EXISTS chunk_meta (
|
|
264
|
+
chunk_id INTEGER PRIMARY KEY REFERENCES indexed_chunk(id) ON DELETE CASCADE,
|
|
265
|
+
preview TEXT DEFAULT '',
|
|
266
|
+
label_hash TEXT DEFAULT '',
|
|
267
|
+
start_line INTEGER,
|
|
268
|
+
end_line INTEGER
|
|
269
|
+
);
|
|
270
|
+
|
|
202
271
|
CREATE TABLE IF NOT EXISTS query_cache (
|
|
203
272
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
204
273
|
index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
|
|
@@ -218,8 +287,11 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
|
|
|
218
287
|
UNIQUE(model, text_hash)
|
|
219
288
|
);
|
|
220
289
|
|
|
221
|
-
CREATE INDEX IF NOT EXISTS
|
|
222
|
-
ON
|
|
290
|
+
CREATE INDEX IF NOT EXISTS idx_indexed_chunk_order
|
|
291
|
+
ON indexed_chunk(index_id, position);
|
|
292
|
+
|
|
293
|
+
CREATE INDEX IF NOT EXISTS idx_indexed_file_lookup
|
|
294
|
+
ON indexed_file(index_id, rel_path);
|
|
223
295
|
|
|
224
296
|
CREATE INDEX IF NOT EXISTS idx_query_cache_lookup
|
|
225
297
|
ON query_cache(index_id, query_hash);
|
|
@@ -228,133 +300,6 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
|
|
|
228
300
|
ON embedding_cache(model, text_hash);
|
|
229
301
|
"""
|
|
230
302
|
)
|
|
231
|
-
try:
|
|
232
|
-
conn.execute(
|
|
233
|
-
"ALTER TABLE index_metadata ADD COLUMN recursive INTEGER NOT NULL DEFAULT 1"
|
|
234
|
-
)
|
|
235
|
-
except sqlite3.OperationalError:
|
|
236
|
-
# Column already exists; ignore error.
|
|
237
|
-
pass
|
|
238
|
-
try:
|
|
239
|
-
conn.execute(
|
|
240
|
-
"ALTER TABLE index_metadata ADD COLUMN respect_gitignore INTEGER NOT NULL DEFAULT 1"
|
|
241
|
-
)
|
|
242
|
-
except sqlite3.OperationalError:
|
|
243
|
-
pass
|
|
244
|
-
try:
|
|
245
|
-
conn.execute(
|
|
246
|
-
"ALTER TABLE index_metadata ADD COLUMN mode TEXT NOT NULL DEFAULT 'name'"
|
|
247
|
-
)
|
|
248
|
-
except sqlite3.OperationalError:
|
|
249
|
-
pass
|
|
250
|
-
try:
|
|
251
|
-
conn.execute(
|
|
252
|
-
"ALTER TABLE indexed_file ADD COLUMN preview TEXT DEFAULT ''"
|
|
253
|
-
)
|
|
254
|
-
except sqlite3.OperationalError:
|
|
255
|
-
pass
|
|
256
|
-
try:
|
|
257
|
-
conn.execute(
|
|
258
|
-
"ALTER TABLE indexed_file ADD COLUMN label_hash TEXT DEFAULT ''"
|
|
259
|
-
)
|
|
260
|
-
except sqlite3.OperationalError:
|
|
261
|
-
pass
|
|
262
|
-
try:
|
|
263
|
-
conn.execute("ALTER TABLE indexed_file ADD COLUMN start_line INTEGER")
|
|
264
|
-
except sqlite3.OperationalError:
|
|
265
|
-
pass
|
|
266
|
-
try:
|
|
267
|
-
conn.execute("ALTER TABLE indexed_file ADD COLUMN end_line INTEGER")
|
|
268
|
-
except sqlite3.OperationalError:
|
|
269
|
-
pass
|
|
270
|
-
if not _table_has_column(conn, "indexed_file", "chunk_index"):
|
|
271
|
-
_upgrade_indexed_file_with_chunk(conn)
|
|
272
|
-
try:
|
|
273
|
-
conn.execute(
|
|
274
|
-
"ALTER TABLE index_metadata ADD COLUMN extensions TEXT DEFAULT ''"
|
|
275
|
-
)
|
|
276
|
-
except sqlite3.OperationalError:
|
|
277
|
-
pass
|
|
278
|
-
try:
|
|
279
|
-
conn.execute(
|
|
280
|
-
"ALTER TABLE index_metadata ADD COLUMN exclude_patterns TEXT DEFAULT ''"
|
|
281
|
-
)
|
|
282
|
-
except sqlite3.OperationalError:
|
|
283
|
-
pass
|
|
284
|
-
_cleanup_orphan_embeddings(conn)
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
def _table_has_column(conn: sqlite3.Connection, table: str, column: str) -> bool:
|
|
288
|
-
rows = conn.execute(f"PRAGMA table_info({table})").fetchall()
|
|
289
|
-
return any(row[1] == column for row in rows)
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
def _upgrade_indexed_file_with_chunk(conn: sqlite3.Connection) -> None:
|
|
293
|
-
conn.execute("PRAGMA foreign_keys = OFF;")
|
|
294
|
-
conn.execute("ALTER TABLE indexed_file RENAME TO indexed_file_legacy;")
|
|
295
|
-
conn.executescript(
|
|
296
|
-
"""
|
|
297
|
-
CREATE TABLE indexed_file (
|
|
298
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
299
|
-
index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
|
|
300
|
-
rel_path TEXT NOT NULL,
|
|
301
|
-
abs_path TEXT NOT NULL,
|
|
302
|
-
size_bytes INTEGER NOT NULL,
|
|
303
|
-
mtime REAL NOT NULL,
|
|
304
|
-
position INTEGER NOT NULL,
|
|
305
|
-
preview TEXT DEFAULT '',
|
|
306
|
-
label_hash TEXT DEFAULT '',
|
|
307
|
-
chunk_index INTEGER NOT NULL DEFAULT 0,
|
|
308
|
-
start_line INTEGER,
|
|
309
|
-
end_line INTEGER,
|
|
310
|
-
UNIQUE(index_id, rel_path, chunk_index)
|
|
311
|
-
);
|
|
312
|
-
|
|
313
|
-
CREATE INDEX IF NOT EXISTS idx_indexed_file_order
|
|
314
|
-
ON indexed_file(index_id, position);
|
|
315
|
-
"""
|
|
316
|
-
)
|
|
317
|
-
conn.execute(
|
|
318
|
-
"""
|
|
319
|
-
INSERT INTO indexed_file (
|
|
320
|
-
id,
|
|
321
|
-
index_id,
|
|
322
|
-
rel_path,
|
|
323
|
-
abs_path,
|
|
324
|
-
size_bytes,
|
|
325
|
-
mtime,
|
|
326
|
-
position,
|
|
327
|
-
preview,
|
|
328
|
-
label_hash,
|
|
329
|
-
chunk_index,
|
|
330
|
-
start_line,
|
|
331
|
-
end_line
|
|
332
|
-
)
|
|
333
|
-
SELECT
|
|
334
|
-
id,
|
|
335
|
-
index_id,
|
|
336
|
-
rel_path,
|
|
337
|
-
abs_path,
|
|
338
|
-
size_bytes,
|
|
339
|
-
mtime,
|
|
340
|
-
position,
|
|
341
|
-
preview,
|
|
342
|
-
'',
|
|
343
|
-
0,
|
|
344
|
-
NULL,
|
|
345
|
-
NULL
|
|
346
|
-
FROM indexed_file_legacy;
|
|
347
|
-
"""
|
|
348
|
-
)
|
|
349
|
-
conn.execute("DROP TABLE indexed_file_legacy;")
|
|
350
|
-
conn.execute("PRAGMA foreign_keys = ON;")
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
def _cleanup_orphan_embeddings(conn: sqlite3.Connection) -> None:
|
|
354
|
-
with conn:
|
|
355
|
-
conn.execute(
|
|
356
|
-
"DELETE FROM file_embedding WHERE file_id NOT IN (SELECT id FROM indexed_file)"
|
|
357
|
-
)
|
|
358
303
|
|
|
359
304
|
|
|
360
305
|
def store_index(
|
|
@@ -430,32 +375,22 @@ def store_index(
|
|
|
430
375
|
)
|
|
431
376
|
index_id = cursor.lastrowid
|
|
432
377
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
378
|
+
file_rows_by_rel: dict[str, tuple] = {}
|
|
379
|
+
for entry in entries:
|
|
380
|
+
if entry.rel_path in file_rows_by_rel:
|
|
381
|
+
continue
|
|
436
382
|
size_bytes = entry.size_bytes
|
|
437
383
|
mtime = entry.mtime
|
|
438
384
|
if size_bytes is None or mtime is None:
|
|
439
385
|
stat = entry.path.stat()
|
|
440
386
|
size_bytes = stat.st_size
|
|
441
387
|
mtime = stat.st_mtime
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
mtime,
|
|
449
|
-
position,
|
|
450
|
-
entry.preview,
|
|
451
|
-
entry.label_hash,
|
|
452
|
-
entry.chunk_index,
|
|
453
|
-
entry.start_line,
|
|
454
|
-
entry.end_line,
|
|
455
|
-
)
|
|
456
|
-
)
|
|
457
|
-
vector_blobs.append(
|
|
458
|
-
np.asarray(entry.embedding, dtype=np.float32).tobytes()
|
|
388
|
+
file_rows_by_rel[entry.rel_path] = (
|
|
389
|
+
index_id,
|
|
390
|
+
entry.rel_path,
|
|
391
|
+
str(entry.path),
|
|
392
|
+
size_bytes,
|
|
393
|
+
mtime,
|
|
459
394
|
)
|
|
460
395
|
|
|
461
396
|
conn.executemany(
|
|
@@ -465,29 +400,87 @@ def store_index(
|
|
|
465
400
|
rel_path,
|
|
466
401
|
abs_path,
|
|
467
402
|
size_bytes,
|
|
468
|
-
mtime
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
403
|
+
mtime
|
|
404
|
+
) VALUES (?, ?, ?, ?, ?)
|
|
405
|
+
""",
|
|
406
|
+
list(file_rows_by_rel.values()),
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
file_id_map: dict[str, int] = {}
|
|
410
|
+
rel_paths = list(file_rows_by_rel.keys())
|
|
411
|
+
for chunk in _chunk_values(rel_paths, 900):
|
|
412
|
+
placeholders = ", ".join("?" for _ in chunk)
|
|
413
|
+
rows = conn.execute(
|
|
414
|
+
f"""
|
|
415
|
+
SELECT id, rel_path
|
|
416
|
+
FROM indexed_file
|
|
417
|
+
WHERE index_id = ? AND rel_path IN ({placeholders})
|
|
418
|
+
""",
|
|
419
|
+
(index_id, *chunk),
|
|
420
|
+
).fetchall()
|
|
421
|
+
for row in rows:
|
|
422
|
+
file_id_map[row["rel_path"]] = int(row["id"])
|
|
423
|
+
|
|
424
|
+
chunk_rows: list[tuple] = []
|
|
425
|
+
vector_blobs: list[bytes] = []
|
|
426
|
+
meta_rows: list[tuple] = []
|
|
427
|
+
for position, entry in enumerate(entries):
|
|
428
|
+
file_id = file_id_map.get(entry.rel_path)
|
|
429
|
+
if file_id is None:
|
|
430
|
+
continue
|
|
431
|
+
chunk_rows.append(
|
|
432
|
+
(index_id, file_id, entry.chunk_index, position)
|
|
433
|
+
)
|
|
434
|
+
vector_blobs.append(
|
|
435
|
+
np.asarray(entry.embedding, dtype=np.float32).tobytes()
|
|
436
|
+
)
|
|
437
|
+
meta_rows.append(
|
|
438
|
+
(
|
|
439
|
+
entry.preview or "",
|
|
440
|
+
entry.label_hash or "",
|
|
441
|
+
entry.start_line,
|
|
442
|
+
entry.end_line,
|
|
443
|
+
)
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
conn.executemany(
|
|
447
|
+
"""
|
|
448
|
+
INSERT INTO indexed_chunk (
|
|
449
|
+
index_id,
|
|
450
|
+
file_id,
|
|
472
451
|
chunk_index,
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
452
|
+
position
|
|
453
|
+
) VALUES (?, ?, ?, ?)
|
|
476
454
|
""",
|
|
477
|
-
|
|
455
|
+
chunk_rows,
|
|
478
456
|
)
|
|
479
457
|
|
|
480
458
|
inserted_ids = conn.execute(
|
|
481
|
-
"SELECT id FROM
|
|
459
|
+
"SELECT id FROM indexed_chunk WHERE index_id = ? ORDER BY position ASC",
|
|
482
460
|
(index_id,),
|
|
483
461
|
).fetchall()
|
|
484
462
|
conn.executemany(
|
|
485
|
-
"INSERT OR REPLACE INTO
|
|
463
|
+
"INSERT OR REPLACE INTO chunk_embedding (chunk_id, vector_blob) VALUES (?, ?)",
|
|
486
464
|
(
|
|
487
465
|
(row["id"], vector_blobs[idx])
|
|
488
466
|
for idx, row in enumerate(inserted_ids)
|
|
489
467
|
),
|
|
490
468
|
)
|
|
469
|
+
conn.executemany(
|
|
470
|
+
"""
|
|
471
|
+
INSERT OR REPLACE INTO chunk_meta (
|
|
472
|
+
chunk_id,
|
|
473
|
+
preview,
|
|
474
|
+
label_hash,
|
|
475
|
+
start_line,
|
|
476
|
+
end_line
|
|
477
|
+
) VALUES (?, ?, ?, ?, ?)
|
|
478
|
+
""",
|
|
479
|
+
(
|
|
480
|
+
(row["id"], *meta_rows[idx])
|
|
481
|
+
for idx, row in enumerate(inserted_ids)
|
|
482
|
+
),
|
|
483
|
+
)
|
|
491
484
|
|
|
492
485
|
return db_path
|
|
493
486
|
finally:
|
|
@@ -558,112 +551,221 @@ def apply_index_updates(
|
|
|
558
551
|
if changed_entries:
|
|
559
552
|
chunk_map: dict[str, list[IndexedChunk]] = {}
|
|
560
553
|
for entry in changed_entries:
|
|
561
|
-
|
|
562
|
-
chunk_map[entry.rel_path] = []
|
|
563
|
-
chunk_map[entry.rel_path].append(entry)
|
|
554
|
+
chunk_map.setdefault(entry.rel_path, []).append(entry)
|
|
564
555
|
|
|
565
|
-
for rel_path
|
|
556
|
+
for rel_path in chunk_map:
|
|
566
557
|
conn.execute(
|
|
567
558
|
"DELETE FROM indexed_file WHERE index_id = ? AND rel_path = ?",
|
|
568
559
|
(index_id, rel_path),
|
|
569
560
|
)
|
|
561
|
+
|
|
562
|
+
file_rows_by_rel: dict[str, tuple] = {}
|
|
563
|
+
for rel_path, chunk_list in chunk_map.items():
|
|
564
|
+
chunk_list.sort(key=lambda item: item.chunk_index)
|
|
565
|
+
sample = chunk_list[0]
|
|
566
|
+
size_bytes = sample.size_bytes
|
|
567
|
+
mtime = sample.mtime
|
|
568
|
+
if size_bytes is None or mtime is None:
|
|
569
|
+
stat = sample.path.stat()
|
|
570
|
+
size_bytes = stat.st_size
|
|
571
|
+
mtime = stat.st_mtime
|
|
572
|
+
file_rows_by_rel[rel_path] = (
|
|
573
|
+
index_id,
|
|
574
|
+
rel_path,
|
|
575
|
+
str(sample.path),
|
|
576
|
+
size_bytes,
|
|
577
|
+
mtime,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
if file_rows_by_rel:
|
|
581
|
+
conn.executemany(
|
|
582
|
+
"""
|
|
583
|
+
INSERT INTO indexed_file (
|
|
584
|
+
index_id,
|
|
585
|
+
rel_path,
|
|
586
|
+
abs_path,
|
|
587
|
+
size_bytes,
|
|
588
|
+
mtime
|
|
589
|
+
) VALUES (?, ?, ?, ?, ?)
|
|
590
|
+
""",
|
|
591
|
+
list(file_rows_by_rel.values()),
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
file_id_map: dict[str, int] = {}
|
|
595
|
+
rel_paths = list(file_rows_by_rel.keys())
|
|
596
|
+
for chunk in _chunk_values(rel_paths, 900):
|
|
597
|
+
placeholders = ", ".join("?" for _ in chunk)
|
|
598
|
+
rows = conn.execute(
|
|
599
|
+
f"""
|
|
600
|
+
SELECT id, rel_path
|
|
601
|
+
FROM indexed_file
|
|
602
|
+
WHERE index_id = ? AND rel_path IN ({placeholders})
|
|
603
|
+
""",
|
|
604
|
+
(index_id, *chunk),
|
|
605
|
+
).fetchall()
|
|
606
|
+
for row in rows:
|
|
607
|
+
file_id_map[row["rel_path"]] = int(row["id"])
|
|
608
|
+
|
|
609
|
+
for rel_path, chunk_list in chunk_map.items():
|
|
610
|
+
file_id = file_id_map.get(rel_path)
|
|
611
|
+
if file_id is None:
|
|
612
|
+
continue
|
|
570
613
|
chunk_list.sort(key=lambda item: item.chunk_index)
|
|
571
|
-
|
|
614
|
+
chunk_rows: list[tuple] = []
|
|
572
615
|
vector_blobs: list[bytes] = []
|
|
616
|
+
meta_rows: list[tuple] = []
|
|
573
617
|
for chunk in chunk_list:
|
|
574
618
|
vector = np.asarray(chunk.embedding, dtype=np.float32)
|
|
575
619
|
if vector_dimension is None:
|
|
576
620
|
vector_dimension = vector.shape[0]
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
mtime = stat.st_mtime
|
|
583
|
-
file_rows.append(
|
|
621
|
+
chunk_rows.append(
|
|
622
|
+
(index_id, file_id, chunk.chunk_index, 0)
|
|
623
|
+
)
|
|
624
|
+
vector_blobs.append(vector.tobytes())
|
|
625
|
+
meta_rows.append(
|
|
584
626
|
(
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
str(chunk.path),
|
|
588
|
-
size_bytes,
|
|
589
|
-
mtime,
|
|
590
|
-
0,
|
|
591
|
-
chunk.preview,
|
|
592
|
-
chunk.label_hash,
|
|
593
|
-
chunk.chunk_index,
|
|
627
|
+
chunk.preview or "",
|
|
628
|
+
chunk.label_hash or "",
|
|
594
629
|
chunk.start_line,
|
|
595
630
|
chunk.end_line,
|
|
596
631
|
)
|
|
597
632
|
)
|
|
598
|
-
vector_blobs.append(vector.tobytes())
|
|
599
633
|
|
|
600
634
|
conn.executemany(
|
|
601
635
|
"""
|
|
602
|
-
INSERT INTO
|
|
636
|
+
INSERT INTO indexed_chunk (
|
|
603
637
|
index_id,
|
|
604
|
-
|
|
605
|
-
abs_path,
|
|
606
|
-
size_bytes,
|
|
607
|
-
mtime,
|
|
608
|
-
position,
|
|
609
|
-
preview,
|
|
610
|
-
label_hash,
|
|
638
|
+
file_id,
|
|
611
639
|
chunk_index,
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
640
|
+
position
|
|
641
|
+
) VALUES (?, ?, ?, ?)
|
|
615
642
|
""",
|
|
616
|
-
|
|
643
|
+
chunk_rows,
|
|
617
644
|
)
|
|
618
645
|
|
|
619
646
|
inserted_ids = conn.execute(
|
|
620
647
|
"""
|
|
621
|
-
SELECT id FROM
|
|
622
|
-
WHERE index_id = ? AND
|
|
648
|
+
SELECT id FROM indexed_chunk
|
|
649
|
+
WHERE index_id = ? AND file_id = ?
|
|
623
650
|
ORDER BY chunk_index ASC
|
|
624
651
|
""",
|
|
625
|
-
(index_id,
|
|
652
|
+
(index_id, file_id),
|
|
626
653
|
).fetchall()
|
|
627
654
|
conn.executemany(
|
|
628
|
-
"INSERT INTO
|
|
655
|
+
"INSERT OR REPLACE INTO chunk_embedding (chunk_id, vector_blob) VALUES (?, ?)",
|
|
629
656
|
(
|
|
630
657
|
(row["id"], vector_blobs[idx])
|
|
631
658
|
for idx, row in enumerate(inserted_ids)
|
|
632
659
|
),
|
|
633
660
|
)
|
|
661
|
+
conn.executemany(
|
|
662
|
+
"""
|
|
663
|
+
INSERT OR REPLACE INTO chunk_meta (
|
|
664
|
+
chunk_id,
|
|
665
|
+
preview,
|
|
666
|
+
label_hash,
|
|
667
|
+
start_line,
|
|
668
|
+
end_line
|
|
669
|
+
) VALUES (?, ?, ?, ?, ?)
|
|
670
|
+
""",
|
|
671
|
+
(
|
|
672
|
+
(row["id"], *meta_rows[idx])
|
|
673
|
+
for idx, row in enumerate(inserted_ids)
|
|
674
|
+
),
|
|
675
|
+
)
|
|
634
676
|
|
|
635
677
|
if touched_entries:
|
|
678
|
+
file_updates: dict[str, tuple[int, float]] = {}
|
|
679
|
+
for (
|
|
680
|
+
rel_path,
|
|
681
|
+
_chunk_index,
|
|
682
|
+
size_bytes,
|
|
683
|
+
mtime,
|
|
684
|
+
_preview,
|
|
685
|
+
_start_line,
|
|
686
|
+
_end_line,
|
|
687
|
+
_label_hash,
|
|
688
|
+
) in touched_entries:
|
|
689
|
+
if rel_path not in file_updates:
|
|
690
|
+
file_updates[rel_path] = (size_bytes, mtime)
|
|
636
691
|
conn.executemany(
|
|
637
692
|
"""
|
|
638
693
|
UPDATE indexed_file
|
|
639
|
-
SET size_bytes = ?, mtime =
|
|
640
|
-
WHERE index_id = ? AND rel_path = ?
|
|
694
|
+
SET size_bytes = ?, mtime = ?
|
|
695
|
+
WHERE index_id = ? AND rel_path = ?
|
|
641
696
|
""",
|
|
642
697
|
(
|
|
698
|
+
(size_bytes, mtime, index_id, rel_path)
|
|
699
|
+
for rel_path, (size_bytes, mtime) in file_updates.items()
|
|
700
|
+
),
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
chunk_id_map: dict[tuple[str, int], int] = {}
|
|
704
|
+
if ordered_entries or touched_entries:
|
|
705
|
+
rows = conn.execute(
|
|
706
|
+
"""
|
|
707
|
+
SELECT c.id, c.chunk_index, f.rel_path
|
|
708
|
+
FROM indexed_chunk AS c
|
|
709
|
+
JOIN indexed_file AS f ON f.id = c.file_id
|
|
710
|
+
WHERE c.index_id = ?
|
|
711
|
+
""",
|
|
712
|
+
(index_id,),
|
|
713
|
+
).fetchall()
|
|
714
|
+
for row in rows:
|
|
715
|
+
chunk_id_map[(row["rel_path"], int(row["chunk_index"]))] = int(
|
|
716
|
+
row["id"]
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
if touched_entries and chunk_id_map:
|
|
720
|
+
meta_rows: list[tuple] = []
|
|
721
|
+
for (
|
|
722
|
+
rel_path,
|
|
723
|
+
chunk_index,
|
|
724
|
+
_size_bytes,
|
|
725
|
+
_mtime,
|
|
726
|
+
preview,
|
|
727
|
+
start_line,
|
|
728
|
+
end_line,
|
|
729
|
+
label_hash,
|
|
730
|
+
) in touched_entries:
|
|
731
|
+
chunk_id = chunk_id_map.get((rel_path, chunk_index))
|
|
732
|
+
if chunk_id is None:
|
|
733
|
+
continue
|
|
734
|
+
meta_rows.append(
|
|
643
735
|
(
|
|
644
|
-
|
|
645
|
-
mtime,
|
|
736
|
+
chunk_id,
|
|
646
737
|
preview or "",
|
|
738
|
+
label_hash or "",
|
|
647
739
|
start_line,
|
|
648
740
|
end_line,
|
|
649
|
-
label_hash or "",
|
|
650
|
-
index_id,
|
|
651
|
-
rel_path,
|
|
652
|
-
chunk_index,
|
|
653
741
|
)
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
742
|
+
)
|
|
743
|
+
if meta_rows:
|
|
744
|
+
conn.executemany(
|
|
745
|
+
"""
|
|
746
|
+
INSERT OR REPLACE INTO chunk_meta (
|
|
747
|
+
chunk_id,
|
|
748
|
+
preview,
|
|
749
|
+
label_hash,
|
|
750
|
+
start_line,
|
|
751
|
+
end_line
|
|
752
|
+
) VALUES (?, ?, ?, ?, ?)
|
|
753
|
+
""",
|
|
754
|
+
meta_rows,
|
|
755
|
+
)
|
|
657
756
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
757
|
+
if ordered_entries and chunk_id_map:
|
|
758
|
+
position_updates = []
|
|
759
|
+
for position, (rel_path, chunk_index) in enumerate(ordered_entries):
|
|
760
|
+
chunk_id = chunk_id_map.get((rel_path, chunk_index))
|
|
761
|
+
if chunk_id is None:
|
|
762
|
+
continue
|
|
763
|
+
position_updates.append((position, chunk_id))
|
|
764
|
+
if position_updates:
|
|
765
|
+
conn.executemany(
|
|
766
|
+
"UPDATE indexed_chunk SET position = ? WHERE id = ?",
|
|
767
|
+
position_updates,
|
|
768
|
+
)
|
|
667
769
|
|
|
668
770
|
generated_at = datetime.now(timezone.utc).isoformat()
|
|
669
771
|
new_dimension = vector_dimension or existing_dimension
|
|
@@ -728,17 +830,52 @@ def backfill_chunk_lines(
|
|
|
728
830
|
|
|
729
831
|
with conn:
|
|
730
832
|
conn.execute("BEGIN IMMEDIATE;")
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
833
|
+
update_rows: list[tuple[int | None, int | None, int]] = []
|
|
834
|
+
insert_rows: list[tuple[int]] = []
|
|
835
|
+
if updates:
|
|
836
|
+
rel_paths = sorted({rel_path for rel_path, *_ in updates})
|
|
837
|
+
chunk_id_map: dict[tuple[str, int], int] = {}
|
|
838
|
+
for chunk in _chunk_values(rel_paths, 900):
|
|
839
|
+
placeholders = ", ".join("?" for _ in chunk)
|
|
840
|
+
rows = conn.execute(
|
|
841
|
+
f"""
|
|
842
|
+
SELECT c.id, c.chunk_index, f.rel_path
|
|
843
|
+
FROM indexed_chunk AS c
|
|
844
|
+
JOIN indexed_file AS f ON f.id = c.file_id
|
|
845
|
+
WHERE c.index_id = ? AND f.rel_path IN ({placeholders})
|
|
846
|
+
""",
|
|
847
|
+
(index_id, *chunk),
|
|
848
|
+
).fetchall()
|
|
849
|
+
for row in rows:
|
|
850
|
+
chunk_id_map[(row["rel_path"], int(row["chunk_index"]))] = int(
|
|
851
|
+
row["id"]
|
|
852
|
+
)
|
|
853
|
+
for rel_path, chunk_index, start_line, end_line in updates:
|
|
854
|
+
chunk_id = chunk_id_map.get((rel_path, chunk_index))
|
|
855
|
+
if chunk_id is None:
|
|
856
|
+
continue
|
|
857
|
+
insert_rows.append((chunk_id,))
|
|
858
|
+
update_rows.append((start_line, end_line, chunk_id))
|
|
859
|
+
if insert_rows:
|
|
860
|
+
conn.executemany(
|
|
861
|
+
"""
|
|
862
|
+
INSERT OR IGNORE INTO chunk_meta (
|
|
863
|
+
chunk_id,
|
|
864
|
+
preview,
|
|
865
|
+
label_hash
|
|
866
|
+
) VALUES (?, '', '')
|
|
867
|
+
""",
|
|
868
|
+
insert_rows,
|
|
869
|
+
)
|
|
870
|
+
if update_rows:
|
|
871
|
+
conn.executemany(
|
|
872
|
+
"""
|
|
873
|
+
UPDATE chunk_meta
|
|
874
|
+
SET start_line = ?, end_line = ?
|
|
875
|
+
WHERE chunk_id = ?
|
|
876
|
+
""",
|
|
877
|
+
update_rows,
|
|
878
|
+
)
|
|
742
879
|
generated_at = datetime.now(timezone.utc).isoformat()
|
|
743
880
|
conn.execute(
|
|
744
881
|
"""
|
|
@@ -769,9 +906,15 @@ def load_index(
|
|
|
769
906
|
if not db_path.exists():
|
|
770
907
|
raise FileNotFoundError(db_path)
|
|
771
908
|
|
|
772
|
-
conn = _connect(db_path)
|
|
909
|
+
conn = _connect(db_path, readonly=True)
|
|
773
910
|
try:
|
|
774
|
-
|
|
911
|
+
try:
|
|
912
|
+
_ensure_schema_readonly(
|
|
913
|
+
conn,
|
|
914
|
+
tables=("index_metadata", "indexed_file", "indexed_chunk", "chunk_meta"),
|
|
915
|
+
)
|
|
916
|
+
except sqlite3.OperationalError:
|
|
917
|
+
raise FileNotFoundError(db_path)
|
|
775
918
|
key = _cache_key(
|
|
776
919
|
root,
|
|
777
920
|
include_hidden,
|
|
@@ -794,13 +937,27 @@ def load_index(
|
|
|
794
937
|
).fetchone()
|
|
795
938
|
if meta is None:
|
|
796
939
|
raise FileNotFoundError(db_path)
|
|
940
|
+
version = int(meta["version"] or 0)
|
|
941
|
+
if version < CACHE_VERSION:
|
|
942
|
+
raise FileNotFoundError(db_path)
|
|
797
943
|
|
|
798
944
|
rows = conn.execute(
|
|
799
945
|
"""
|
|
800
|
-
SELECT
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
946
|
+
SELECT
|
|
947
|
+
f.rel_path,
|
|
948
|
+
f.abs_path,
|
|
949
|
+
f.size_bytes,
|
|
950
|
+
f.mtime,
|
|
951
|
+
c.chunk_index,
|
|
952
|
+
m.preview,
|
|
953
|
+
m.label_hash,
|
|
954
|
+
m.start_line,
|
|
955
|
+
m.end_line
|
|
956
|
+
FROM indexed_chunk AS c
|
|
957
|
+
JOIN indexed_file AS f ON f.id = c.file_id
|
|
958
|
+
LEFT JOIN chunk_meta AS m ON m.chunk_id = c.id
|
|
959
|
+
WHERE c.index_id = ?
|
|
960
|
+
ORDER BY c.position ASC
|
|
804
961
|
""",
|
|
805
962
|
(meta["id"],),
|
|
806
963
|
).fetchall()
|
|
@@ -865,9 +1022,20 @@ def load_index_vectors(
|
|
|
865
1022
|
if not db_path.exists():
|
|
866
1023
|
raise FileNotFoundError(db_path)
|
|
867
1024
|
|
|
868
|
-
conn = _connect(db_path)
|
|
1025
|
+
conn = _connect(db_path, readonly=True)
|
|
869
1026
|
try:
|
|
870
|
-
|
|
1027
|
+
try:
|
|
1028
|
+
_ensure_schema_readonly(
|
|
1029
|
+
conn,
|
|
1030
|
+
tables=(
|
|
1031
|
+
"index_metadata",
|
|
1032
|
+
"indexed_file",
|
|
1033
|
+
"indexed_chunk",
|
|
1034
|
+
"chunk_embedding",
|
|
1035
|
+
),
|
|
1036
|
+
)
|
|
1037
|
+
except sqlite3.OperationalError:
|
|
1038
|
+
raise FileNotFoundError(db_path)
|
|
871
1039
|
key = _cache_key(
|
|
872
1040
|
root,
|
|
873
1041
|
include_hidden,
|
|
@@ -890,11 +1058,14 @@ def load_index_vectors(
|
|
|
890
1058
|
).fetchone()
|
|
891
1059
|
if meta is None:
|
|
892
1060
|
raise FileNotFoundError(db_path)
|
|
1061
|
+
version = int(meta["version"] or 0)
|
|
1062
|
+
if version < CACHE_VERSION:
|
|
1063
|
+
raise FileNotFoundError(db_path)
|
|
893
1064
|
|
|
894
1065
|
index_id = meta["id"]
|
|
895
1066
|
dimension = int(meta["dimension"])
|
|
896
1067
|
chunk_count = conn.execute(
|
|
897
|
-
"SELECT COUNT(*) AS count FROM
|
|
1068
|
+
"SELECT COUNT(*) AS count FROM indexed_chunk WHERE index_id = ?",
|
|
898
1069
|
(index_id,),
|
|
899
1070
|
).fetchone()["count"]
|
|
900
1071
|
chunk_total = int(chunk_count or 0)
|
|
@@ -916,27 +1087,48 @@ def load_index_vectors(
|
|
|
916
1087
|
"extensions": _deserialize_extensions(meta["extensions"]),
|
|
917
1088
|
"files": [],
|
|
918
1089
|
"chunks": [],
|
|
1090
|
+
"chunk_ids": [],
|
|
919
1091
|
}
|
|
920
1092
|
return [], empty, metadata
|
|
921
1093
|
|
|
922
1094
|
embeddings = np.empty((chunk_total, dimension), dtype=np.float32)
|
|
923
1095
|
paths: list[Path] = []
|
|
924
|
-
|
|
925
|
-
file_snapshot:
|
|
1096
|
+
chunk_ids: list[int] = []
|
|
1097
|
+
file_snapshot: list[dict] = []
|
|
1098
|
+
file_meta_by_rel: dict[str, dict] = {}
|
|
1099
|
+
|
|
1100
|
+
file_rows = conn.execute(
|
|
1101
|
+
"""
|
|
1102
|
+
SELECT rel_path, abs_path, size_bytes, mtime
|
|
1103
|
+
FROM indexed_file
|
|
1104
|
+
WHERE index_id = ?
|
|
1105
|
+
""",
|
|
1106
|
+
(index_id,),
|
|
1107
|
+
).fetchall()
|
|
1108
|
+
for row in file_rows:
|
|
1109
|
+
file_meta_by_rel[row["rel_path"]] = {
|
|
1110
|
+
"path": row["rel_path"],
|
|
1111
|
+
"absolute": row["abs_path"],
|
|
1112
|
+
"mtime": row["mtime"],
|
|
1113
|
+
"size": row["size_bytes"],
|
|
1114
|
+
}
|
|
1115
|
+
seen_files: set[str] = set()
|
|
926
1116
|
|
|
927
1117
|
cursor = conn.execute(
|
|
928
1118
|
"""
|
|
929
|
-
SELECT
|
|
930
|
-
FROM
|
|
931
|
-
JOIN
|
|
932
|
-
|
|
933
|
-
|
|
1119
|
+
SELECT c.id AS chunk_id, f.rel_path, e.vector_blob
|
|
1120
|
+
FROM indexed_chunk AS c
|
|
1121
|
+
JOIN indexed_file AS f ON f.id = c.file_id
|
|
1122
|
+
JOIN chunk_embedding AS e ON e.chunk_id = c.id
|
|
1123
|
+
WHERE c.index_id = ?
|
|
1124
|
+
ORDER BY c.position ASC
|
|
934
1125
|
""",
|
|
935
1126
|
(index_id,),
|
|
936
1127
|
)
|
|
937
1128
|
|
|
938
1129
|
for idx, row in enumerate(cursor):
|
|
939
1130
|
rel_path = row["rel_path"]
|
|
1131
|
+
chunk_id = int(row["chunk_id"])
|
|
940
1132
|
vector = np.frombuffer(row["vector_blob"], dtype=np.float32)
|
|
941
1133
|
if vector.size != dimension:
|
|
942
1134
|
raise RuntimeError(
|
|
@@ -944,27 +1136,12 @@ def load_index_vectors(
|
|
|
944
1136
|
)
|
|
945
1137
|
embeddings[idx] = vector
|
|
946
1138
|
paths.append(root / Path(rel_path))
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
"size": row["size_bytes"],
|
|
954
|
-
"preview": row["preview"],
|
|
955
|
-
"label_hash": row["label_hash"],
|
|
956
|
-
"chunk_index": chunk_index,
|
|
957
|
-
"start_line": row["start_line"],
|
|
958
|
-
"end_line": row["end_line"],
|
|
959
|
-
}
|
|
960
|
-
)
|
|
961
|
-
if rel_path not in file_snapshot:
|
|
962
|
-
file_snapshot[rel_path] = {
|
|
963
|
-
"path": rel_path,
|
|
964
|
-
"absolute": row["abs_path"],
|
|
965
|
-
"mtime": row["mtime"],
|
|
966
|
-
"size": row["size_bytes"],
|
|
967
|
-
}
|
|
1139
|
+
chunk_ids.append(chunk_id)
|
|
1140
|
+
if rel_path not in seen_files:
|
|
1141
|
+
meta_row = file_meta_by_rel.get(rel_path)
|
|
1142
|
+
if meta_row is not None:
|
|
1143
|
+
file_snapshot.append(meta_row)
|
|
1144
|
+
seen_files.add(rel_path)
|
|
968
1145
|
|
|
969
1146
|
metadata = {
|
|
970
1147
|
"index_id": int(index_id),
|
|
@@ -979,14 +1156,76 @@ def load_index_vectors(
|
|
|
979
1156
|
"dimension": meta["dimension"],
|
|
980
1157
|
"exclude_patterns": _deserialize_exclude_patterns(meta["exclude_patterns"]),
|
|
981
1158
|
"extensions": _deserialize_extensions(meta["extensions"]),
|
|
982
|
-
"files":
|
|
983
|
-
"chunks":
|
|
1159
|
+
"files": file_snapshot,
|
|
1160
|
+
"chunks": [],
|
|
1161
|
+
"chunk_ids": chunk_ids,
|
|
984
1162
|
}
|
|
985
1163
|
return paths, embeddings, metadata
|
|
986
1164
|
finally:
|
|
987
1165
|
conn.close()
|
|
988
1166
|
|
|
989
1167
|
|
|
1168
|
+
def load_chunk_metadata(
|
|
1169
|
+
chunk_ids: Sequence[int],
|
|
1170
|
+
conn: sqlite3.Connection | None = None,
|
|
1171
|
+
) -> dict[int, dict]:
|
|
1172
|
+
"""Load cached chunk metadata keyed by chunk_id."""
|
|
1173
|
+
|
|
1174
|
+
if not chunk_ids:
|
|
1175
|
+
return {}
|
|
1176
|
+
unique_ids: list[int] = []
|
|
1177
|
+
seen: set[int] = set()
|
|
1178
|
+
for value in chunk_ids:
|
|
1179
|
+
try:
|
|
1180
|
+
chunk_id = int(value)
|
|
1181
|
+
except (TypeError, ValueError):
|
|
1182
|
+
continue
|
|
1183
|
+
if chunk_id in seen:
|
|
1184
|
+
continue
|
|
1185
|
+
seen.add(chunk_id)
|
|
1186
|
+
unique_ids.append(chunk_id)
|
|
1187
|
+
if not unique_ids:
|
|
1188
|
+
return {}
|
|
1189
|
+
db_path = cache_db_path()
|
|
1190
|
+
owns_connection = conn is None
|
|
1191
|
+
try:
|
|
1192
|
+
connection = conn or _connect(db_path, readonly=True)
|
|
1193
|
+
except sqlite3.OperationalError:
|
|
1194
|
+
return {}
|
|
1195
|
+
try:
|
|
1196
|
+
try:
|
|
1197
|
+
_ensure_schema_readonly(
|
|
1198
|
+
connection,
|
|
1199
|
+
tables=("indexed_chunk", "chunk_meta"),
|
|
1200
|
+
)
|
|
1201
|
+
except sqlite3.OperationalError:
|
|
1202
|
+
return {}
|
|
1203
|
+
results: dict[int, dict] = {}
|
|
1204
|
+
for chunk in _chunk_values(unique_ids, 900):
|
|
1205
|
+
placeholders = ", ".join("?" for _ in chunk)
|
|
1206
|
+
rows = connection.execute(
|
|
1207
|
+
f"""
|
|
1208
|
+
SELECT c.id AS chunk_id, c.chunk_index, m.preview, m.label_hash, m.start_line, m.end_line
|
|
1209
|
+
FROM indexed_chunk AS c
|
|
1210
|
+
LEFT JOIN chunk_meta AS m ON m.chunk_id = c.id
|
|
1211
|
+
WHERE c.id IN ({placeholders})
|
|
1212
|
+
""",
|
|
1213
|
+
tuple(chunk),
|
|
1214
|
+
).fetchall()
|
|
1215
|
+
for row in rows:
|
|
1216
|
+
results[int(row["chunk_id"])] = {
|
|
1217
|
+
"chunk_index": int(row["chunk_index"]),
|
|
1218
|
+
"preview": row["preview"],
|
|
1219
|
+
"label_hash": row["label_hash"],
|
|
1220
|
+
"start_line": row["start_line"],
|
|
1221
|
+
"end_line": row["end_line"],
|
|
1222
|
+
}
|
|
1223
|
+
return results
|
|
1224
|
+
finally:
|
|
1225
|
+
if owns_connection:
|
|
1226
|
+
connection.close()
|
|
1227
|
+
|
|
1228
|
+
|
|
990
1229
|
def load_query_vector(
|
|
991
1230
|
index_id: int,
|
|
992
1231
|
query_hash: str,
|
|
@@ -997,12 +1236,12 @@ def load_query_vector(
|
|
|
997
1236
|
db_path = cache_db_path()
|
|
998
1237
|
owns_connection = conn is None
|
|
999
1238
|
try:
|
|
1000
|
-
connection = conn or _connect(db_path)
|
|
1239
|
+
connection = conn or _connect(db_path, readonly=True)
|
|
1001
1240
|
except sqlite3.OperationalError:
|
|
1002
1241
|
return None
|
|
1003
1242
|
try:
|
|
1004
1243
|
try:
|
|
1005
|
-
|
|
1244
|
+
_ensure_schema_readonly(connection, tables=("query_cache",))
|
|
1006
1245
|
except sqlite3.OperationalError:
|
|
1007
1246
|
return None
|
|
1008
1247
|
row = connection.execute(
|
|
@@ -1074,12 +1313,12 @@ def load_embedding_cache(
|
|
|
1074
1313
|
db_path = cache_db_path()
|
|
1075
1314
|
owns_connection = conn is None
|
|
1076
1315
|
try:
|
|
1077
|
-
connection = conn or _connect(db_path)
|
|
1316
|
+
connection = conn or _connect(db_path, readonly=True)
|
|
1078
1317
|
except sqlite3.OperationalError:
|
|
1079
1318
|
return {}
|
|
1080
1319
|
try:
|
|
1081
1320
|
try:
|
|
1082
|
-
|
|
1321
|
+
_ensure_schema_readonly(connection, tables=("embedding_cache",))
|
|
1083
1322
|
except sqlite3.OperationalError:
|
|
1084
1323
|
return {}
|
|
1085
1324
|
results: dict[str, np.ndarray] = {}
|
|
@@ -1237,12 +1476,12 @@ def list_cache_entries() -> list[dict[str, object]]:
|
|
|
1237
1476
|
return []
|
|
1238
1477
|
|
|
1239
1478
|
try:
|
|
1240
|
-
conn = _connect(db_path)
|
|
1479
|
+
conn = _connect(db_path, readonly=True)
|
|
1241
1480
|
except sqlite3.OperationalError:
|
|
1242
1481
|
return []
|
|
1243
1482
|
try:
|
|
1244
1483
|
try:
|
|
1245
|
-
|
|
1484
|
+
_ensure_schema_readonly(conn, tables=("index_metadata", "indexed_file"))
|
|
1246
1485
|
except sqlite3.OperationalError:
|
|
1247
1486
|
return []
|
|
1248
1487
|
rows = conn.execute(
|
|
@@ -1260,7 +1499,7 @@ def list_cache_entries() -> list[dict[str, object]]:
|
|
|
1260
1499
|
exclude_patterns,
|
|
1261
1500
|
extensions,
|
|
1262
1501
|
(
|
|
1263
|
-
SELECT COUNT(
|
|
1502
|
+
SELECT COUNT(*)
|
|
1264
1503
|
FROM indexed_file
|
|
1265
1504
|
WHERE index_id = index_metadata.id
|
|
1266
1505
|
) AS file_count
|