codegraph-cli 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph_cli/__init__.py +1 -1
- codegraph_cli/agents.py +59 -3
- codegraph_cli/chat_agent.py +58 -11
- codegraph_cli/cli.py +569 -54
- codegraph_cli/cli_chat.py +204 -94
- codegraph_cli/cli_diagnose.py +13 -2
- codegraph_cli/cli_docs.py +207 -0
- codegraph_cli/cli_explore.py +1053 -0
- codegraph_cli/cli_export.py +941 -0
- codegraph_cli/cli_groups.py +33 -0
- codegraph_cli/cli_health.py +316 -0
- codegraph_cli/cli_history.py +213 -0
- codegraph_cli/cli_onboard.py +380 -0
- codegraph_cli/cli_quickstart.py +256 -0
- codegraph_cli/cli_refactor.py +17 -3
- codegraph_cli/cli_setup.py +12 -12
- codegraph_cli/cli_suggestions.py +90 -0
- codegraph_cli/cli_test.py +17 -3
- codegraph_cli/cli_tui.py +210 -0
- codegraph_cli/cli_v2.py +24 -4
- codegraph_cli/cli_watch.py +158 -0
- codegraph_cli/cli_workflows.py +255 -0
- codegraph_cli/codegen_agent.py +15 -1
- codegraph_cli/config.py +18 -5
- codegraph_cli/context_manager.py +117 -15
- codegraph_cli/crew_agents.py +32 -8
- codegraph_cli/crew_chat.py +146 -13
- codegraph_cli/crew_tools.py +30 -2
- codegraph_cli/embeddings.py +95 -5
- codegraph_cli/llm.py +42 -55
- codegraph_cli/project_context.py +64 -1
- codegraph_cli/rag.py +282 -19
- codegraph_cli/storage.py +310 -14
- codegraph_cli/vector_store.py +110 -8
- {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/METADATA +75 -21
- codegraph_cli-2.1.2.dist-info/RECORD +55 -0
- codegraph_cli-2.1.2.dist-info/entry_points.txt +2 -0
- codegraph_cli-2.1.0.dist-info/RECORD +0 -43
- codegraph_cli-2.1.0.dist-info/entry_points.txt +0 -2
- {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/WHEEL +0 -0
- {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/licenses/LICENSE +0 -0
- {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/top_level.txt +0 -0
codegraph_cli/storage.py
CHANGED
|
@@ -13,6 +13,7 @@ from __future__ import annotations
|
|
|
13
13
|
|
|
14
14
|
import json
|
|
15
15
|
import logging
|
|
16
|
+
import re
|
|
16
17
|
import sqlite3
|
|
17
18
|
from pathlib import Path
|
|
18
19
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
@@ -109,7 +110,7 @@ class GraphStore:
|
|
|
109
110
|
self.conn.row_factory = sqlite3.Row
|
|
110
111
|
self._init_schema()
|
|
111
112
|
|
|
112
|
-
# Initialise LanceDB vector store
|
|
113
|
+
# Initialise LanceDB vector store (default / legacy table)
|
|
113
114
|
self.vector_store: Optional[VectorStore] = None
|
|
114
115
|
if VECTOR_STORE_AVAILABLE:
|
|
115
116
|
try:
|
|
@@ -117,6 +118,9 @@ class GraphStore:
|
|
|
117
118
|
except Exception as exc:
|
|
118
119
|
logger.warning("LanceDB vector store unavailable: %s", exc)
|
|
119
120
|
|
|
121
|
+
# Per-model vector store cache: model_key → VectorStore
|
|
122
|
+
self._model_vector_stores: Dict[str, "VectorStore"] = {}
|
|
123
|
+
|
|
120
124
|
def close(self) -> None:
|
|
121
125
|
self.conn.close()
|
|
122
126
|
|
|
@@ -186,12 +190,19 @@ class GraphStore:
|
|
|
186
190
|
# Insert
|
|
187
191
|
# ------------------------------------------------------------------
|
|
188
192
|
|
|
189
|
-
def insert_nodes(
|
|
193
|
+
def insert_nodes(
|
|
194
|
+
self,
|
|
195
|
+
rows: Iterable[Tuple[Node, List[float]]],
|
|
196
|
+
model_key: Optional[str] = None,
|
|
197
|
+
) -> None:
|
|
190
198
|
"""Insert nodes with their embedding vectors.
|
|
191
199
|
|
|
192
200
|
Each element of *rows* is a ``(Node, embedding)`` tuple. Data is
|
|
193
201
|
written to both SQLite (for structured queries) and LanceDB (for
|
|
194
202
|
vector search).
|
|
203
|
+
|
|
204
|
+
When *model_key* is provided the embeddings are also written to
|
|
205
|
+
the model-specific LanceDB table (``code_nodes_{model_key}``).
|
|
195
206
|
"""
|
|
196
207
|
rows_list = list(rows)
|
|
197
208
|
if not rows_list:
|
|
@@ -225,26 +236,42 @@ class GraphStore:
|
|
|
225
236
|
self.conn.commit()
|
|
226
237
|
|
|
227
238
|
# ---- LanceDB (vector store) ------------------------------------
|
|
239
|
+
node_ids = [node.node_id for node, _ in rows_list]
|
|
240
|
+
embeddings = [emb for _, emb in rows_list]
|
|
241
|
+
metadatas = [
|
|
242
|
+
{
|
|
243
|
+
"node_type": node.node_type,
|
|
244
|
+
"file_path": node.file_path,
|
|
245
|
+
"qualname": node.qualname,
|
|
246
|
+
"name": node.name,
|
|
247
|
+
}
|
|
248
|
+
for node, _ in rows_list
|
|
249
|
+
]
|
|
250
|
+
documents = [node.code for node, _ in rows_list]
|
|
251
|
+
|
|
252
|
+
# Write to legacy table (backward compat)
|
|
228
253
|
if self.vector_store is not None:
|
|
229
254
|
try:
|
|
230
|
-
node_ids = [node.node_id for node, _ in rows_list]
|
|
231
|
-
embeddings = [emb for _, emb in rows_list]
|
|
232
|
-
metadatas = [
|
|
233
|
-
{
|
|
234
|
-
"node_type": node.node_type,
|
|
235
|
-
"file_path": node.file_path,
|
|
236
|
-
"qualname": node.qualname,
|
|
237
|
-
"name": node.name,
|
|
238
|
-
}
|
|
239
|
-
for node, _ in rows_list
|
|
240
|
-
]
|
|
241
|
-
documents = [node.code for node, _ in rows_list]
|
|
242
255
|
self.vector_store.add_nodes(
|
|
243
256
|
node_ids, embeddings, metadatas, documents,
|
|
244
257
|
)
|
|
245
258
|
except Exception as exc:
|
|
246
259
|
logger.warning("Failed to sync nodes to LanceDB: %s", exc)
|
|
247
260
|
|
|
261
|
+
# Write to model-specific table
|
|
262
|
+
if model_key:
|
|
263
|
+
model_vs = self.get_vector_store_for_model(model_key)
|
|
264
|
+
if model_vs is not None:
|
|
265
|
+
try:
|
|
266
|
+
model_vs.add_nodes(
|
|
267
|
+
node_ids, embeddings, metadatas, documents,
|
|
268
|
+
)
|
|
269
|
+
except Exception as exc:
|
|
270
|
+
logger.warning(
|
|
271
|
+
"Failed to sync nodes to model table '%s': %s",
|
|
272
|
+
model_key, exc,
|
|
273
|
+
)
|
|
274
|
+
|
|
248
275
|
def insert_edges(self, edges: Iterable[Edge]) -> None:
|
|
249
276
|
cur = self.conn.cursor()
|
|
250
277
|
cur.executemany(
|
|
@@ -253,6 +280,132 @@ class GraphStore:
|
|
|
253
280
|
)
|
|
254
281
|
self.conn.commit()
|
|
255
282
|
|
|
283
|
+
# ------------------------------------------------------------------
|
|
284
|
+
# Incremental index (single-file add / remove)
|
|
285
|
+
# ------------------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
def remove_nodes_for_file(self, rel_path: str) -> int:
|
|
288
|
+
"""Remove all nodes and related edges for a specific file.
|
|
289
|
+
|
|
290
|
+
Clears data from SQLite **and** every known LanceDB table
|
|
291
|
+
(legacy + per-model).
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
rel_path: Relative file path as stored in the ``file_path``
|
|
295
|
+
column (e.g. ``"src/utils.py"``).
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
Number of SQLite node rows deleted.
|
|
299
|
+
"""
|
|
300
|
+
# 1. Collect node IDs that belong to this file
|
|
301
|
+
cur = self.conn.cursor()
|
|
302
|
+
rows = cur.execute(
|
|
303
|
+
"SELECT node_id FROM nodes WHERE file_path = ?", (rel_path,),
|
|
304
|
+
).fetchall()
|
|
305
|
+
node_ids = [r[0] for r in rows]
|
|
306
|
+
|
|
307
|
+
if not node_ids:
|
|
308
|
+
return 0
|
|
309
|
+
|
|
310
|
+
# 2. Delete edges referencing these nodes (src OR dst)
|
|
311
|
+
placeholders = ",".join("?" * len(node_ids))
|
|
312
|
+
cur.execute(
|
|
313
|
+
f"DELETE FROM edges WHERE src IN ({placeholders}) OR dst IN ({placeholders})",
|
|
314
|
+
node_ids + node_ids,
|
|
315
|
+
)
|
|
316
|
+
# 3. Delete nodes themselves
|
|
317
|
+
cur.execute(
|
|
318
|
+
f"DELETE FROM nodes WHERE node_id IN ({placeholders})",
|
|
319
|
+
node_ids,
|
|
320
|
+
)
|
|
321
|
+
self.conn.commit()
|
|
322
|
+
|
|
323
|
+
# 4. Remove from legacy LanceDB table
|
|
324
|
+
if self.vector_store is not None:
|
|
325
|
+
try:
|
|
326
|
+
self.vector_store.delete_by_file_path(rel_path)
|
|
327
|
+
except Exception as exc:
|
|
328
|
+
logger.debug("Legacy vector delete for '%s': %s", rel_path, exc)
|
|
329
|
+
|
|
330
|
+
# 5. Remove from all per-model LanceDB tables
|
|
331
|
+
for _key, vs in self._model_vector_stores.items():
|
|
332
|
+
try:
|
|
333
|
+
vs.delete_by_file_path(rel_path)
|
|
334
|
+
except Exception:
|
|
335
|
+
pass
|
|
336
|
+
|
|
337
|
+
# Also try tables that haven't been opened yet
|
|
338
|
+
if VECTOR_STORE_AVAILABLE:
|
|
339
|
+
try:
|
|
340
|
+
probe = VectorStore(self.project_dir, model_key="")
|
|
341
|
+
for mk in probe.list_model_tables():
|
|
342
|
+
if mk and mk not in self._model_vector_stores:
|
|
343
|
+
try:
|
|
344
|
+
vs = VectorStore(self.project_dir, model_key=mk)
|
|
345
|
+
vs.delete_by_file_path(rel_path)
|
|
346
|
+
except Exception:
|
|
347
|
+
pass
|
|
348
|
+
except Exception:
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
return len(node_ids)
|
|
352
|
+
|
|
353
|
+
def index_single_file(
|
|
354
|
+
self,
|
|
355
|
+
file_path: Path,
|
|
356
|
+
project_root: Path,
|
|
357
|
+
embedder: Any,
|
|
358
|
+
model_key: str = "",
|
|
359
|
+
) -> int:
|
|
360
|
+
"""Parse and index a single file incrementally.
|
|
361
|
+
|
|
362
|
+
Removes old nodes/edges for the file, parses it fresh,
|
|
363
|
+
embeds the new nodes, and inserts them.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
file_path: Absolute path to the source file.
|
|
367
|
+
project_root: Project root (for computing relative paths).
|
|
368
|
+
embedder: Object with ``embed_text(str) -> List[float]``.
|
|
369
|
+
model_key: Embedding model identifier.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
Number of nodes indexed for this file.
|
|
373
|
+
"""
|
|
374
|
+
from .parser import PythonGraphParser
|
|
375
|
+
from .agents import _build_chunk_text
|
|
376
|
+
|
|
377
|
+
rel_path = str(file_path.relative_to(project_root))
|
|
378
|
+
|
|
379
|
+
# Remove stale data for this file
|
|
380
|
+
self.remove_nodes_for_file(rel_path)
|
|
381
|
+
|
|
382
|
+
# Parse the single file
|
|
383
|
+
parser = PythonGraphParser(project_root)
|
|
384
|
+
try:
|
|
385
|
+
nodes, edges = parser.parse_file(file_path)
|
|
386
|
+
except Exception as exc:
|
|
387
|
+
logger.warning("Failed to parse %s: %s", file_path, exc)
|
|
388
|
+
return 0
|
|
389
|
+
|
|
390
|
+
if not nodes:
|
|
391
|
+
return 0
|
|
392
|
+
|
|
393
|
+
# Embed and insert
|
|
394
|
+
node_payload = []
|
|
395
|
+
for node in nodes:
|
|
396
|
+
text = _build_chunk_text(node)
|
|
397
|
+
emb = embedder.embed_text(text)
|
|
398
|
+
node_payload.append((node, emb))
|
|
399
|
+
|
|
400
|
+
self.insert_nodes(node_payload, model_key=model_key)
|
|
401
|
+
self.insert_edges(edges)
|
|
402
|
+
|
|
403
|
+
logger.info(
|
|
404
|
+
"Incremental index: %d nodes, %d edges for %s",
|
|
405
|
+
len(nodes), len(edges), rel_path,
|
|
406
|
+
)
|
|
407
|
+
return len(nodes)
|
|
408
|
+
|
|
256
409
|
# ------------------------------------------------------------------
|
|
257
410
|
# Read (structured)
|
|
258
411
|
# ------------------------------------------------------------------
|
|
@@ -388,3 +541,146 @@ class GraphStore:
|
|
|
388
541
|
edge_rows,
|
|
389
542
|
)
|
|
390
543
|
self.conn.commit()
|
|
544
|
+
|
|
545
|
+
# ------------------------------------------------------------------
|
|
546
|
+
# Per-model vector stores (auto re-ingestion)
|
|
547
|
+
# ------------------------------------------------------------------
|
|
548
|
+
|
|
549
|
+
def get_vector_store_for_model(self, model_key: str) -> Optional["VectorStore"]:
|
|
550
|
+
"""Get (or create) a LanceDB vector store for a specific embedding model.
|
|
551
|
+
|
|
552
|
+
Each embedding model gets its own LanceDB table so that
|
|
553
|
+
different dimensionalities never collide. The table is named
|
|
554
|
+
``code_nodes_{model_key}``.
|
|
555
|
+
|
|
556
|
+
Returns ``None`` when LanceDB is not available.
|
|
557
|
+
"""
|
|
558
|
+
if not VECTOR_STORE_AVAILABLE:
|
|
559
|
+
return None
|
|
560
|
+
if model_key in self._model_vector_stores:
|
|
561
|
+
return self._model_vector_stores[model_key]
|
|
562
|
+
try:
|
|
563
|
+
vs = VectorStore(self.project_dir, model_key=model_key)
|
|
564
|
+
self._model_vector_stores[model_key] = vs
|
|
565
|
+
return vs
|
|
566
|
+
except Exception as exc:
|
|
567
|
+
logger.warning(
|
|
568
|
+
"Cannot create vector store for model '%s': %s", model_key, exc,
|
|
569
|
+
)
|
|
570
|
+
return None
|
|
571
|
+
|
|
572
|
+
def reingest_for_model(
|
|
573
|
+
self,
|
|
574
|
+
model_key: str,
|
|
575
|
+
embedder: Any,
|
|
576
|
+
chunk_builder: Any = None,
|
|
577
|
+
) -> int:
|
|
578
|
+
"""Re-embed all SQLite nodes into a model-specific LanceDB table.
|
|
579
|
+
|
|
580
|
+
Reads raw code/metadata from the SQLite ``nodes`` table,
|
|
581
|
+
computes embeddings with *embedder*, and writes them into the
|
|
582
|
+
LanceDB table for *model_key*.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
model_key: Embedding model identifier (e.g. ``"minilm"``).
|
|
586
|
+
embedder: Object with an ``embed_text(str) -> List[float]``
|
|
587
|
+
method (and optionally ``embed_documents``).
|
|
588
|
+
chunk_builder: Optional callable ``(dict) -> str`` that builds
|
|
589
|
+
the text chunk from a node row dict. Falls back
|
|
590
|
+
to an internal default.
|
|
591
|
+
|
|
592
|
+
Returns:
|
|
593
|
+
Number of nodes ingested.
|
|
594
|
+
"""
|
|
595
|
+
vs = self.get_vector_store_for_model(model_key)
|
|
596
|
+
if vs is None:
|
|
597
|
+
return 0
|
|
598
|
+
|
|
599
|
+
rows = self.get_nodes()
|
|
600
|
+
if not rows:
|
|
601
|
+
return 0
|
|
602
|
+
|
|
603
|
+
if chunk_builder is None:
|
|
604
|
+
chunk_builder = _default_chunk_builder
|
|
605
|
+
|
|
606
|
+
# Clear old data for this model's table and re-open
|
|
607
|
+
vs.clear()
|
|
608
|
+
self._model_vector_stores.pop(model_key, None)
|
|
609
|
+
vs = self.get_vector_store_for_model(model_key)
|
|
610
|
+
if vs is None:
|
|
611
|
+
return 0
|
|
612
|
+
|
|
613
|
+
node_ids: List[str] = []
|
|
614
|
+
embeddings: List[List[float]] = []
|
|
615
|
+
metadatas: List[Dict[str, str]] = []
|
|
616
|
+
documents: List[str] = []
|
|
617
|
+
texts: List[str] = []
|
|
618
|
+
|
|
619
|
+
for row in rows:
|
|
620
|
+
row_dict = dict(row)
|
|
621
|
+
text = chunk_builder(row_dict)
|
|
622
|
+
texts.append(text)
|
|
623
|
+
node_ids.append(row_dict["node_id"])
|
|
624
|
+
metadatas.append({
|
|
625
|
+
"node_type": row_dict["node_type"],
|
|
626
|
+
"file_path": row_dict["file_path"],
|
|
627
|
+
"qualname": row_dict["qualname"],
|
|
628
|
+
"name": row_dict["name"],
|
|
629
|
+
})
|
|
630
|
+
documents.append(row_dict["code"])
|
|
631
|
+
|
|
632
|
+
# Batch-embed when possible, single-embed otherwise
|
|
633
|
+
if hasattr(embedder, "embed_documents"):
|
|
634
|
+
embeddings = embedder.embed_documents(texts)
|
|
635
|
+
else:
|
|
636
|
+
embeddings = [embedder.embed_text(t) for t in texts]
|
|
637
|
+
|
|
638
|
+
try:
|
|
639
|
+
vs.add_nodes(node_ids, embeddings, metadatas, documents)
|
|
640
|
+
logger.info(
|
|
641
|
+
"Re-ingested %d nodes into table for embedding model '%s'.",
|
|
642
|
+
len(node_ids), model_key,
|
|
643
|
+
)
|
|
644
|
+
except Exception as exc:
|
|
645
|
+
logger.warning("Re-ingestion for model '%s' failed: %s", model_key, exc)
|
|
646
|
+
return 0
|
|
647
|
+
|
|
648
|
+
return len(node_ids)
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
# ===================================================================
|
|
652
|
+
# Helpers
|
|
653
|
+
# ===================================================================
|
|
654
|
+
|
|
655
|
+
# Regex to strip bare import lines from chunk text (mirrors agents._IMPORT_RE)
|
|
656
|
+
_CHUNK_IMPORT_RE = re.compile(r"^(?:from\s+\S+\s+)?import\s+.+$", re.MULTILINE)
|
|
657
|
+
_MAX_CHUNK_CODE = 1500
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def _default_chunk_builder(row: Dict[str, Any]) -> str:
|
|
661
|
+
"""Build embedding text from a SQLite node row dict.
|
|
662
|
+
|
|
663
|
+
Mirrors :func:`codegraph_cli.agents._build_chunk_text` but works
|
|
664
|
+
with plain dicts instead of :class:`Node` objects.
|
|
665
|
+
"""
|
|
666
|
+
parts: List[str] = [
|
|
667
|
+
f"file: {row['file_path']}",
|
|
668
|
+
f"symbol: {row['qualname']}",
|
|
669
|
+
f"type: {row['node_type']}",
|
|
670
|
+
]
|
|
671
|
+
docstring = row.get("docstring") or ""
|
|
672
|
+
if docstring.strip():
|
|
673
|
+
parts.append(f"doc: {docstring.strip()}")
|
|
674
|
+
|
|
675
|
+
code: str = row.get("code", "")
|
|
676
|
+
if row["node_type"] != "module":
|
|
677
|
+
code = _CHUNK_IMPORT_RE.sub("", code).strip()
|
|
678
|
+
else:
|
|
679
|
+
code = code[:_MAX_CHUNK_CODE]
|
|
680
|
+
|
|
681
|
+
if len(code) > _MAX_CHUNK_CODE:
|
|
682
|
+
code = code[:_MAX_CHUNK_CODE] + "\n# ... (truncated)"
|
|
683
|
+
if code:
|
|
684
|
+
parts.append(code)
|
|
685
|
+
|
|
686
|
+
return "\n".join(parts)
|
codegraph_cli/vector_store.py
CHANGED
|
@@ -49,22 +49,26 @@ class VectorStore:
|
|
|
49
49
|
======== ============ =====================================
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
|
-
def __init__(self, project_dir: Path) -> None:
|
|
52
|
+
def __init__(self, project_dir: Path, model_key: str = "") -> None:
|
|
53
53
|
if not LANCE_AVAILABLE:
|
|
54
54
|
raise ImportError(
|
|
55
55
|
"lancedb is not installed. Install with: pip install lancedb pyarrow"
|
|
56
56
|
)
|
|
57
57
|
|
|
58
58
|
self.project_dir = project_dir
|
|
59
|
+
self.model_key = model_key
|
|
59
60
|
self._lance_dir = project_dir / "lancedb"
|
|
60
61
|
self._lance_dir.mkdir(exist_ok=True, parents=True)
|
|
61
62
|
|
|
63
|
+
# Each embedding model gets its own table to avoid dimension conflicts
|
|
64
|
+
self._table_name = f"code_nodes_{model_key}" if model_key else "code_nodes"
|
|
65
|
+
|
|
62
66
|
self._db: Any = lancedb.connect(str(self._lance_dir))
|
|
63
67
|
self._table: Optional[Any] = None
|
|
64
68
|
|
|
65
69
|
# Try to open existing table
|
|
66
70
|
try:
|
|
67
|
-
self._table = self._db.open_table(
|
|
71
|
+
self._table = self._db.open_table(self._table_name)
|
|
68
72
|
except Exception:
|
|
69
73
|
self._table = None
|
|
70
74
|
|
|
@@ -106,7 +110,7 @@ class VectorStore:
|
|
|
106
110
|
if self._table is None:
|
|
107
111
|
# First insert – create the table (schema inferred from data)
|
|
108
112
|
self._table = self._db.create_table(
|
|
109
|
-
|
|
113
|
+
self._table_name, data=rows, mode="overwrite",
|
|
110
114
|
)
|
|
111
115
|
else:
|
|
112
116
|
# Subsequent inserts – upsert by deleting old IDs first
|
|
@@ -150,7 +154,12 @@ class VectorStore:
|
|
|
150
154
|
return empty
|
|
151
155
|
|
|
152
156
|
try:
|
|
153
|
-
query =
|
|
157
|
+
query = (
|
|
158
|
+
self._table
|
|
159
|
+
.search(query_embedding)
|
|
160
|
+
.metric("cosine")
|
|
161
|
+
.limit(n_results)
|
|
162
|
+
)
|
|
154
163
|
|
|
155
164
|
# Apply metadata filters as SQL WHERE clause
|
|
156
165
|
if where:
|
|
@@ -171,8 +180,11 @@ class VectorStore:
|
|
|
171
180
|
docs: List[str] = []
|
|
172
181
|
|
|
173
182
|
for row in results:
|
|
183
|
+
# With cosine metric, _distance is the *cosine distance*
|
|
184
|
+
# (1 − cos_sim), so values are in [0, 2].
|
|
185
|
+
dist = row.get("_distance", 0.0)
|
|
174
186
|
ids.append(row.get("id", ""))
|
|
175
|
-
distances.append(
|
|
187
|
+
distances.append(dist)
|
|
176
188
|
metas.append({
|
|
177
189
|
"node_type": row.get("node_type", ""),
|
|
178
190
|
"file_path": row.get("file_path", ""),
|
|
@@ -209,7 +221,12 @@ class VectorStore:
|
|
|
209
221
|
return []
|
|
210
222
|
|
|
211
223
|
try:
|
|
212
|
-
query =
|
|
224
|
+
query = (
|
|
225
|
+
self._table
|
|
226
|
+
.search(query_embedding)
|
|
227
|
+
.metric("cosine")
|
|
228
|
+
.limit(n_results)
|
|
229
|
+
)
|
|
213
230
|
if where_sql:
|
|
214
231
|
query = query.where(where_sql)
|
|
215
232
|
return query.to_list()
|
|
@@ -226,7 +243,7 @@ class VectorStore:
|
|
|
226
243
|
if self._table is None:
|
|
227
244
|
return None
|
|
228
245
|
try:
|
|
229
|
-
import pandas as pd # type: ignore[import-untyped]
|
|
246
|
+
import pandas as pd # type: ignore[import-untyped] # noqa: F811
|
|
230
247
|
df: pd.DataFrame = self._table.to_pandas()
|
|
231
248
|
match = df[df["id"] == node_id]
|
|
232
249
|
if match.empty:
|
|
@@ -260,14 +277,58 @@ class VectorStore:
|
|
|
260
277
|
except Exception:
|
|
261
278
|
pass
|
|
262
279
|
|
|
280
|
+
def delete_by_file_path(self, file_path: str) -> int:
|
|
281
|
+
"""Delete all nodes belonging to a specific file.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
file_path: Relative file path (must match the ``file_path``
|
|
285
|
+
column stored during indexing).
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
Number of rows deleted (0 if table is empty / missing).
|
|
289
|
+
"""
|
|
290
|
+
if self._table is None:
|
|
291
|
+
return 0
|
|
292
|
+
try:
|
|
293
|
+
before = self._table.count_rows()
|
|
294
|
+
# Escape single quotes in the path to avoid SQL injection
|
|
295
|
+
safe_path = file_path.replace("'", "''")
|
|
296
|
+
self._table.delete(f"file_path = '{safe_path}'")
|
|
297
|
+
after = self._table.count_rows()
|
|
298
|
+
return max(0, before - after)
|
|
299
|
+
except Exception as exc:
|
|
300
|
+
logger.warning(
|
|
301
|
+
"delete_by_file_path('%s') failed: %s", file_path, exc,
|
|
302
|
+
)
|
|
303
|
+
return 0
|
|
304
|
+
|
|
263
305
|
def clear(self) -> None:
|
|
264
306
|
"""Drop all data and recreate an empty table."""
|
|
265
307
|
try:
|
|
266
|
-
self._db.drop_table(
|
|
308
|
+
self._db.drop_table(self._table_name)
|
|
267
309
|
except Exception:
|
|
268
310
|
pass
|
|
269
311
|
self._table = None
|
|
270
312
|
|
|
313
|
+
def list_model_tables(self) -> List[str]:
|
|
314
|
+
"""Return model keys for which a LanceDB table exists.
|
|
315
|
+
|
|
316
|
+
Tables are named ``code_nodes_{model_key}``; this method strips
|
|
317
|
+
the prefix and returns just the model keys.
|
|
318
|
+
"""
|
|
319
|
+
try:
|
|
320
|
+
all_tables = self._db.table_names()
|
|
321
|
+
except Exception:
|
|
322
|
+
return []
|
|
323
|
+
models: List[str] = []
|
|
324
|
+
prefix = "code_nodes_"
|
|
325
|
+
for name in all_tables:
|
|
326
|
+
if name == "code_nodes":
|
|
327
|
+
models.append("") # legacy table
|
|
328
|
+
elif name.startswith(prefix):
|
|
329
|
+
models.append(name[len(prefix):])
|
|
330
|
+
return models
|
|
331
|
+
|
|
271
332
|
# ------------------------------------------------------------------
|
|
272
333
|
# Informational
|
|
273
334
|
# ------------------------------------------------------------------
|
|
@@ -291,3 +352,44 @@ class VectorStore:
|
|
|
291
352
|
return df.head(limit).to_dict(orient="records")
|
|
292
353
|
except Exception:
|
|
293
354
|
return []
|
|
355
|
+
|
|
356
|
+
def debug_search(
|
|
357
|
+
self,
|
|
358
|
+
query_embedding: List[float],
|
|
359
|
+
n_results: int = 5,
|
|
360
|
+
) -> List[Dict[str, Any]]:
|
|
361
|
+
"""Diagnostic search returning raw scores and distance details.
|
|
362
|
+
|
|
363
|
+
Unlike :meth:`search`, this returns a flat list of dicts that
|
|
364
|
+
includes the raw ``_distance`` value, the derived similarity
|
|
365
|
+
score, and key metadata — useful for inspecting retrieval
|
|
366
|
+
quality from the CLI.
|
|
367
|
+
"""
|
|
368
|
+
if self._table is None:
|
|
369
|
+
return []
|
|
370
|
+
try:
|
|
371
|
+
results = (
|
|
372
|
+
self._table
|
|
373
|
+
.search(query_embedding)
|
|
374
|
+
.metric("cosine")
|
|
375
|
+
.limit(n_results)
|
|
376
|
+
.to_list()
|
|
377
|
+
)
|
|
378
|
+
except Exception as exc:
|
|
379
|
+
logger.warning("debug_search failed: %s", exc)
|
|
380
|
+
return []
|
|
381
|
+
|
|
382
|
+
out: List[Dict[str, Any]] = []
|
|
383
|
+
for row in results:
|
|
384
|
+
dist = row.get("_distance", 0.0)
|
|
385
|
+
out.append({
|
|
386
|
+
"id": row.get("id", ""),
|
|
387
|
+
"name": row.get("name", ""),
|
|
388
|
+
"qualname": row.get("qualname", ""),
|
|
389
|
+
"node_type": row.get("node_type", ""),
|
|
390
|
+
"file_path": row.get("file_path", ""),
|
|
391
|
+
"cosine_distance": round(dist, 5),
|
|
392
|
+
"similarity_score": round(max(0.0, 1.0 - dist), 5),
|
|
393
|
+
"document_preview": (row.get("document", "") or "")[:120],
|
|
394
|
+
})
|
|
395
|
+
return out
|