coderay 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderay/__init__.py +1 -0
- coderay/chunking/__init__.py +0 -0
- coderay/chunking/chunker.py +127 -0
- coderay/chunking/registry.py +190 -0
- coderay/cli/__init__.py +3 -0
- coderay/cli/commands.py +475 -0
- coderay/core/__init__.py +0 -0
- coderay/core/config.py +73 -0
- coderay/core/lock.py +36 -0
- coderay/core/models.py +71 -0
- coderay/core/timing.py +45 -0
- coderay/core/utils.py +35 -0
- coderay/embedding/__init__.py +0 -0
- coderay/embedding/base.py +60 -0
- coderay/embedding/local.py +68 -0
- coderay/embedding/openai.py +87 -0
- coderay/graph/__init__.py +19 -0
- coderay/graph/builder.py +128 -0
- coderay/graph/code_graph.py +311 -0
- coderay/graph/extractor.py +315 -0
- coderay/mcp_server/__init__.py +0 -0
- coderay/mcp_server/server.py +178 -0
- coderay/pipeline/__init__.py +0 -0
- coderay/pipeline/indexer.py +417 -0
- coderay/pipeline/watcher.py +318 -0
- coderay/retrieval/__init__.py +3 -0
- coderay/retrieval/boosting.py +80 -0
- coderay/retrieval/search.py +121 -0
- coderay/skeleton/__init__.py +0 -0
- coderay/skeleton/extractor.py +140 -0
- coderay/state/__init__.py +8 -0
- coderay/state/machine.py +242 -0
- coderay/state/version.py +47 -0
- coderay/storage/__init__.py +0 -0
- coderay/storage/lancedb.py +268 -0
- coderay/vcs/__init__.py +0 -0
- coderay/vcs/git.py +193 -0
- coderay-1.0.0.dist-info/METADATA +145 -0
- coderay-1.0.0.dist-info/RECORD +42 -0
- coderay-1.0.0.dist-info/WHEEL +5 -0
- coderay-1.0.0.dist-info/entry_points.txt +3 -0
- coderay-1.0.0.dist-info/top_level.txt +1 -0
coderay/state/machine.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from dataclasses import asdict, dataclass, field
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
META_FILENAME = "meta.json"
|
|
10
|
+
FILE_HASHES_FILENAME = "file_hashes.json"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MetaState(str, Enum):
|
|
14
|
+
"""State of the indexer run."""
|
|
15
|
+
|
|
16
|
+
IN_PROGRESS = "in_progress"
|
|
17
|
+
DONE = "done"
|
|
18
|
+
ERRORED = "errored"
|
|
19
|
+
INCOMPLETE = "incomplete"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class CurrentRun:
|
|
24
|
+
"""Resume info for the current indexing run."""
|
|
25
|
+
|
|
26
|
+
paths_to_process: list[str] = field(default_factory=list)
|
|
27
|
+
processed_count: int = 0
|
|
28
|
+
error: str | None = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True)
|
|
32
|
+
class IndexMeta:
|
|
33
|
+
"""Immutable snapshot of meta.json contents."""
|
|
34
|
+
|
|
35
|
+
state: MetaState
|
|
36
|
+
started_at: float
|
|
37
|
+
last_commit: str
|
|
38
|
+
branch: str
|
|
39
|
+
indexed_at: float
|
|
40
|
+
current_run: CurrentRun
|
|
41
|
+
error: str | None = None
|
|
42
|
+
|
|
43
|
+
def is_in_progress(self):
|
|
44
|
+
"""Return True if state is IN_PROGRESS."""
|
|
45
|
+
return self.state == MetaState.IN_PROGRESS
|
|
46
|
+
|
|
47
|
+
def is_incomplete(self):
|
|
48
|
+
"""Return True if state is INCOMPLETE (previous run did not finish)."""
|
|
49
|
+
return self.state == MetaState.INCOMPLETE
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class StateMachine:
|
|
53
|
+
"""Manages index metadata and file hashes on disk."""
|
|
54
|
+
|
|
55
|
+
def __init__(self, index_dir: Path | str) -> None:
|
|
56
|
+
"""Initialize with index dir (contains meta.json and file_hashes.json)."""
|
|
57
|
+
self._index_dir = Path(index_dir)
|
|
58
|
+
self._meta_path = self._index_dir / META_FILENAME
|
|
59
|
+
self._file_hashes_path = self._index_dir / FILE_HASHES_FILENAME
|
|
60
|
+
self._current_state: IndexMeta | None = self._load_meta()
|
|
61
|
+
self._file_hashes: dict[str, str] = self._load_file_hashes()
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def index_dir(self) -> Path:
|
|
65
|
+
"""Path to the index directory."""
|
|
66
|
+
return self._index_dir
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def meta_path(self) -> Path:
|
|
70
|
+
"""Path to meta.json."""
|
|
71
|
+
return self._meta_path
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def current_state(self) -> IndexMeta | None:
|
|
75
|
+
"""Current meta state; None if meta.json is missing or invalid."""
|
|
76
|
+
return self._current_state
|
|
77
|
+
|
|
78
|
+
@current_state.setter
|
|
79
|
+
def current_state(self, value: IndexMeta) -> None:
|
|
80
|
+
self._current_state = value
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def file_hashes(self) -> dict[str, str]:
|
|
84
|
+
"""Mapping of relative path -> content hash."""
|
|
85
|
+
return self._file_hashes
|
|
86
|
+
|
|
87
|
+
@file_hashes.setter
|
|
88
|
+
def file_hashes(self, value: dict[str, str]) -> None:
|
|
89
|
+
self._file_hashes = value
|
|
90
|
+
|
|
91
|
+
def _load_meta(self) -> IndexMeta | None:
|
|
92
|
+
"""Load meta from disk. Returns None if file missing or invalid."""
|
|
93
|
+
if not self._meta_path.exists():
|
|
94
|
+
return None
|
|
95
|
+
try:
|
|
96
|
+
data = json.loads(self._meta_path.read_text())
|
|
97
|
+
data["state"] = MetaState(data["state"])
|
|
98
|
+
cr = data.get("current_run") or {}
|
|
99
|
+
data["current_run"] = CurrentRun(
|
|
100
|
+
paths_to_process=cr.get("paths_to_process") or [],
|
|
101
|
+
processed_count=int(cr.get("processed_count") or 0),
|
|
102
|
+
error=cr.get("error"),
|
|
103
|
+
)
|
|
104
|
+
return IndexMeta(**data)
|
|
105
|
+
except Exception:
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
def _load_file_hashes(self) -> dict[str, str]:
|
|
109
|
+
"""Load file hashes from disk. Returns {} if missing or invalid."""
|
|
110
|
+
if not self._file_hashes_path.exists():
|
|
111
|
+
return {}
|
|
112
|
+
try:
|
|
113
|
+
data = json.loads(self._file_hashes_path.read_text())
|
|
114
|
+
return dict(data) if isinstance(data, dict) else {}
|
|
115
|
+
except Exception:
|
|
116
|
+
return {}
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def is_in_progress(self) -> bool:
|
|
120
|
+
"""True if state is in_progress or incomplete."""
|
|
121
|
+
if self._current_state is None:
|
|
122
|
+
return False
|
|
123
|
+
return (
|
|
124
|
+
self._current_state.is_in_progress() or self._current_state.is_incomplete()
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def has_partial_progress(self) -> bool:
|
|
129
|
+
"""True if we have a run with paths and non-zero processed count."""
|
|
130
|
+
if self._current_state is None:
|
|
131
|
+
return False
|
|
132
|
+
run = self._current_state.current_run
|
|
133
|
+
return bool(run.paths_to_process and run.processed_count > 0)
|
|
134
|
+
|
|
135
|
+
def set_incomplete(self) -> None:
|
|
136
|
+
"""Mark the current run as incomplete."""
|
|
137
|
+
if self._current_state is None or not self.is_in_progress:
|
|
138
|
+
return
|
|
139
|
+
data = asdict(self._current_state)
|
|
140
|
+
data["state"] = MetaState.INCOMPLETE
|
|
141
|
+
data["current_run"] = asdict(self._current_state.current_run)
|
|
142
|
+
self._current_state = IndexMeta(
|
|
143
|
+
state=data["state"],
|
|
144
|
+
started_at=data["started_at"],
|
|
145
|
+
last_commit=data["last_commit"],
|
|
146
|
+
branch=data["branch"],
|
|
147
|
+
indexed_at=data["indexed_at"],
|
|
148
|
+
current_run=CurrentRun(**data["current_run"]),
|
|
149
|
+
)
|
|
150
|
+
self._save_meta()
|
|
151
|
+
|
|
152
|
+
def set_errored(self, exc: str) -> None:
|
|
153
|
+
if self._current_state is None:
|
|
154
|
+
return
|
|
155
|
+
data = asdict(self._current_state)
|
|
156
|
+
current_run = asdict(self._current_state.current_run)
|
|
157
|
+
self._current_state = IndexMeta(
|
|
158
|
+
state=MetaState.ERRORED,
|
|
159
|
+
error=exc,
|
|
160
|
+
started_at=data["started_at"],
|
|
161
|
+
last_commit=data["last_commit"],
|
|
162
|
+
branch=data["branch"],
|
|
163
|
+
indexed_at=data["indexed_at"],
|
|
164
|
+
current_run=CurrentRun(
|
|
165
|
+
paths_to_process=current_run["paths_to_process"],
|
|
166
|
+
processed_count=current_run["processed_count"],
|
|
167
|
+
error=exc,
|
|
168
|
+
),
|
|
169
|
+
)
|
|
170
|
+
self._save_meta()
|
|
171
|
+
|
|
172
|
+
def start(self, branch: str | None, last_commit: str | None) -> None:
|
|
173
|
+
"""Start a new indexing run."""
|
|
174
|
+
now = time.time()
|
|
175
|
+
self._current_state = IndexMeta(
|
|
176
|
+
state=MetaState.IN_PROGRESS,
|
|
177
|
+
started_at=now,
|
|
178
|
+
last_commit=last_commit or "",
|
|
179
|
+
branch=branch or "",
|
|
180
|
+
indexed_at=now,
|
|
181
|
+
current_run=CurrentRun(),
|
|
182
|
+
)
|
|
183
|
+
self._save_meta()
|
|
184
|
+
|
|
185
|
+
def save_progress(
|
|
186
|
+
self,
|
|
187
|
+
full_rel_paths: list[str],
|
|
188
|
+
processed_count: int,
|
|
189
|
+
) -> None:
|
|
190
|
+
"""Save resume checkpoint with processed count."""
|
|
191
|
+
if self._current_state is None or not self.is_in_progress:
|
|
192
|
+
return
|
|
193
|
+
run = CurrentRun(
|
|
194
|
+
paths_to_process=full_rel_paths,
|
|
195
|
+
processed_count=processed_count,
|
|
196
|
+
)
|
|
197
|
+
self._current_state = IndexMeta(
|
|
198
|
+
state=self._current_state.state,
|
|
199
|
+
started_at=self._current_state.started_at,
|
|
200
|
+
last_commit=self._current_state.last_commit,
|
|
201
|
+
branch=self._current_state.branch,
|
|
202
|
+
indexed_at=self._current_state.indexed_at,
|
|
203
|
+
current_run=run,
|
|
204
|
+
)
|
|
205
|
+
self._save_meta()
|
|
206
|
+
|
|
207
|
+
def finish(
|
|
208
|
+
self,
|
|
209
|
+
last_commit: str | None = None,
|
|
210
|
+
branch: str | None = None,
|
|
211
|
+
) -> None:
|
|
212
|
+
"""Mark the current run as done and persist state."""
|
|
213
|
+
self._save_file_hashes()
|
|
214
|
+
if self._current_state is None:
|
|
215
|
+
return
|
|
216
|
+
commit = (
|
|
217
|
+
last_commit if last_commit is not None else self._current_state.last_commit
|
|
218
|
+
)
|
|
219
|
+
br = branch if branch is not None else self._current_state.branch
|
|
220
|
+
self._current_state = IndexMeta(
|
|
221
|
+
state=MetaState.DONE,
|
|
222
|
+
started_at=self._current_state.started_at,
|
|
223
|
+
last_commit=commit,
|
|
224
|
+
branch=br,
|
|
225
|
+
indexed_at=time.time(),
|
|
226
|
+
current_run=CurrentRun(),
|
|
227
|
+
)
|
|
228
|
+
self._save_meta()
|
|
229
|
+
|
|
230
|
+
def _save_meta(self) -> None:
|
|
231
|
+
"""Write current state to meta.json."""
|
|
232
|
+
self._meta_path.parent.mkdir(parents=True, exist_ok=True)
|
|
233
|
+
data = asdict(self._current_state)
|
|
234
|
+
data["state"] = self._current_state.state.value
|
|
235
|
+
self._meta_path.write_text(json.dumps(data, indent=2))
|
|
236
|
+
|
|
237
|
+
def _save_file_hashes(self) -> None:
|
|
238
|
+
"""Write file_hashes to file_hashes.json."""
|
|
239
|
+
self._file_hashes_path.parent.mkdir(parents=True, exist_ok=True)
|
|
240
|
+
self._file_hashes_path.write_text(
|
|
241
|
+
json.dumps(self._file_hashes, indent=0, sort_keys=True)
|
|
242
|
+
)
|
coderay/state/version.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
INDEX_SCHEMA_VERSION = 2
|
|
10
|
+
VERSION_FILENAME = "version.json"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class IndexVersionError(Exception):
|
|
14
|
+
"""Raised when the index schema version doesn't match the current code."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def write_index_version(index_dir: str | Path) -> None:
|
|
18
|
+
"""Write the current schema version to index_dir/version.json."""
|
|
19
|
+
path = Path(index_dir) / VERSION_FILENAME
|
|
20
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
path.write_text(json.dumps({"schema_version": INDEX_SCHEMA_VERSION}))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def read_index_version(index_dir: str | Path) -> int | None:
|
|
25
|
+
"""Read the schema version from index_dir/version.json. Returns None if missing."""
|
|
26
|
+
path = Path(index_dir) / VERSION_FILENAME
|
|
27
|
+
if not path.is_file():
|
|
28
|
+
return None
|
|
29
|
+
try:
|
|
30
|
+
data = json.loads(path.read_text())
|
|
31
|
+
return int(data.get("schema_version", 0))
|
|
32
|
+
except Exception:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def check_index_version(index_dir: str | Path) -> None:
|
|
37
|
+
"""Warn if the index version doesn't match the current schema."""
|
|
38
|
+
version = read_index_version(index_dir)
|
|
39
|
+
if version is None:
|
|
40
|
+
return
|
|
41
|
+
if version != INDEX_SCHEMA_VERSION:
|
|
42
|
+
logger.warning(
|
|
43
|
+
"Index schema version mismatch: index=%s, code=%s. "
|
|
44
|
+
"Consider rebuilding with 'coderay build --full'.",
|
|
45
|
+
version,
|
|
46
|
+
INDEX_SCHEMA_VERSION,
|
|
47
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import lancedb
|
|
8
|
+
|
|
9
|
+
from coderay.core.models import Chunk
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
DEFAULT_DIMENSIONS = 384
|
|
14
|
+
TABLE_NAME = "chunks"
|
|
15
|
+
DEFAULT_DISTANCE_METRIC = "cosine"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def index_exists(index_dir: str | Path) -> bool:
|
|
19
|
+
"""True if a LanceDB index (chunks table) exists at index_dir."""
|
|
20
|
+
path = Path(index_dir)
|
|
21
|
+
return (path / f"{TABLE_NAME}.lance").is_dir()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Store:
|
|
25
|
+
"""LanceDB-backed vector store for code chunks."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, db_path: str | Path, dimensions: int = DEFAULT_DIMENSIONS):
|
|
28
|
+
"""Initialize the LanceDB store."""
|
|
29
|
+
self.db_path = Path(db_path)
|
|
30
|
+
self.dimensions = dimensions
|
|
31
|
+
self._ensure_dir()
|
|
32
|
+
self._db = lancedb.connect(str(self.db_path))
|
|
33
|
+
self._table_known = False
|
|
34
|
+
self._fts_stale = True
|
|
35
|
+
|
|
36
|
+
def _ensure_dir(self) -> None:
|
|
37
|
+
self.db_path.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
|
|
39
|
+
def _table_exists(self) -> bool:
|
|
40
|
+
if self._table_known:
|
|
41
|
+
return True
|
|
42
|
+
resp = self._db.list_tables()
|
|
43
|
+
tables = resp.tables if hasattr(resp, "tables") else list(resp)
|
|
44
|
+
exists = TABLE_NAME in tables
|
|
45
|
+
if exists:
|
|
46
|
+
self._table_known = True
|
|
47
|
+
return exists
|
|
48
|
+
|
|
49
|
+
def _rows_from_chunks_embeddings(
|
|
50
|
+
self,
|
|
51
|
+
chunks: list[Chunk],
|
|
52
|
+
embeddings: list[list[float]],
|
|
53
|
+
) -> list[dict[str, Any]]:
|
|
54
|
+
rows = []
|
|
55
|
+
for chunk, emb in zip(chunks, embeddings):
|
|
56
|
+
if len(emb) != self.dimensions:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
f"Embedding dimension {len(emb)} "
|
|
59
|
+
f"!= store dimension {self.dimensions}"
|
|
60
|
+
)
|
|
61
|
+
rows.append(
|
|
62
|
+
{
|
|
63
|
+
"path": chunk.path,
|
|
64
|
+
"start_line": chunk.start_line,
|
|
65
|
+
"end_line": chunk.end_line,
|
|
66
|
+
"symbol": chunk.symbol,
|
|
67
|
+
"language": chunk.language,
|
|
68
|
+
"content": chunk.content,
|
|
69
|
+
"vector": emb,
|
|
70
|
+
}
|
|
71
|
+
)
|
|
72
|
+
return rows
|
|
73
|
+
|
|
74
|
+
def _get_table(self):
|
|
75
|
+
return self._db.open_table(TABLE_NAME)
|
|
76
|
+
|
|
77
|
+
def _ensure_fts_index(self, table) -> None:
|
|
78
|
+
"""Create or replace the full-text search index on the content column."""
|
|
79
|
+
try:
|
|
80
|
+
table.create_fts_index("content", replace=True)
|
|
81
|
+
except Exception as exc:
|
|
82
|
+
logger.debug("FTS index creation skipped: %s", exc)
|
|
83
|
+
|
|
84
|
+
def insert_chunks(
|
|
85
|
+
self,
|
|
86
|
+
chunks: list[Chunk],
|
|
87
|
+
embeddings: list[list[float]],
|
|
88
|
+
) -> None:
|
|
89
|
+
"""Insert chunks and their embeddings. Lengths must match."""
|
|
90
|
+
if len(chunks) != len(embeddings):
|
|
91
|
+
raise ValueError("chunks and embeddings length mismatch")
|
|
92
|
+
if not chunks:
|
|
93
|
+
return
|
|
94
|
+
rows = self._rows_from_chunks_embeddings(chunks, embeddings)
|
|
95
|
+
if not self._table_exists():
|
|
96
|
+
self._db.create_table(TABLE_NAME, rows)
|
|
97
|
+
self._table_known = True
|
|
98
|
+
else:
|
|
99
|
+
self._get_table().add(rows)
|
|
100
|
+
self._fts_stale = True
|
|
101
|
+
|
|
102
|
+
def delete_by_paths(self, paths: list[str]) -> None:
|
|
103
|
+
"""Remove all chunks whose path is in the given list."""
|
|
104
|
+
if not paths:
|
|
105
|
+
return
|
|
106
|
+
if not self._table_exists():
|
|
107
|
+
return
|
|
108
|
+
table = self._get_table()
|
|
109
|
+
safe = [p.replace("'", "''") for p in paths]
|
|
110
|
+
quoted = ", ".join(f"'{p}'" for p in safe)
|
|
111
|
+
table.delete(f"path IN ({quoted})")
|
|
112
|
+
|
|
113
|
+
def search(
|
|
114
|
+
self,
|
|
115
|
+
query_embedding: list[float],
|
|
116
|
+
top_k: int = 10,
|
|
117
|
+
path_prefix: str | None = None,
|
|
118
|
+
language: str | None = None,
|
|
119
|
+
query_text: str | None = None,
|
|
120
|
+
) -> list[dict[str, Any]]:
|
|
121
|
+
"""Nearest-neighbor search with optional hybrid scoring."""
|
|
122
|
+
if not self._table_exists():
|
|
123
|
+
return []
|
|
124
|
+
table = self._get_table()
|
|
125
|
+
|
|
126
|
+
use_hybrid = bool(query_text)
|
|
127
|
+
if use_hybrid:
|
|
128
|
+
if self._fts_stale:
|
|
129
|
+
self._ensure_fts_index(table)
|
|
130
|
+
self._fts_stale = False
|
|
131
|
+
try:
|
|
132
|
+
query = (
|
|
133
|
+
table.search(query_type="hybrid")
|
|
134
|
+
.vector(query_embedding)
|
|
135
|
+
.distance_type(DEFAULT_DISTANCE_METRIC)
|
|
136
|
+
.text(query_text)
|
|
137
|
+
)
|
|
138
|
+
except Exception:
|
|
139
|
+
query = table.search(query_embedding).distance_type(
|
|
140
|
+
DEFAULT_DISTANCE_METRIC
|
|
141
|
+
)
|
|
142
|
+
use_hybrid = False
|
|
143
|
+
else:
|
|
144
|
+
query = table.search(query_embedding).distance_type(DEFAULT_DISTANCE_METRIC)
|
|
145
|
+
|
|
146
|
+
if path_prefix:
|
|
147
|
+
prefix = (path_prefix.rstrip("/") + "/").replace("'", "''")
|
|
148
|
+
query = query.where(f"path LIKE '{prefix}%'")
|
|
149
|
+
if language:
|
|
150
|
+
lang = (language or "").replace("'", "''")
|
|
151
|
+
query = query.where(f"language = '{lang}'")
|
|
152
|
+
|
|
153
|
+
query = query.limit(top_k)
|
|
154
|
+
rows = query.to_list()
|
|
155
|
+
|
|
156
|
+
results = []
|
|
157
|
+
for r in rows:
|
|
158
|
+
row = dict(r)
|
|
159
|
+
if "_relevance_score" in row:
|
|
160
|
+
score = row.pop("_relevance_score")
|
|
161
|
+
row.pop("_distance", None)
|
|
162
|
+
elif "_distance" in row:
|
|
163
|
+
score = 1.0 - row.pop("_distance")
|
|
164
|
+
else:
|
|
165
|
+
score = row.pop("distance", 0.0)
|
|
166
|
+
row["score"] = round(float(score), 4)
|
|
167
|
+
row["score_type"] = "rrf" if use_hybrid else "cosine"
|
|
168
|
+
row.pop("vector", None)
|
|
169
|
+
results.append(row)
|
|
170
|
+
|
|
171
|
+
return results
|
|
172
|
+
|
|
173
|
+
def chunk_count(self) -> int:
|
|
174
|
+
"""Total number of chunks in the store."""
|
|
175
|
+
if not self._table_exists():
|
|
176
|
+
return 0
|
|
177
|
+
return self._get_table().count_rows()
|
|
178
|
+
|
|
179
|
+
def list_chunks(
|
|
180
|
+
self,
|
|
181
|
+
limit: int = 500,
|
|
182
|
+
path_prefix: str | None = None,
|
|
183
|
+
) -> list[dict[str, Any]]:
|
|
184
|
+
"""List indexed chunks (no vectors). For visualization / debugging."""
|
|
185
|
+
if not self._table_exists():
|
|
186
|
+
return []
|
|
187
|
+
table = self._get_table()
|
|
188
|
+
n = table.count_rows()
|
|
189
|
+
if n == 0:
|
|
190
|
+
return []
|
|
191
|
+
|
|
192
|
+
col_names = ["path", "start_line", "end_line", "symbol", "language"]
|
|
193
|
+
|
|
194
|
+
if path_prefix:
|
|
195
|
+
prefix = (path_prefix.rstrip("/") + "/").replace("'", "''")
|
|
196
|
+
try:
|
|
197
|
+
arrow = (
|
|
198
|
+
table.search()
|
|
199
|
+
.where(f"path LIKE '{prefix}%'")
|
|
200
|
+
.select(col_names)
|
|
201
|
+
.limit(limit)
|
|
202
|
+
.to_arrow()
|
|
203
|
+
)
|
|
204
|
+
except Exception:
|
|
205
|
+
arrow = table.head(min(n, limit * 2))
|
|
206
|
+
arrow = arrow.select(col_names)
|
|
207
|
+
rows = arrow.to_pylist()
|
|
208
|
+
pfix = path_prefix.rstrip("/") + "/"
|
|
209
|
+
return [r for r in rows if str(r.get("path", "")).startswith(pfix)][
|
|
210
|
+
:limit
|
|
211
|
+
]
|
|
212
|
+
else:
|
|
213
|
+
to_read = min(n, limit)
|
|
214
|
+
arrow = table.head(to_read)
|
|
215
|
+
arrow = arrow.select(col_names)
|
|
216
|
+
|
|
217
|
+
return arrow.to_pylist()[:limit]
|
|
218
|
+
|
|
219
|
+
def chunks_by_path(self) -> dict[str, int]:
|
|
220
|
+
"""Return mapping of file path -> chunk count for the whole index."""
|
|
221
|
+
if not self._table_exists():
|
|
222
|
+
return {}
|
|
223
|
+
table = self._get_table()
|
|
224
|
+
n = table.count_rows()
|
|
225
|
+
if n == 0:
|
|
226
|
+
return {}
|
|
227
|
+
arrow = table.head(n).select(["path"])
|
|
228
|
+
paths = arrow.column("path").to_pylist()
|
|
229
|
+
counts: dict[str, int] = {}
|
|
230
|
+
for p in paths:
|
|
231
|
+
key = str(p) if p is not None else "?"
|
|
232
|
+
counts[key] = counts.get(key, 0) + 1
|
|
233
|
+
return counts
|
|
234
|
+
|
|
235
|
+
def maintain(self) -> dict[str, Any]:
|
|
236
|
+
"""Run maintenance on the chunks table to reclaim space."""
|
|
237
|
+
result: dict[str, Any] = {"cleanup_done": False, "compact_done": False}
|
|
238
|
+
if not self._table_exists():
|
|
239
|
+
return result
|
|
240
|
+
table = self._get_table()
|
|
241
|
+
try:
|
|
242
|
+
dataset = table.to_lance()
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.warning("to_lance failed (install pylance?): %s", e)
|
|
245
|
+
result["error_cleanup"] = str(e)
|
|
246
|
+
return result
|
|
247
|
+
try:
|
|
248
|
+
dataset.cleanup_old_versions(retain_versions=1)
|
|
249
|
+
result["cleanup_done"] = True
|
|
250
|
+
logger.info("Cleaned up old table versions")
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logger.warning("cleanup_old_versions failed: %s", e)
|
|
253
|
+
result["error_cleanup"] = str(e)
|
|
254
|
+
try:
|
|
255
|
+
dataset.optimize.compact_files()
|
|
256
|
+
result["compact_done"] = True
|
|
257
|
+
logger.info("Compacted table fragments")
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logger.warning("compact_files failed: %s", e)
|
|
260
|
+
result["error_compact"] = str(e)
|
|
261
|
+
return result
|
|
262
|
+
|
|
263
|
+
def clear(self) -> None:
|
|
264
|
+
"""Drop table so next insert_chunks creates a fresh one (full rebuild)."""
|
|
265
|
+
if self._table_exists():
|
|
266
|
+
self._db.drop_table(TABLE_NAME)
|
|
267
|
+
self._table_known = False
|
|
268
|
+
self._fts_stale = True
|
coderay/vcs/__init__.py
ADDED
|
File without changes
|