coderay 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,242 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ from dataclasses import asdict, dataclass, field
6
+ from enum import Enum
7
+ from pathlib import Path
8
+
9
+ META_FILENAME = "meta.json"
10
+ FILE_HASHES_FILENAME = "file_hashes.json"
11
+
12
+
13
+ class MetaState(str, Enum):
14
+ """State of the indexer run."""
15
+
16
+ IN_PROGRESS = "in_progress"
17
+ DONE = "done"
18
+ ERRORED = "errored"
19
+ INCOMPLETE = "incomplete"
20
+
21
+
22
+ @dataclass
23
+ class CurrentRun:
24
+ """Resume info for the current indexing run."""
25
+
26
+ paths_to_process: list[str] = field(default_factory=list)
27
+ processed_count: int = 0
28
+ error: str | None = None
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class IndexMeta:
33
+ """Immutable snapshot of meta.json contents."""
34
+
35
+ state: MetaState
36
+ started_at: float
37
+ last_commit: str
38
+ branch: str
39
+ indexed_at: float
40
+ current_run: CurrentRun
41
+ error: str | None = None
42
+
43
+ def is_in_progress(self):
44
+ """Return True if state is IN_PROGRESS."""
45
+ return self.state == MetaState.IN_PROGRESS
46
+
47
+ def is_incomplete(self):
48
+ """Return True if state is INCOMPLETE (previous run did not finish)."""
49
+ return self.state == MetaState.INCOMPLETE
50
+
51
+
52
+ class StateMachine:
53
+ """Manages index metadata and file hashes on disk."""
54
+
55
+ def __init__(self, index_dir: Path | str) -> None:
56
+ """Initialize with index dir (contains meta.json and file_hashes.json)."""
57
+ self._index_dir = Path(index_dir)
58
+ self._meta_path = self._index_dir / META_FILENAME
59
+ self._file_hashes_path = self._index_dir / FILE_HASHES_FILENAME
60
+ self._current_state: IndexMeta | None = self._load_meta()
61
+ self._file_hashes: dict[str, str] = self._load_file_hashes()
62
+
63
+ @property
64
+ def index_dir(self) -> Path:
65
+ """Path to the index directory."""
66
+ return self._index_dir
67
+
68
+ @property
69
+ def meta_path(self) -> Path:
70
+ """Path to meta.json."""
71
+ return self._meta_path
72
+
73
+ @property
74
+ def current_state(self) -> IndexMeta | None:
75
+ """Current meta state; None if meta.json is missing or invalid."""
76
+ return self._current_state
77
+
78
+ @current_state.setter
79
+ def current_state(self, value: IndexMeta) -> None:
80
+ self._current_state = value
81
+
82
+ @property
83
+ def file_hashes(self) -> dict[str, str]:
84
+ """Mapping of relative path -> content hash."""
85
+ return self._file_hashes
86
+
87
+ @file_hashes.setter
88
+ def file_hashes(self, value: dict[str, str]) -> None:
89
+ self._file_hashes = value
90
+
91
+ def _load_meta(self) -> IndexMeta | None:
92
+ """Load meta from disk. Returns None if file missing or invalid."""
93
+ if not self._meta_path.exists():
94
+ return None
95
+ try:
96
+ data = json.loads(self._meta_path.read_text())
97
+ data["state"] = MetaState(data["state"])
98
+ cr = data.get("current_run") or {}
99
+ data["current_run"] = CurrentRun(
100
+ paths_to_process=cr.get("paths_to_process") or [],
101
+ processed_count=int(cr.get("processed_count") or 0),
102
+ error=cr.get("error"),
103
+ )
104
+ return IndexMeta(**data)
105
+ except Exception:
106
+ return None
107
+
108
+ def _load_file_hashes(self) -> dict[str, str]:
109
+ """Load file hashes from disk. Returns {} if missing or invalid."""
110
+ if not self._file_hashes_path.exists():
111
+ return {}
112
+ try:
113
+ data = json.loads(self._file_hashes_path.read_text())
114
+ return dict(data) if isinstance(data, dict) else {}
115
+ except Exception:
116
+ return {}
117
+
118
+ @property
119
+ def is_in_progress(self) -> bool:
120
+ """True if state is in_progress or incomplete."""
121
+ if self._current_state is None:
122
+ return False
123
+ return (
124
+ self._current_state.is_in_progress() or self._current_state.is_incomplete()
125
+ )
126
+
127
+ @property
128
+ def has_partial_progress(self) -> bool:
129
+ """True if we have a run with paths and non-zero processed count."""
130
+ if self._current_state is None:
131
+ return False
132
+ run = self._current_state.current_run
133
+ return bool(run.paths_to_process and run.processed_count > 0)
134
+
135
+ def set_incomplete(self) -> None:
136
+ """Mark the current run as incomplete."""
137
+ if self._current_state is None or not self.is_in_progress:
138
+ return
139
+ data = asdict(self._current_state)
140
+ data["state"] = MetaState.INCOMPLETE
141
+ data["current_run"] = asdict(self._current_state.current_run)
142
+ self._current_state = IndexMeta(
143
+ state=data["state"],
144
+ started_at=data["started_at"],
145
+ last_commit=data["last_commit"],
146
+ branch=data["branch"],
147
+ indexed_at=data["indexed_at"],
148
+ current_run=CurrentRun(**data["current_run"]),
149
+ )
150
+ self._save_meta()
151
+
152
+ def set_errored(self, exc: str) -> None:
153
+ if self._current_state is None:
154
+ return
155
+ data = asdict(self._current_state)
156
+ current_run = asdict(self._current_state.current_run)
157
+ self._current_state = IndexMeta(
158
+ state=MetaState.ERRORED,
159
+ error=exc,
160
+ started_at=data["started_at"],
161
+ last_commit=data["last_commit"],
162
+ branch=data["branch"],
163
+ indexed_at=data["indexed_at"],
164
+ current_run=CurrentRun(
165
+ paths_to_process=current_run["paths_to_process"],
166
+ processed_count=current_run["processed_count"],
167
+ error=exc,
168
+ ),
169
+ )
170
+ self._save_meta()
171
+
172
+ def start(self, branch: str | None, last_commit: str | None) -> None:
173
+ """Start a new indexing run."""
174
+ now = time.time()
175
+ self._current_state = IndexMeta(
176
+ state=MetaState.IN_PROGRESS,
177
+ started_at=now,
178
+ last_commit=last_commit or "",
179
+ branch=branch or "",
180
+ indexed_at=now,
181
+ current_run=CurrentRun(),
182
+ )
183
+ self._save_meta()
184
+
185
+ def save_progress(
186
+ self,
187
+ full_rel_paths: list[str],
188
+ processed_count: int,
189
+ ) -> None:
190
+ """Save resume checkpoint with processed count."""
191
+ if self._current_state is None or not self.is_in_progress:
192
+ return
193
+ run = CurrentRun(
194
+ paths_to_process=full_rel_paths,
195
+ processed_count=processed_count,
196
+ )
197
+ self._current_state = IndexMeta(
198
+ state=self._current_state.state,
199
+ started_at=self._current_state.started_at,
200
+ last_commit=self._current_state.last_commit,
201
+ branch=self._current_state.branch,
202
+ indexed_at=self._current_state.indexed_at,
203
+ current_run=run,
204
+ )
205
+ self._save_meta()
206
+
207
+ def finish(
208
+ self,
209
+ last_commit: str | None = None,
210
+ branch: str | None = None,
211
+ ) -> None:
212
+ """Mark the current run as done and persist state."""
213
+ self._save_file_hashes()
214
+ if self._current_state is None:
215
+ return
216
+ commit = (
217
+ last_commit if last_commit is not None else self._current_state.last_commit
218
+ )
219
+ br = branch if branch is not None else self._current_state.branch
220
+ self._current_state = IndexMeta(
221
+ state=MetaState.DONE,
222
+ started_at=self._current_state.started_at,
223
+ last_commit=commit,
224
+ branch=br,
225
+ indexed_at=time.time(),
226
+ current_run=CurrentRun(),
227
+ )
228
+ self._save_meta()
229
+
230
+ def _save_meta(self) -> None:
231
+ """Write current state to meta.json."""
232
+ self._meta_path.parent.mkdir(parents=True, exist_ok=True)
233
+ data = asdict(self._current_state)
234
+ data["state"] = self._current_state.state.value
235
+ self._meta_path.write_text(json.dumps(data, indent=2))
236
+
237
+ def _save_file_hashes(self) -> None:
238
+ """Write file_hashes to file_hashes.json."""
239
+ self._file_hashes_path.parent.mkdir(parents=True, exist_ok=True)
240
+ self._file_hashes_path.write_text(
241
+ json.dumps(self._file_hashes, indent=0, sort_keys=True)
242
+ )
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ INDEX_SCHEMA_VERSION = 2
10
+ VERSION_FILENAME = "version.json"
11
+
12
+
13
+ class IndexVersionError(Exception):
14
+ """Raised when the index schema version doesn't match the current code."""
15
+
16
+
17
+ def write_index_version(index_dir: str | Path) -> None:
18
+ """Write the current schema version to index_dir/version.json."""
19
+ path = Path(index_dir) / VERSION_FILENAME
20
+ path.parent.mkdir(parents=True, exist_ok=True)
21
+ path.write_text(json.dumps({"schema_version": INDEX_SCHEMA_VERSION}))
22
+
23
+
24
+ def read_index_version(index_dir: str | Path) -> int | None:
25
+ """Read the schema version from index_dir/version.json. Returns None if missing."""
26
+ path = Path(index_dir) / VERSION_FILENAME
27
+ if not path.is_file():
28
+ return None
29
+ try:
30
+ data = json.loads(path.read_text())
31
+ return int(data.get("schema_version", 0))
32
+ except Exception:
33
+ return None
34
+
35
+
36
+ def check_index_version(index_dir: str | Path) -> None:
37
+ """Warn if the index version doesn't match the current schema."""
38
+ version = read_index_version(index_dir)
39
+ if version is None:
40
+ return
41
+ if version != INDEX_SCHEMA_VERSION:
42
+ logger.warning(
43
+ "Index schema version mismatch: index=%s, code=%s. "
44
+ "Consider rebuilding with 'coderay build --full'.",
45
+ version,
46
+ INDEX_SCHEMA_VERSION,
47
+ )
File without changes
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import lancedb
8
+
9
+ from coderay.core.models import Chunk
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ DEFAULT_DIMENSIONS = 384
14
+ TABLE_NAME = "chunks"
15
+ DEFAULT_DISTANCE_METRIC = "cosine"
16
+
17
+
18
+ def index_exists(index_dir: str | Path) -> bool:
19
+ """True if a LanceDB index (chunks table) exists at index_dir."""
20
+ path = Path(index_dir)
21
+ return (path / f"{TABLE_NAME}.lance").is_dir()
22
+
23
+
24
+ class Store:
25
+ """LanceDB-backed vector store for code chunks."""
26
+
27
+ def __init__(self, db_path: str | Path, dimensions: int = DEFAULT_DIMENSIONS):
28
+ """Initialize the LanceDB store."""
29
+ self.db_path = Path(db_path)
30
+ self.dimensions = dimensions
31
+ self._ensure_dir()
32
+ self._db = lancedb.connect(str(self.db_path))
33
+ self._table_known = False
34
+ self._fts_stale = True
35
+
36
+ def _ensure_dir(self) -> None:
37
+ self.db_path.mkdir(parents=True, exist_ok=True)
38
+
39
+ def _table_exists(self) -> bool:
40
+ if self._table_known:
41
+ return True
42
+ resp = self._db.list_tables()
43
+ tables = resp.tables if hasattr(resp, "tables") else list(resp)
44
+ exists = TABLE_NAME in tables
45
+ if exists:
46
+ self._table_known = True
47
+ return exists
48
+
49
+ def _rows_from_chunks_embeddings(
50
+ self,
51
+ chunks: list[Chunk],
52
+ embeddings: list[list[float]],
53
+ ) -> list[dict[str, Any]]:
54
+ rows = []
55
+ for chunk, emb in zip(chunks, embeddings):
56
+ if len(emb) != self.dimensions:
57
+ raise ValueError(
58
+ f"Embedding dimension {len(emb)} "
59
+ f"!= store dimension {self.dimensions}"
60
+ )
61
+ rows.append(
62
+ {
63
+ "path": chunk.path,
64
+ "start_line": chunk.start_line,
65
+ "end_line": chunk.end_line,
66
+ "symbol": chunk.symbol,
67
+ "language": chunk.language,
68
+ "content": chunk.content,
69
+ "vector": emb,
70
+ }
71
+ )
72
+ return rows
73
+
74
+ def _get_table(self):
75
+ return self._db.open_table(TABLE_NAME)
76
+
77
+ def _ensure_fts_index(self, table) -> None:
78
+ """Create or replace the full-text search index on the content column."""
79
+ try:
80
+ table.create_fts_index("content", replace=True)
81
+ except Exception as exc:
82
+ logger.debug("FTS index creation skipped: %s", exc)
83
+
84
+ def insert_chunks(
85
+ self,
86
+ chunks: list[Chunk],
87
+ embeddings: list[list[float]],
88
+ ) -> None:
89
+ """Insert chunks and their embeddings. Lengths must match."""
90
+ if len(chunks) != len(embeddings):
91
+ raise ValueError("chunks and embeddings length mismatch")
92
+ if not chunks:
93
+ return
94
+ rows = self._rows_from_chunks_embeddings(chunks, embeddings)
95
+ if not self._table_exists():
96
+ self._db.create_table(TABLE_NAME, rows)
97
+ self._table_known = True
98
+ else:
99
+ self._get_table().add(rows)
100
+ self._fts_stale = True
101
+
102
+ def delete_by_paths(self, paths: list[str]) -> None:
103
+ """Remove all chunks whose path is in the given list."""
104
+ if not paths:
105
+ return
106
+ if not self._table_exists():
107
+ return
108
+ table = self._get_table()
109
+ safe = [p.replace("'", "''") for p in paths]
110
+ quoted = ", ".join(f"'{p}'" for p in safe)
111
+ table.delete(f"path IN ({quoted})")
112
+
113
+ def search(
114
+ self,
115
+ query_embedding: list[float],
116
+ top_k: int = 10,
117
+ path_prefix: str | None = None,
118
+ language: str | None = None,
119
+ query_text: str | None = None,
120
+ ) -> list[dict[str, Any]]:
121
+ """Nearest-neighbor search with optional hybrid scoring."""
122
+ if not self._table_exists():
123
+ return []
124
+ table = self._get_table()
125
+
126
+ use_hybrid = bool(query_text)
127
+ if use_hybrid:
128
+ if self._fts_stale:
129
+ self._ensure_fts_index(table)
130
+ self._fts_stale = False
131
+ try:
132
+ query = (
133
+ table.search(query_type="hybrid")
134
+ .vector(query_embedding)
135
+ .distance_type(DEFAULT_DISTANCE_METRIC)
136
+ .text(query_text)
137
+ )
138
+ except Exception:
139
+ query = table.search(query_embedding).distance_type(
140
+ DEFAULT_DISTANCE_METRIC
141
+ )
142
+ use_hybrid = False
143
+ else:
144
+ query = table.search(query_embedding).distance_type(DEFAULT_DISTANCE_METRIC)
145
+
146
+ if path_prefix:
147
+ prefix = (path_prefix.rstrip("/") + "/").replace("'", "''")
148
+ query = query.where(f"path LIKE '{prefix}%'")
149
+ if language:
150
+ lang = (language or "").replace("'", "''")
151
+ query = query.where(f"language = '{lang}'")
152
+
153
+ query = query.limit(top_k)
154
+ rows = query.to_list()
155
+
156
+ results = []
157
+ for r in rows:
158
+ row = dict(r)
159
+ if "_relevance_score" in row:
160
+ score = row.pop("_relevance_score")
161
+ row.pop("_distance", None)
162
+ elif "_distance" in row:
163
+ score = 1.0 - row.pop("_distance")
164
+ else:
165
+ score = row.pop("distance", 0.0)
166
+ row["score"] = round(float(score), 4)
167
+ row["score_type"] = "rrf" if use_hybrid else "cosine"
168
+ row.pop("vector", None)
169
+ results.append(row)
170
+
171
+ return results
172
+
173
+ def chunk_count(self) -> int:
174
+ """Total number of chunks in the store."""
175
+ if not self._table_exists():
176
+ return 0
177
+ return self._get_table().count_rows()
178
+
179
+ def list_chunks(
180
+ self,
181
+ limit: int = 500,
182
+ path_prefix: str | None = None,
183
+ ) -> list[dict[str, Any]]:
184
+ """List indexed chunks (no vectors). For visualization / debugging."""
185
+ if not self._table_exists():
186
+ return []
187
+ table = self._get_table()
188
+ n = table.count_rows()
189
+ if n == 0:
190
+ return []
191
+
192
+ col_names = ["path", "start_line", "end_line", "symbol", "language"]
193
+
194
+ if path_prefix:
195
+ prefix = (path_prefix.rstrip("/") + "/").replace("'", "''")
196
+ try:
197
+ arrow = (
198
+ table.search()
199
+ .where(f"path LIKE '{prefix}%'")
200
+ .select(col_names)
201
+ .limit(limit)
202
+ .to_arrow()
203
+ )
204
+ except Exception:
205
+ arrow = table.head(min(n, limit * 2))
206
+ arrow = arrow.select(col_names)
207
+ rows = arrow.to_pylist()
208
+ pfix = path_prefix.rstrip("/") + "/"
209
+ return [r for r in rows if str(r.get("path", "")).startswith(pfix)][
210
+ :limit
211
+ ]
212
+ else:
213
+ to_read = min(n, limit)
214
+ arrow = table.head(to_read)
215
+ arrow = arrow.select(col_names)
216
+
217
+ return arrow.to_pylist()[:limit]
218
+
219
+ def chunks_by_path(self) -> dict[str, int]:
220
+ """Return mapping of file path -> chunk count for the whole index."""
221
+ if not self._table_exists():
222
+ return {}
223
+ table = self._get_table()
224
+ n = table.count_rows()
225
+ if n == 0:
226
+ return {}
227
+ arrow = table.head(n).select(["path"])
228
+ paths = arrow.column("path").to_pylist()
229
+ counts: dict[str, int] = {}
230
+ for p in paths:
231
+ key = str(p) if p is not None else "?"
232
+ counts[key] = counts.get(key, 0) + 1
233
+ return counts
234
+
235
+ def maintain(self) -> dict[str, Any]:
236
+ """Run maintenance on the chunks table to reclaim space."""
237
+ result: dict[str, Any] = {"cleanup_done": False, "compact_done": False}
238
+ if not self._table_exists():
239
+ return result
240
+ table = self._get_table()
241
+ try:
242
+ dataset = table.to_lance()
243
+ except Exception as e:
244
+ logger.warning("to_lance failed (install pylance?): %s", e)
245
+ result["error_cleanup"] = str(e)
246
+ return result
247
+ try:
248
+ dataset.cleanup_old_versions(retain_versions=1)
249
+ result["cleanup_done"] = True
250
+ logger.info("Cleaned up old table versions")
251
+ except Exception as e:
252
+ logger.warning("cleanup_old_versions failed: %s", e)
253
+ result["error_cleanup"] = str(e)
254
+ try:
255
+ dataset.optimize.compact_files()
256
+ result["compact_done"] = True
257
+ logger.info("Compacted table fragments")
258
+ except Exception as e:
259
+ logger.warning("compact_files failed: %s", e)
260
+ result["error_compact"] = str(e)
261
+ return result
262
+
263
+ def clear(self) -> None:
264
+ """Drop table so next insert_chunks creates a fresh one (full rebuild)."""
265
+ if self._table_exists():
266
+ self._db.drop_table(TABLE_NAME)
267
+ self._table_known = False
268
+ self._fts_stale = True
File without changes