codebase-retrieval-context-engine 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
- corbell/__init__.py +6 -0
- corbell/cli/__init__.py +1 -0
- corbell/cli/commands/__init__.py +1 -0
- corbell/cli/commands/index.py +86 -0
- corbell/cli/commands/query.py +71 -0
- corbell/cli/main.py +57 -0
- corbell/core/__init__.py +1 -0
- corbell/core/constants.py +52 -0
- corbell/core/embeddings/__init__.py +6 -0
- corbell/core/embeddings/base.py +68 -0
- corbell/core/embeddings/extractor.py +201 -0
- corbell/core/embeddings/factory.py +48 -0
- corbell/core/embeddings/model.py +401 -0
- corbell/core/embeddings/search_cache.py +95 -0
- corbell/core/embeddings/sqlite_store.py +271 -0
- corbell/core/gitignore.py +76 -0
- corbell/core/graph/__init__.py +1 -0
- corbell/core/graph/builder.py +696 -0
- corbell/core/graph/method_graph.py +1077 -0
- corbell/core/graph/providers/__init__.py +6 -0
- corbell/core/graph/providers/aws_patterns.py +62 -0
- corbell/core/graph/providers/azure_patterns.py +64 -0
- corbell/core/graph/providers/gcp_patterns.py +59 -0
- corbell/core/graph/schema.py +175 -0
- corbell/core/graph/sqlite_store.py +500 -0
- corbell/core/indexing/__init__.py +1 -0
- corbell/core/indexing/builder.py +608 -0
- corbell/core/indexing/lock.py +150 -0
- corbell/core/indexing/tracker.py +245 -0
- corbell/core/llm_client.py +677 -0
- corbell/core/mcp/__init__.py +1 -0
- corbell/core/mcp/server.py +214 -0
- corbell/core/query/__init__.py +1 -0
- corbell/core/query/diagnostics.py +38 -0
- corbell/core/query/engine.py +321 -0
- corbell/core/query/enhancer.py +102 -0
- corbell/core/query/formatter.py +98 -0
- corbell/core/query/graph_expander.py +284 -0
- corbell/core/query/merger.py +171 -0
- corbell/core/query/reranker.py +131 -0
- corbell/core/workspace.py +408 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Cross-platform file lock for index build serialization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from types import TracebackType
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class IndexLock:
|
|
13
|
+
"""A cross-platform file lock that prevents concurrent index builds.
|
|
14
|
+
|
|
15
|
+
Uses ``fcntl.flock`` on Unix and ``msvcrt.locking`` on Windows.
|
|
16
|
+
Falls back to a polling loop so callers don't need to worry about
|
|
17
|
+
platform differences.
|
|
18
|
+
|
|
19
|
+
Usage::
|
|
20
|
+
|
|
21
|
+
lock = IndexLock(db_path.parent / "index.lock")
|
|
22
|
+
with lock:
|
|
23
|
+
# only one process at a time reaches here
|
|
24
|
+
...
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
lock_path: Path to the lock file (created if absent).
|
|
28
|
+
timeout: Maximum seconds to wait before raising TimeoutError.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, lock_path: Path | str, timeout: int = 300) -> None:
|
|
32
|
+
self.lock_path = Path(lock_path)
|
|
33
|
+
self.timeout = timeout
|
|
34
|
+
self._fh: Optional[object] = None # file handle, platform-specific type
|
|
35
|
+
|
|
36
|
+
# ------------------------------------------------------------------ #
|
|
37
|
+
# Public interface #
|
|
38
|
+
# ------------------------------------------------------------------ #
|
|
39
|
+
|
|
40
|
+
def acquire(self) -> None:
|
|
41
|
+
"""Block until the lock is acquired or timeout is reached.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
TimeoutError: If the lock cannot be acquired within ``timeout`` seconds.
|
|
45
|
+
"""
|
|
46
|
+
self.lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
deadline = time.monotonic() + self.timeout
|
|
48
|
+
|
|
49
|
+
while True:
|
|
50
|
+
try:
|
|
51
|
+
self._acquire_once()
|
|
52
|
+
return # success
|
|
53
|
+
except (OSError, IOError):
|
|
54
|
+
pass # lock held by another process
|
|
55
|
+
|
|
56
|
+
if time.monotonic() >= deadline:
|
|
57
|
+
raise TimeoutError(
|
|
58
|
+
f"Could not acquire index lock at {self.lock_path} "
|
|
59
|
+
f"within {self.timeout}s. Another build may be running."
|
|
60
|
+
)
|
|
61
|
+
time.sleep(0.25)
|
|
62
|
+
|
|
63
|
+
def release(self) -> None:
|
|
64
|
+
"""Release the lock and close the file handle.
|
|
65
|
+
|
|
66
|
+
Safe to call even if the lock was never acquired.
|
|
67
|
+
"""
|
|
68
|
+
if self._fh is None:
|
|
69
|
+
return
|
|
70
|
+
try:
|
|
71
|
+
if sys.platform == "win32":
|
|
72
|
+
self._release_windows()
|
|
73
|
+
else:
|
|
74
|
+
self._release_unix()
|
|
75
|
+
except Exception:
|
|
76
|
+
pass
|
|
77
|
+
finally:
|
|
78
|
+
try:
|
|
79
|
+
self._fh.close() # type: ignore[union-attr]
|
|
80
|
+
except Exception:
|
|
81
|
+
pass
|
|
82
|
+
self._fh = None
|
|
83
|
+
|
|
84
|
+
# ------------------------------------------------------------------ #
|
|
85
|
+
# Context manager #
|
|
86
|
+
# ------------------------------------------------------------------ #
|
|
87
|
+
|
|
88
|
+
def __enter__(self) -> "IndexLock":
|
|
89
|
+
self.acquire()
|
|
90
|
+
return self
|
|
91
|
+
|
|
92
|
+
def __exit__(
|
|
93
|
+
self,
|
|
94
|
+
exc_type: Optional[type],
|
|
95
|
+
exc_val: Optional[BaseException],
|
|
96
|
+
exc_tb: Optional[TracebackType],
|
|
97
|
+
) -> None:
|
|
98
|
+
self.release()
|
|
99
|
+
|
|
100
|
+
# ------------------------------------------------------------------ #
|
|
101
|
+
# Platform-specific internals #
|
|
102
|
+
# ------------------------------------------------------------------ #
|
|
103
|
+
|
|
104
|
+
def _acquire_once(self) -> None:
|
|
105
|
+
"""Attempt a single non-blocking lock acquisition.
|
|
106
|
+
|
|
107
|
+
Raises OSError/IOError if the lock is held by another process.
|
|
108
|
+
"""
|
|
109
|
+
if sys.platform == "win32":
|
|
110
|
+
self._acquire_windows()
|
|
111
|
+
else:
|
|
112
|
+
self._acquire_unix()
|
|
113
|
+
|
|
114
|
+
def _acquire_unix(self) -> None:
|
|
115
|
+
import fcntl
|
|
116
|
+
|
|
117
|
+
fh = open(self.lock_path, "w") # noqa: WPS515
|
|
118
|
+
try:
|
|
119
|
+
fcntl.flock(fh.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
120
|
+
except (OSError, IOError):
|
|
121
|
+
fh.close()
|
|
122
|
+
raise
|
|
123
|
+
self._fh = fh
|
|
124
|
+
|
|
125
|
+
def _release_unix(self) -> None:
|
|
126
|
+
import fcntl
|
|
127
|
+
|
|
128
|
+
fcntl.flock(self._fh.fileno(), fcntl.LOCK_UN) # type: ignore[union-attr]
|
|
129
|
+
|
|
130
|
+
def _acquire_windows(self) -> None:
|
|
131
|
+
import msvcrt
|
|
132
|
+
|
|
133
|
+
fh = open(self.lock_path, "w") # noqa: WPS515
|
|
134
|
+
try:
|
|
135
|
+
# Lock the first byte of the file (non-blocking)
|
|
136
|
+
msvcrt.locking(fh.fileno(), msvcrt.LK_NBLCK, 1)
|
|
137
|
+
except (OSError, IOError):
|
|
138
|
+
fh.close()
|
|
139
|
+
raise
|
|
140
|
+
self._fh = fh
|
|
141
|
+
|
|
142
|
+
def _release_windows(self) -> None:
|
|
143
|
+
import msvcrt
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
# Seek back to start before unlocking
|
|
147
|
+
self._fh.seek(0) # type: ignore[union-attr]
|
|
148
|
+
except Exception:
|
|
149
|
+
pass
|
|
150
|
+
msvcrt.locking(self._fh.fileno(), msvcrt.LK_UNLCK, 1) # type: ignore[union-attr]
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Index tracking: file-level mtime tracking and global index metadata."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sqlite3
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, List, Optional, TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from corbell.core.workspace import WorkspaceConfig
|
|
13
|
+
|
|
14
|
+
_CREATE_FILE_META = """
|
|
15
|
+
CREATE TABLE IF NOT EXISTS file_index_meta (
|
|
16
|
+
file_path TEXT NOT NULL,
|
|
17
|
+
repo_id TEXT NOT NULL,
|
|
18
|
+
mtime REAL NOT NULL,
|
|
19
|
+
indexed_at REAL NOT NULL,
|
|
20
|
+
PRIMARY KEY (file_path, repo_id)
|
|
21
|
+
);
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
_CREATE_INDEX_META = """
|
|
25
|
+
CREATE TABLE IF NOT EXISTS index_meta (
|
|
26
|
+
key TEXT PRIMARY KEY,
|
|
27
|
+
value TEXT NOT NULL
|
|
28
|
+
);
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class StaleResult:
|
|
34
|
+
"""Result of a stale-file detection scan."""
|
|
35
|
+
|
|
36
|
+
added: List[tuple] = field(default_factory=list) # [(file_path, repo_id), ...]
|
|
37
|
+
modified: List[tuple] = field(default_factory=list) # [(file_path, repo_id), ...]
|
|
38
|
+
deleted: List[tuple] = field(default_factory=list) # [(file_path, repo_id), ...]
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def has_changes(self) -> bool:
|
|
42
|
+
"""True if any files need to be reindexed."""
|
|
43
|
+
return bool(self.added or self.modified or self.deleted)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def changed_repo_ids(self) -> set:
|
|
47
|
+
"""Set of repo IDs that have at least one changed file."""
|
|
48
|
+
result = set()
|
|
49
|
+
for _, repo_id in self.added:
|
|
50
|
+
result.add(repo_id)
|
|
51
|
+
for _, repo_id in self.modified:
|
|
52
|
+
result.add(repo_id)
|
|
53
|
+
for _, repo_id in self.deleted:
|
|
54
|
+
result.add(repo_id)
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class IndexTracker:
|
|
59
|
+
"""Manages file_index_meta and index_meta tables for incremental indexing.
|
|
60
|
+
|
|
61
|
+
The file_index_meta table records each indexed file's path, repo_id, and
|
|
62
|
+
mtime so the builder can detect added/modified/deleted files on subsequent
|
|
63
|
+
runs without a full re-scan of every file's content.
|
|
64
|
+
|
|
65
|
+
The index_meta table stores global key-value metadata (embedding_model,
|
|
66
|
+
last_build_at, chunk_size, overlap).
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(self, db_path: Path | str) -> None:
|
|
70
|
+
self.db_path = Path(db_path)
|
|
71
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
self._create_tables()
|
|
73
|
+
|
|
74
|
+
def _conn(self) -> sqlite3.Connection:
|
|
75
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
76
|
+
conn.row_factory = sqlite3.Row
|
|
77
|
+
return conn
|
|
78
|
+
|
|
79
|
+
def _create_tables(self) -> None:
|
|
80
|
+
"""Ensure the tracking tables exist."""
|
|
81
|
+
with self._conn() as conn:
|
|
82
|
+
conn.execute(_CREATE_FILE_META)
|
|
83
|
+
conn.execute(_CREATE_INDEX_META)
|
|
84
|
+
conn.commit()
|
|
85
|
+
|
|
86
|
+
# Alias for external callers that prefer create_tables()
|
|
87
|
+
def create_tables(self) -> None:
|
|
88
|
+
self._create_tables()
|
|
89
|
+
|
|
90
|
+
def get_last_build_at(self) -> Optional[float]:
|
|
91
|
+
"""Return the Unix timestamp of the last successful build, or None."""
|
|
92
|
+
with self._conn() as conn:
|
|
93
|
+
row = conn.execute(
|
|
94
|
+
"SELECT value FROM index_meta WHERE key = 'last_build_at'"
|
|
95
|
+
).fetchone()
|
|
96
|
+
if row:
|
|
97
|
+
try:
|
|
98
|
+
return float(row["value"])
|
|
99
|
+
except (ValueError, TypeError):
|
|
100
|
+
return None
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
def get_stored_model(self) -> Optional[str]:
|
|
104
|
+
"""Return the model name stored at last build, or None."""
|
|
105
|
+
with self._conn() as conn:
|
|
106
|
+
row = conn.execute(
|
|
107
|
+
"SELECT value FROM index_meta WHERE key = 'embedding_model'"
|
|
108
|
+
).fetchone()
|
|
109
|
+
return row["value"] if row else None
|
|
110
|
+
|
|
111
|
+
def set_meta(self, key: str, value: str) -> None:
|
|
112
|
+
"""Upsert a key-value pair in index_meta."""
|
|
113
|
+
with self._conn() as conn:
|
|
114
|
+
conn.execute(
|
|
115
|
+
"INSERT INTO index_meta (key, value) VALUES (?, ?) "
|
|
116
|
+
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
|
117
|
+
(key, value),
|
|
118
|
+
)
|
|
119
|
+
conn.commit()
|
|
120
|
+
|
|
121
|
+
def get_stale_files(
|
|
122
|
+
self,
|
|
123
|
+
repos: List,
|
|
124
|
+
config: "WorkspaceConfig",
|
|
125
|
+
) -> StaleResult:
|
|
126
|
+
"""Detect added, modified, and deleted files across all repos.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
repos: List of RepoConfig objects with resolved_path set.
|
|
130
|
+
config: WorkspaceConfig for skip_dirs and file extension settings.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
StaleResult with added, modified, deleted lists of (file_path, repo_id).
|
|
134
|
+
"""
|
|
135
|
+
from corbell.core.constants import EXTENSION_LANG, SKIP_DIRS
|
|
136
|
+
from corbell.core.gitignore import load_gitignore
|
|
137
|
+
from corbell.core.workspace import IndexingConfig
|
|
138
|
+
|
|
139
|
+
indexing: IndexingConfig = config.indexing
|
|
140
|
+
extra_skip = set(indexing.skip_dirs)
|
|
141
|
+
all_skip = SKIP_DIRS | extra_skip
|
|
142
|
+
|
|
143
|
+
result = StaleResult()
|
|
144
|
+
|
|
145
|
+
# Load all currently tracked entries from DB
|
|
146
|
+
with self._conn() as conn:
|
|
147
|
+
rows = conn.execute(
|
|
148
|
+
"SELECT file_path, repo_id, mtime FROM file_index_meta"
|
|
149
|
+
).fetchall()
|
|
150
|
+
|
|
151
|
+
tracked: Dict[tuple, float] = {}
|
|
152
|
+
for row in rows:
|
|
153
|
+
tracked[(row["file_path"], row["repo_id"])] = row["mtime"]
|
|
154
|
+
|
|
155
|
+
# Scan current files on disk
|
|
156
|
+
seen_keys: set = set()
|
|
157
|
+
for repo in repos:
|
|
158
|
+
repo_id = repo.id
|
|
159
|
+
repo_path = repo.resolved_path
|
|
160
|
+
if not repo_path or not repo_path.exists():
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
gitignore_spec = load_gitignore(repo_path)
|
|
164
|
+
|
|
165
|
+
for fp in repo_path.rglob("*"):
|
|
166
|
+
if not fp.is_file():
|
|
167
|
+
continue
|
|
168
|
+
# Check skip dirs
|
|
169
|
+
rel = fp.relative_to(repo_path)
|
|
170
|
+
if any(part in all_skip for part in rel.parts):
|
|
171
|
+
continue
|
|
172
|
+
# Check gitignore
|
|
173
|
+
if gitignore_spec.match_file(str(rel).replace("\\", "/")):
|
|
174
|
+
continue
|
|
175
|
+
# Check extension
|
|
176
|
+
if fp.suffix not in EXTENSION_LANG:
|
|
177
|
+
continue
|
|
178
|
+
# Check file size
|
|
179
|
+
try:
|
|
180
|
+
stat = fp.stat()
|
|
181
|
+
if stat.st_size > indexing.max_file_bytes:
|
|
182
|
+
continue
|
|
183
|
+
current_mtime = stat.st_mtime
|
|
184
|
+
except OSError:
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
rel_str = str(rel)
|
|
188
|
+
key = (rel_str, repo_id)
|
|
189
|
+
seen_keys.add(key)
|
|
190
|
+
|
|
191
|
+
if key not in tracked:
|
|
192
|
+
result.added.append(key)
|
|
193
|
+
elif abs(current_mtime - tracked[key]) > 0.001:
|
|
194
|
+
result.modified.append(key)
|
|
195
|
+
|
|
196
|
+
# Detect deleted files
|
|
197
|
+
for key in tracked:
|
|
198
|
+
if key not in seen_keys:
|
|
199
|
+
result.deleted.append(key)
|
|
200
|
+
|
|
201
|
+
return result
|
|
202
|
+
|
|
203
|
+
def mark_indexed(self, file_path: str, repo_id: str, mtime: float) -> None:
|
|
204
|
+
"""Record a file as indexed with its current mtime.
|
|
205
|
+
|
|
206
|
+
Should be called AFTER the embedding chunks have been committed to DB
|
|
207
|
+
to ensure crash safety (next run will re-detect the file if mtime not updated).
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
file_path: Relative file path within the repo.
|
|
211
|
+
repo_id: The repository ID.
|
|
212
|
+
mtime: File modification time (from os.path.getmtime).
|
|
213
|
+
"""
|
|
214
|
+
with self._conn() as conn:
|
|
215
|
+
conn.execute(
|
|
216
|
+
"INSERT INTO file_index_meta (file_path, repo_id, mtime, indexed_at) "
|
|
217
|
+
"VALUES (?, ?, ?, ?) "
|
|
218
|
+
"ON CONFLICT(file_path, repo_id) DO UPDATE SET "
|
|
219
|
+
"mtime = excluded.mtime, indexed_at = excluded.indexed_at",
|
|
220
|
+
(file_path, repo_id, mtime, time.time()),
|
|
221
|
+
)
|
|
222
|
+
conn.commit()
|
|
223
|
+
|
|
224
|
+
def remove_tracked(self, file_paths: List[tuple]) -> None:
|
|
225
|
+
"""Remove tracking entries for deleted files.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
file_paths: List of ``(file_path, repo_id)`` tuples to remove.
|
|
229
|
+
"""
|
|
230
|
+
if not file_paths:
|
|
231
|
+
return
|
|
232
|
+
with self._conn() as conn:
|
|
233
|
+
for file_path, repo_id in file_paths:
|
|
234
|
+
conn.execute(
|
|
235
|
+
"DELETE FROM file_index_meta WHERE file_path = ? AND repo_id = ?",
|
|
236
|
+
(file_path, repo_id),
|
|
237
|
+
)
|
|
238
|
+
conn.commit()
|
|
239
|
+
|
|
240
|
+
def clear_all(self) -> None:
|
|
241
|
+
"""Remove all tracking data (used during --rebuild)."""
|
|
242
|
+
with self._conn() as conn:
|
|
243
|
+
conn.execute("DELETE FROM file_index_meta")
|
|
244
|
+
conn.execute("DELETE FROM index_meta")
|
|
245
|
+
conn.commit()
|