codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
  2. codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
  3. codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
  4. codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
  5. codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
  6. corbell/__init__.py +6 -0
  7. corbell/cli/__init__.py +1 -0
  8. corbell/cli/commands/__init__.py +1 -0
  9. corbell/cli/commands/index.py +86 -0
  10. corbell/cli/commands/query.py +71 -0
  11. corbell/cli/main.py +57 -0
  12. corbell/core/__init__.py +1 -0
  13. corbell/core/constants.py +52 -0
  14. corbell/core/embeddings/__init__.py +6 -0
  15. corbell/core/embeddings/base.py +68 -0
  16. corbell/core/embeddings/extractor.py +201 -0
  17. corbell/core/embeddings/factory.py +48 -0
  18. corbell/core/embeddings/model.py +401 -0
  19. corbell/core/embeddings/search_cache.py +95 -0
  20. corbell/core/embeddings/sqlite_store.py +271 -0
  21. corbell/core/gitignore.py +76 -0
  22. corbell/core/graph/__init__.py +1 -0
  23. corbell/core/graph/builder.py +696 -0
  24. corbell/core/graph/method_graph.py +1077 -0
  25. corbell/core/graph/providers/__init__.py +6 -0
  26. corbell/core/graph/providers/aws_patterns.py +62 -0
  27. corbell/core/graph/providers/azure_patterns.py +64 -0
  28. corbell/core/graph/providers/gcp_patterns.py +59 -0
  29. corbell/core/graph/schema.py +175 -0
  30. corbell/core/graph/sqlite_store.py +500 -0
  31. corbell/core/indexing/__init__.py +1 -0
  32. corbell/core/indexing/builder.py +608 -0
  33. corbell/core/indexing/lock.py +150 -0
  34. corbell/core/indexing/tracker.py +245 -0
  35. corbell/core/llm_client.py +677 -0
  36. corbell/core/mcp/__init__.py +1 -0
  37. corbell/core/mcp/server.py +214 -0
  38. corbell/core/query/__init__.py +1 -0
  39. corbell/core/query/diagnostics.py +38 -0
  40. corbell/core/query/engine.py +321 -0
  41. corbell/core/query/enhancer.py +102 -0
  42. corbell/core/query/formatter.py +98 -0
  43. corbell/core/query/graph_expander.py +284 -0
  44. corbell/core/query/merger.py +171 -0
  45. corbell/core/query/reranker.py +131 -0
  46. corbell/core/workspace.py +408 -0
@@ -0,0 +1,150 @@
1
+ """Cross-platform file lock for index build serialization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ import time
7
+ from pathlib import Path
8
+ from types import TracebackType
9
+ from typing import Optional
10
+
11
+
12
+ class IndexLock:
13
+ """A cross-platform file lock that prevents concurrent index builds.
14
+
15
+ Uses ``fcntl.flock`` on Unix and ``msvcrt.locking`` on Windows.
16
+ Falls back to a polling loop so callers don't need to worry about
17
+ platform differences.
18
+
19
+ Usage::
20
+
21
+ lock = IndexLock(db_path.parent / "index.lock")
22
+ with lock:
23
+ # only one process at a time reaches here
24
+ ...
25
+
26
+ Args:
27
+ lock_path: Path to the lock file (created if absent).
28
+ timeout: Maximum seconds to wait before raising TimeoutError.
29
+ """
30
+
31
+ def __init__(self, lock_path: Path | str, timeout: int = 300) -> None:
32
+ self.lock_path = Path(lock_path)
33
+ self.timeout = timeout
34
+ self._fh: Optional[object] = None # file handle, platform-specific type
35
+
36
+ # ------------------------------------------------------------------ #
37
+ # Public interface #
38
+ # ------------------------------------------------------------------ #
39
+
40
+ def acquire(self) -> None:
41
+ """Block until the lock is acquired or timeout is reached.
42
+
43
+ Raises:
44
+ TimeoutError: If the lock cannot be acquired within ``timeout`` seconds.
45
+ """
46
+ self.lock_path.parent.mkdir(parents=True, exist_ok=True)
47
+ deadline = time.monotonic() + self.timeout
48
+
49
+ while True:
50
+ try:
51
+ self._acquire_once()
52
+ return # success
53
+ except (OSError, IOError):
54
+ pass # lock held by another process
55
+
56
+ if time.monotonic() >= deadline:
57
+ raise TimeoutError(
58
+ f"Could not acquire index lock at {self.lock_path} "
59
+ f"within {self.timeout}s. Another build may be running."
60
+ )
61
+ time.sleep(0.25)
62
+
63
+ def release(self) -> None:
64
+ """Release the lock and close the file handle.
65
+
66
+ Safe to call even if the lock was never acquired.
67
+ """
68
+ if self._fh is None:
69
+ return
70
+ try:
71
+ if sys.platform == "win32":
72
+ self._release_windows()
73
+ else:
74
+ self._release_unix()
75
+ except Exception:
76
+ pass
77
+ finally:
78
+ try:
79
+ self._fh.close() # type: ignore[union-attr]
80
+ except Exception:
81
+ pass
82
+ self._fh = None
83
+
84
+ # ------------------------------------------------------------------ #
85
+ # Context manager #
86
+ # ------------------------------------------------------------------ #
87
+
88
+ def __enter__(self) -> "IndexLock":
89
+ self.acquire()
90
+ return self
91
+
92
+ def __exit__(
93
+ self,
94
+ exc_type: Optional[type],
95
+ exc_val: Optional[BaseException],
96
+ exc_tb: Optional[TracebackType],
97
+ ) -> None:
98
+ self.release()
99
+
100
+ # ------------------------------------------------------------------ #
101
+ # Platform-specific internals #
102
+ # ------------------------------------------------------------------ #
103
+
104
+ def _acquire_once(self) -> None:
105
+ """Attempt a single non-blocking lock acquisition.
106
+
107
+ Raises OSError/IOError if the lock is held by another process.
108
+ """
109
+ if sys.platform == "win32":
110
+ self._acquire_windows()
111
+ else:
112
+ self._acquire_unix()
113
+
114
+ def _acquire_unix(self) -> None:
115
+ import fcntl
116
+
117
+ fh = open(self.lock_path, "w") # noqa: WPS515
118
+ try:
119
+ fcntl.flock(fh.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
120
+ except (OSError, IOError):
121
+ fh.close()
122
+ raise
123
+ self._fh = fh
124
+
125
+ def _release_unix(self) -> None:
126
+ import fcntl
127
+
128
+ fcntl.flock(self._fh.fileno(), fcntl.LOCK_UN) # type: ignore[union-attr]
129
+
130
+ def _acquire_windows(self) -> None:
131
+ import msvcrt
132
+
133
+ fh = open(self.lock_path, "w") # noqa: WPS515
134
+ try:
135
+ # Lock the first byte of the file (non-blocking)
136
+ msvcrt.locking(fh.fileno(), msvcrt.LK_NBLCK, 1)
137
+ except (OSError, IOError):
138
+ fh.close()
139
+ raise
140
+ self._fh = fh
141
+
142
+ def _release_windows(self) -> None:
143
+ import msvcrt
144
+
145
+ try:
146
+ # Seek back to start before unlocking
147
+ self._fh.seek(0) # type: ignore[union-attr]
148
+ except Exception:
149
+ pass
150
+ msvcrt.locking(self._fh.fileno(), msvcrt.LK_UNLCK, 1) # type: ignore[union-attr]
@@ -0,0 +1,245 @@
1
+ """Index tracking: file-level mtime tracking and global index metadata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sqlite3
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from corbell.core.workspace import WorkspaceConfig
13
+
14
+ _CREATE_FILE_META = """
15
+ CREATE TABLE IF NOT EXISTS file_index_meta (
16
+ file_path TEXT NOT NULL,
17
+ repo_id TEXT NOT NULL,
18
+ mtime REAL NOT NULL,
19
+ indexed_at REAL NOT NULL,
20
+ PRIMARY KEY (file_path, repo_id)
21
+ );
22
+ """
23
+
24
+ _CREATE_INDEX_META = """
25
+ CREATE TABLE IF NOT EXISTS index_meta (
26
+ key TEXT PRIMARY KEY,
27
+ value TEXT NOT NULL
28
+ );
29
+ """
30
+
31
+
32
+ @dataclass
33
+ class StaleResult:
34
+ """Result of a stale-file detection scan."""
35
+
36
+ added: List[tuple] = field(default_factory=list) # [(file_path, repo_id), ...]
37
+ modified: List[tuple] = field(default_factory=list) # [(file_path, repo_id), ...]
38
+ deleted: List[tuple] = field(default_factory=list) # [(file_path, repo_id), ...]
39
+
40
+ @property
41
+ def has_changes(self) -> bool:
42
+ """True if any files need to be reindexed."""
43
+ return bool(self.added or self.modified or self.deleted)
44
+
45
+ @property
46
+ def changed_repo_ids(self) -> set:
47
+ """Set of repo IDs that have at least one changed file."""
48
+ result = set()
49
+ for _, repo_id in self.added:
50
+ result.add(repo_id)
51
+ for _, repo_id in self.modified:
52
+ result.add(repo_id)
53
+ for _, repo_id in self.deleted:
54
+ result.add(repo_id)
55
+ return result
56
+
57
+
58
+ class IndexTracker:
59
+ """Manages file_index_meta and index_meta tables for incremental indexing.
60
+
61
+ The file_index_meta table records each indexed file's path, repo_id, and
62
+ mtime so the builder can detect added/modified/deleted files on subsequent
63
+ runs without a full re-scan of every file's content.
64
+
65
+ The index_meta table stores global key-value metadata (embedding_model,
66
+ last_build_at, chunk_size, overlap).
67
+ """
68
+
69
+ def __init__(self, db_path: Path | str) -> None:
70
+ self.db_path = Path(db_path)
71
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
72
+ self._create_tables()
73
+
74
+ def _conn(self) -> sqlite3.Connection:
75
+ conn = sqlite3.connect(str(self.db_path))
76
+ conn.row_factory = sqlite3.Row
77
+ return conn
78
+
79
+ def _create_tables(self) -> None:
80
+ """Ensure the tracking tables exist."""
81
+ with self._conn() as conn:
82
+ conn.execute(_CREATE_FILE_META)
83
+ conn.execute(_CREATE_INDEX_META)
84
+ conn.commit()
85
+
86
+ # Alias for external callers that prefer create_tables()
87
+ def create_tables(self) -> None:
88
+ self._create_tables()
89
+
90
+ def get_last_build_at(self) -> Optional[float]:
91
+ """Return the Unix timestamp of the last successful build, or None."""
92
+ with self._conn() as conn:
93
+ row = conn.execute(
94
+ "SELECT value FROM index_meta WHERE key = 'last_build_at'"
95
+ ).fetchone()
96
+ if row:
97
+ try:
98
+ return float(row["value"])
99
+ except (ValueError, TypeError):
100
+ return None
101
+ return None
102
+
103
+ def get_stored_model(self) -> Optional[str]:
104
+ """Return the model name stored at last build, or None."""
105
+ with self._conn() as conn:
106
+ row = conn.execute(
107
+ "SELECT value FROM index_meta WHERE key = 'embedding_model'"
108
+ ).fetchone()
109
+ return row["value"] if row else None
110
+
111
+ def set_meta(self, key: str, value: str) -> None:
112
+ """Upsert a key-value pair in index_meta."""
113
+ with self._conn() as conn:
114
+ conn.execute(
115
+ "INSERT INTO index_meta (key, value) VALUES (?, ?) "
116
+ "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
117
+ (key, value),
118
+ )
119
+ conn.commit()
120
+
121
+ def get_stale_files(
122
+ self,
123
+ repos: List,
124
+ config: "WorkspaceConfig",
125
+ ) -> StaleResult:
126
+ """Detect added, modified, and deleted files across all repos.
127
+
128
+ Args:
129
+ repos: List of RepoConfig objects with resolved_path set.
130
+ config: WorkspaceConfig for skip_dirs and file extension settings.
131
+
132
+ Returns:
133
+ StaleResult with added, modified, deleted lists of (file_path, repo_id).
134
+ """
135
+ from corbell.core.constants import EXTENSION_LANG, SKIP_DIRS
136
+ from corbell.core.gitignore import load_gitignore
137
+ from corbell.core.workspace import IndexingConfig
138
+
139
+ indexing: IndexingConfig = config.indexing
140
+ extra_skip = set(indexing.skip_dirs)
141
+ all_skip = SKIP_DIRS | extra_skip
142
+
143
+ result = StaleResult()
144
+
145
+ # Load all currently tracked entries from DB
146
+ with self._conn() as conn:
147
+ rows = conn.execute(
148
+ "SELECT file_path, repo_id, mtime FROM file_index_meta"
149
+ ).fetchall()
150
+
151
+ tracked: Dict[tuple, float] = {}
152
+ for row in rows:
153
+ tracked[(row["file_path"], row["repo_id"])] = row["mtime"]
154
+
155
+ # Scan current files on disk
156
+ seen_keys: set = set()
157
+ for repo in repos:
158
+ repo_id = repo.id
159
+ repo_path = repo.resolved_path
160
+ if not repo_path or not repo_path.exists():
161
+ continue
162
+
163
+ gitignore_spec = load_gitignore(repo_path)
164
+
165
+ for fp in repo_path.rglob("*"):
166
+ if not fp.is_file():
167
+ continue
168
+ # Check skip dirs
169
+ rel = fp.relative_to(repo_path)
170
+ if any(part in all_skip for part in rel.parts):
171
+ continue
172
+ # Check gitignore
173
+ if gitignore_spec.match_file(str(rel).replace("\\", "/")):
174
+ continue
175
+ # Check extension
176
+ if fp.suffix not in EXTENSION_LANG:
177
+ continue
178
+ # Check file size
179
+ try:
180
+ stat = fp.stat()
181
+ if stat.st_size > indexing.max_file_bytes:
182
+ continue
183
+ current_mtime = stat.st_mtime
184
+ except OSError:
185
+ continue
186
+
187
+ rel_str = str(rel)
188
+ key = (rel_str, repo_id)
189
+ seen_keys.add(key)
190
+
191
+ if key not in tracked:
192
+ result.added.append(key)
193
+ elif abs(current_mtime - tracked[key]) > 0.001:
194
+ result.modified.append(key)
195
+
196
+ # Detect deleted files
197
+ for key in tracked:
198
+ if key not in seen_keys:
199
+ result.deleted.append(key)
200
+
201
+ return result
202
+
203
+ def mark_indexed(self, file_path: str, repo_id: str, mtime: float) -> None:
204
+ """Record a file as indexed with its current mtime.
205
+
206
+ Should be called AFTER the embedding chunks have been committed to DB
207
+ to ensure crash safety (next run will re-detect the file if mtime not updated).
208
+
209
+ Args:
210
+ file_path: Relative file path within the repo.
211
+ repo_id: The repository ID.
212
+ mtime: File modification time (from os.path.getmtime).
213
+ """
214
+ with self._conn() as conn:
215
+ conn.execute(
216
+ "INSERT INTO file_index_meta (file_path, repo_id, mtime, indexed_at) "
217
+ "VALUES (?, ?, ?, ?) "
218
+ "ON CONFLICT(file_path, repo_id) DO UPDATE SET "
219
+ "mtime = excluded.mtime, indexed_at = excluded.indexed_at",
220
+ (file_path, repo_id, mtime, time.time()),
221
+ )
222
+ conn.commit()
223
+
224
+ def remove_tracked(self, file_paths: List[tuple]) -> None:
225
+ """Remove tracking entries for deleted files.
226
+
227
+ Args:
228
+ file_paths: List of ``(file_path, repo_id)`` tuples to remove.
229
+ """
230
+ if not file_paths:
231
+ return
232
+ with self._conn() as conn:
233
+ for file_path, repo_id in file_paths:
234
+ conn.execute(
235
+ "DELETE FROM file_index_meta WHERE file_path = ? AND repo_id = ?",
236
+ (file_path, repo_id),
237
+ )
238
+ conn.commit()
239
+
240
+ def clear_all(self) -> None:
241
+ """Remove all tracking data (used during --rebuild)."""
242
+ with self._conn() as conn:
243
+ conn.execute("DELETE FROM file_index_meta")
244
+ conn.execute("DELETE FROM index_meta")
245
+ conn.commit()