codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
  2. codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
  3. codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
  4. codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
  5. codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
  6. corbell/__init__.py +6 -0
  7. corbell/cli/__init__.py +1 -0
  8. corbell/cli/commands/__init__.py +1 -0
  9. corbell/cli/commands/index.py +86 -0
  10. corbell/cli/commands/query.py +71 -0
  11. corbell/cli/main.py +57 -0
  12. corbell/core/__init__.py +1 -0
  13. corbell/core/constants.py +52 -0
  14. corbell/core/embeddings/__init__.py +6 -0
  15. corbell/core/embeddings/base.py +68 -0
  16. corbell/core/embeddings/extractor.py +201 -0
  17. corbell/core/embeddings/factory.py +48 -0
  18. corbell/core/embeddings/model.py +401 -0
  19. corbell/core/embeddings/search_cache.py +95 -0
  20. corbell/core/embeddings/sqlite_store.py +271 -0
  21. corbell/core/gitignore.py +76 -0
  22. corbell/core/graph/__init__.py +1 -0
  23. corbell/core/graph/builder.py +696 -0
  24. corbell/core/graph/method_graph.py +1077 -0
  25. corbell/core/graph/providers/__init__.py +6 -0
  26. corbell/core/graph/providers/aws_patterns.py +62 -0
  27. corbell/core/graph/providers/azure_patterns.py +64 -0
  28. corbell/core/graph/providers/gcp_patterns.py +59 -0
  29. corbell/core/graph/schema.py +175 -0
  30. corbell/core/graph/sqlite_store.py +500 -0
  31. corbell/core/indexing/__init__.py +1 -0
  32. corbell/core/indexing/builder.py +608 -0
  33. corbell/core/indexing/lock.py +150 -0
  34. corbell/core/indexing/tracker.py +245 -0
  35. corbell/core/llm_client.py +677 -0
  36. corbell/core/mcp/__init__.py +1 -0
  37. corbell/core/mcp/server.py +214 -0
  38. corbell/core/query/__init__.py +1 -0
  39. corbell/core/query/diagnostics.py +38 -0
  40. corbell/core/query/engine.py +321 -0
  41. corbell/core/query/enhancer.py +102 -0
  42. corbell/core/query/formatter.py +98 -0
  43. corbell/core/query/graph_expander.py +284 -0
  44. corbell/core/query/merger.py +171 -0
  45. corbell/core/query/reranker.py +131 -0
  46. corbell/core/workspace.py +408 -0
@@ -0,0 +1,408 @@
1
+ """Workspace configuration for Corbell — env-var driven, no YAML required."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import shutil
7
+ import subprocess
8
+ import tempfile
9
+ from pathlib import Path
10
+ from typing import List, Optional
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+
15
+ class RepoConfig(BaseModel):
16
+ """A single repository definition."""
17
+
18
+ id: str
19
+ path: str
20
+ language: Optional[str] = None
21
+ resolved_path: Optional[Path] = Field(default=None, exclude=True)
22
+
23
+ model_config = {"extra": "ignore"}
24
+
25
+
26
+ class StorageConfig(BaseModel):
27
+ """Storage sub-config (single SQLite file for both graph and embeddings)."""
28
+
29
+ model: str = "all-MiniLM-L6-v2"
30
+
31
+ model_config = {"extra": "ignore"}
32
+
33
+ def resolved_model(self) -> str:
34
+ """Return the effective embedding model name.
35
+
36
+ Resolution order:
37
+ 1. ``CORBELL_EMBEDDING_MODEL`` env var (if set)
38
+ 2. ``model`` field default
39
+ """
40
+ return os.environ.get("CORBELL_EMBEDDING_MODEL") or self.model
41
+
42
+
43
+ class QueryConfig(BaseModel):
44
+ """Query pipeline configuration."""
45
+
46
+ top_k: int = 50
47
+ expand_call_depth: int = 2
48
+ expand_max_chunks: int = 30
49
+ rerank: bool = True
50
+
51
+ model_config = {"extra": "ignore"}
52
+
53
+
54
+ class IndexingConfig(BaseModel):
55
+ """Indexing pipeline configuration."""
56
+
57
+ skip_dirs: List[str] = Field(default_factory=list)
58
+ max_file_bytes: int = 1024 * 1024 # 1 MB
59
+ chunk_size: int = 50
60
+ chunk_overlap: int = 10
61
+
62
+ model_config = {"extra": "ignore"}
63
+
64
+
65
+ class LLMConfig(BaseModel):
66
+ """LLM provider configuration.
67
+
68
+ Local providers: openai, anthropic, ollama, google.
69
+ Cloud providers: aws (Bedrock), azure (Azure OpenAI), gcp (Vertex AI).
70
+
71
+ API key resolved via env vars:
72
+ ANTHROPIC_API_KEY, OPENAI_API_KEY, AZURE_OPENAI_API_KEY, CORBELL_LLM_API_KEY
73
+
74
+ Model can be overridden via env vars (checked in order):
75
+ 1. Provider-specific: ANTHROPIC_MODEL, OPENAI_MODEL, GOOGLE_MODEL, etc.
76
+ 2. Generic: CORBELL_LLM_MODEL
77
+ 3. ``model`` field default
78
+ """
79
+
80
+ provider: str = "anthropic"
81
+ model: str = "claude-sonnet-4-5"
82
+ api_key: Optional[str] = None
83
+
84
+ # AWS Bedrock
85
+ aws_region: Optional[str] = None
86
+
87
+ # Azure OpenAI
88
+ azure_endpoint: Optional[str] = None
89
+ azure_deployment: Optional[str] = None
90
+ azure_api_version: Optional[str] = None
91
+
92
+ # GCP Vertex AI
93
+ gcp_project: Optional[str] = None
94
+ gcp_region: Optional[str] = None
95
+
96
+ model_config = {"extra": "ignore"}
97
+
98
+ def resolved_model(self) -> str:
99
+ """Return the effective LLM model name.
100
+
101
+ Resolution order:
102
+ 1. Provider-specific env var (e.g. ``ANTHROPIC_MODEL``, ``GOOGLE_MODEL``)
103
+ 2. ``CORBELL_LLM_MODEL`` env var
104
+ 3. ``model`` field default
105
+ """
106
+ provider_env_map = {
107
+ "anthropic": "ANTHROPIC_MODEL",
108
+ "openai": "OPENAI_MODEL",
109
+ "google": "GOOGLE_MODEL",
110
+ "ollama": "OLLAMA_MODEL",
111
+ "aws": "AWS_MODEL",
112
+ "azure": "AZURE_MODEL",
113
+ "gcp": "GCP_MODEL",
114
+ }
115
+ provider_var = provider_env_map.get(self.provider.lower())
116
+ if provider_var:
117
+ val = os.environ.get(provider_var)
118
+ if val:
119
+ return val
120
+ return os.environ.get("CORBELL_LLM_MODEL") or self.model
121
+
122
+ def resolved_api_key(self) -> Optional[str]:
123
+ """Return the API key from env vars."""
124
+ # Cloud providers use their own credential chains (no API key needed)
125
+ if self.provider in ("aws", "gcp"):
126
+ return None
127
+ # Fall back to well-known env vars
128
+ env_map = {
129
+ "openai": "OPENAI_API_KEY",
130
+ "anthropic": "ANTHROPIC_API_KEY",
131
+ "azure": "AZURE_OPENAI_API_KEY",
132
+ "google": "GOOGLE_API_KEY",
133
+ "ollama": None,
134
+ }
135
+ env_var = env_map.get(self.provider.lower(), "CORBELL_LLM_API_KEY")
136
+ if env_var:
137
+ return os.environ.get(env_var) or os.environ.get("CORBELL_LLM_API_KEY")
138
+ return None
139
+
140
+
141
+ class WorkspaceConfig(BaseModel):
142
+ """Root workspace configuration model (populated from env vars)."""
143
+
144
+ version: str = "1"
145
+ repos: List[RepoConfig] = Field(default_factory=list)
146
+ storage: StorageConfig = Field(default_factory=StorageConfig)
147
+ query: QueryConfig = Field(default_factory=QueryConfig)
148
+ indexing: IndexingConfig = Field(default_factory=IndexingConfig)
149
+ llm: LLMConfig = Field(default_factory=LLMConfig)
150
+
151
+ model_config = {"extra": "ignore"}
152
+
153
+
154
+ # ---------------------------------------------------------------------------
155
+ # Path helpers
156
+ # ---------------------------------------------------------------------------
157
+
158
+
159
+ def sanitize_path(workspace_path: Path) -> str:
160
+ """Sanitize a workspace path for use as a filesystem directory name.
161
+
162
+ Steps:
163
+ 1. Resolve to absolute path.
164
+ 2. Strip trailing separators.
165
+ 3. Replace ``/``, ``\\``, ``:`` with ``-``.
166
+ 4. Strip leading ``-`` characters.
167
+
168
+ Examples:
169
+ /home/user/projects/my-app → home-user-projects-my-app
170
+ D:\\projects\\Python\\local-context-engine → D--projects-Python-local-context-engine
171
+ """
172
+ resolved = str(workspace_path.resolve())
173
+ # Strip trailing path separators
174
+ resolved = resolved.rstrip("/\\")
175
+ # Replace path separators and Windows drive colon with dash
176
+ sanitized = resolved.replace("\\", "-").replace("/", "-").replace(":", "-")
177
+ # Strip leading dashes (e.g. from a leading / after replacement on Linux)
178
+ sanitized = sanitized.lstrip("-")
179
+ return sanitized
180
+
181
+
182
+ def resolve_embedding_dimension(model_name: str) -> int:
183
+ """Return the embedding vector dimension for *model_name*.
184
+
185
+ Resolution order:
186
+ 1. ``CORBELL_EMBEDDING_DIM`` env var (if set)
187
+ 2. Prefix-based rule for voyage-* and gemini-* models
188
+ 3. Exact lookup in ``KNOWN_DIMS``
189
+ 4. Default fallback of 384
190
+
191
+ Never loads an actual model — pure lookup only.
192
+ """
193
+ dim_env = os.environ.get("CORBELL_EMBEDDING_DIM", "").strip()
194
+ if dim_env:
195
+ return int(dim_env)
196
+ if model_name.startswith("voyage-"):
197
+ return 1024
198
+ if model_name.startswith("gemini-"):
199
+ return 768
200
+ known_dims = {
201
+ "all-MiniLM-L6-v2": 384,
202
+ "all-MiniLM-L12-v2": 384,
203
+ "all-mpnet-base-v2": 768,
204
+ }
205
+ return known_dims.get(model_name, 384)
206
+
207
+
208
+ def detect_git_branch(workspace_path: Path) -> str:
209
+ """Detect the current git branch for *workspace_path*.
210
+
211
+ Returns the branch name, ``"detached-<short-sha>"`` for detached HEAD,
212
+ or ``"_no_git"`` when git is unavailable or the directory is not a repo.
213
+ """
214
+ try:
215
+ result = subprocess.run(
216
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"],
217
+ cwd=str(workspace_path),
218
+ capture_output=True,
219
+ text=True,
220
+ timeout=5,
221
+ )
222
+ if result.returncode == 0:
223
+ branch = result.stdout.strip()
224
+ if branch and branch != "HEAD":
225
+ return branch
226
+ result2 = subprocess.run(
227
+ ["git", "rev-parse", "--short", "HEAD"],
228
+ cwd=str(workspace_path),
229
+ capture_output=True,
230
+ text=True,
231
+ timeout=5,
232
+ )
233
+ if result2.returncode == 0:
234
+ return f"detached-{result2.stdout.strip()}"
235
+ except (FileNotFoundError, subprocess.TimeoutExpired):
236
+ pass
237
+ return "_no_git"
238
+
239
+
240
+ def _seed_from_sibling(base_dir: Path, target_namespace: str, model_dim_prefix: str) -> None:
241
+ """Copy the most-recently-modified sibling DB into *target_namespace* if it doesn't exist yet.
242
+
243
+ Looks for sibling directories under *base_dir* that share the same
244
+ ``model--dimension`` prefix but differ in branch name. The most recent DB
245
+ is copied atomically (temp file + os.replace) so a crash mid-copy never
246
+ leaves a partial database file.
247
+
248
+ Args:
249
+ base_dir: Parent directory that contains per-namespace subdirectories.
250
+ target_namespace: The namespace directory name to seed (``model--dim--branch``).
251
+ model_dim_prefix: The ``model--dimension`` prefix used to identify siblings
252
+ (e.g. ``"all-MiniLM-L6-v2--384"``). Passed explicitly to avoid
253
+ ambiguous parsing when branch names contain ``--``.
254
+ """
255
+ target_dir = base_dir / target_namespace
256
+ target_db = target_dir / "workspace.db"
257
+ if target_db.exists():
258
+ return
259
+
260
+ # Find sibling dirs with same model+dim but different branch
261
+ candidates = []
262
+ if base_dir.exists():
263
+ for d in base_dir.iterdir():
264
+ if not d.is_dir() or d.name == target_namespace:
265
+ continue
266
+ if not d.name.startswith(model_dim_prefix + "--"):
267
+ continue
268
+ db_file = d / "workspace.db"
269
+ if db_file.exists():
270
+ try:
271
+ candidates.append((db_file.stat().st_mtime, db_file))
272
+ except OSError:
273
+ continue
274
+
275
+ if not candidates:
276
+ return
277
+
278
+ candidates.sort(reverse=True)
279
+ best_db = candidates[0][1]
280
+
281
+ target_dir.mkdir(parents=True, exist_ok=True)
282
+
283
+ # Atomic: write to temp file in same directory, then rename
284
+ fd, tmp_path = tempfile.mkstemp(dir=str(target_dir), suffix=".db.tmp")
285
+ try:
286
+ os.close(fd)
287
+ shutil.copy2(str(best_db), tmp_path)
288
+ os.replace(tmp_path, str(target_db))
289
+ except Exception:
290
+ try:
291
+ os.unlink(tmp_path)
292
+ except OSError:
293
+ pass
294
+
295
+
296
+ def db_path_for_workspace(workspace_path: Path, model: Optional[str] = None) -> Path:
297
+ """Return the SQLite DB path for a workspace, namespaced by model, dimension, and git branch.
298
+
299
+ Stored at ``~/.vibervn/context-engine/{sanitized}/{model}--{dim}--{branch}/workspace.db``.
300
+ Creates parent directories automatically.
301
+
302
+ The namespace isolates index data so switching embedding models, changing
303
+ vector dimensions, or checking out a different branch never corrupts an
304
+ existing index. When a new branch namespace is first used, the most recent
305
+ sibling DB (same model+dim, different branch) is copied as a warm seed so
306
+ incremental indexing can pick up where it left off.
307
+
308
+ Args:
309
+ workspace_path: Path to the workspace root directory.
310
+ model: Embedding model name. Falls back to ``CORBELL_EMBEDDING_MODEL``
311
+ env var, then ``"all-MiniLM-L6-v2"``.
312
+ """
313
+ model_name = model or os.environ.get("CORBELL_EMBEDDING_MODEL") or "all-MiniLM-L6-v2"
314
+ dimension = resolve_embedding_dimension(model_name)
315
+ branch = detect_git_branch(workspace_path)
316
+
317
+ sanitized_model = model_name.replace("/", "_").replace("\\", "_")
318
+ sanitized_branch = branch.replace("/", "_").replace("\\", "_")
319
+ model_dim_prefix = f"{sanitized_model}--{dimension}"
320
+ namespace = f"{model_dim_prefix}--{sanitized_branch}"
321
+
322
+ name = sanitize_path(workspace_path)
323
+ base_dir = Path.home() / ".vibervn" / "context-engine" / name
324
+
325
+ # Seed from sibling branch if this is a new namespace
326
+ if not (base_dir / namespace / "workspace.db").exists():
327
+ _seed_from_sibling(base_dir, namespace, model_dim_prefix)
328
+
329
+ db_dir = base_dir / namespace
330
+ db_dir.mkdir(parents=True, exist_ok=True)
331
+ return db_dir / "workspace.db"
332
+
333
+
334
+ def _detect_language(path: Path) -> str:
335
+ """Detect the most likely language of a project directory based on key files."""
336
+ if (path / "package.json").exists() or (path / "tsconfig.json").exists():
337
+ return "typescript"
338
+ if (
339
+ (path / "requirements.txt").exists()
340
+ or (path / "pyproject.toml").exists()
341
+ or (path / "Pipfile").exists()
342
+ or (path / "setup.py").exists()
343
+ ):
344
+ return "python"
345
+ if (path / "go.mod").exists():
346
+ return "go"
347
+ if (path / "pom.xml").exists() or (path / "build.gradle").exists():
348
+ return "java"
349
+ if (path / "Cargo.toml").exists():
350
+ return "rust"
351
+ return "python"
352
+
353
+
354
+ def build_config(workspace_path: Path) -> WorkspaceConfig:
355
+ """Build a WorkspaceConfig from environment variables and a workspace path.
356
+
357
+ Reads all ``CORBELL_*`` env vars with sensible defaults, then constructs
358
+ a single RepoConfig from the workspace_path (id = basename, path = workspace_path).
359
+
360
+ Args:
361
+ workspace_path: Absolute path to the workspace (repository) root directory.
362
+
363
+ Returns:
364
+ Fully populated WorkspaceConfig ready for use by the indexer and query engine.
365
+ """
366
+ workspace_path = workspace_path.resolve()
367
+
368
+ # Parse env vars
369
+ top_k = int(os.environ.get("CORBELL_TOP_K", "50"))
370
+ chunk_size = int(os.environ.get("CORBELL_CHUNK_SIZE", "50"))
371
+ chunk_overlap = int(os.environ.get("CORBELL_CHUNK_OVERLAP", "10"))
372
+ expand_call_depth = int(os.environ.get("CORBELL_EXPAND_CALL_DEPTH", "2"))
373
+ expand_max_chunks = int(os.environ.get("CORBELL_EXPAND_MAX_CHUNKS", "30"))
374
+ rerank_str = os.environ.get("CORBELL_RERANK", "true").lower()
375
+ rerank = rerank_str not in ("false", "0", "no")
376
+ embedding_model = os.environ.get("CORBELL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
377
+ max_file_bytes = int(os.environ.get("CORBELL_MAX_FILE_BYTES", str(1024 * 1024)))
378
+ skip_dirs_str = os.environ.get("CORBELL_SKIP_DIRS", "")
379
+ skip_dirs = [d.strip() for d in skip_dirs_str.split(",") if d.strip()] if skip_dirs_str else []
380
+ llm_model = os.environ.get("CORBELL_LLM_MODEL", "claude-sonnet-4-5")
381
+
382
+ # Single repo: workspace root IS the repo
383
+ repo_id = workspace_path.name
384
+ language = _detect_language(workspace_path)
385
+ repo = RepoConfig(
386
+ id=repo_id,
387
+ path=str(workspace_path),
388
+ language=language,
389
+ resolved_path=workspace_path,
390
+ )
391
+
392
+ return WorkspaceConfig(
393
+ repos=[repo],
394
+ storage=StorageConfig(model=embedding_model),
395
+ query=QueryConfig(
396
+ top_k=top_k,
397
+ expand_call_depth=expand_call_depth,
398
+ expand_max_chunks=expand_max_chunks,
399
+ rerank=rerank,
400
+ ),
401
+ indexing=IndexingConfig(
402
+ skip_dirs=skip_dirs,
403
+ max_file_bytes=max_file_bytes,
404
+ chunk_size=chunk_size,
405
+ chunk_overlap=chunk_overlap,
406
+ ),
407
+ llm=LLMConfig(model=llm_model),
408
+ )