codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
  2. codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
  3. codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
  4. codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
  5. codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
  6. corbell/__init__.py +6 -0
  7. corbell/cli/__init__.py +1 -0
  8. corbell/cli/commands/__init__.py +1 -0
  9. corbell/cli/commands/index.py +86 -0
  10. corbell/cli/commands/query.py +71 -0
  11. corbell/cli/main.py +57 -0
  12. corbell/core/__init__.py +1 -0
  13. corbell/core/constants.py +52 -0
  14. corbell/core/embeddings/__init__.py +6 -0
  15. corbell/core/embeddings/base.py +68 -0
  16. corbell/core/embeddings/extractor.py +201 -0
  17. corbell/core/embeddings/factory.py +48 -0
  18. corbell/core/embeddings/model.py +401 -0
  19. corbell/core/embeddings/search_cache.py +95 -0
  20. corbell/core/embeddings/sqlite_store.py +271 -0
  21. corbell/core/gitignore.py +76 -0
  22. corbell/core/graph/__init__.py +1 -0
  23. corbell/core/graph/builder.py +696 -0
  24. corbell/core/graph/method_graph.py +1077 -0
  25. corbell/core/graph/providers/__init__.py +6 -0
  26. corbell/core/graph/providers/aws_patterns.py +62 -0
  27. corbell/core/graph/providers/azure_patterns.py +64 -0
  28. corbell/core/graph/providers/gcp_patterns.py +59 -0
  29. corbell/core/graph/schema.py +175 -0
  30. corbell/core/graph/sqlite_store.py +500 -0
  31. corbell/core/indexing/__init__.py +1 -0
  32. corbell/core/indexing/builder.py +608 -0
  33. corbell/core/indexing/lock.py +150 -0
  34. corbell/core/indexing/tracker.py +245 -0
  35. corbell/core/llm_client.py +677 -0
  36. corbell/core/mcp/__init__.py +1 -0
  37. corbell/core/mcp/server.py +214 -0
  38. corbell/core/query/__init__.py +1 -0
  39. corbell/core/query/diagnostics.py +38 -0
  40. corbell/core/query/engine.py +321 -0
  41. corbell/core/query/enhancer.py +102 -0
  42. corbell/core/query/formatter.py +98 -0
  43. corbell/core/query/graph_expander.py +284 -0
  44. corbell/core/query/merger.py +171 -0
  45. corbell/core/query/reranker.py +131 -0
  46. corbell/core/workspace.py +408 -0
@@ -0,0 +1,201 @@
1
+ """Code chunk extractor for embedding indexing.
2
+
3
+ Extracts code chunks (functions, classes, methods) from source files.
4
+ Extracts function/class/method chunks using Python ast; generic line-split for others.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import ast
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import List, Optional
13
+
14
+ import pathspec
15
+
16
+ from corbell.core.constants import EXTENSION_LANG as _SUPPORTED, SKIP_DIRS as _SKIP_DIRS
17
+ from corbell.core.gitignore import load_gitignore
18
+
19
+
20
+ @dataclass
21
+ class EmbeddingRecord:
22
+ """A chunk of code ready to be embedded."""
23
+
24
+ id: str
25
+ service_id: str
26
+ repo: str
27
+ file_path: str # relative path within repo
28
+ start_line: int
29
+ end_line: int
30
+ content: str
31
+ language: str
32
+ chunk_type: str # function | class | method | block
33
+ symbol: Optional[str] = None # function/class name if known
34
+ embedding: Optional[List[float]] = None
35
+
36
+
37
+ class CodeChunkExtractor:
38
+ """Extract meaningful code chunks from source files in a repo.
39
+
40
+ Produces :class:`EmbeddingRecord` instances that can be stored in any
41
+ embedding backend.
42
+ """
43
+
44
+ def __init__(self, chunk_size: int = 50, overlap: int = 10):
45
+ """Initialize the extractor.
46
+
47
+ Args:
48
+ chunk_size: Lines per generic block chunk.
49
+ overlap: Overlap between consecutive generic chunks.
50
+ """
51
+ self.chunk_size = chunk_size
52
+ self.overlap = overlap
53
+
54
+ def extract_from_repo(
55
+ self,
56
+ repo_path: Path | str,
57
+ service_id: str,
58
+ max_file_bytes: int = 1024 * 1024,
59
+ gitignore_spec: Optional[pathspec.PathSpec] = None,
60
+ ) -> List[EmbeddingRecord]:
61
+ """Walk a repo and extract all code chunks.
62
+
63
+ Args:
64
+ repo_path: Root directory of the repository.
65
+ service_id: ID of the owning service.
66
+ max_file_bytes: Skip files larger than this.
67
+ gitignore_spec: Pre-loaded PathSpec for gitignore filtering.
68
+ If None, it is loaded from the repo automatically.
69
+
70
+ Returns:
71
+ List of :class:`EmbeddingRecord` ready for embedding.
72
+ """
73
+ repo_path = Path(repo_path)
74
+ if gitignore_spec is None:
75
+ gitignore_spec = load_gitignore(repo_path)
76
+ records: List[EmbeddingRecord] = []
77
+
78
+ for fp in repo_path.rglob("*"):
79
+ if not fp.is_file():
80
+ continue
81
+ if self._should_skip(fp, max_file_bytes):
82
+ continue
83
+ lang = _SUPPORTED.get(fp.suffix)
84
+ if not lang:
85
+ continue
86
+ rel = str(fp.relative_to(repo_path))
87
+ if gitignore_spec.match_file(rel.replace("\\", "/")):
88
+ continue
89
+ chunks = self._extract_file(fp, rel, lang, service_id, str(repo_path))
90
+ records.extend(chunks)
91
+
92
+ return records
93
+
94
+ # ------------------------------------------------------------------ #
95
+ # Internal helpers #
96
+ # ------------------------------------------------------------------ #
97
+
98
+ def _should_skip(self, fp: Path, max_bytes: int) -> bool:
99
+ if any(part in _SKIP_DIRS for part in fp.parts):
100
+ return True
101
+ try:
102
+ if fp.stat().st_size > max_bytes:
103
+ return True
104
+ except OSError:
105
+ return True
106
+ return False
107
+
108
+ def _extract_file(
109
+ self, fp: Path, rel: str, lang: str, service_id: str, repo: str
110
+ ) -> List[EmbeddingRecord]:
111
+ try:
112
+ content = fp.read_text(encoding="utf-8", errors="ignore")
113
+ except Exception:
114
+ return []
115
+
116
+ if lang == "python":
117
+ return self._extract_python(content, rel, service_id, repo)
118
+ return self._extract_generic(content, rel, lang, service_id, repo)
119
+
120
+ def _extract_python(
121
+ self, content: str, rel: str, service_id: str, repo: str
122
+ ) -> List[EmbeddingRecord]:
123
+ """Use Python's ast to extract function/class definitions."""
124
+ records: List[EmbeddingRecord] = []
125
+ lines = content.splitlines()
126
+
127
+ try:
128
+ tree = ast.parse(content)
129
+ except SyntaxError:
130
+ return self._extract_generic(content, rel, "python", service_id, repo)
131
+
132
+ class _Visitor(ast.NodeVisitor):
133
+ def __init__(self_v):
134
+ self_v.class_stack: List[str] = []
135
+
136
+ def _emit(self_v, node, name: str, chunk_type: str):
137
+ line_start = node.lineno
138
+ line_end = getattr(node, "end_lineno", node.lineno)
139
+ chunk_content = "\n".join(lines[line_start - 1 : line_end])
140
+ symbol = ".".join(self_v.class_stack + [name])
141
+ rec = EmbeddingRecord(
142
+ id=f"{service_id}::{rel}::{symbol}",
143
+ service_id=service_id,
144
+ repo=repo,
145
+ file_path=rel,
146
+ start_line=line_start,
147
+ end_line=line_end,
148
+ content=chunk_content,
149
+ language="python",
150
+ chunk_type=chunk_type,
151
+ symbol=symbol,
152
+ )
153
+ records.append(rec)
154
+
155
+ def visit_ClassDef(self_v, node):
156
+ self_v._emit(node, node.name, "class")
157
+ self_v.class_stack.append(node.name)
158
+ self_v.generic_visit(node)
159
+ self_v.class_stack.pop()
160
+
161
+ def visit_FunctionDef(self_v, node):
162
+ chunk_type = "method" if self_v.class_stack else "function"
163
+ self_v._emit(node, node.name, chunk_type)
164
+
165
+ visit_AsyncFunctionDef = visit_FunctionDef
166
+
167
+ _Visitor().visit(tree)
168
+ # Fall back to generic if nothing found
169
+ if not records:
170
+ return self._extract_generic(content, rel, "python", service_id, repo)
171
+ return records
172
+
173
+ def _extract_generic(
174
+ self, content: str, rel: str, lang: str, service_id: str, repo: str
175
+ ) -> List[EmbeddingRecord]:
176
+ """Split file into overlapping line-based blocks."""
177
+ lines = content.splitlines()
178
+ records: List[EmbeddingRecord] = []
179
+ step = max(1, self.chunk_size - self.overlap)
180
+
181
+ for i in range(0, len(lines), step):
182
+ end = min(i + self.chunk_size, len(lines))
183
+ chunk_lines = lines[i:end]
184
+ if not any(l.strip() for l in chunk_lines):
185
+ continue
186
+ chunk_content = "\n".join(chunk_lines)
187
+ records.append(
188
+ EmbeddingRecord(
189
+ id=f"{service_id}::{rel}::block_{i}",
190
+ service_id=service_id,
191
+ repo=repo,
192
+ file_path=rel,
193
+ start_line=i + 1,
194
+ end_line=end,
195
+ content=chunk_content,
196
+ language=lang,
197
+ chunk_type="block",
198
+ )
199
+ )
200
+
201
+ return records
@@ -0,0 +1,48 @@
1
+ """Factory for creating EmbeddingStore instances by backend name.
2
+
3
+ To add a new backend:
4
+ 1. Create a class that implements :class:`~corbell.core.embeddings.base.EmbeddingStore`.
5
+ 2. Add an ``elif backend == "<name>":`` branch below.
6
+ 3. Users opt in via ``storage.embeddings.backend: <name>`` in workspace.yaml.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from pathlib import Path
12
+
13
+ from corbell.core.embeddings.base import EmbeddingStore
14
+
15
+ _SUPPORTED_BACKENDS = ("sqlite",)
16
+
17
+
18
+ def get_embedding_store(backend: str, db_path: Path) -> EmbeddingStore:
19
+ """Return an :class:`EmbeddingStore` for the requested backend.
20
+
21
+ Args:
22
+ backend: Backend identifier string (e.g. ``"sqlite"``).
23
+ db_path: Path to the storage file / directory.
24
+
25
+ Returns:
26
+ A concrete :class:`EmbeddingStore` instance.
27
+
28
+ Raises:
29
+ ValueError: If ``backend`` is not a recognised backend name.
30
+ """
31
+ backend = backend.lower().strip()
32
+
33
+ if backend == "sqlite":
34
+ from corbell.core.embeddings.sqlite_store import SQLiteEmbeddingStore
35
+ return SQLiteEmbeddingStore(db_path)
36
+
37
+ # ------------------------------------------------------------------ #
38
+ # Future backends — add branches here, e.g.: #
39
+ # elif backend == "kuzu": #
40
+ # from corbell.core.embeddings.kuzu_store import KuzuEmbeddingStore
41
+ # return KuzuEmbeddingStore(db_path) #
42
+ # ------------------------------------------------------------------ #
43
+
44
+ raise ValueError(
45
+ f"Unknown embedding backend: {backend!r}. "
46
+ f"Supported backends: {', '.join(_SUPPORTED_BACKENDS)}. "
47
+ f"Set 'storage.embeddings.backend' in workspace.yaml."
48
+ )
@@ -0,0 +1,401 @@
1
+ """Embedding model interface + SentenceTransformers implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ import random
8
+ from abc import ABC, abstractmethod
9
+ from typing import List, Optional
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class EmbeddingModel(ABC):
15
+ """Abstract embedding model interface."""
16
+
17
+ @abstractmethod
18
+ def encode(self, texts: List[str]) -> List[List[float]]:
19
+ """Encode a list of texts into embedding vectors.
20
+
21
+ Args:
22
+ texts: List of text strings to encode.
23
+
24
+ Returns:
25
+ List of float vectors (one per input text).
26
+ """
27
+ ...
28
+
29
+ @property
30
+ @abstractmethod
31
+ def dimension(self) -> int:
32
+ """Return the embedding dimension."""
33
+ ...
34
+
35
+
36
+ class SentenceTransformerModel(EmbeddingModel):
37
+ """Wraps ``sentence-transformers`` with lazy loading.
38
+
39
+ Uses ``all-MiniLM-L6-v2`` by default (384-dim, fast, no API key).
40
+ """
41
+
42
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
43
+ self.model_name = model_name
44
+ self._model = None # lazy-loaded
45
+
46
+ def _get_model(self):
47
+ if self._model is None:
48
+ from sentence_transformers import SentenceTransformer
49
+ self._model = SentenceTransformer(f"sentence-transformers/{self.model_name}")
50
+ return self._model
51
+
52
+ def encode(self, texts: List[str]) -> List[List[float]]:
53
+ model = self._get_model()
54
+ vecs = model.encode(texts, show_progress_bar=False)
55
+ return [v.tolist() for v in vecs]
56
+
57
+ @property
58
+ def dimension(self) -> int:
59
+ return self._get_model().get_sentence_embedding_dimension()
60
+
61
+
62
+ def _is_voyage_rate_limit_error(e: Exception) -> bool:
63
+ """Return True when a Voyage API error is a 429 rate limit."""
64
+ status = getattr(e, "status_code", None)
65
+ if status == 429:
66
+ return True
67
+ # Some Voyage SDK versions use a different attribute
68
+ code = getattr(e, "code", None)
69
+ return code == 429
70
+
71
+
72
+ def _is_google_key_error(e: Exception) -> bool:
73
+ """Return True when a Google API error is caused by the key, not the request."""
74
+ code = getattr(e, "code", None)
75
+ if code in (401, 403, 429):
76
+ return True
77
+ if code == 400:
78
+ msg = (getattr(e, "message", None) or str(e)).lower()
79
+ return "api key" in msg
80
+ return False
81
+
82
+
83
+ def _is_rate_limit_error(e: Exception) -> bool:
84
+ """Return True when a Google API error is a 429 RESOURCE_EXHAUSTED rate limit."""
85
+ return getattr(e, "code", None) == 429
86
+
87
+
88
+ def _parse_gemini_version(model_name: str) -> int:
89
+ """Parse the version number from a gemini-embedding model name.
90
+
91
+ Examples:
92
+ ``gemini-embedding-001`` → 1
93
+ ``gemini-embedding-2`` → 2
94
+
95
+ Returns 0 if the version cannot be parsed.
96
+ """
97
+ prefix = "gemini-embedding-"
98
+ if not model_name.startswith(prefix):
99
+ return 0
100
+ suffix = model_name[len(prefix):]
101
+ try:
102
+ return int(suffix)
103
+ except ValueError:
104
+ return 0
105
+
106
+
107
+ class GoogleEmbeddingModel(EmbeddingModel):
108
+ """Google AI (Gemini) embedding model via the google-genai SDK.
109
+
110
+ Uses ``gemini-embedding-001`` by default (768-dim, text-only).
111
+ Requires ``pip install corbell[google]`` and ``GOOGLE_API_KEY``.
112
+
113
+ Supports a comma-separated list of API keys for round-robin distribution
114
+ and automatic failover when a key is invalid or quota-exhausted.
115
+
116
+ Supports ``task_type`` to improve retrieval quality:
117
+ - ``RETRIEVAL_DOCUMENT`` for indexing (default)
118
+ - ``RETRIEVAL_QUERY`` for query-time encoding
119
+
120
+ For ``gemini-embedding-2`` and later, inline text prefixes are used
121
+ instead of relying solely on ``task_type`` for better retrieval quality.
122
+ Use ``prepare_query`` and ``prepare_document`` to format texts before
123
+ passing them to ``encode``.
124
+ """
125
+
126
+ def __init__(self, model_name: str = "gemini-embedding-001", api_key: Optional[str] = None):
127
+ self.model_name = model_name
128
+ raw = api_key or os.environ.get("GOOGLE_API_KEY") or ""
129
+ self._api_keys: List[str] = [k.strip() for k in raw.split(",") if k.strip()]
130
+ if not self._api_keys:
131
+ raise ValueError(
132
+ "GOOGLE_API_KEY is not set. "
133
+ "Set it in your environment or workspace.yaml:\n"
134
+ " export GOOGLE_API_KEY=AIza...\n"
135
+ "Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
136
+ )
137
+ self._key_index: int = random.randrange(len(self._api_keys))
138
+ # kept for backwards-compat with tests that read _api_key directly
139
+ self._api_key: str = self._api_keys[0]
140
+
141
+ @property
142
+ def uses_prefix_format(self) -> bool:
143
+ """Return True when the model requires inline text prefixes for best quality.
144
+
145
+ Activated for ``gemini-embedding-2`` and all later versions (version >= 2).
146
+ """
147
+ return _parse_gemini_version(self.model_name) >= 2
148
+
149
+ def prepare_query(self, query: str) -> str:
150
+ """Format a query string with a task prefix for retrieval.
151
+
152
+ Only applies the prefix when ``uses_prefix_format`` is True.
153
+ """
154
+ if self.uses_prefix_format:
155
+ return f"task: code retrieval | query: {query}"
156
+ return query
157
+
158
+ def prepare_document(self, content: str, title: Optional[str] = None) -> str:
159
+ """Format a document chunk with a title prefix for indexing.
160
+
161
+ Only applies the prefix when ``uses_prefix_format`` is True.
162
+
163
+ Args:
164
+ content: Raw chunk text.
165
+ title: Descriptive title, typically ``"{file_path}:{symbol}"``
166
+ or ``"{file_path}:L{start}-{end}"``. Defaults to ``"none"``.
167
+ """
168
+ if self.uses_prefix_format:
169
+ resolved_title = title or "none"
170
+ return f"title: {resolved_title} | text: {content}"
171
+ return content
172
+
173
+ _BATCH_SIZE = 100
174
+ _BASE_DELAY = 2.0
175
+ _MAX_BACKOFF = 60.0
176
+
177
+ def encode(self, texts: List[str], task_type: str = "RETRIEVAL_DOCUMENT") -> List[List[float]]:
178
+ """Encode a list of texts into embedding vectors.
179
+
180
+ Batches requests (100 texts/batch) and retries on rate limit (429)
181
+ with exponential backoff.
182
+
183
+ Args:
184
+ texts: List of text strings to encode.
185
+ task_type: Task type hint for the embedding model.
186
+ Use ``RETRIEVAL_DOCUMENT`` when indexing, ``RETRIEVAL_QUERY`` at query time.
187
+
188
+ Returns:
189
+ List of float vectors (one per input text).
190
+ """
191
+ try:
192
+ from google import genai
193
+ from google.genai import types
194
+ except ImportError:
195
+ raise ImportError("pip install corbell[google]")
196
+
197
+ all_embeddings: List[List[float]] = []
198
+ for batch_start in range(0, len(texts), self._BATCH_SIZE):
199
+ batch = texts[batch_start:batch_start + self._BATCH_SIZE]
200
+ contents = [types.Content(parts=[types.Part(text=t)]) for t in batch]
201
+ batch_result = self._embed_batch_with_retry(
202
+ contents, task_type, genai, types
203
+ )
204
+ all_embeddings.extend(batch_result)
205
+
206
+ return all_embeddings
207
+
208
+ def _embed_batch_with_retry(
209
+ self, contents, task_type: str, genai, types
210
+ ) -> List[List[float]]:
211
+ """Embed a single batch, rotating keys and retrying on rate limit.
212
+
213
+ - If all keys fail with 429 (rate limit), waits with capped exponential
214
+ backoff and retries indefinitely until the quota is restored.
215
+ - If any key fails with a non-key error, raises immediately.
216
+ - If all keys fail with auth errors (401/403/400+apikey), raises immediately.
217
+ """
218
+ import time
219
+
220
+ start = self._key_index
221
+ rate_limit_attempt = 0
222
+
223
+ while True:
224
+ errors: List[str] = []
225
+ all_rate_limited = True
226
+
227
+ for i in range(len(self._api_keys)):
228
+ idx = (start + i) % len(self._api_keys)
229
+ key = self._api_keys[idx]
230
+ try:
231
+ client = genai.Client(api_key=key)
232
+ result = client.models.embed_content(
233
+ model=self.model_name,
234
+ contents=contents,
235
+ config=types.EmbedContentConfig(
236
+ task_type=task_type,
237
+ output_dimensionality=self.dimension,
238
+ ),
239
+ )
240
+ self._key_index = (idx + 1) % len(self._api_keys)
241
+ return [emb.values for emb in result.embeddings]
242
+ except Exception as e:
243
+ if _is_google_key_error(e):
244
+ if not _is_rate_limit_error(e):
245
+ # Auth failure (401/403/400+apikey) — not a transient error
246
+ all_rate_limited = False
247
+ errors.append(f"key[{idx}]: {e}")
248
+ continue
249
+ raise
250
+
251
+ if not all_rate_limited:
252
+ raise RuntimeError(
253
+ f"All {len(self._api_keys)} Google API key(s) failed with auth errors:\n"
254
+ + "\n".join(errors)
255
+ )
256
+
257
+ # All keys are rate-limited (429) — wait and retry indefinitely
258
+ delay = min(self._BASE_DELAY * (2 ** rate_limit_attempt), self._MAX_BACKOFF)
259
+ rate_limit_attempt += 1
260
+ logger.warning(
261
+ "All %d Google API key(s) rate-limited (429). "
262
+ "Retrying in %.0fs (attempt %d)...",
263
+ len(self._api_keys),
264
+ delay,
265
+ rate_limit_attempt,
266
+ )
267
+ time.sleep(delay)
268
+
269
+ @property
270
+ def dimension(self) -> int:
271
+ dim_env = os.environ.get("CORBELL_EMBEDDING_DIM", "").strip()
272
+ if dim_env:
273
+ return int(dim_env)
274
+ return 768
275
+
276
+
277
+ class VoyageEmbeddingModel(EmbeddingModel):
278
+ """Voyage AI embedding model via the voyageai SDK.
279
+
280
+ Uses ``voyage-code-3`` by default (1024-dim, optimized for code retrieval).
281
+ Requires ``pip install corbell[voyage]`` and ``VOYAGE_API_KEY``.
282
+
283
+ Supports a comma-separated list of API keys for round-robin distribution
284
+ and automatic failover when a key is quota-exhausted.
285
+
286
+ The ``input_type`` parameter improves retrieval quality:
287
+ - ``"document"`` for indexing (default)
288
+ - ``"query"`` for query-time encoding
289
+
290
+ Use ``prepare_query`` and ``prepare_document`` which return the text unchanged
291
+ (Voyage handles task differentiation via ``input_type``).
292
+ """
293
+
294
+ _BATCH_SIZE = 1000
295
+ _BASE_DELAY = 2.0
296
+ _MAX_BACKOFF = 60.0
297
+
298
+ def __init__(self, model_name: str = "voyage-code-3", api_key: Optional[str] = None):
299
+ self.model_name = model_name
300
+ raw = api_key or os.environ.get("VOYAGE_API_KEY") or ""
301
+ self._api_keys: List[str] = [k.strip() for k in raw.split(",") if k.strip()]
302
+ if not self._api_keys:
303
+ raise ValueError(
304
+ "VOYAGE_API_KEY is not set. "
305
+ "Set it in your environment or workspace.yaml:\n"
306
+ " export VOYAGE_API_KEY=pa-...\n"
307
+ "Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
308
+ )
309
+ self._key_index: int = random.randrange(len(self._api_keys))
310
+ # kept for backwards-compat with tests that read _api_key directly
311
+ self._api_key: str = self._api_keys[0]
312
+
313
+ def prepare_query(self, query: str) -> str:
314
+ """Return query unchanged; Voyage handles task differentiation via input_type."""
315
+ return query
316
+
317
+ def prepare_document(self, content: str, title: Optional[str] = None) -> str:
318
+ """Return content unchanged; Voyage handles task differentiation via input_type."""
319
+ return content
320
+
321
+ def encode(self, texts: List[str], input_type: str = "document") -> List[List[float]]:
322
+ """Encode a list of texts into embedding vectors.
323
+
324
+ Batches requests (1000 texts/batch) and retries on rate limit (429)
325
+ with exponential backoff.
326
+
327
+ Args:
328
+ texts: List of text strings to encode.
329
+ input_type: Voyage input type hint. Use ``"document"`` when indexing,
330
+ ``"query"`` at query time.
331
+
332
+ Returns:
333
+ List of float vectors (one per input text).
334
+ """
335
+ try:
336
+ import voyageai
337
+ except ImportError:
338
+ raise ImportError("pip install corbell[voyage]")
339
+
340
+ all_embeddings: List[List[float]] = []
341
+ for batch_start in range(0, len(texts), self._BATCH_SIZE):
342
+ batch = texts[batch_start:batch_start + self._BATCH_SIZE]
343
+ batch_result = self._embed_batch_with_retry(batch, input_type, voyageai)
344
+ all_embeddings.extend(batch_result)
345
+
346
+ return all_embeddings
347
+
348
+ def _embed_batch_with_retry(
349
+ self, batch: List[str], input_type: str, voyageai
350
+ ) -> List[List[float]]:
351
+ """Embed a single batch, rotating keys and retrying on rate limit.
352
+
353
+ - If all keys fail with 429 (rate limit), waits with capped exponential
354
+ backoff and retries indefinitely until the quota is restored.
355
+ - If any key fails with a non-rate-limit error, raises immediately.
356
+ """
357
+ import time
358
+
359
+ start = self._key_index
360
+ rate_limit_attempt = 0
361
+
362
+ while True:
363
+ errors: List[str] = []
364
+
365
+ for i in range(len(self._api_keys)):
366
+ idx = (start + i) % len(self._api_keys)
367
+ key = self._api_keys[idx]
368
+ try:
369
+ vo = voyageai.Client(api_key=key)
370
+ result = vo.embed(
371
+ batch,
372
+ model=self.model_name,
373
+ input_type=input_type,
374
+ output_dimension=self.dimension,
375
+ )
376
+ self._key_index = (idx + 1) % len(self._api_keys)
377
+ return result.embeddings
378
+ except Exception as e:
379
+ if _is_voyage_rate_limit_error(e):
380
+ errors.append(f"key[{idx}]: {e}")
381
+ continue
382
+ raise
383
+
384
+ # All keys are rate-limited (429) — wait and retry indefinitely
385
+ delay = min(self._BASE_DELAY * (2 ** rate_limit_attempt), self._MAX_BACKOFF)
386
+ rate_limit_attempt += 1
387
+ logger.warning(
388
+ "All %d Voyage API key(s) rate-limited (429). "
389
+ "Retrying in %.0fs (attempt %d)...",
390
+ len(self._api_keys),
391
+ delay,
392
+ rate_limit_attempt,
393
+ )
394
+ time.sleep(delay)
395
+
396
+ @property
397
+ def dimension(self) -> int:
398
+ dim_env = os.environ.get("CORBELL_EMBEDDING_DIM", "").strip()
399
+ if dim_env:
400
+ return int(dim_env)
401
+ return 1024