codebase-retrieval-context-engine 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
- corbell/__init__.py +6 -0
- corbell/cli/__init__.py +1 -0
- corbell/cli/commands/__init__.py +1 -0
- corbell/cli/commands/index.py +86 -0
- corbell/cli/commands/query.py +71 -0
- corbell/cli/main.py +57 -0
- corbell/core/__init__.py +1 -0
- corbell/core/constants.py +52 -0
- corbell/core/embeddings/__init__.py +6 -0
- corbell/core/embeddings/base.py +68 -0
- corbell/core/embeddings/extractor.py +201 -0
- corbell/core/embeddings/factory.py +48 -0
- corbell/core/embeddings/model.py +401 -0
- corbell/core/embeddings/search_cache.py +95 -0
- corbell/core/embeddings/sqlite_store.py +271 -0
- corbell/core/gitignore.py +76 -0
- corbell/core/graph/__init__.py +1 -0
- corbell/core/graph/builder.py +696 -0
- corbell/core/graph/method_graph.py +1077 -0
- corbell/core/graph/providers/__init__.py +6 -0
- corbell/core/graph/providers/aws_patterns.py +62 -0
- corbell/core/graph/providers/azure_patterns.py +64 -0
- corbell/core/graph/providers/gcp_patterns.py +59 -0
- corbell/core/graph/schema.py +175 -0
- corbell/core/graph/sqlite_store.py +500 -0
- corbell/core/indexing/__init__.py +1 -0
- corbell/core/indexing/builder.py +608 -0
- corbell/core/indexing/lock.py +150 -0
- corbell/core/indexing/tracker.py +245 -0
- corbell/core/llm_client.py +677 -0
- corbell/core/mcp/__init__.py +1 -0
- corbell/core/mcp/server.py +214 -0
- corbell/core/query/__init__.py +1 -0
- corbell/core/query/diagnostics.py +38 -0
- corbell/core/query/engine.py +321 -0
- corbell/core/query/enhancer.py +102 -0
- corbell/core/query/formatter.py +98 -0
- corbell/core/query/graph_expander.py +284 -0
- corbell/core/query/merger.py +171 -0
- corbell/core/query/reranker.py +131 -0
- corbell/core/workspace.py +408 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Code chunk extractor for embedding indexing.
|
|
2
|
+
|
|
3
|
+
Extracts code chunks (functions, classes, methods) from source files.
|
|
4
|
+
Extracts function/class/method chunks using Python ast; generic line-split for others.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import ast
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Optional
|
|
13
|
+
|
|
14
|
+
import pathspec
|
|
15
|
+
|
|
16
|
+
from corbell.core.constants import EXTENSION_LANG as _SUPPORTED, SKIP_DIRS as _SKIP_DIRS
|
|
17
|
+
from corbell.core.gitignore import load_gitignore
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class EmbeddingRecord:
|
|
22
|
+
"""A chunk of code ready to be embedded."""
|
|
23
|
+
|
|
24
|
+
id: str
|
|
25
|
+
service_id: str
|
|
26
|
+
repo: str
|
|
27
|
+
file_path: str # relative path within repo
|
|
28
|
+
start_line: int
|
|
29
|
+
end_line: int
|
|
30
|
+
content: str
|
|
31
|
+
language: str
|
|
32
|
+
chunk_type: str # function | class | method | block
|
|
33
|
+
symbol: Optional[str] = None # function/class name if known
|
|
34
|
+
embedding: Optional[List[float]] = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class CodeChunkExtractor:
|
|
38
|
+
"""Extract meaningful code chunks from source files in a repo.
|
|
39
|
+
|
|
40
|
+
Produces :class:`EmbeddingRecord` instances that can be stored in any
|
|
41
|
+
embedding backend.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, chunk_size: int = 50, overlap: int = 10):
|
|
45
|
+
"""Initialize the extractor.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
chunk_size: Lines per generic block chunk.
|
|
49
|
+
overlap: Overlap between consecutive generic chunks.
|
|
50
|
+
"""
|
|
51
|
+
self.chunk_size = chunk_size
|
|
52
|
+
self.overlap = overlap
|
|
53
|
+
|
|
54
|
+
def extract_from_repo(
|
|
55
|
+
self,
|
|
56
|
+
repo_path: Path | str,
|
|
57
|
+
service_id: str,
|
|
58
|
+
max_file_bytes: int = 1024 * 1024,
|
|
59
|
+
gitignore_spec: Optional[pathspec.PathSpec] = None,
|
|
60
|
+
) -> List[EmbeddingRecord]:
|
|
61
|
+
"""Walk a repo and extract all code chunks.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
repo_path: Root directory of the repository.
|
|
65
|
+
service_id: ID of the owning service.
|
|
66
|
+
max_file_bytes: Skip files larger than this.
|
|
67
|
+
gitignore_spec: Pre-loaded PathSpec for gitignore filtering.
|
|
68
|
+
If None, it is loaded from the repo automatically.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
List of :class:`EmbeddingRecord` ready for embedding.
|
|
72
|
+
"""
|
|
73
|
+
repo_path = Path(repo_path)
|
|
74
|
+
if gitignore_spec is None:
|
|
75
|
+
gitignore_spec = load_gitignore(repo_path)
|
|
76
|
+
records: List[EmbeddingRecord] = []
|
|
77
|
+
|
|
78
|
+
for fp in repo_path.rglob("*"):
|
|
79
|
+
if not fp.is_file():
|
|
80
|
+
continue
|
|
81
|
+
if self._should_skip(fp, max_file_bytes):
|
|
82
|
+
continue
|
|
83
|
+
lang = _SUPPORTED.get(fp.suffix)
|
|
84
|
+
if not lang:
|
|
85
|
+
continue
|
|
86
|
+
rel = str(fp.relative_to(repo_path))
|
|
87
|
+
if gitignore_spec.match_file(rel.replace("\\", "/")):
|
|
88
|
+
continue
|
|
89
|
+
chunks = self._extract_file(fp, rel, lang, service_id, str(repo_path))
|
|
90
|
+
records.extend(chunks)
|
|
91
|
+
|
|
92
|
+
return records
|
|
93
|
+
|
|
94
|
+
# ------------------------------------------------------------------ #
|
|
95
|
+
# Internal helpers #
|
|
96
|
+
# ------------------------------------------------------------------ #
|
|
97
|
+
|
|
98
|
+
def _should_skip(self, fp: Path, max_bytes: int) -> bool:
|
|
99
|
+
if any(part in _SKIP_DIRS for part in fp.parts):
|
|
100
|
+
return True
|
|
101
|
+
try:
|
|
102
|
+
if fp.stat().st_size > max_bytes:
|
|
103
|
+
return True
|
|
104
|
+
except OSError:
|
|
105
|
+
return True
|
|
106
|
+
return False
|
|
107
|
+
|
|
108
|
+
def _extract_file(
|
|
109
|
+
self, fp: Path, rel: str, lang: str, service_id: str, repo: str
|
|
110
|
+
) -> List[EmbeddingRecord]:
|
|
111
|
+
try:
|
|
112
|
+
content = fp.read_text(encoding="utf-8", errors="ignore")
|
|
113
|
+
except Exception:
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
if lang == "python":
|
|
117
|
+
return self._extract_python(content, rel, service_id, repo)
|
|
118
|
+
return self._extract_generic(content, rel, lang, service_id, repo)
|
|
119
|
+
|
|
120
|
+
def _extract_python(
|
|
121
|
+
self, content: str, rel: str, service_id: str, repo: str
|
|
122
|
+
) -> List[EmbeddingRecord]:
|
|
123
|
+
"""Use Python's ast to extract function/class definitions."""
|
|
124
|
+
records: List[EmbeddingRecord] = []
|
|
125
|
+
lines = content.splitlines()
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
tree = ast.parse(content)
|
|
129
|
+
except SyntaxError:
|
|
130
|
+
return self._extract_generic(content, rel, "python", service_id, repo)
|
|
131
|
+
|
|
132
|
+
class _Visitor(ast.NodeVisitor):
|
|
133
|
+
def __init__(self_v):
|
|
134
|
+
self_v.class_stack: List[str] = []
|
|
135
|
+
|
|
136
|
+
def _emit(self_v, node, name: str, chunk_type: str):
|
|
137
|
+
line_start = node.lineno
|
|
138
|
+
line_end = getattr(node, "end_lineno", node.lineno)
|
|
139
|
+
chunk_content = "\n".join(lines[line_start - 1 : line_end])
|
|
140
|
+
symbol = ".".join(self_v.class_stack + [name])
|
|
141
|
+
rec = EmbeddingRecord(
|
|
142
|
+
id=f"{service_id}::{rel}::{symbol}",
|
|
143
|
+
service_id=service_id,
|
|
144
|
+
repo=repo,
|
|
145
|
+
file_path=rel,
|
|
146
|
+
start_line=line_start,
|
|
147
|
+
end_line=line_end,
|
|
148
|
+
content=chunk_content,
|
|
149
|
+
language="python",
|
|
150
|
+
chunk_type=chunk_type,
|
|
151
|
+
symbol=symbol,
|
|
152
|
+
)
|
|
153
|
+
records.append(rec)
|
|
154
|
+
|
|
155
|
+
def visit_ClassDef(self_v, node):
|
|
156
|
+
self_v._emit(node, node.name, "class")
|
|
157
|
+
self_v.class_stack.append(node.name)
|
|
158
|
+
self_v.generic_visit(node)
|
|
159
|
+
self_v.class_stack.pop()
|
|
160
|
+
|
|
161
|
+
def visit_FunctionDef(self_v, node):
|
|
162
|
+
chunk_type = "method" if self_v.class_stack else "function"
|
|
163
|
+
self_v._emit(node, node.name, chunk_type)
|
|
164
|
+
|
|
165
|
+
visit_AsyncFunctionDef = visit_FunctionDef
|
|
166
|
+
|
|
167
|
+
_Visitor().visit(tree)
|
|
168
|
+
# Fall back to generic if nothing found
|
|
169
|
+
if not records:
|
|
170
|
+
return self._extract_generic(content, rel, "python", service_id, repo)
|
|
171
|
+
return records
|
|
172
|
+
|
|
173
|
+
def _extract_generic(
|
|
174
|
+
self, content: str, rel: str, lang: str, service_id: str, repo: str
|
|
175
|
+
) -> List[EmbeddingRecord]:
|
|
176
|
+
"""Split file into overlapping line-based blocks."""
|
|
177
|
+
lines = content.splitlines()
|
|
178
|
+
records: List[EmbeddingRecord] = []
|
|
179
|
+
step = max(1, self.chunk_size - self.overlap)
|
|
180
|
+
|
|
181
|
+
for i in range(0, len(lines), step):
|
|
182
|
+
end = min(i + self.chunk_size, len(lines))
|
|
183
|
+
chunk_lines = lines[i:end]
|
|
184
|
+
if not any(l.strip() for l in chunk_lines):
|
|
185
|
+
continue
|
|
186
|
+
chunk_content = "\n".join(chunk_lines)
|
|
187
|
+
records.append(
|
|
188
|
+
EmbeddingRecord(
|
|
189
|
+
id=f"{service_id}::{rel}::block_{i}",
|
|
190
|
+
service_id=service_id,
|
|
191
|
+
repo=repo,
|
|
192
|
+
file_path=rel,
|
|
193
|
+
start_line=i + 1,
|
|
194
|
+
end_line=end,
|
|
195
|
+
content=chunk_content,
|
|
196
|
+
language=lang,
|
|
197
|
+
chunk_type="block",
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return records
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Factory for creating EmbeddingStore instances by backend name.
|
|
2
|
+
|
|
3
|
+
To add a new backend:
|
|
4
|
+
1. Create a class that implements :class:`~corbell.core.embeddings.base.EmbeddingStore`.
|
|
5
|
+
2. Add an ``elif backend == "<name>":`` branch below.
|
|
6
|
+
3. Users opt in via ``storage.embeddings.backend: <name>`` in workspace.yaml.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from corbell.core.embeddings.base import EmbeddingStore
|
|
14
|
+
|
|
15
|
+
_SUPPORTED_BACKENDS = ("sqlite",)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_embedding_store(backend: str, db_path: Path) -> EmbeddingStore:
|
|
19
|
+
"""Return an :class:`EmbeddingStore` for the requested backend.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
backend: Backend identifier string (e.g. ``"sqlite"``).
|
|
23
|
+
db_path: Path to the storage file / directory.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
A concrete :class:`EmbeddingStore` instance.
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If ``backend`` is not a recognised backend name.
|
|
30
|
+
"""
|
|
31
|
+
backend = backend.lower().strip()
|
|
32
|
+
|
|
33
|
+
if backend == "sqlite":
|
|
34
|
+
from corbell.core.embeddings.sqlite_store import SQLiteEmbeddingStore
|
|
35
|
+
return SQLiteEmbeddingStore(db_path)
|
|
36
|
+
|
|
37
|
+
# ------------------------------------------------------------------ #
|
|
38
|
+
# Future backends — add branches here, e.g.: #
|
|
39
|
+
# elif backend == "kuzu": #
|
|
40
|
+
# from corbell.core.embeddings.kuzu_store import KuzuEmbeddingStore
|
|
41
|
+
# return KuzuEmbeddingStore(db_path) #
|
|
42
|
+
# ------------------------------------------------------------------ #
|
|
43
|
+
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"Unknown embedding backend: {backend!r}. "
|
|
46
|
+
f"Supported backends: {', '.join(_SUPPORTED_BACKENDS)}. "
|
|
47
|
+
f"Set 'storage.embeddings.backend' in workspace.yaml."
|
|
48
|
+
)
|
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
"""Embedding model interface + SentenceTransformers implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EmbeddingModel(ABC):
|
|
15
|
+
"""Abstract embedding model interface."""
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def encode(self, texts: List[str]) -> List[List[float]]:
|
|
19
|
+
"""Encode a list of texts into embedding vectors.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
texts: List of text strings to encode.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
List of float vectors (one per input text).
|
|
26
|
+
"""
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def dimension(self) -> int:
|
|
32
|
+
"""Return the embedding dimension."""
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class SentenceTransformerModel(EmbeddingModel):
|
|
37
|
+
"""Wraps ``sentence-transformers`` with lazy loading.
|
|
38
|
+
|
|
39
|
+
Uses ``all-MiniLM-L6-v2`` by default (384-dim, fast, no API key).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
|
43
|
+
self.model_name = model_name
|
|
44
|
+
self._model = None # lazy-loaded
|
|
45
|
+
|
|
46
|
+
def _get_model(self):
|
|
47
|
+
if self._model is None:
|
|
48
|
+
from sentence_transformers import SentenceTransformer
|
|
49
|
+
self._model = SentenceTransformer(f"sentence-transformers/{self.model_name}")
|
|
50
|
+
return self._model
|
|
51
|
+
|
|
52
|
+
def encode(self, texts: List[str]) -> List[List[float]]:
|
|
53
|
+
model = self._get_model()
|
|
54
|
+
vecs = model.encode(texts, show_progress_bar=False)
|
|
55
|
+
return [v.tolist() for v in vecs]
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def dimension(self) -> int:
|
|
59
|
+
return self._get_model().get_sentence_embedding_dimension()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _is_voyage_rate_limit_error(e: Exception) -> bool:
|
|
63
|
+
"""Return True when a Voyage API error is a 429 rate limit."""
|
|
64
|
+
status = getattr(e, "status_code", None)
|
|
65
|
+
if status == 429:
|
|
66
|
+
return True
|
|
67
|
+
# Some Voyage SDK versions use a different attribute
|
|
68
|
+
code = getattr(e, "code", None)
|
|
69
|
+
return code == 429
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _is_google_key_error(e: Exception) -> bool:
|
|
73
|
+
"""Return True when a Google API error is caused by the key, not the request."""
|
|
74
|
+
code = getattr(e, "code", None)
|
|
75
|
+
if code in (401, 403, 429):
|
|
76
|
+
return True
|
|
77
|
+
if code == 400:
|
|
78
|
+
msg = (getattr(e, "message", None) or str(e)).lower()
|
|
79
|
+
return "api key" in msg
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _is_rate_limit_error(e: Exception) -> bool:
|
|
84
|
+
"""Return True when a Google API error is a 429 RESOURCE_EXHAUSTED rate limit."""
|
|
85
|
+
return getattr(e, "code", None) == 429
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _parse_gemini_version(model_name: str) -> int:
|
|
89
|
+
"""Parse the version number from a gemini-embedding model name.
|
|
90
|
+
|
|
91
|
+
Examples:
|
|
92
|
+
``gemini-embedding-001`` → 1
|
|
93
|
+
``gemini-embedding-2`` → 2
|
|
94
|
+
|
|
95
|
+
Returns 0 if the version cannot be parsed.
|
|
96
|
+
"""
|
|
97
|
+
prefix = "gemini-embedding-"
|
|
98
|
+
if not model_name.startswith(prefix):
|
|
99
|
+
return 0
|
|
100
|
+
suffix = model_name[len(prefix):]
|
|
101
|
+
try:
|
|
102
|
+
return int(suffix)
|
|
103
|
+
except ValueError:
|
|
104
|
+
return 0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class GoogleEmbeddingModel(EmbeddingModel):
|
|
108
|
+
"""Google AI (Gemini) embedding model via the google-genai SDK.
|
|
109
|
+
|
|
110
|
+
Uses ``gemini-embedding-001`` by default (768-dim, text-only).
|
|
111
|
+
Requires ``pip install corbell[google]`` and ``GOOGLE_API_KEY``.
|
|
112
|
+
|
|
113
|
+
Supports a comma-separated list of API keys for round-robin distribution
|
|
114
|
+
and automatic failover when a key is invalid or quota-exhausted.
|
|
115
|
+
|
|
116
|
+
Supports ``task_type`` to improve retrieval quality:
|
|
117
|
+
- ``RETRIEVAL_DOCUMENT`` for indexing (default)
|
|
118
|
+
- ``RETRIEVAL_QUERY`` for query-time encoding
|
|
119
|
+
|
|
120
|
+
For ``gemini-embedding-2`` and later, inline text prefixes are used
|
|
121
|
+
instead of relying solely on ``task_type`` for better retrieval quality.
|
|
122
|
+
Use ``prepare_query`` and ``prepare_document`` to format texts before
|
|
123
|
+
passing them to ``encode``.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
def __init__(self, model_name: str = "gemini-embedding-001", api_key: Optional[str] = None):
|
|
127
|
+
self.model_name = model_name
|
|
128
|
+
raw = api_key or os.environ.get("GOOGLE_API_KEY") or ""
|
|
129
|
+
self._api_keys: List[str] = [k.strip() for k in raw.split(",") if k.strip()]
|
|
130
|
+
if not self._api_keys:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"GOOGLE_API_KEY is not set. "
|
|
133
|
+
"Set it in your environment or workspace.yaml:\n"
|
|
134
|
+
" export GOOGLE_API_KEY=AIza...\n"
|
|
135
|
+
"Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
|
|
136
|
+
)
|
|
137
|
+
self._key_index: int = random.randrange(len(self._api_keys))
|
|
138
|
+
# kept for backwards-compat with tests that read _api_key directly
|
|
139
|
+
self._api_key: str = self._api_keys[0]
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def uses_prefix_format(self) -> bool:
|
|
143
|
+
"""Return True when the model requires inline text prefixes for best quality.
|
|
144
|
+
|
|
145
|
+
Activated for ``gemini-embedding-2`` and all later versions (version >= 2).
|
|
146
|
+
"""
|
|
147
|
+
return _parse_gemini_version(self.model_name) >= 2
|
|
148
|
+
|
|
149
|
+
def prepare_query(self, query: str) -> str:
|
|
150
|
+
"""Format a query string with a task prefix for retrieval.
|
|
151
|
+
|
|
152
|
+
Only applies the prefix when ``uses_prefix_format`` is True.
|
|
153
|
+
"""
|
|
154
|
+
if self.uses_prefix_format:
|
|
155
|
+
return f"task: code retrieval | query: {query}"
|
|
156
|
+
return query
|
|
157
|
+
|
|
158
|
+
def prepare_document(self, content: str, title: Optional[str] = None) -> str:
|
|
159
|
+
"""Format a document chunk with a title prefix for indexing.
|
|
160
|
+
|
|
161
|
+
Only applies the prefix when ``uses_prefix_format`` is True.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
content: Raw chunk text.
|
|
165
|
+
title: Descriptive title, typically ``"{file_path}:{symbol}"``
|
|
166
|
+
or ``"{file_path}:L{start}-{end}"``. Defaults to ``"none"``.
|
|
167
|
+
"""
|
|
168
|
+
if self.uses_prefix_format:
|
|
169
|
+
resolved_title = title or "none"
|
|
170
|
+
return f"title: {resolved_title} | text: {content}"
|
|
171
|
+
return content
|
|
172
|
+
|
|
173
|
+
_BATCH_SIZE = 100
|
|
174
|
+
_BASE_DELAY = 2.0
|
|
175
|
+
_MAX_BACKOFF = 60.0
|
|
176
|
+
|
|
177
|
+
def encode(self, texts: List[str], task_type: str = "RETRIEVAL_DOCUMENT") -> List[List[float]]:
|
|
178
|
+
"""Encode a list of texts into embedding vectors.
|
|
179
|
+
|
|
180
|
+
Batches requests (100 texts/batch) and retries on rate limit (429)
|
|
181
|
+
with exponential backoff.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
texts: List of text strings to encode.
|
|
185
|
+
task_type: Task type hint for the embedding model.
|
|
186
|
+
Use ``RETRIEVAL_DOCUMENT`` when indexing, ``RETRIEVAL_QUERY`` at query time.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
List of float vectors (one per input text).
|
|
190
|
+
"""
|
|
191
|
+
try:
|
|
192
|
+
from google import genai
|
|
193
|
+
from google.genai import types
|
|
194
|
+
except ImportError:
|
|
195
|
+
raise ImportError("pip install corbell[google]")
|
|
196
|
+
|
|
197
|
+
all_embeddings: List[List[float]] = []
|
|
198
|
+
for batch_start in range(0, len(texts), self._BATCH_SIZE):
|
|
199
|
+
batch = texts[batch_start:batch_start + self._BATCH_SIZE]
|
|
200
|
+
contents = [types.Content(parts=[types.Part(text=t)]) for t in batch]
|
|
201
|
+
batch_result = self._embed_batch_with_retry(
|
|
202
|
+
contents, task_type, genai, types
|
|
203
|
+
)
|
|
204
|
+
all_embeddings.extend(batch_result)
|
|
205
|
+
|
|
206
|
+
return all_embeddings
|
|
207
|
+
|
|
208
|
+
def _embed_batch_with_retry(
|
|
209
|
+
self, contents, task_type: str, genai, types
|
|
210
|
+
) -> List[List[float]]:
|
|
211
|
+
"""Embed a single batch, rotating keys and retrying on rate limit.
|
|
212
|
+
|
|
213
|
+
- If all keys fail with 429 (rate limit), waits with capped exponential
|
|
214
|
+
backoff and retries indefinitely until the quota is restored.
|
|
215
|
+
- If any key fails with a non-key error, raises immediately.
|
|
216
|
+
- If all keys fail with auth errors (401/403/400+apikey), raises immediately.
|
|
217
|
+
"""
|
|
218
|
+
import time
|
|
219
|
+
|
|
220
|
+
start = self._key_index
|
|
221
|
+
rate_limit_attempt = 0
|
|
222
|
+
|
|
223
|
+
while True:
|
|
224
|
+
errors: List[str] = []
|
|
225
|
+
all_rate_limited = True
|
|
226
|
+
|
|
227
|
+
for i in range(len(self._api_keys)):
|
|
228
|
+
idx = (start + i) % len(self._api_keys)
|
|
229
|
+
key = self._api_keys[idx]
|
|
230
|
+
try:
|
|
231
|
+
client = genai.Client(api_key=key)
|
|
232
|
+
result = client.models.embed_content(
|
|
233
|
+
model=self.model_name,
|
|
234
|
+
contents=contents,
|
|
235
|
+
config=types.EmbedContentConfig(
|
|
236
|
+
task_type=task_type,
|
|
237
|
+
output_dimensionality=self.dimension,
|
|
238
|
+
),
|
|
239
|
+
)
|
|
240
|
+
self._key_index = (idx + 1) % len(self._api_keys)
|
|
241
|
+
return [emb.values for emb in result.embeddings]
|
|
242
|
+
except Exception as e:
|
|
243
|
+
if _is_google_key_error(e):
|
|
244
|
+
if not _is_rate_limit_error(e):
|
|
245
|
+
# Auth failure (401/403/400+apikey) — not a transient error
|
|
246
|
+
all_rate_limited = False
|
|
247
|
+
errors.append(f"key[{idx}]: {e}")
|
|
248
|
+
continue
|
|
249
|
+
raise
|
|
250
|
+
|
|
251
|
+
if not all_rate_limited:
|
|
252
|
+
raise RuntimeError(
|
|
253
|
+
f"All {len(self._api_keys)} Google API key(s) failed with auth errors:\n"
|
|
254
|
+
+ "\n".join(errors)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# All keys are rate-limited (429) — wait and retry indefinitely
|
|
258
|
+
delay = min(self._BASE_DELAY * (2 ** rate_limit_attempt), self._MAX_BACKOFF)
|
|
259
|
+
rate_limit_attempt += 1
|
|
260
|
+
logger.warning(
|
|
261
|
+
"All %d Google API key(s) rate-limited (429). "
|
|
262
|
+
"Retrying in %.0fs (attempt %d)...",
|
|
263
|
+
len(self._api_keys),
|
|
264
|
+
delay,
|
|
265
|
+
rate_limit_attempt,
|
|
266
|
+
)
|
|
267
|
+
time.sleep(delay)
|
|
268
|
+
|
|
269
|
+
@property
|
|
270
|
+
def dimension(self) -> int:
|
|
271
|
+
dim_env = os.environ.get("CORBELL_EMBEDDING_DIM", "").strip()
|
|
272
|
+
if dim_env:
|
|
273
|
+
return int(dim_env)
|
|
274
|
+
return 768
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class VoyageEmbeddingModel(EmbeddingModel):
|
|
278
|
+
"""Voyage AI embedding model via the voyageai SDK.
|
|
279
|
+
|
|
280
|
+
Uses ``voyage-code-3`` by default (1024-dim, optimized for code retrieval).
|
|
281
|
+
Requires ``pip install corbell[voyage]`` and ``VOYAGE_API_KEY``.
|
|
282
|
+
|
|
283
|
+
Supports a comma-separated list of API keys for round-robin distribution
|
|
284
|
+
and automatic failover when a key is quota-exhausted.
|
|
285
|
+
|
|
286
|
+
The ``input_type`` parameter improves retrieval quality:
|
|
287
|
+
- ``"document"`` for indexing (default)
|
|
288
|
+
- ``"query"`` for query-time encoding
|
|
289
|
+
|
|
290
|
+
Use ``prepare_query`` and ``prepare_document`` which return the text unchanged
|
|
291
|
+
(Voyage handles task differentiation via ``input_type``).
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
_BATCH_SIZE = 1000
|
|
295
|
+
_BASE_DELAY = 2.0
|
|
296
|
+
_MAX_BACKOFF = 60.0
|
|
297
|
+
|
|
298
|
+
def __init__(self, model_name: str = "voyage-code-3", api_key: Optional[str] = None):
|
|
299
|
+
self.model_name = model_name
|
|
300
|
+
raw = api_key or os.environ.get("VOYAGE_API_KEY") or ""
|
|
301
|
+
self._api_keys: List[str] = [k.strip() for k in raw.split(",") if k.strip()]
|
|
302
|
+
if not self._api_keys:
|
|
303
|
+
raise ValueError(
|
|
304
|
+
"VOYAGE_API_KEY is not set. "
|
|
305
|
+
"Set it in your environment or workspace.yaml:\n"
|
|
306
|
+
" export VOYAGE_API_KEY=pa-...\n"
|
|
307
|
+
"Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
|
|
308
|
+
)
|
|
309
|
+
self._key_index: int = random.randrange(len(self._api_keys))
|
|
310
|
+
# kept for backwards-compat with tests that read _api_key directly
|
|
311
|
+
self._api_key: str = self._api_keys[0]
|
|
312
|
+
|
|
313
|
+
def prepare_query(self, query: str) -> str:
|
|
314
|
+
"""Return query unchanged; Voyage handles task differentiation via input_type."""
|
|
315
|
+
return query
|
|
316
|
+
|
|
317
|
+
def prepare_document(self, content: str, title: Optional[str] = None) -> str:
|
|
318
|
+
"""Return content unchanged; Voyage handles task differentiation via input_type."""
|
|
319
|
+
return content
|
|
320
|
+
|
|
321
|
+
def encode(self, texts: List[str], input_type: str = "document") -> List[List[float]]:
|
|
322
|
+
"""Encode a list of texts into embedding vectors.
|
|
323
|
+
|
|
324
|
+
Batches requests (1000 texts/batch) and retries on rate limit (429)
|
|
325
|
+
with exponential backoff.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
texts: List of text strings to encode.
|
|
329
|
+
input_type: Voyage input type hint. Use ``"document"`` when indexing,
|
|
330
|
+
``"query"`` at query time.
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
List of float vectors (one per input text).
|
|
334
|
+
"""
|
|
335
|
+
try:
|
|
336
|
+
import voyageai
|
|
337
|
+
except ImportError:
|
|
338
|
+
raise ImportError("pip install corbell[voyage]")
|
|
339
|
+
|
|
340
|
+
all_embeddings: List[List[float]] = []
|
|
341
|
+
for batch_start in range(0, len(texts), self._BATCH_SIZE):
|
|
342
|
+
batch = texts[batch_start:batch_start + self._BATCH_SIZE]
|
|
343
|
+
batch_result = self._embed_batch_with_retry(batch, input_type, voyageai)
|
|
344
|
+
all_embeddings.extend(batch_result)
|
|
345
|
+
|
|
346
|
+
return all_embeddings
|
|
347
|
+
|
|
348
|
+
def _embed_batch_with_retry(
|
|
349
|
+
self, batch: List[str], input_type: str, voyageai
|
|
350
|
+
) -> List[List[float]]:
|
|
351
|
+
"""Embed a single batch, rotating keys and retrying on rate limit.
|
|
352
|
+
|
|
353
|
+
- If all keys fail with 429 (rate limit), waits with capped exponential
|
|
354
|
+
backoff and retries indefinitely until the quota is restored.
|
|
355
|
+
- If any key fails with a non-rate-limit error, raises immediately.
|
|
356
|
+
"""
|
|
357
|
+
import time
|
|
358
|
+
|
|
359
|
+
start = self._key_index
|
|
360
|
+
rate_limit_attempt = 0
|
|
361
|
+
|
|
362
|
+
while True:
|
|
363
|
+
errors: List[str] = []
|
|
364
|
+
|
|
365
|
+
for i in range(len(self._api_keys)):
|
|
366
|
+
idx = (start + i) % len(self._api_keys)
|
|
367
|
+
key = self._api_keys[idx]
|
|
368
|
+
try:
|
|
369
|
+
vo = voyageai.Client(api_key=key)
|
|
370
|
+
result = vo.embed(
|
|
371
|
+
batch,
|
|
372
|
+
model=self.model_name,
|
|
373
|
+
input_type=input_type,
|
|
374
|
+
output_dimension=self.dimension,
|
|
375
|
+
)
|
|
376
|
+
self._key_index = (idx + 1) % len(self._api_keys)
|
|
377
|
+
return result.embeddings
|
|
378
|
+
except Exception as e:
|
|
379
|
+
if _is_voyage_rate_limit_error(e):
|
|
380
|
+
errors.append(f"key[{idx}]: {e}")
|
|
381
|
+
continue
|
|
382
|
+
raise
|
|
383
|
+
|
|
384
|
+
# All keys are rate-limited (429) — wait and retry indefinitely
|
|
385
|
+
delay = min(self._BASE_DELAY * (2 ** rate_limit_attempt), self._MAX_BACKOFF)
|
|
386
|
+
rate_limit_attempt += 1
|
|
387
|
+
logger.warning(
|
|
388
|
+
"All %d Voyage API key(s) rate-limited (429). "
|
|
389
|
+
"Retrying in %.0fs (attempt %d)...",
|
|
390
|
+
len(self._api_keys),
|
|
391
|
+
delay,
|
|
392
|
+
rate_limit_attempt,
|
|
393
|
+
)
|
|
394
|
+
time.sleep(delay)
|
|
395
|
+
|
|
396
|
+
@property
|
|
397
|
+
def dimension(self) -> int:
|
|
398
|
+
dim_env = os.environ.get("CORBELL_EMBEDDING_DIM", "").strip()
|
|
399
|
+
if dim_env:
|
|
400
|
+
return int(dim_env)
|
|
401
|
+
return 1024
|