vexor 0.22.0__py3-none-any.whl → 0.23.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vexor/__init__.py +1 -1
- vexor/api.py +55 -0
- vexor/cache.py +45 -13
- vexor/cli.py +59 -2
- vexor/config.py +150 -3
- vexor/providers/openai.py +14 -4
- vexor/search.py +16 -1
- vexor/services/config_service.py +30 -2
- vexor/services/index_service.py +56 -4
- vexor/services/init_service.py +12 -2
- vexor/services/search_service.py +63 -6
- vexor/text.py +17 -3
- {vexor-0.22.0.dist-info → vexor-0.23.0rc1.dist-info}/METADATA +28 -5
- {vexor-0.22.0.dist-info → vexor-0.23.0rc1.dist-info}/RECORD +17 -17
- {vexor-0.22.0.dist-info → vexor-0.23.0rc1.dist-info}/WHEEL +0 -0
- {vexor-0.22.0.dist-info → vexor-0.23.0rc1.dist-info}/entry_points.txt +0 -0
- {vexor-0.22.0.dist-info → vexor-0.23.0rc1.dist-info}/licenses/LICENSE +0 -0
vexor/config.py
CHANGED
|
@@ -23,6 +23,7 @@ _CONFIG_DIR_OVERRIDE: ContextVar[Path | None] = ContextVar(
|
|
|
23
23
|
)
|
|
24
24
|
DEFAULT_MODEL = "text-embedding-3-small"
|
|
25
25
|
DEFAULT_GEMINI_MODEL = "gemini-embedding-001"
|
|
26
|
+
DEFAULT_VOYAGE_MODEL = "voyage-3-large"
|
|
26
27
|
DEFAULT_LOCAL_MODEL = "intfloat/multilingual-e5-small"
|
|
27
28
|
DEFAULT_BATCH_SIZE = 64
|
|
28
29
|
DEFAULT_EMBED_CONCURRENCY = 4
|
|
@@ -32,13 +33,22 @@ DEFAULT_PROVIDER = "openai"
|
|
|
32
33
|
DEFAULT_RERANK = "off"
|
|
33
34
|
DEFAULT_FLASHRANK_MODEL = "ms-marco-TinyBERT-L-2-v2"
|
|
34
35
|
DEFAULT_FLASHRANK_MAX_LENGTH = 256
|
|
35
|
-
|
|
36
|
+
VOYAGE_BASE_URL = "https://api.voyageai.com/v1"
|
|
37
|
+
SUPPORTED_PROVIDERS: tuple[str, ...] = (DEFAULT_PROVIDER, "gemini", "voyageai", "custom", "local")
|
|
36
38
|
SUPPORTED_RERANKERS: tuple[str, ...] = ("off", "bm25", "flashrank", "remote")
|
|
37
39
|
SUPPORTED_EXTRACT_BACKENDS: tuple[str, ...] = ("auto", "thread", "process")
|
|
40
|
+
# Models that support the dimensions parameter (model prefix/name -> supported dimensions)
|
|
41
|
+
DIMENSION_SUPPORTED_MODELS: dict[str, tuple[int, ...]] = {
|
|
42
|
+
"text-embedding-3-small": (256, 512, 1024, 1536),
|
|
43
|
+
"text-embedding-3-large": (256, 512, 1024, 1536, 3072),
|
|
44
|
+
"voyage-3": (256, 512, 1024, 2048),
|
|
45
|
+
"voyage-code-3": (256, 512, 1024, 2048),
|
|
46
|
+
}
|
|
38
47
|
ENV_API_KEY = "VEXOR_API_KEY"
|
|
39
48
|
REMOTE_RERANK_ENV = "VEXOR_REMOTE_RERANK_API_KEY"
|
|
40
49
|
LEGACY_GEMINI_ENV = "GOOGLE_GENAI_API_KEY"
|
|
41
50
|
OPENAI_ENV = "OPENAI_API_KEY"
|
|
51
|
+
VOYAGE_ENV = "VOYAGE_API_KEY"
|
|
42
52
|
|
|
43
53
|
|
|
44
54
|
@dataclass
|
|
@@ -63,6 +73,7 @@ class Config:
|
|
|
63
73
|
rerank: str = DEFAULT_RERANK
|
|
64
74
|
flashrank_model: str | None = None
|
|
65
75
|
remote_rerank: RemoteRerankConfig | None = None
|
|
76
|
+
embedding_dimensions: int | None = None
|
|
66
77
|
|
|
67
78
|
|
|
68
79
|
def _parse_remote_rerank(raw: object) -> RemoteRerankConfig | None:
|
|
@@ -133,6 +144,7 @@ def load_config() -> Config:
|
|
|
133
144
|
rerank=rerank,
|
|
134
145
|
flashrank_model=raw.get("flashrank_model") or None,
|
|
135
146
|
remote_rerank=_parse_remote_rerank(raw.get("remote_rerank")),
|
|
147
|
+
embedding_dimensions=_coerce_optional_int(raw.get("embedding_dimensions")),
|
|
136
148
|
)
|
|
137
149
|
|
|
138
150
|
|
|
@@ -157,6 +169,8 @@ def save_config(config: Config) -> None:
|
|
|
157
169
|
data["rerank"] = config.rerank
|
|
158
170
|
if config.flashrank_model:
|
|
159
171
|
data["flashrank_model"] = config.flashrank_model
|
|
172
|
+
if config.embedding_dimensions is not None:
|
|
173
|
+
data["embedding_dimensions"] = config.embedding_dimensions
|
|
160
174
|
if config.remote_rerank is not None:
|
|
161
175
|
remote_data: Dict[str, Any] = {}
|
|
162
176
|
if config.remote_rerank.base_url:
|
|
@@ -223,9 +237,11 @@ def set_api_key(value: str | None) -> None:
|
|
|
223
237
|
save_config(config)
|
|
224
238
|
|
|
225
239
|
|
|
226
|
-
def set_model(value: str) -> None:
|
|
240
|
+
def set_model(value: str, *, validate_embedding_dimensions: bool = True) -> None:
|
|
227
241
|
config = load_config()
|
|
228
242
|
config.model = value
|
|
243
|
+
if validate_embedding_dimensions:
|
|
244
|
+
_validate_config_embedding_dimensions(config)
|
|
229
245
|
save_config(config)
|
|
230
246
|
|
|
231
247
|
|
|
@@ -253,9 +269,11 @@ def set_extract_backend(value: str) -> None:
|
|
|
253
269
|
save_config(config)
|
|
254
270
|
|
|
255
271
|
|
|
256
|
-
def set_provider(value: str) -> None:
|
|
272
|
+
def set_provider(value: str, *, validate_embedding_dimensions: bool = True) -> None:
|
|
257
273
|
config = load_config()
|
|
258
274
|
config.provider = value
|
|
275
|
+
if validate_embedding_dimensions:
|
|
276
|
+
_validate_config_embedding_dimensions(config)
|
|
259
277
|
save_config(config)
|
|
260
278
|
|
|
261
279
|
|
|
@@ -293,6 +311,43 @@ def set_flashrank_model(value: str | None) -> None:
|
|
|
293
311
|
save_config(config)
|
|
294
312
|
|
|
295
313
|
|
|
314
|
+
def set_embedding_dimensions(
|
|
315
|
+
value: int | None,
|
|
316
|
+
model: str | None = None,
|
|
317
|
+
provider: str | None = None,
|
|
318
|
+
) -> None:
|
|
319
|
+
"""Set the embedding dimensions for providers that support it (e.g., Voyage AI).
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
value: The dimension to set, or None/0 to clear
|
|
323
|
+
model: Optional model to validate against. If not provided, uses config model.
|
|
324
|
+
provider: Optional provider to resolve effective model. If not provided, uses config provider.
|
|
325
|
+
|
|
326
|
+
Raises:
|
|
327
|
+
ValueError: If value is negative, model doesn't support dimensions,
|
|
328
|
+
or dimension is not valid for the model.
|
|
329
|
+
"""
|
|
330
|
+
config = load_config()
|
|
331
|
+
|
|
332
|
+
# Reject negative values explicitly
|
|
333
|
+
if value is not None and value < 0:
|
|
334
|
+
raise ValueError(f"embedding_dimensions must be non-negative, got {value}")
|
|
335
|
+
|
|
336
|
+
# Treat 0 and None as "clear"
|
|
337
|
+
if not value or value <= 0:
|
|
338
|
+
config.embedding_dimensions = None
|
|
339
|
+
save_config(config)
|
|
340
|
+
return
|
|
341
|
+
|
|
342
|
+
# Validate against effective model (resolved from provider + model)
|
|
343
|
+
effective_provider = provider if provider else config.provider
|
|
344
|
+
effective_model = resolve_default_model(effective_provider, model if model else config.model)
|
|
345
|
+
validate_embedding_dimensions_for_model(value, effective_model)
|
|
346
|
+
|
|
347
|
+
config.embedding_dimensions = value
|
|
348
|
+
save_config(config)
|
|
349
|
+
|
|
350
|
+
|
|
296
351
|
def update_remote_rerank(
|
|
297
352
|
*,
|
|
298
353
|
base_url: str | None = None,
|
|
@@ -345,11 +400,72 @@ def resolve_default_model(provider: str | None, model: str | None) -> str:
|
|
|
345
400
|
normalized = (provider or DEFAULT_PROVIDER).lower()
|
|
346
401
|
if normalized == "gemini" and (not clean_model or clean_model == DEFAULT_MODEL):
|
|
347
402
|
return DEFAULT_GEMINI_MODEL
|
|
403
|
+
if normalized == "voyageai" and (not clean_model or clean_model == DEFAULT_MODEL):
|
|
404
|
+
return DEFAULT_VOYAGE_MODEL
|
|
348
405
|
if clean_model:
|
|
349
406
|
return clean_model
|
|
350
407
|
return DEFAULT_MODEL
|
|
351
408
|
|
|
352
409
|
|
|
410
|
+
def resolve_base_url(provider: str | None, configured_url: str | None) -> str | None:
|
|
411
|
+
"""Return the effective base URL for the selected provider."""
|
|
412
|
+
if configured_url:
|
|
413
|
+
return configured_url
|
|
414
|
+
normalized = (provider or DEFAULT_PROVIDER).lower()
|
|
415
|
+
if normalized == "voyageai":
|
|
416
|
+
return VOYAGE_BASE_URL
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def supports_dimensions(model: str) -> bool:
|
|
421
|
+
"""Check if a model supports the dimensions parameter."""
|
|
422
|
+
return get_supported_dimensions(model) is not None
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def get_supported_dimensions(model: str) -> tuple[int, ...] | None:
|
|
426
|
+
"""Return the supported dimensions for a model, or None if not supported."""
|
|
427
|
+
model_lower = model.lower()
|
|
428
|
+
for prefix, dims in DIMENSION_SUPPORTED_MODELS.items():
|
|
429
|
+
if model_lower.startswith(prefix):
|
|
430
|
+
return dims
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def validate_embedding_dimensions_for_model(value: int | None, model: str) -> None:
|
|
435
|
+
"""Validate that `value` is supported by `model` when value is set."""
|
|
436
|
+
if value is None:
|
|
437
|
+
return
|
|
438
|
+
supported = get_supported_dimensions(model)
|
|
439
|
+
if not supported:
|
|
440
|
+
raise ValueError(
|
|
441
|
+
f"Model '{model}' does not support custom dimensions. "
|
|
442
|
+
f"Supported model names/prefixes: {', '.join(DIMENSION_SUPPORTED_MODELS.keys())}"
|
|
443
|
+
)
|
|
444
|
+
if value not in supported:
|
|
445
|
+
raise ValueError(
|
|
446
|
+
f"Dimension {value} is not supported for model '{model}'. "
|
|
447
|
+
f"Supported dimensions: {supported}"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _validate_config_embedding_dimensions(config: Config) -> None:
|
|
452
|
+
"""Ensure stored embedding dimensions remain compatible with provider/model."""
|
|
453
|
+
if config.embedding_dimensions is None:
|
|
454
|
+
return
|
|
455
|
+
effective_model = resolve_default_model(config.provider, config.model)
|
|
456
|
+
try:
|
|
457
|
+
validate_embedding_dimensions_for_model(
|
|
458
|
+
config.embedding_dimensions,
|
|
459
|
+
effective_model,
|
|
460
|
+
)
|
|
461
|
+
except ValueError as exc:
|
|
462
|
+
raise ValueError(
|
|
463
|
+
f"Current embedding_dimensions ({config.embedding_dimensions}) is incompatible with "
|
|
464
|
+
f"model '{effective_model}'. Clear it with "
|
|
465
|
+
"`vexor config --clear-embedding-dimensions` or set a supported value."
|
|
466
|
+
) from exc
|
|
467
|
+
|
|
468
|
+
|
|
353
469
|
def resolve_api_key(configured: str | None, provider: str) -> str | None:
|
|
354
470
|
"""Return the first available API key from config or environment."""
|
|
355
471
|
|
|
@@ -365,6 +481,10 @@ def resolve_api_key(configured: str | None, provider: str) -> str | None:
|
|
|
365
481
|
legacy = os.getenv(LEGACY_GEMINI_ENV)
|
|
366
482
|
if legacy:
|
|
367
483
|
return legacy
|
|
484
|
+
if normalized == "voyageai":
|
|
485
|
+
voyage_key = os.getenv(VOYAGE_ENV)
|
|
486
|
+
if voyage_key:
|
|
487
|
+
return voyage_key
|
|
368
488
|
if normalized in {"openai", "custom"}:
|
|
369
489
|
openai_key = os.getenv(OPENAI_ENV)
|
|
370
490
|
if openai_key:
|
|
@@ -422,6 +542,7 @@ def _clone_config(config: Config) -> Config:
|
|
|
422
542
|
model=remote.model,
|
|
423
543
|
)
|
|
424
544
|
),
|
|
545
|
+
embedding_dimensions=config.embedding_dimensions,
|
|
425
546
|
)
|
|
426
547
|
|
|
427
548
|
|
|
@@ -466,6 +587,8 @@ def _apply_config_payload(config: Config, payload: Mapping[str, object]) -> None
|
|
|
466
587
|
)
|
|
467
588
|
if "remote_rerank" in payload:
|
|
468
589
|
config.remote_rerank = _coerce_remote_rerank(payload["remote_rerank"])
|
|
590
|
+
if "embedding_dimensions" in payload:
|
|
591
|
+
config.embedding_dimensions = _coerce_optional_int(payload["embedding_dimensions"])
|
|
469
592
|
|
|
470
593
|
|
|
471
594
|
def _coerce_optional_str(value: object, field: str) -> str | None:
|
|
@@ -522,6 +645,30 @@ def _coerce_bool(value: object, field: str) -> bool:
|
|
|
522
645
|
raise ValueError(Messages.ERROR_CONFIG_VALUE_INVALID.format(field=field))
|
|
523
646
|
|
|
524
647
|
|
|
648
|
+
def _coerce_optional_int(value: object) -> int | None:
|
|
649
|
+
"""Coerce a value to an optional integer, returning None for empty/null values."""
|
|
650
|
+
if value is None:
|
|
651
|
+
return None
|
|
652
|
+
if isinstance(value, bool):
|
|
653
|
+
return None
|
|
654
|
+
if isinstance(value, int):
|
|
655
|
+
return value if value > 0 else None
|
|
656
|
+
if isinstance(value, float):
|
|
657
|
+
if value.is_integer() and value > 0:
|
|
658
|
+
return int(value)
|
|
659
|
+
return None
|
|
660
|
+
if isinstance(value, str):
|
|
661
|
+
cleaned = value.strip()
|
|
662
|
+
if not cleaned:
|
|
663
|
+
return None
|
|
664
|
+
try:
|
|
665
|
+
parsed = int(cleaned)
|
|
666
|
+
return parsed if parsed > 0 else None
|
|
667
|
+
except ValueError:
|
|
668
|
+
return None
|
|
669
|
+
return None
|
|
670
|
+
|
|
671
|
+
|
|
525
672
|
def _normalize_extract_backend(value: object) -> str:
|
|
526
673
|
if value is None:
|
|
527
674
|
return DEFAULT_EXTRACT_BACKEND
|
vexor/providers/openai.py
CHANGED
|
@@ -24,12 +24,14 @@ class OpenAIEmbeddingBackend:
|
|
|
24
24
|
chunk_size: int | None = None,
|
|
25
25
|
concurrency: int = 1,
|
|
26
26
|
base_url: str | None = None,
|
|
27
|
+
dimensions: int | None = None,
|
|
27
28
|
) -> None:
|
|
28
29
|
load_dotenv()
|
|
29
30
|
self.model_name = model_name
|
|
30
31
|
self.chunk_size = chunk_size if chunk_size and chunk_size > 0 else None
|
|
31
32
|
self.concurrency = max(int(concurrency or 1), 1)
|
|
32
33
|
self.api_key = api_key
|
|
34
|
+
self.dimensions = dimensions if dimensions and dimensions > 0 else None
|
|
33
35
|
if not self.api_key:
|
|
34
36
|
raise RuntimeError(Messages.ERROR_API_KEY_MISSING)
|
|
35
37
|
client_kwargs: dict[str, object] = {"api_key": self.api_key}
|
|
@@ -73,10 +75,18 @@ class OpenAIEmbeddingBackend:
|
|
|
73
75
|
attempt = 0
|
|
74
76
|
while True:
|
|
75
77
|
try:
|
|
76
|
-
|
|
77
|
-
model
|
|
78
|
-
input
|
|
79
|
-
|
|
78
|
+
create_kwargs: dict[str, object] = {
|
|
79
|
+
"model": self.model_name,
|
|
80
|
+
"input": list(batch),
|
|
81
|
+
}
|
|
82
|
+
if self.dimensions is not None:
|
|
83
|
+
# Voyage AI uses output_dimension, OpenAI uses dimensions
|
|
84
|
+
if self.model_name.startswith("voyage"):
|
|
85
|
+
# Pass Voyage-specific params via extra_body
|
|
86
|
+
create_kwargs["extra_body"] = {"output_dimension": self.dimensions}
|
|
87
|
+
else:
|
|
88
|
+
create_kwargs["dimensions"] = self.dimensions
|
|
89
|
+
response = self._client.embeddings.create(**create_kwargs)
|
|
80
90
|
break
|
|
81
91
|
except Exception as exc: # pragma: no cover - API client variations
|
|
82
92
|
if _should_retry_openai_error(exc) and attempt < _MAX_RETRIES:
|
vexor/search.py
CHANGED
|
@@ -15,6 +15,7 @@ from .config import (
|
|
|
15
15
|
DEFAULT_PROVIDER,
|
|
16
16
|
SUPPORTED_PROVIDERS,
|
|
17
17
|
resolve_api_key,
|
|
18
|
+
resolve_base_url,
|
|
18
19
|
)
|
|
19
20
|
from .providers.gemini import GeminiEmbeddingBackend
|
|
20
21
|
from .providers.local import LocalEmbeddingBackend
|
|
@@ -56,14 +57,16 @@ class VexorSearcher:
|
|
|
56
57
|
base_url: str | None = None,
|
|
57
58
|
api_key: str | None = None,
|
|
58
59
|
local_cuda: bool = False,
|
|
60
|
+
embedding_dimensions: int | None = None,
|
|
59
61
|
) -> None:
|
|
60
62
|
self.model_name = model_name
|
|
61
63
|
self.batch_size = max(batch_size, 0)
|
|
62
64
|
self.embed_concurrency = max(int(embed_concurrency or 1), 1)
|
|
63
65
|
self.provider = (provider or DEFAULT_PROVIDER).lower()
|
|
64
|
-
self.base_url = base_url
|
|
66
|
+
self.base_url = resolve_base_url(self.provider, base_url)
|
|
65
67
|
self.api_key = resolve_api_key(api_key, self.provider)
|
|
66
68
|
self.local_cuda = bool(local_cuda)
|
|
69
|
+
self.embedding_dimensions = embedding_dimensions if embedding_dimensions and embedding_dimensions > 0 else None
|
|
67
70
|
if backend is not None:
|
|
68
71
|
self._backend = backend
|
|
69
72
|
self._device = getattr(backend, "device", "Custom embedding backend")
|
|
@@ -142,6 +145,16 @@ class VexorSearcher:
|
|
|
142
145
|
concurrency=self.embed_concurrency,
|
|
143
146
|
cuda=self.local_cuda,
|
|
144
147
|
)
|
|
148
|
+
if self.provider == "voyageai":
|
|
149
|
+
self._device = f"{self.model_name} via Voyage AI API"
|
|
150
|
+
return OpenAIEmbeddingBackend(
|
|
151
|
+
model_name=self.model_name,
|
|
152
|
+
chunk_size=self.batch_size,
|
|
153
|
+
concurrency=self.embed_concurrency,
|
|
154
|
+
base_url=self.base_url,
|
|
155
|
+
api_key=self.api_key,
|
|
156
|
+
dimensions=self.embedding_dimensions,
|
|
157
|
+
)
|
|
145
158
|
if self.provider == "custom":
|
|
146
159
|
base_url = (self.base_url or "").strip()
|
|
147
160
|
if not base_url:
|
|
@@ -155,6 +168,7 @@ class VexorSearcher:
|
|
|
155
168
|
concurrency=self.embed_concurrency,
|
|
156
169
|
base_url=base_url,
|
|
157
170
|
api_key=self.api_key,
|
|
171
|
+
dimensions=self.embedding_dimensions,
|
|
158
172
|
)
|
|
159
173
|
if self.provider == "openai":
|
|
160
174
|
self._device = f"{self.model_name} via OpenAI API"
|
|
@@ -164,6 +178,7 @@ class VexorSearcher:
|
|
|
164
178
|
concurrency=self.embed_concurrency,
|
|
165
179
|
base_url=self.base_url,
|
|
166
180
|
api_key=self.api_key,
|
|
181
|
+
dimensions=self.embedding_dimensions,
|
|
167
182
|
)
|
|
168
183
|
allowed = ", ".join(SUPPORTED_PROVIDERS)
|
|
169
184
|
raise RuntimeError(
|
vexor/services/config_service.py
CHANGED
|
@@ -11,6 +11,7 @@ from ..config import (
|
|
|
11
11
|
set_base_url,
|
|
12
12
|
set_batch_size,
|
|
13
13
|
set_embed_concurrency,
|
|
14
|
+
set_embedding_dimensions,
|
|
14
15
|
set_extract_concurrency,
|
|
15
16
|
set_extract_backend,
|
|
16
17
|
set_auto_index,
|
|
@@ -43,6 +44,8 @@ class ConfigUpdateResult:
|
|
|
43
44
|
remote_rerank_model_set: bool = False
|
|
44
45
|
remote_rerank_api_key_set: bool = False
|
|
45
46
|
remote_rerank_cleared: bool = False
|
|
47
|
+
embedding_dimensions_set: bool = False
|
|
48
|
+
embedding_dimensions_cleared: bool = False
|
|
46
49
|
|
|
47
50
|
@property
|
|
48
51
|
def changed(self) -> bool:
|
|
@@ -66,6 +69,8 @@ class ConfigUpdateResult:
|
|
|
66
69
|
self.remote_rerank_model_set,
|
|
67
70
|
self.remote_rerank_api_key_set,
|
|
68
71
|
self.remote_rerank_cleared,
|
|
72
|
+
self.embedding_dimensions_set,
|
|
73
|
+
self.embedding_dimensions_cleared,
|
|
69
74
|
)
|
|
70
75
|
)
|
|
71
76
|
|
|
@@ -90,6 +95,8 @@ def apply_config_updates(
|
|
|
90
95
|
remote_rerank_model: str | None = None,
|
|
91
96
|
remote_rerank_api_key: str | None = None,
|
|
92
97
|
clear_remote_rerank: bool = False,
|
|
98
|
+
embedding_dimensions: int | None = None,
|
|
99
|
+
clear_embedding_dimensions: bool = False,
|
|
93
100
|
) -> ConfigUpdateResult:
|
|
94
101
|
"""Apply config mutations and report which fields were updated."""
|
|
95
102
|
|
|
@@ -101,7 +108,12 @@ def apply_config_updates(
|
|
|
101
108
|
set_api_key(None)
|
|
102
109
|
result.api_key_cleared = True
|
|
103
110
|
if model is not None:
|
|
104
|
-
set_model(
|
|
111
|
+
set_model(
|
|
112
|
+
model,
|
|
113
|
+
validate_embedding_dimensions=not (
|
|
114
|
+
embedding_dimensions is not None or clear_embedding_dimensions
|
|
115
|
+
),
|
|
116
|
+
)
|
|
105
117
|
result.model_set = True
|
|
106
118
|
if batch_size is not None:
|
|
107
119
|
set_batch_size(batch_size)
|
|
@@ -116,7 +128,12 @@ def apply_config_updates(
|
|
|
116
128
|
set_extract_backend(extract_backend)
|
|
117
129
|
result.extract_backend_set = True
|
|
118
130
|
if provider is not None:
|
|
119
|
-
set_provider(
|
|
131
|
+
set_provider(
|
|
132
|
+
provider,
|
|
133
|
+
validate_embedding_dimensions=not (
|
|
134
|
+
embedding_dimensions is not None or clear_embedding_dimensions
|
|
135
|
+
),
|
|
136
|
+
)
|
|
120
137
|
result.provider_set = True
|
|
121
138
|
if base_url is not None:
|
|
122
139
|
set_base_url(base_url)
|
|
@@ -152,6 +169,17 @@ def apply_config_updates(
|
|
|
152
169
|
result.remote_rerank_model_set = remote_rerank_model is not None
|
|
153
170
|
result.remote_rerank_api_key_set = remote_rerank_api_key is not None
|
|
154
171
|
result.remote_rerank_cleared = clear_remote_rerank
|
|
172
|
+
if embedding_dimensions is not None:
|
|
173
|
+
if embedding_dimensions > 0:
|
|
174
|
+
set_embedding_dimensions(embedding_dimensions)
|
|
175
|
+
result.embedding_dimensions_set = True
|
|
176
|
+
else:
|
|
177
|
+
set_embedding_dimensions(None)
|
|
178
|
+
result.embedding_dimensions_cleared = True
|
|
179
|
+
if clear_embedding_dimensions:
|
|
180
|
+
if not result.embedding_dimensions_cleared:
|
|
181
|
+
set_embedding_dimensions(None)
|
|
182
|
+
result.embedding_dimensions_cleared = True
|
|
155
183
|
return result
|
|
156
184
|
|
|
157
185
|
|
vexor/services/index_service.py
CHANGED
|
@@ -142,6 +142,7 @@ def build_index(
|
|
|
142
142
|
exclude_patterns: Sequence[str] | None = None,
|
|
143
143
|
extensions: Sequence[str] | None = None,
|
|
144
144
|
no_cache: bool = False,
|
|
145
|
+
embedding_dimensions: int | None = None,
|
|
145
146
|
) -> IndexResult:
|
|
146
147
|
"""Create or refresh the cached index for *directory*."""
|
|
147
148
|
|
|
@@ -183,8 +184,24 @@ def build_index(
|
|
|
183
184
|
base_url=base_url,
|
|
184
185
|
api_key=api_key,
|
|
185
186
|
local_cuda=local_cuda,
|
|
187
|
+
embedding_dimensions=embedding_dimensions,
|
|
186
188
|
)
|
|
187
189
|
|
|
190
|
+
# Check if dimensions changed - if so, force full rebuild with no embedding cache
|
|
191
|
+
# Only detect mismatch when user explicitly requests a specific dimension that differs
|
|
192
|
+
force_no_cache = False
|
|
193
|
+
if cached_files:
|
|
194
|
+
cached_dimension = existing_meta.get("dimension") if existing_meta else None
|
|
195
|
+
dimension_changed = (
|
|
196
|
+
cached_dimension is not None
|
|
197
|
+
and embedding_dimensions is not None
|
|
198
|
+
and cached_dimension != embedding_dimensions
|
|
199
|
+
)
|
|
200
|
+
if dimension_changed:
|
|
201
|
+
# Dimensions changed, need full rebuild without embedding cache
|
|
202
|
+
# (cached embeddings have wrong dimensions)
|
|
203
|
+
cached_files = []
|
|
204
|
+
force_no_cache = True
|
|
188
205
|
if cached_files:
|
|
189
206
|
cached_version = int(existing_meta.get("version", 0) or 0) if existing_meta else 0
|
|
190
207
|
full_max_bytes = (
|
|
@@ -291,6 +308,8 @@ def build_index(
|
|
|
291
308
|
extensions=extensions,
|
|
292
309
|
stat_cache=stat_cache,
|
|
293
310
|
no_cache=no_cache,
|
|
311
|
+
embedding_dimensions=embedding_dimensions,
|
|
312
|
+
cached_index_dimension=existing_meta.get("dimension") if existing_meta else None,
|
|
294
313
|
)
|
|
295
314
|
|
|
296
315
|
line_backfill_targets = missing_line_files - changed_rel_paths - removed_rel_paths
|
|
@@ -333,7 +352,8 @@ def build_index(
|
|
|
333
352
|
searcher=searcher,
|
|
334
353
|
model_name=model_name,
|
|
335
354
|
labels=file_labels,
|
|
336
|
-
no_cache=no_cache,
|
|
355
|
+
no_cache=no_cache or force_no_cache,
|
|
356
|
+
embedding_dimension=embedding_dimensions,
|
|
337
357
|
)
|
|
338
358
|
entries = _build_index_entries(payloads, embeddings, directory, stat_cache=stat_cache)
|
|
339
359
|
|
|
@@ -374,6 +394,7 @@ def build_index_in_memory(
|
|
|
374
394
|
exclude_patterns: Sequence[str] | None = None,
|
|
375
395
|
extensions: Sequence[str] | None = None,
|
|
376
396
|
no_cache: bool = False,
|
|
397
|
+
embedding_dimensions: int | None = None,
|
|
377
398
|
) -> tuple[list[Path], np.ndarray, dict]:
|
|
378
399
|
"""Build an index in memory without writing to disk."""
|
|
379
400
|
|
|
@@ -418,6 +439,7 @@ def build_index_in_memory(
|
|
|
418
439
|
base_url=base_url,
|
|
419
440
|
api_key=api_key,
|
|
420
441
|
local_cuda=local_cuda,
|
|
442
|
+
embedding_dimensions=embedding_dimensions,
|
|
421
443
|
)
|
|
422
444
|
payloads = _payloads_for_files(
|
|
423
445
|
strategy,
|
|
@@ -455,6 +477,7 @@ def build_index_in_memory(
|
|
|
455
477
|
searcher=searcher,
|
|
456
478
|
model_name=model_name,
|
|
457
479
|
labels=labels,
|
|
480
|
+
embedding_dimension=embedding_dimensions,
|
|
458
481
|
)
|
|
459
482
|
entries = _build_index_entries(
|
|
460
483
|
payloads,
|
|
@@ -634,6 +657,8 @@ def _apply_incremental_update(
|
|
|
634
657
|
extensions: Sequence[str] | None,
|
|
635
658
|
stat_cache: MutableMapping[Path, os.stat_result] | None = None,
|
|
636
659
|
no_cache: bool = False,
|
|
660
|
+
embedding_dimensions: int | None = None,
|
|
661
|
+
cached_index_dimension: int | None = None,
|
|
637
662
|
) -> Path:
|
|
638
663
|
payloads_to_embed, payloads_to_touch = _split_payloads_by_label(
|
|
639
664
|
changed_payloads,
|
|
@@ -655,7 +680,20 @@ def _apply_incremental_update(
|
|
|
655
680
|
model_name=model_name,
|
|
656
681
|
labels=labels,
|
|
657
682
|
no_cache=no_cache,
|
|
683
|
+
embedding_dimension=embedding_dimensions,
|
|
658
684
|
)
|
|
685
|
+
|
|
686
|
+
# Validate dimension compatibility with existing index
|
|
687
|
+
if cached_index_dimension is not None and embeddings.size > 0:
|
|
688
|
+
new_dimension = embeddings.shape[1] if embeddings.ndim == 2 else 0
|
|
689
|
+
if new_dimension != cached_index_dimension:
|
|
690
|
+
raise ValueError(
|
|
691
|
+
f"Embedding dimension mismatch: existing index has {cached_index_dimension}-dim vectors, "
|
|
692
|
+
f"but new embeddings are {new_dimension}-dim. "
|
|
693
|
+
f"This typically happens when embedding_dimensions config was changed. "
|
|
694
|
+
f"Clear the index and rebuild: vexor index --clear {directory}"
|
|
695
|
+
)
|
|
696
|
+
|
|
659
697
|
changed_entries = _build_index_entries(
|
|
660
698
|
payloads_to_embed,
|
|
661
699
|
embeddings,
|
|
@@ -693,7 +731,18 @@ def _embed_labels_with_cache(
|
|
|
693
731
|
model_name: str,
|
|
694
732
|
labels: Sequence[str],
|
|
695
733
|
no_cache: bool = False,
|
|
734
|
+
embedding_dimension: int | None = None,
|
|
696
735
|
) -> np.ndarray:
|
|
736
|
+
"""Embed labels with caching support.
|
|
737
|
+
|
|
738
|
+
Args:
|
|
739
|
+
searcher: The embedding searcher instance
|
|
740
|
+
model_name: Name of the embedding model
|
|
741
|
+
labels: Sequence of label strings to embed
|
|
742
|
+
no_cache: If True, bypass cache entirely
|
|
743
|
+
embedding_dimension: Embedding dimension for cache segmentation (prevents
|
|
744
|
+
cross-dimension cache pollution when dimension settings change)
|
|
745
|
+
"""
|
|
697
746
|
if not labels:
|
|
698
747
|
return np.empty((0, 0), dtype=np.float32)
|
|
699
748
|
if no_cache:
|
|
@@ -701,8 +750,9 @@ def _embed_labels_with_cache(
|
|
|
701
750
|
return np.asarray(vectors, dtype=np.float32)
|
|
702
751
|
from ..cache import embedding_cache_key, load_embedding_cache, store_embedding_cache
|
|
703
752
|
|
|
704
|
-
|
|
705
|
-
|
|
753
|
+
# Include dimension in cache key to prevent cross-dimension cache pollution
|
|
754
|
+
hashes = [embedding_cache_key(label, dimension=embedding_dimension) for label in labels]
|
|
755
|
+
cached = load_embedding_cache(model_name, hashes, dimension=embedding_dimension)
|
|
706
756
|
missing: dict[str, str] = {}
|
|
707
757
|
for label, text_hash in zip(labels, hashes):
|
|
708
758
|
vector = cached.get(text_hash)
|
|
@@ -719,7 +769,9 @@ def _embed_labels_with_cache(
|
|
|
719
769
|
vector = np.asarray(new_vectors[idx], dtype=np.float32)
|
|
720
770
|
cached[text_hash] = vector
|
|
721
771
|
stored[text_hash] = vector
|
|
722
|
-
store_embedding_cache(
|
|
772
|
+
store_embedding_cache(
|
|
773
|
+
model=model_name, embeddings=stored, dimension=embedding_dimension
|
|
774
|
+
)
|
|
723
775
|
|
|
724
776
|
vectors = [cached[text_hash] for text_hash in hashes]
|
|
725
777
|
return np.vstack([np.asarray(vector, dtype=np.float32) for vector in vectors])
|
vexor/services/init_service.py
CHANGED
|
@@ -231,6 +231,11 @@ def _collect_remote_settings() -> dict[str, object]:
|
|
|
231
231
|
)
|
|
232
232
|
_print_option(
|
|
233
233
|
"C",
|
|
234
|
+
Messages.INIT_OPTION_PROVIDER_VOYAGEAI,
|
|
235
|
+
Messages.INIT_OPTION_PROVIDER_VOYAGEAI_DESC,
|
|
236
|
+
)
|
|
237
|
+
_print_option(
|
|
238
|
+
"D",
|
|
234
239
|
Messages.INIT_OPTION_PROVIDER_CUSTOM,
|
|
235
240
|
Messages.INIT_OPTION_PROVIDER_CUSTOM_DESC,
|
|
236
241
|
)
|
|
@@ -242,11 +247,14 @@ def _collect_remote_settings() -> dict[str, object]:
|
|
|
242
247
|
"openai": "openai",
|
|
243
248
|
"b": "gemini",
|
|
244
249
|
"gemini": "gemini",
|
|
245
|
-
"c": "
|
|
250
|
+
"c": "voyageai",
|
|
251
|
+
"voyageai": "voyageai",
|
|
252
|
+
"voyage": "voyageai",
|
|
253
|
+
"d": "custom",
|
|
246
254
|
"custom": "custom",
|
|
247
255
|
},
|
|
248
256
|
default="A",
|
|
249
|
-
allowed="A/B/C",
|
|
257
|
+
allowed="A/B/C/D",
|
|
250
258
|
)
|
|
251
259
|
console.print()
|
|
252
260
|
|
|
@@ -266,6 +274,8 @@ def _collect_remote_settings() -> dict[str, object]:
|
|
|
266
274
|
|
|
267
275
|
if provider == "gemini":
|
|
268
276
|
api_key = _prompt_api_key(Messages.INIT_PROMPT_API_KEY_GEMINI, provider)
|
|
277
|
+
elif provider == "voyageai":
|
|
278
|
+
api_key = _prompt_api_key(Messages.INIT_PROMPT_API_KEY_VOYAGE, provider)
|
|
269
279
|
else:
|
|
270
280
|
api_key = _prompt_api_key(Messages.INIT_PROMPT_API_KEY_OPENAI, provider)
|
|
271
281
|
updates["api_key"] = api_key
|