vexor 0.22.0__py3-none-any.whl → 0.23.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vexor/config.py CHANGED
@@ -23,6 +23,7 @@ _CONFIG_DIR_OVERRIDE: ContextVar[Path | None] = ContextVar(
23
23
  )
24
24
  DEFAULT_MODEL = "text-embedding-3-small"
25
25
  DEFAULT_GEMINI_MODEL = "gemini-embedding-001"
26
+ DEFAULT_VOYAGE_MODEL = "voyage-3-large"
26
27
  DEFAULT_LOCAL_MODEL = "intfloat/multilingual-e5-small"
27
28
  DEFAULT_BATCH_SIZE = 64
28
29
  DEFAULT_EMBED_CONCURRENCY = 4
@@ -32,13 +33,22 @@ DEFAULT_PROVIDER = "openai"
32
33
  DEFAULT_RERANK = "off"
33
34
  DEFAULT_FLASHRANK_MODEL = "ms-marco-TinyBERT-L-2-v2"
34
35
  DEFAULT_FLASHRANK_MAX_LENGTH = 256
35
- SUPPORTED_PROVIDERS: tuple[str, ...] = (DEFAULT_PROVIDER, "gemini", "custom", "local")
36
+ VOYAGE_BASE_URL = "https://api.voyageai.com/v1"
37
+ SUPPORTED_PROVIDERS: tuple[str, ...] = (DEFAULT_PROVIDER, "gemini", "voyageai", "custom", "local")
36
38
  SUPPORTED_RERANKERS: tuple[str, ...] = ("off", "bm25", "flashrank", "remote")
37
39
  SUPPORTED_EXTRACT_BACKENDS: tuple[str, ...] = ("auto", "thread", "process")
40
+ # Models that support the dimensions parameter (model prefix/name -> supported dimensions)
41
+ DIMENSION_SUPPORTED_MODELS: dict[str, tuple[int, ...]] = {
42
+ "text-embedding-3-small": (256, 512, 1024, 1536),
43
+ "text-embedding-3-large": (256, 512, 1024, 1536, 3072),
44
+ "voyage-3": (256, 512, 1024, 2048),
45
+ "voyage-code-3": (256, 512, 1024, 2048),
46
+ }
38
47
  ENV_API_KEY = "VEXOR_API_KEY"
39
48
  REMOTE_RERANK_ENV = "VEXOR_REMOTE_RERANK_API_KEY"
40
49
  LEGACY_GEMINI_ENV = "GOOGLE_GENAI_API_KEY"
41
50
  OPENAI_ENV = "OPENAI_API_KEY"
51
+ VOYAGE_ENV = "VOYAGE_API_KEY"
42
52
 
43
53
 
44
54
  @dataclass
@@ -63,6 +73,7 @@ class Config:
63
73
  rerank: str = DEFAULT_RERANK
64
74
  flashrank_model: str | None = None
65
75
  remote_rerank: RemoteRerankConfig | None = None
76
+ embedding_dimensions: int | None = None
66
77
 
67
78
 
68
79
  def _parse_remote_rerank(raw: object) -> RemoteRerankConfig | None:
@@ -133,6 +144,7 @@ def load_config() -> Config:
133
144
  rerank=rerank,
134
145
  flashrank_model=raw.get("flashrank_model") or None,
135
146
  remote_rerank=_parse_remote_rerank(raw.get("remote_rerank")),
147
+ embedding_dimensions=_coerce_optional_int(raw.get("embedding_dimensions")),
136
148
  )
137
149
 
138
150
 
@@ -157,6 +169,8 @@ def save_config(config: Config) -> None:
157
169
  data["rerank"] = config.rerank
158
170
  if config.flashrank_model:
159
171
  data["flashrank_model"] = config.flashrank_model
172
+ if config.embedding_dimensions is not None:
173
+ data["embedding_dimensions"] = config.embedding_dimensions
160
174
  if config.remote_rerank is not None:
161
175
  remote_data: Dict[str, Any] = {}
162
176
  if config.remote_rerank.base_url:
@@ -223,9 +237,11 @@ def set_api_key(value: str | None) -> None:
223
237
  save_config(config)
224
238
 
225
239
 
226
- def set_model(value: str) -> None:
240
+ def set_model(value: str, *, validate_embedding_dimensions: bool = True) -> None:
227
241
  config = load_config()
228
242
  config.model = value
243
+ if validate_embedding_dimensions:
244
+ _validate_config_embedding_dimensions(config)
229
245
  save_config(config)
230
246
 
231
247
 
@@ -253,9 +269,11 @@ def set_extract_backend(value: str) -> None:
253
269
  save_config(config)
254
270
 
255
271
 
256
- def set_provider(value: str) -> None:
272
+ def set_provider(value: str, *, validate_embedding_dimensions: bool = True) -> None:
257
273
  config = load_config()
258
274
  config.provider = value
275
+ if validate_embedding_dimensions:
276
+ _validate_config_embedding_dimensions(config)
259
277
  save_config(config)
260
278
 
261
279
 
@@ -293,6 +311,43 @@ def set_flashrank_model(value: str | None) -> None:
293
311
  save_config(config)
294
312
 
295
313
 
314
+ def set_embedding_dimensions(
315
+ value: int | None,
316
+ model: str | None = None,
317
+ provider: str | None = None,
318
+ ) -> None:
319
+ """Set the embedding dimensions for providers that support it (e.g., Voyage AI).
320
+
321
+ Args:
322
+ value: The dimension to set, or None/0 to clear
323
+ model: Optional model to validate against. If not provided, uses config model.
324
+ provider: Optional provider to resolve effective model. If not provided, uses config provider.
325
+
326
+ Raises:
327
+ ValueError: If value is negative, model doesn't support dimensions,
328
+ or dimension is not valid for the model.
329
+ """
330
+ config = load_config()
331
+
332
+ # Reject negative values explicitly
333
+ if value is not None and value < 0:
334
+ raise ValueError(f"embedding_dimensions must be non-negative, got {value}")
335
+
336
+ # Treat 0 and None as "clear"
337
+ if not value or value <= 0:
338
+ config.embedding_dimensions = None
339
+ save_config(config)
340
+ return
341
+
342
+ # Validate against effective model (resolved from provider + model)
343
+ effective_provider = provider if provider else config.provider
344
+ effective_model = resolve_default_model(effective_provider, model if model else config.model)
345
+ validate_embedding_dimensions_for_model(value, effective_model)
346
+
347
+ config.embedding_dimensions = value
348
+ save_config(config)
349
+
350
+
296
351
  def update_remote_rerank(
297
352
  *,
298
353
  base_url: str | None = None,
@@ -345,11 +400,72 @@ def resolve_default_model(provider: str | None, model: str | None) -> str:
345
400
  normalized = (provider or DEFAULT_PROVIDER).lower()
346
401
  if normalized == "gemini" and (not clean_model or clean_model == DEFAULT_MODEL):
347
402
  return DEFAULT_GEMINI_MODEL
403
+ if normalized == "voyageai" and (not clean_model or clean_model == DEFAULT_MODEL):
404
+ return DEFAULT_VOYAGE_MODEL
348
405
  if clean_model:
349
406
  return clean_model
350
407
  return DEFAULT_MODEL
351
408
 
352
409
 
410
+ def resolve_base_url(provider: str | None, configured_url: str | None) -> str | None:
411
+ """Return the effective base URL for the selected provider."""
412
+ if configured_url:
413
+ return configured_url
414
+ normalized = (provider or DEFAULT_PROVIDER).lower()
415
+ if normalized == "voyageai":
416
+ return VOYAGE_BASE_URL
417
+ return None
418
+
419
+
420
+ def supports_dimensions(model: str) -> bool:
421
+ """Check if a model supports the dimensions parameter."""
422
+ return get_supported_dimensions(model) is not None
423
+
424
+
425
+ def get_supported_dimensions(model: str) -> tuple[int, ...] | None:
426
+ """Return the supported dimensions for a model, or None if not supported."""
427
+ model_lower = model.lower()
428
+ for prefix, dims in DIMENSION_SUPPORTED_MODELS.items():
429
+ if model_lower.startswith(prefix):
430
+ return dims
431
+ return None
432
+
433
+
434
+ def validate_embedding_dimensions_for_model(value: int | None, model: str) -> None:
435
+ """Validate that `value` is supported by `model` when value is set."""
436
+ if value is None:
437
+ return
438
+ supported = get_supported_dimensions(model)
439
+ if not supported:
440
+ raise ValueError(
441
+ f"Model '{model}' does not support custom dimensions. "
442
+ f"Supported model names/prefixes: {', '.join(DIMENSION_SUPPORTED_MODELS.keys())}"
443
+ )
444
+ if value not in supported:
445
+ raise ValueError(
446
+ f"Dimension {value} is not supported for model '{model}'. "
447
+ f"Supported dimensions: {supported}"
448
+ )
449
+
450
+
451
+ def _validate_config_embedding_dimensions(config: Config) -> None:
452
+ """Ensure stored embedding dimensions remain compatible with provider/model."""
453
+ if config.embedding_dimensions is None:
454
+ return
455
+ effective_model = resolve_default_model(config.provider, config.model)
456
+ try:
457
+ validate_embedding_dimensions_for_model(
458
+ config.embedding_dimensions,
459
+ effective_model,
460
+ )
461
+ except ValueError as exc:
462
+ raise ValueError(
463
+ f"Current embedding_dimensions ({config.embedding_dimensions}) is incompatible with "
464
+ f"model '{effective_model}'. Clear it with "
465
+ "`vexor config --clear-embedding-dimensions` or set a supported value."
466
+ ) from exc
467
+
468
+
353
469
  def resolve_api_key(configured: str | None, provider: str) -> str | None:
354
470
  """Return the first available API key from config or environment."""
355
471
 
@@ -365,6 +481,10 @@ def resolve_api_key(configured: str | None, provider: str) -> str | None:
365
481
  legacy = os.getenv(LEGACY_GEMINI_ENV)
366
482
  if legacy:
367
483
  return legacy
484
+ if normalized == "voyageai":
485
+ voyage_key = os.getenv(VOYAGE_ENV)
486
+ if voyage_key:
487
+ return voyage_key
368
488
  if normalized in {"openai", "custom"}:
369
489
  openai_key = os.getenv(OPENAI_ENV)
370
490
  if openai_key:
@@ -422,6 +542,7 @@ def _clone_config(config: Config) -> Config:
422
542
  model=remote.model,
423
543
  )
424
544
  ),
545
+ embedding_dimensions=config.embedding_dimensions,
425
546
  )
426
547
 
427
548
 
@@ -466,6 +587,8 @@ def _apply_config_payload(config: Config, payload: Mapping[str, object]) -> None
466
587
  )
467
588
  if "remote_rerank" in payload:
468
589
  config.remote_rerank = _coerce_remote_rerank(payload["remote_rerank"])
590
+ if "embedding_dimensions" in payload:
591
+ config.embedding_dimensions = _coerce_optional_int(payload["embedding_dimensions"])
469
592
 
470
593
 
471
594
  def _coerce_optional_str(value: object, field: str) -> str | None:
@@ -522,6 +645,30 @@ def _coerce_bool(value: object, field: str) -> bool:
522
645
  raise ValueError(Messages.ERROR_CONFIG_VALUE_INVALID.format(field=field))
523
646
 
524
647
 
648
+ def _coerce_optional_int(value: object) -> int | None:
649
+ """Coerce a value to an optional integer, returning None for empty/null values."""
650
+ if value is None:
651
+ return None
652
+ if isinstance(value, bool):
653
+ return None
654
+ if isinstance(value, int):
655
+ return value if value > 0 else None
656
+ if isinstance(value, float):
657
+ if value.is_integer() and value > 0:
658
+ return int(value)
659
+ return None
660
+ if isinstance(value, str):
661
+ cleaned = value.strip()
662
+ if not cleaned:
663
+ return None
664
+ try:
665
+ parsed = int(cleaned)
666
+ return parsed if parsed > 0 else None
667
+ except ValueError:
668
+ return None
669
+ return None
670
+
671
+
525
672
  def _normalize_extract_backend(value: object) -> str:
526
673
  if value is None:
527
674
  return DEFAULT_EXTRACT_BACKEND
vexor/providers/openai.py CHANGED
@@ -24,12 +24,14 @@ class OpenAIEmbeddingBackend:
24
24
  chunk_size: int | None = None,
25
25
  concurrency: int = 1,
26
26
  base_url: str | None = None,
27
+ dimensions: int | None = None,
27
28
  ) -> None:
28
29
  load_dotenv()
29
30
  self.model_name = model_name
30
31
  self.chunk_size = chunk_size if chunk_size and chunk_size > 0 else None
31
32
  self.concurrency = max(int(concurrency or 1), 1)
32
33
  self.api_key = api_key
34
+ self.dimensions = dimensions if dimensions and dimensions > 0 else None
33
35
  if not self.api_key:
34
36
  raise RuntimeError(Messages.ERROR_API_KEY_MISSING)
35
37
  client_kwargs: dict[str, object] = {"api_key": self.api_key}
@@ -73,10 +75,18 @@ class OpenAIEmbeddingBackend:
73
75
  attempt = 0
74
76
  while True:
75
77
  try:
76
- response = self._client.embeddings.create(
77
- model=self.model_name,
78
- input=list(batch),
79
- )
78
+ create_kwargs: dict[str, object] = {
79
+ "model": self.model_name,
80
+ "input": list(batch),
81
+ }
82
+ if self.dimensions is not None:
83
+ # Voyage AI uses output_dimension, OpenAI uses dimensions
84
+ if self.model_name.startswith("voyage"):
85
+ # Pass Voyage-specific params via extra_body
86
+ create_kwargs["extra_body"] = {"output_dimension": self.dimensions}
87
+ else:
88
+ create_kwargs["dimensions"] = self.dimensions
89
+ response = self._client.embeddings.create(**create_kwargs)
80
90
  break
81
91
  except Exception as exc: # pragma: no cover - API client variations
82
92
  if _should_retry_openai_error(exc) and attempt < _MAX_RETRIES:
vexor/search.py CHANGED
@@ -15,6 +15,7 @@ from .config import (
15
15
  DEFAULT_PROVIDER,
16
16
  SUPPORTED_PROVIDERS,
17
17
  resolve_api_key,
18
+ resolve_base_url,
18
19
  )
19
20
  from .providers.gemini import GeminiEmbeddingBackend
20
21
  from .providers.local import LocalEmbeddingBackend
@@ -56,14 +57,16 @@ class VexorSearcher:
56
57
  base_url: str | None = None,
57
58
  api_key: str | None = None,
58
59
  local_cuda: bool = False,
60
+ embedding_dimensions: int | None = None,
59
61
  ) -> None:
60
62
  self.model_name = model_name
61
63
  self.batch_size = max(batch_size, 0)
62
64
  self.embed_concurrency = max(int(embed_concurrency or 1), 1)
63
65
  self.provider = (provider or DEFAULT_PROVIDER).lower()
64
- self.base_url = base_url
66
+ self.base_url = resolve_base_url(self.provider, base_url)
65
67
  self.api_key = resolve_api_key(api_key, self.provider)
66
68
  self.local_cuda = bool(local_cuda)
69
+ self.embedding_dimensions = embedding_dimensions if embedding_dimensions and embedding_dimensions > 0 else None
67
70
  if backend is not None:
68
71
  self._backend = backend
69
72
  self._device = getattr(backend, "device", "Custom embedding backend")
@@ -142,6 +145,16 @@ class VexorSearcher:
142
145
  concurrency=self.embed_concurrency,
143
146
  cuda=self.local_cuda,
144
147
  )
148
+ if self.provider == "voyageai":
149
+ self._device = f"{self.model_name} via Voyage AI API"
150
+ return OpenAIEmbeddingBackend(
151
+ model_name=self.model_name,
152
+ chunk_size=self.batch_size,
153
+ concurrency=self.embed_concurrency,
154
+ base_url=self.base_url,
155
+ api_key=self.api_key,
156
+ dimensions=self.embedding_dimensions,
157
+ )
145
158
  if self.provider == "custom":
146
159
  base_url = (self.base_url or "").strip()
147
160
  if not base_url:
@@ -155,6 +168,7 @@ class VexorSearcher:
155
168
  concurrency=self.embed_concurrency,
156
169
  base_url=base_url,
157
170
  api_key=self.api_key,
171
+ dimensions=self.embedding_dimensions,
158
172
  )
159
173
  if self.provider == "openai":
160
174
  self._device = f"{self.model_name} via OpenAI API"
@@ -164,6 +178,7 @@ class VexorSearcher:
164
178
  concurrency=self.embed_concurrency,
165
179
  base_url=self.base_url,
166
180
  api_key=self.api_key,
181
+ dimensions=self.embedding_dimensions,
167
182
  )
168
183
  allowed = ", ".join(SUPPORTED_PROVIDERS)
169
184
  raise RuntimeError(
@@ -11,6 +11,7 @@ from ..config import (
11
11
  set_base_url,
12
12
  set_batch_size,
13
13
  set_embed_concurrency,
14
+ set_embedding_dimensions,
14
15
  set_extract_concurrency,
15
16
  set_extract_backend,
16
17
  set_auto_index,
@@ -43,6 +44,8 @@ class ConfigUpdateResult:
43
44
  remote_rerank_model_set: bool = False
44
45
  remote_rerank_api_key_set: bool = False
45
46
  remote_rerank_cleared: bool = False
47
+ embedding_dimensions_set: bool = False
48
+ embedding_dimensions_cleared: bool = False
46
49
 
47
50
  @property
48
51
  def changed(self) -> bool:
@@ -66,6 +69,8 @@ class ConfigUpdateResult:
66
69
  self.remote_rerank_model_set,
67
70
  self.remote_rerank_api_key_set,
68
71
  self.remote_rerank_cleared,
72
+ self.embedding_dimensions_set,
73
+ self.embedding_dimensions_cleared,
69
74
  )
70
75
  )
71
76
 
@@ -90,6 +95,8 @@ def apply_config_updates(
90
95
  remote_rerank_model: str | None = None,
91
96
  remote_rerank_api_key: str | None = None,
92
97
  clear_remote_rerank: bool = False,
98
+ embedding_dimensions: int | None = None,
99
+ clear_embedding_dimensions: bool = False,
93
100
  ) -> ConfigUpdateResult:
94
101
  """Apply config mutations and report which fields were updated."""
95
102
 
@@ -101,7 +108,12 @@ def apply_config_updates(
101
108
  set_api_key(None)
102
109
  result.api_key_cleared = True
103
110
  if model is not None:
104
- set_model(model)
111
+ set_model(
112
+ model,
113
+ validate_embedding_dimensions=not (
114
+ embedding_dimensions is not None or clear_embedding_dimensions
115
+ ),
116
+ )
105
117
  result.model_set = True
106
118
  if batch_size is not None:
107
119
  set_batch_size(batch_size)
@@ -116,7 +128,12 @@ def apply_config_updates(
116
128
  set_extract_backend(extract_backend)
117
129
  result.extract_backend_set = True
118
130
  if provider is not None:
119
- set_provider(provider)
131
+ set_provider(
132
+ provider,
133
+ validate_embedding_dimensions=not (
134
+ embedding_dimensions is not None or clear_embedding_dimensions
135
+ ),
136
+ )
120
137
  result.provider_set = True
121
138
  if base_url is not None:
122
139
  set_base_url(base_url)
@@ -152,6 +169,17 @@ def apply_config_updates(
152
169
  result.remote_rerank_model_set = remote_rerank_model is not None
153
170
  result.remote_rerank_api_key_set = remote_rerank_api_key is not None
154
171
  result.remote_rerank_cleared = clear_remote_rerank
172
+ if embedding_dimensions is not None:
173
+ if embedding_dimensions > 0:
174
+ set_embedding_dimensions(embedding_dimensions)
175
+ result.embedding_dimensions_set = True
176
+ else:
177
+ set_embedding_dimensions(None)
178
+ result.embedding_dimensions_cleared = True
179
+ if clear_embedding_dimensions:
180
+ if not result.embedding_dimensions_cleared:
181
+ set_embedding_dimensions(None)
182
+ result.embedding_dimensions_cleared = True
155
183
  return result
156
184
 
157
185
 
@@ -142,6 +142,7 @@ def build_index(
142
142
  exclude_patterns: Sequence[str] | None = None,
143
143
  extensions: Sequence[str] | None = None,
144
144
  no_cache: bool = False,
145
+ embedding_dimensions: int | None = None,
145
146
  ) -> IndexResult:
146
147
  """Create or refresh the cached index for *directory*."""
147
148
 
@@ -183,8 +184,24 @@ def build_index(
183
184
  base_url=base_url,
184
185
  api_key=api_key,
185
186
  local_cuda=local_cuda,
187
+ embedding_dimensions=embedding_dimensions,
186
188
  )
187
189
 
190
+ # Check if dimensions changed - if so, force full rebuild with no embedding cache
191
+ # Only detect mismatch when user explicitly requests a specific dimension that differs
192
+ force_no_cache = False
193
+ if cached_files:
194
+ cached_dimension = existing_meta.get("dimension") if existing_meta else None
195
+ dimension_changed = (
196
+ cached_dimension is not None
197
+ and embedding_dimensions is not None
198
+ and cached_dimension != embedding_dimensions
199
+ )
200
+ if dimension_changed:
201
+ # Dimensions changed, need full rebuild without embedding cache
202
+ # (cached embeddings have wrong dimensions)
203
+ cached_files = []
204
+ force_no_cache = True
188
205
  if cached_files:
189
206
  cached_version = int(existing_meta.get("version", 0) or 0) if existing_meta else 0
190
207
  full_max_bytes = (
@@ -291,6 +308,8 @@ def build_index(
291
308
  extensions=extensions,
292
309
  stat_cache=stat_cache,
293
310
  no_cache=no_cache,
311
+ embedding_dimensions=embedding_dimensions,
312
+ cached_index_dimension=existing_meta.get("dimension") if existing_meta else None,
294
313
  )
295
314
 
296
315
  line_backfill_targets = missing_line_files - changed_rel_paths - removed_rel_paths
@@ -333,7 +352,8 @@ def build_index(
333
352
  searcher=searcher,
334
353
  model_name=model_name,
335
354
  labels=file_labels,
336
- no_cache=no_cache,
355
+ no_cache=no_cache or force_no_cache,
356
+ embedding_dimension=embedding_dimensions,
337
357
  )
338
358
  entries = _build_index_entries(payloads, embeddings, directory, stat_cache=stat_cache)
339
359
 
@@ -374,6 +394,7 @@ def build_index_in_memory(
374
394
  exclude_patterns: Sequence[str] | None = None,
375
395
  extensions: Sequence[str] | None = None,
376
396
  no_cache: bool = False,
397
+ embedding_dimensions: int | None = None,
377
398
  ) -> tuple[list[Path], np.ndarray, dict]:
378
399
  """Build an index in memory without writing to disk."""
379
400
 
@@ -418,6 +439,7 @@ def build_index_in_memory(
418
439
  base_url=base_url,
419
440
  api_key=api_key,
420
441
  local_cuda=local_cuda,
442
+ embedding_dimensions=embedding_dimensions,
421
443
  )
422
444
  payloads = _payloads_for_files(
423
445
  strategy,
@@ -455,6 +477,7 @@ def build_index_in_memory(
455
477
  searcher=searcher,
456
478
  model_name=model_name,
457
479
  labels=labels,
480
+ embedding_dimension=embedding_dimensions,
458
481
  )
459
482
  entries = _build_index_entries(
460
483
  payloads,
@@ -634,6 +657,8 @@ def _apply_incremental_update(
634
657
  extensions: Sequence[str] | None,
635
658
  stat_cache: MutableMapping[Path, os.stat_result] | None = None,
636
659
  no_cache: bool = False,
660
+ embedding_dimensions: int | None = None,
661
+ cached_index_dimension: int | None = None,
637
662
  ) -> Path:
638
663
  payloads_to_embed, payloads_to_touch = _split_payloads_by_label(
639
664
  changed_payloads,
@@ -655,7 +680,20 @@ def _apply_incremental_update(
655
680
  model_name=model_name,
656
681
  labels=labels,
657
682
  no_cache=no_cache,
683
+ embedding_dimension=embedding_dimensions,
658
684
  )
685
+
686
+ # Validate dimension compatibility with existing index
687
+ if cached_index_dimension is not None and embeddings.size > 0:
688
+ new_dimension = embeddings.shape[1] if embeddings.ndim == 2 else 0
689
+ if new_dimension != cached_index_dimension:
690
+ raise ValueError(
691
+ f"Embedding dimension mismatch: existing index has {cached_index_dimension}-dim vectors, "
692
+ f"but new embeddings are {new_dimension}-dim. "
693
+ f"This typically happens when embedding_dimensions config was changed. "
694
+ f"Clear the index and rebuild: vexor index --clear {directory}"
695
+ )
696
+
659
697
  changed_entries = _build_index_entries(
660
698
  payloads_to_embed,
661
699
  embeddings,
@@ -693,7 +731,18 @@ def _embed_labels_with_cache(
693
731
  model_name: str,
694
732
  labels: Sequence[str],
695
733
  no_cache: bool = False,
734
+ embedding_dimension: int | None = None,
696
735
  ) -> np.ndarray:
736
+ """Embed labels with caching support.
737
+
738
+ Args:
739
+ searcher: The embedding searcher instance
740
+ model_name: Name of the embedding model
741
+ labels: Sequence of label strings to embed
742
+ no_cache: If True, bypass cache entirely
743
+ embedding_dimension: Embedding dimension for cache segmentation (prevents
744
+ cross-dimension cache pollution when dimension settings change)
745
+ """
697
746
  if not labels:
698
747
  return np.empty((0, 0), dtype=np.float32)
699
748
  if no_cache:
@@ -701,8 +750,9 @@ def _embed_labels_with_cache(
701
750
  return np.asarray(vectors, dtype=np.float32)
702
751
  from ..cache import embedding_cache_key, load_embedding_cache, store_embedding_cache
703
752
 
704
- hashes = [embedding_cache_key(label) for label in labels]
705
- cached = load_embedding_cache(model_name, hashes)
753
+ # Include dimension in cache key to prevent cross-dimension cache pollution
754
+ hashes = [embedding_cache_key(label, dimension=embedding_dimension) for label in labels]
755
+ cached = load_embedding_cache(model_name, hashes, dimension=embedding_dimension)
706
756
  missing: dict[str, str] = {}
707
757
  for label, text_hash in zip(labels, hashes):
708
758
  vector = cached.get(text_hash)
@@ -719,7 +769,9 @@ def _embed_labels_with_cache(
719
769
  vector = np.asarray(new_vectors[idx], dtype=np.float32)
720
770
  cached[text_hash] = vector
721
771
  stored[text_hash] = vector
722
- store_embedding_cache(model=model_name, embeddings=stored)
772
+ store_embedding_cache(
773
+ model=model_name, embeddings=stored, dimension=embedding_dimension
774
+ )
723
775
 
724
776
  vectors = [cached[text_hash] for text_hash in hashes]
725
777
  return np.vstack([np.asarray(vector, dtype=np.float32) for vector in vectors])
@@ -231,6 +231,11 @@ def _collect_remote_settings() -> dict[str, object]:
231
231
  )
232
232
  _print_option(
233
233
  "C",
234
+ Messages.INIT_OPTION_PROVIDER_VOYAGEAI,
235
+ Messages.INIT_OPTION_PROVIDER_VOYAGEAI_DESC,
236
+ )
237
+ _print_option(
238
+ "D",
234
239
  Messages.INIT_OPTION_PROVIDER_CUSTOM,
235
240
  Messages.INIT_OPTION_PROVIDER_CUSTOM_DESC,
236
241
  )
@@ -242,11 +247,14 @@ def _collect_remote_settings() -> dict[str, object]:
242
247
  "openai": "openai",
243
248
  "b": "gemini",
244
249
  "gemini": "gemini",
245
- "c": "custom",
250
+ "c": "voyageai",
251
+ "voyageai": "voyageai",
252
+ "voyage": "voyageai",
253
+ "d": "custom",
246
254
  "custom": "custom",
247
255
  },
248
256
  default="A",
249
- allowed="A/B/C",
257
+ allowed="A/B/C/D",
250
258
  )
251
259
  console.print()
252
260
 
@@ -266,6 +274,8 @@ def _collect_remote_settings() -> dict[str, object]:
266
274
 
267
275
  if provider == "gemini":
268
276
  api_key = _prompt_api_key(Messages.INIT_PROMPT_API_KEY_GEMINI, provider)
277
+ elif provider == "voyageai":
278
+ api_key = _prompt_api_key(Messages.INIT_PROMPT_API_KEY_VOYAGE, provider)
269
279
  else:
270
280
  api_key = _prompt_api_key(Messages.INIT_PROMPT_API_KEY_OPENAI, provider)
271
281
  updates["api_key"] = api_key