vexor 0.21.1__py3-none-any.whl → 0.23.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vexor/__init__.py +17 -2
- vexor/api.py +851 -86
- vexor/cache.py +140 -16
- vexor/cli.py +59 -2
- vexor/config.py +197 -9
- vexor/providers/openai.py +14 -4
- vexor/search.py +16 -1
- vexor/services/config_service.py +30 -2
- vexor/services/content_extract_service.py +6 -0
- vexor/services/index_service.py +56 -4
- vexor/services/init_service.py +12 -2
- vexor/services/search_service.py +105 -30
- vexor/text.py +17 -3
- {vexor-0.21.1.dist-info → vexor-0.23.0rc1.dist-info}/METADATA +41 -5
- vexor-0.23.0rc1.dist-info/RECORD +33 -0
- vexor-0.21.1.dist-info/RECORD +0 -33
- {vexor-0.21.1.dist-info → vexor-0.23.0rc1.dist-info}/WHEEL +0 -0
- {vexor-0.21.1.dist-info → vexor-0.23.0rc1.dist-info}/entry_points.txt +0 -0
- {vexor-0.21.1.dist-info → vexor-0.23.0rc1.dist-info}/licenses/LICENSE +0 -0
vexor/cache.py
CHANGED
|
@@ -5,9 +5,13 @@ from __future__ import annotations
|
|
|
5
5
|
import hashlib
|
|
6
6
|
import os
|
|
7
7
|
import sqlite3
|
|
8
|
+
from collections import OrderedDict
|
|
8
9
|
from dataclasses import dataclass
|
|
10
|
+
from contextlib import contextmanager
|
|
11
|
+
from contextvars import ContextVar
|
|
9
12
|
from datetime import datetime, timezone, timedelta
|
|
10
13
|
from pathlib import Path
|
|
14
|
+
from threading import Lock
|
|
11
15
|
from typing import Iterable, Mapping, Sequence
|
|
12
16
|
|
|
13
17
|
import numpy as np
|
|
@@ -16,10 +20,18 @@ from .utils import collect_files
|
|
|
16
20
|
|
|
17
21
|
DEFAULT_CACHE_DIR = Path(os.path.expanduser("~")) / ".vexor"
|
|
18
22
|
CACHE_DIR = DEFAULT_CACHE_DIR
|
|
23
|
+
_CACHE_DIR_OVERRIDE: ContextVar[Path | None] = ContextVar(
|
|
24
|
+
"vexor_cache_dir_override",
|
|
25
|
+
default=None,
|
|
26
|
+
)
|
|
19
27
|
CACHE_VERSION = 6
|
|
20
28
|
DB_FILENAME = "index.db"
|
|
21
29
|
EMBED_CACHE_TTL_DAYS = 30
|
|
22
30
|
EMBED_CACHE_MAX_ENTRIES = 50_000
|
|
31
|
+
EMBED_MEMORY_CACHE_MAX_ENTRIES = 2_048
|
|
32
|
+
|
|
33
|
+
_EMBED_MEMORY_CACHE: "OrderedDict[tuple[str, int | None, str], np.ndarray]" = OrderedDict()
|
|
34
|
+
_EMBED_MEMORY_LOCK = Lock()
|
|
23
35
|
|
|
24
36
|
|
|
25
37
|
@dataclass(slots=True)
|
|
@@ -77,11 +89,73 @@ def query_cache_key(query: str, model: str) -> str:
|
|
|
77
89
|
return hashlib.sha1(base.encode("utf-8")).hexdigest()
|
|
78
90
|
|
|
79
91
|
|
|
80
|
-
def embedding_cache_key(text: str) -> str:
|
|
81
|
-
"""Return a stable hash for embedding cache lookups.
|
|
92
|
+
def embedding_cache_key(text: str, dimension: int | None = None) -> str:
|
|
93
|
+
"""Return a stable hash for embedding cache lookups.
|
|
82
94
|
|
|
95
|
+
Args:
|
|
96
|
+
text: The text to hash
|
|
97
|
+
dimension: Optional embedding dimension (included in hash for dimension-aware caching)
|
|
98
|
+
"""
|
|
83
99
|
clean_text = text or ""
|
|
84
|
-
|
|
100
|
+
# Include dimension in hash to prevent cross-dimension cache pollution
|
|
101
|
+
if dimension is not None:
|
|
102
|
+
base = f"{clean_text}|dim={dimension}"
|
|
103
|
+
else:
|
|
104
|
+
base = clean_text
|
|
105
|
+
return hashlib.sha1(base.encode("utf-8")).hexdigest()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _clear_embedding_memory_cache() -> None:
|
|
109
|
+
if EMBED_MEMORY_CACHE_MAX_ENTRIES <= 0:
|
|
110
|
+
return
|
|
111
|
+
with _EMBED_MEMORY_LOCK:
|
|
112
|
+
_EMBED_MEMORY_CACHE.clear()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _load_embedding_memory_cache(
|
|
116
|
+
model: str,
|
|
117
|
+
text_hashes: Sequence[str],
|
|
118
|
+
dimension: int | None = None,
|
|
119
|
+
) -> dict[str, np.ndarray]:
|
|
120
|
+
if EMBED_MEMORY_CACHE_MAX_ENTRIES <= 0:
|
|
121
|
+
return {}
|
|
122
|
+
results: dict[str, np.ndarray] = {}
|
|
123
|
+
with _EMBED_MEMORY_LOCK:
|
|
124
|
+
for text_hash in text_hashes:
|
|
125
|
+
if not text_hash:
|
|
126
|
+
continue
|
|
127
|
+
# Include dimension in cache key to prevent cross-dimension pollution
|
|
128
|
+
key = (model, dimension, text_hash)
|
|
129
|
+
vector = _EMBED_MEMORY_CACHE.pop(key, None)
|
|
130
|
+
if vector is None:
|
|
131
|
+
continue
|
|
132
|
+
_EMBED_MEMORY_CACHE[key] = vector
|
|
133
|
+
results[text_hash] = vector
|
|
134
|
+
return results
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _store_embedding_memory_cache(
|
|
138
|
+
*,
|
|
139
|
+
model: str,
|
|
140
|
+
embeddings: Mapping[str, np.ndarray],
|
|
141
|
+
dimension: int | None = None,
|
|
142
|
+
) -> None:
|
|
143
|
+
if EMBED_MEMORY_CACHE_MAX_ENTRIES <= 0 or not embeddings:
|
|
144
|
+
return
|
|
145
|
+
with _EMBED_MEMORY_LOCK:
|
|
146
|
+
for text_hash, vector in embeddings.items():
|
|
147
|
+
if not text_hash:
|
|
148
|
+
continue
|
|
149
|
+
array = np.asarray(vector, dtype=np.float32)
|
|
150
|
+
if array.size == 0:
|
|
151
|
+
continue
|
|
152
|
+
# Include dimension in cache key to prevent cross-dimension pollution
|
|
153
|
+
key = (model, dimension, text_hash)
|
|
154
|
+
if key in _EMBED_MEMORY_CACHE:
|
|
155
|
+
_EMBED_MEMORY_CACHE.pop(key, None)
|
|
156
|
+
_EMBED_MEMORY_CACHE[key] = array
|
|
157
|
+
while len(_EMBED_MEMORY_CACHE) > EMBED_MEMORY_CACHE_MAX_ENTRIES:
|
|
158
|
+
_EMBED_MEMORY_CACHE.popitem(last=False)
|
|
85
159
|
|
|
86
160
|
|
|
87
161
|
def _serialize_extensions(extensions: Sequence[str] | None) -> str:
|
|
@@ -115,9 +189,32 @@ def _chunk_values(values: Sequence[object], size: int) -> Iterable[Sequence[obje
|
|
|
115
189
|
yield values[idx : idx + size]
|
|
116
190
|
|
|
117
191
|
|
|
192
|
+
def _resolve_cache_dir() -> Path:
|
|
193
|
+
override = _CACHE_DIR_OVERRIDE.get()
|
|
194
|
+
return override if override is not None else CACHE_DIR
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@contextmanager
|
|
198
|
+
def cache_dir_context(path: Path | str | None):
|
|
199
|
+
"""Temporarily override the cache directory for the current context."""
|
|
200
|
+
|
|
201
|
+
if path is None:
|
|
202
|
+
yield
|
|
203
|
+
return
|
|
204
|
+
dir_path = Path(path).expanduser().resolve()
|
|
205
|
+
if dir_path.exists() and not dir_path.is_dir():
|
|
206
|
+
raise NotADirectoryError(f"Path is not a directory: {dir_path}")
|
|
207
|
+
token = _CACHE_DIR_OVERRIDE.set(dir_path)
|
|
208
|
+
try:
|
|
209
|
+
yield
|
|
210
|
+
finally:
|
|
211
|
+
_CACHE_DIR_OVERRIDE.reset(token)
|
|
212
|
+
|
|
213
|
+
|
|
118
214
|
def ensure_cache_dir() -> Path:
|
|
119
|
-
|
|
120
|
-
|
|
215
|
+
cache_dir = _resolve_cache_dir()
|
|
216
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
217
|
+
return cache_dir
|
|
121
218
|
|
|
122
219
|
|
|
123
220
|
def set_cache_dir(path: Path | str | None) -> None:
|
|
@@ -134,8 +231,8 @@ def set_cache_dir(path: Path | str | None) -> None:
|
|
|
134
231
|
def cache_db_path() -> Path:
|
|
135
232
|
"""Return the absolute path to the shared SQLite cache database."""
|
|
136
233
|
|
|
137
|
-
ensure_cache_dir()
|
|
138
|
-
return
|
|
234
|
+
cache_dir = ensure_cache_dir()
|
|
235
|
+
return cache_dir / DB_FILENAME
|
|
139
236
|
|
|
140
237
|
|
|
141
238
|
def cache_file(root: Path, model: str, include_hidden: bool) -> Path: # pragma: no cover - kept for API parity
|
|
@@ -1304,25 +1401,38 @@ def load_embedding_cache(
|
|
|
1304
1401
|
model: str,
|
|
1305
1402
|
text_hashes: Sequence[str],
|
|
1306
1403
|
conn: sqlite3.Connection | None = None,
|
|
1404
|
+
*,
|
|
1405
|
+
dimension: int | None = None,
|
|
1307
1406
|
) -> dict[str, np.ndarray]:
|
|
1308
|
-
"""Load cached embeddings keyed by (model, text_hash).
|
|
1309
|
-
|
|
1407
|
+
"""Load cached embeddings keyed by (model, text_hash).
|
|
1408
|
+
|
|
1409
|
+
Args:
|
|
1410
|
+
model: The embedding model name
|
|
1411
|
+
text_hashes: Sequence of text hashes to look up (should be generated with
|
|
1412
|
+
embedding_cache_key() using the same dimension parameter)
|
|
1413
|
+
conn: Optional database connection
|
|
1414
|
+
dimension: Embedding dimension (used for memory cache segmentation)
|
|
1415
|
+
"""
|
|
1310
1416
|
unique_hashes = list(dict.fromkeys([value for value in text_hashes if value]))
|
|
1311
1417
|
if not unique_hashes:
|
|
1312
1418
|
return {}
|
|
1419
|
+
results = _load_embedding_memory_cache(model, unique_hashes, dimension=dimension)
|
|
1420
|
+
missing = [value for value in unique_hashes if value not in results]
|
|
1421
|
+
if not missing:
|
|
1422
|
+
return results
|
|
1313
1423
|
db_path = cache_db_path()
|
|
1314
1424
|
owns_connection = conn is None
|
|
1315
1425
|
try:
|
|
1316
1426
|
connection = conn or _connect(db_path, readonly=True)
|
|
1317
1427
|
except sqlite3.OperationalError:
|
|
1318
|
-
return
|
|
1428
|
+
return results
|
|
1319
1429
|
try:
|
|
1320
1430
|
try:
|
|
1321
1431
|
_ensure_schema_readonly(connection, tables=("embedding_cache",))
|
|
1322
1432
|
except sqlite3.OperationalError:
|
|
1323
|
-
return
|
|
1324
|
-
|
|
1325
|
-
for chunk in _chunk_values(
|
|
1433
|
+
return results
|
|
1434
|
+
disk_results: dict[str, np.ndarray] = {}
|
|
1435
|
+
for chunk in _chunk_values(missing, 900):
|
|
1326
1436
|
placeholders = ", ".join("?" for _ in chunk)
|
|
1327
1437
|
rows = connection.execute(
|
|
1328
1438
|
f"""
|
|
@@ -1339,7 +1449,12 @@ def load_embedding_cache(
|
|
|
1339
1449
|
vector = np.frombuffer(blob, dtype=np.float32)
|
|
1340
1450
|
if vector.size == 0:
|
|
1341
1451
|
continue
|
|
1342
|
-
|
|
1452
|
+
disk_results[row["text_hash"]] = vector
|
|
1453
|
+
if disk_results:
|
|
1454
|
+
_store_embedding_memory_cache(
|
|
1455
|
+
model=model, embeddings=disk_results, dimension=dimension
|
|
1456
|
+
)
|
|
1457
|
+
results.update(disk_results)
|
|
1343
1458
|
return results
|
|
1344
1459
|
finally:
|
|
1345
1460
|
if owns_connection:
|
|
@@ -1351,11 +1466,20 @@ def store_embedding_cache(
|
|
|
1351
1466
|
model: str,
|
|
1352
1467
|
embeddings: Mapping[str, np.ndarray],
|
|
1353
1468
|
conn: sqlite3.Connection | None = None,
|
|
1469
|
+
dimension: int | None = None,
|
|
1354
1470
|
) -> None:
|
|
1355
|
-
"""Store embedding vectors keyed by (model, text_hash).
|
|
1356
|
-
|
|
1471
|
+
"""Store embedding vectors keyed by (model, text_hash).
|
|
1472
|
+
|
|
1473
|
+
Args:
|
|
1474
|
+
model: The embedding model name
|
|
1475
|
+
embeddings: Dict mapping text_hash -> vector (hashes should be generated with
|
|
1476
|
+
embedding_cache_key() using the same dimension parameter)
|
|
1477
|
+
conn: Optional database connection
|
|
1478
|
+
dimension: Embedding dimension (used for memory cache segmentation)
|
|
1479
|
+
"""
|
|
1357
1480
|
if not embeddings:
|
|
1358
1481
|
return
|
|
1482
|
+
_store_embedding_memory_cache(model=model, embeddings=embeddings, dimension=dimension)
|
|
1359
1483
|
db_path = cache_db_path()
|
|
1360
1484
|
owns_connection = conn is None
|
|
1361
1485
|
connection = conn or _connect(db_path)
|
vexor/cli.py
CHANGED
|
@@ -31,14 +31,18 @@ from .config import (
|
|
|
31
31
|
DEFAULT_MODEL,
|
|
32
32
|
DEFAULT_PROVIDER,
|
|
33
33
|
DEFAULT_RERANK,
|
|
34
|
+
DEFAULT_VOYAGE_MODEL,
|
|
35
|
+
DIMENSION_SUPPORTED_MODELS,
|
|
34
36
|
SUPPORTED_EXTRACT_BACKENDS,
|
|
35
37
|
SUPPORTED_PROVIDERS,
|
|
36
38
|
SUPPORTED_RERANKERS,
|
|
37
39
|
flashrank_cache_dir,
|
|
40
|
+
get_supported_dimensions,
|
|
38
41
|
load_config,
|
|
39
42
|
normalize_remote_rerank_url,
|
|
40
43
|
resolve_remote_rerank_api_key,
|
|
41
44
|
resolve_default_model,
|
|
45
|
+
supports_dimensions,
|
|
42
46
|
)
|
|
43
47
|
from .modes import available_modes, get_strategy
|
|
44
48
|
from .services.cache_service import is_cache_current, load_index_metadata_safe
|
|
@@ -454,6 +458,7 @@ def search(
|
|
|
454
458
|
rerank=rerank,
|
|
455
459
|
flashrank_model=flashrank_model,
|
|
456
460
|
remote_rerank=remote_rerank,
|
|
461
|
+
embedding_dimensions=config.embedding_dimensions,
|
|
457
462
|
)
|
|
458
463
|
if output_format == SearchOutputFormat.rich:
|
|
459
464
|
if no_cache:
|
|
@@ -488,7 +493,7 @@ def search(
|
|
|
488
493
|
else:
|
|
489
494
|
typer.echo(message, err=True)
|
|
490
495
|
raise typer.Exit(code=1)
|
|
491
|
-
except RuntimeError as exc:
|
|
496
|
+
except (RuntimeError, ValueError) as exc:
|
|
492
497
|
if output_format == SearchOutputFormat.rich:
|
|
493
498
|
console.print(_styled(str(exc), Styles.ERROR))
|
|
494
499
|
else:
|
|
@@ -688,8 +693,9 @@ def index(
|
|
|
688
693
|
local_cuda=bool(config.local_cuda),
|
|
689
694
|
exclude_patterns=normalized_excludes,
|
|
690
695
|
extensions=normalized_exts,
|
|
696
|
+
embedding_dimensions=config.embedding_dimensions,
|
|
691
697
|
)
|
|
692
|
-
except RuntimeError as exc:
|
|
698
|
+
except (RuntimeError, ValueError) as exc:
|
|
693
699
|
console.print(_styled(str(exc), Styles.ERROR))
|
|
694
700
|
raise typer.Exit(code=1)
|
|
695
701
|
if result.status == IndexStatus.EMPTY:
|
|
@@ -768,6 +774,16 @@ def config(
|
|
|
768
774
|
"--clear-base-url",
|
|
769
775
|
help=Messages.HELP_CLEAR_BASE_URL,
|
|
770
776
|
),
|
|
777
|
+
set_embedding_dimensions_option: int | None = typer.Option(
|
|
778
|
+
None,
|
|
779
|
+
"--set-embedding-dimensions",
|
|
780
|
+
help=Messages.HELP_SET_EMBEDDING_DIMENSIONS,
|
|
781
|
+
),
|
|
782
|
+
clear_embedding_dimensions: bool = typer.Option(
|
|
783
|
+
False,
|
|
784
|
+
"--clear-embedding-dimensions",
|
|
785
|
+
help=Messages.HELP_CLEAR_EMBEDDING_DIMENSIONS,
|
|
786
|
+
),
|
|
771
787
|
set_auto_index_option: str | None = typer.Option(
|
|
772
788
|
None,
|
|
773
789
|
"--set-auto-index",
|
|
@@ -989,6 +1005,33 @@ def config(
|
|
|
989
1005
|
except ValueError as exc:
|
|
990
1006
|
raise typer.BadParameter(str(exc)) from exc
|
|
991
1007
|
|
|
1008
|
+
effective_embedding_dimensions = set_embedding_dimensions_option
|
|
1009
|
+
effective_clear_embedding_dimensions = clear_embedding_dimensions
|
|
1010
|
+
if effective_embedding_dimensions == 0:
|
|
1011
|
+
effective_embedding_dimensions = None
|
|
1012
|
+
effective_clear_embedding_dimensions = True
|
|
1013
|
+
|
|
1014
|
+
# Validate embedding dimensions if set
|
|
1015
|
+
if effective_embedding_dimensions is not None:
|
|
1016
|
+
if effective_embedding_dimensions < 0:
|
|
1017
|
+
raise typer.BadParameter(
|
|
1018
|
+
f"--set-embedding-dimensions must be non-negative, got {effective_embedding_dimensions}"
|
|
1019
|
+
)
|
|
1020
|
+
if effective_embedding_dimensions > 0:
|
|
1021
|
+
# Resolve effective model from provider + model to account for provider defaults
|
|
1022
|
+
effective_model = resolve_default_model(pending_provider, pending_model)
|
|
1023
|
+
if not supports_dimensions(effective_model):
|
|
1024
|
+
raise typer.BadParameter(
|
|
1025
|
+
f"Model '{effective_model}' does not support custom dimensions. "
|
|
1026
|
+
f"Supported model names/prefixes: {', '.join(DIMENSION_SUPPORTED_MODELS.keys())}"
|
|
1027
|
+
)
|
|
1028
|
+
supported = get_supported_dimensions(effective_model)
|
|
1029
|
+
if supported and effective_embedding_dimensions not in supported:
|
|
1030
|
+
raise typer.BadParameter(
|
|
1031
|
+
f"Dimension {effective_embedding_dimensions} is not supported for model '{effective_model}'. "
|
|
1032
|
+
f"Supported dimensions: {supported}"
|
|
1033
|
+
)
|
|
1034
|
+
|
|
992
1035
|
updates = apply_config_updates(
|
|
993
1036
|
api_key=set_api_key_option,
|
|
994
1037
|
clear_api_key=clear_api_key,
|
|
@@ -1007,6 +1050,8 @@ def config(
|
|
|
1007
1050
|
remote_rerank_model=set_remote_rerank_model_option,
|
|
1008
1051
|
remote_rerank_api_key=set_remote_rerank_api_key_option,
|
|
1009
1052
|
clear_remote_rerank=clear_remote_rerank,
|
|
1053
|
+
embedding_dimensions=effective_embedding_dimensions,
|
|
1054
|
+
clear_embedding_dimensions=effective_clear_embedding_dimensions,
|
|
1010
1055
|
)
|
|
1011
1056
|
|
|
1012
1057
|
if updates.api_key_set:
|
|
@@ -1109,6 +1154,17 @@ def config(
|
|
|
1109
1154
|
console.print(_styled(Messages.INFO_REMOTE_RERANK_API_KEY_SET, Styles.SUCCESS))
|
|
1110
1155
|
if updates.remote_rerank_cleared and clear_remote_rerank:
|
|
1111
1156
|
console.print(_styled(Messages.INFO_REMOTE_RERANK_CLEARED, Styles.SUCCESS))
|
|
1157
|
+
if updates.embedding_dimensions_set and effective_embedding_dimensions is not None:
|
|
1158
|
+
console.print(
|
|
1159
|
+
_styled(
|
|
1160
|
+
Messages.INFO_EMBEDDING_DIMENSIONS_SET.format(
|
|
1161
|
+
value=effective_embedding_dimensions
|
|
1162
|
+
),
|
|
1163
|
+
Styles.SUCCESS,
|
|
1164
|
+
)
|
|
1165
|
+
)
|
|
1166
|
+
if updates.embedding_dimensions_cleared:
|
|
1167
|
+
console.print(_styled(Messages.INFO_EMBEDDING_DIMENSIONS_CLEARED, Styles.SUCCESS))
|
|
1112
1168
|
|
|
1113
1169
|
if clear_flashrank:
|
|
1114
1170
|
cache_dir = flashrank_cache_dir(create=False)
|
|
@@ -1188,6 +1244,7 @@ def config(
|
|
|
1188
1244
|
api="yes" if cfg.api_key else "no",
|
|
1189
1245
|
provider=provider,
|
|
1190
1246
|
model=resolve_default_model(provider, cfg.model),
|
|
1247
|
+
embedding_dimensions=cfg.embedding_dimensions if cfg.embedding_dimensions else "default",
|
|
1191
1248
|
batch=cfg.batch_size if cfg.batch_size is not None else DEFAULT_BATCH_SIZE,
|
|
1192
1249
|
concurrency=cfg.embed_concurrency,
|
|
1193
1250
|
extract_concurrency=cfg.extract_concurrency,
|