vexor 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vexor/__init__.py +12 -0
- vexor/__main__.py +18 -0
- vexor/cache.py +286 -0
- vexor/cli.py +466 -0
- vexor/config.py +62 -0
- vexor/search.py +152 -0
- vexor/text.py +82 -0
- vexor/utils.py +50 -0
- vexor-0.2.0.dist-info/METADATA +102 -0
- vexor-0.2.0.dist-info/RECORD +13 -0
- vexor-0.2.0.dist-info/WHEEL +4 -0
- vexor-0.2.0.dist-info/entry_points.txt +2 -0
- vexor-0.2.0.dist-info/licenses/LICENSE +21 -0
vexor/__init__.py
ADDED
vexor/__main__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Entry point for `python -m vexor` and frozen builds."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
# Normal package execution path
|
|
7
|
+
from .cli import run
|
|
8
|
+
except ImportError: # pragma: no cover - happens in frozen single-file builds
|
|
9
|
+
from vexor.cli import run # type: ignore[import]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main() -> None:
|
|
13
|
+
"""Execute the Typer application."""
|
|
14
|
+
run()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
if __name__ == "__main__":
|
|
18
|
+
raise SystemExit(main())
|
vexor/cache.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""Index cache helpers for Vexor backed by SQLite."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import os
|
|
7
|
+
import sqlite3
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Sequence
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
from .utils import collect_files
|
|
15
|
+
|
|
16
|
+
CACHE_DIR = Path(os.path.expanduser("~")) / ".vexor"
|
|
17
|
+
CACHE_VERSION = 1
|
|
18
|
+
DB_FILENAME = "index.db"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _cache_key(root: Path, include_hidden: bool) -> str:
|
|
22
|
+
digest = hashlib.sha1(
|
|
23
|
+
f"{root.resolve()}|hidden={include_hidden}".encode("utf-8")
|
|
24
|
+
).hexdigest()
|
|
25
|
+
return digest
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def ensure_cache_dir() -> Path:
|
|
29
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
return CACHE_DIR
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def cache_file(root: Path, model: str, include_hidden: bool) -> Path: # pragma: no cover - kept for API parity
|
|
34
|
+
"""Return the on-disk cache artifact path (single SQLite DB)."""
|
|
35
|
+
ensure_cache_dir()
|
|
36
|
+
return CACHE_DIR / DB_FILENAME
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _connect(db_path: Path) -> sqlite3.Connection:
|
|
40
|
+
conn = sqlite3.connect(db_path)
|
|
41
|
+
conn.row_factory = sqlite3.Row
|
|
42
|
+
conn.execute("PRAGMA foreign_keys = ON;")
|
|
43
|
+
return conn
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _ensure_schema(conn: sqlite3.Connection) -> None:
|
|
47
|
+
conn.executescript(
|
|
48
|
+
"""
|
|
49
|
+
CREATE TABLE IF NOT EXISTS index_metadata (
|
|
50
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
51
|
+
cache_key TEXT NOT NULL,
|
|
52
|
+
root_path TEXT NOT NULL,
|
|
53
|
+
model TEXT NOT NULL,
|
|
54
|
+
include_hidden INTEGER NOT NULL,
|
|
55
|
+
dimension INTEGER NOT NULL,
|
|
56
|
+
version INTEGER NOT NULL,
|
|
57
|
+
generated_at TEXT NOT NULL,
|
|
58
|
+
UNIQUE(cache_key, model)
|
|
59
|
+
);
|
|
60
|
+
|
|
61
|
+
CREATE TABLE IF NOT EXISTS indexed_file (
|
|
62
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
63
|
+
index_id INTEGER NOT NULL REFERENCES index_metadata(id) ON DELETE CASCADE,
|
|
64
|
+
rel_path TEXT NOT NULL,
|
|
65
|
+
abs_path TEXT NOT NULL,
|
|
66
|
+
size_bytes INTEGER NOT NULL,
|
|
67
|
+
mtime REAL NOT NULL,
|
|
68
|
+
position INTEGER NOT NULL,
|
|
69
|
+
UNIQUE(index_id, rel_path)
|
|
70
|
+
);
|
|
71
|
+
|
|
72
|
+
CREATE TABLE IF NOT EXISTS file_embedding (
|
|
73
|
+
file_id INTEGER PRIMARY KEY REFERENCES indexed_file(id) ON DELETE CASCADE,
|
|
74
|
+
vector_blob BLOB NOT NULL
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
CREATE INDEX IF NOT EXISTS idx_indexed_file_order
|
|
78
|
+
ON indexed_file(index_id, position);
|
|
79
|
+
"""
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def store_index(
|
|
84
|
+
*,
|
|
85
|
+
root: Path,
|
|
86
|
+
model: str,
|
|
87
|
+
include_hidden: bool,
|
|
88
|
+
files: Sequence[Path],
|
|
89
|
+
embeddings: np.ndarray,
|
|
90
|
+
) -> Path:
|
|
91
|
+
db_path = cache_file(root, model, include_hidden)
|
|
92
|
+
conn = _connect(db_path)
|
|
93
|
+
try:
|
|
94
|
+
_ensure_schema(conn)
|
|
95
|
+
key = _cache_key(root, include_hidden)
|
|
96
|
+
generated_at = datetime.now(timezone.utc).isoformat()
|
|
97
|
+
dimension = int(embeddings.shape[1] if embeddings.size else 0)
|
|
98
|
+
include_flag = 1 if include_hidden else 0
|
|
99
|
+
|
|
100
|
+
with conn:
|
|
101
|
+
conn.execute(
|
|
102
|
+
"DELETE FROM index_metadata WHERE cache_key = ? AND model = ?",
|
|
103
|
+
(key, model),
|
|
104
|
+
)
|
|
105
|
+
cursor = conn.execute(
|
|
106
|
+
"""
|
|
107
|
+
INSERT INTO index_metadata (
|
|
108
|
+
cache_key,
|
|
109
|
+
root_path,
|
|
110
|
+
model,
|
|
111
|
+
include_hidden,
|
|
112
|
+
dimension,
|
|
113
|
+
version,
|
|
114
|
+
generated_at
|
|
115
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
116
|
+
""",
|
|
117
|
+
(key, str(root), model, include_flag, dimension, CACHE_VERSION, generated_at),
|
|
118
|
+
)
|
|
119
|
+
index_id = cursor.lastrowid
|
|
120
|
+
|
|
121
|
+
for position, file in enumerate(files):
|
|
122
|
+
stat = file.stat()
|
|
123
|
+
try:
|
|
124
|
+
rel_path = file.relative_to(root)
|
|
125
|
+
except ValueError:
|
|
126
|
+
rel_path = file
|
|
127
|
+
file_cursor = conn.execute(
|
|
128
|
+
"""
|
|
129
|
+
INSERT INTO indexed_file (
|
|
130
|
+
index_id,
|
|
131
|
+
rel_path,
|
|
132
|
+
abs_path,
|
|
133
|
+
size_bytes,
|
|
134
|
+
mtime,
|
|
135
|
+
position
|
|
136
|
+
) VALUES (?, ?, ?, ?, ?, ?)
|
|
137
|
+
""",
|
|
138
|
+
(
|
|
139
|
+
index_id,
|
|
140
|
+
str(rel_path),
|
|
141
|
+
str(file),
|
|
142
|
+
stat.st_size,
|
|
143
|
+
stat.st_mtime,
|
|
144
|
+
position,
|
|
145
|
+
),
|
|
146
|
+
)
|
|
147
|
+
vector_blob = embeddings[position].astype(np.float32).tobytes()
|
|
148
|
+
conn.execute(
|
|
149
|
+
"INSERT INTO file_embedding (file_id, vector_blob) VALUES (?, ?)",
|
|
150
|
+
(file_cursor.lastrowid, vector_blob),
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
return db_path
|
|
154
|
+
finally:
|
|
155
|
+
conn.close()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def load_index(root: Path, model: str, include_hidden: bool) -> dict:
|
|
159
|
+
db_path = cache_file(root, model, include_hidden)
|
|
160
|
+
if not db_path.exists():
|
|
161
|
+
raise FileNotFoundError(db_path)
|
|
162
|
+
|
|
163
|
+
conn = _connect(db_path)
|
|
164
|
+
try:
|
|
165
|
+
_ensure_schema(conn)
|
|
166
|
+
key = _cache_key(root, include_hidden)
|
|
167
|
+
include_flag = 1 if include_hidden else 0
|
|
168
|
+
meta = conn.execute(
|
|
169
|
+
"""
|
|
170
|
+
SELECT id, root_path, model, include_hidden, dimension, version, generated_at
|
|
171
|
+
FROM index_metadata
|
|
172
|
+
WHERE cache_key = ? AND model = ? AND include_hidden = ?
|
|
173
|
+
""",
|
|
174
|
+
(key, model, include_flag),
|
|
175
|
+
).fetchone()
|
|
176
|
+
if meta is None:
|
|
177
|
+
raise FileNotFoundError(db_path)
|
|
178
|
+
|
|
179
|
+
files = conn.execute(
|
|
180
|
+
"""
|
|
181
|
+
SELECT f.rel_path, f.abs_path, f.size_bytes, f.mtime, e.vector_blob
|
|
182
|
+
FROM indexed_file AS f
|
|
183
|
+
JOIN file_embedding AS e ON e.file_id = f.id
|
|
184
|
+
WHERE f.index_id = ?
|
|
185
|
+
ORDER BY f.position ASC
|
|
186
|
+
""",
|
|
187
|
+
(meta["id"],),
|
|
188
|
+
).fetchall()
|
|
189
|
+
|
|
190
|
+
serialized_files = []
|
|
191
|
+
for row in files:
|
|
192
|
+
vector = np.frombuffer(row["vector_blob"], dtype=np.float32)
|
|
193
|
+
serialized_files.append(
|
|
194
|
+
{
|
|
195
|
+
"path": row["rel_path"],
|
|
196
|
+
"absolute": row["abs_path"],
|
|
197
|
+
"mtime": row["mtime"],
|
|
198
|
+
"size": row["size_bytes"],
|
|
199
|
+
"embedding": vector.tolist(),
|
|
200
|
+
}
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
"version": meta["version"],
|
|
205
|
+
"generated_at": meta["generated_at"],
|
|
206
|
+
"root": meta["root_path"],
|
|
207
|
+
"model": meta["model"],
|
|
208
|
+
"include_hidden": bool(meta["include_hidden"]),
|
|
209
|
+
"dimension": meta["dimension"],
|
|
210
|
+
"files": serialized_files,
|
|
211
|
+
}
|
|
212
|
+
finally:
|
|
213
|
+
conn.close()
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def load_index_vectors(root: Path, model: str, include_hidden: bool):
|
|
217
|
+
data = load_index(root, model, include_hidden)
|
|
218
|
+
files = data.get("files", [])
|
|
219
|
+
paths = [root / Path(entry["path"]) for entry in files]
|
|
220
|
+
embeddings = np.asarray([entry["embedding"] for entry in files], dtype=np.float32)
|
|
221
|
+
return paths, embeddings, data
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def clear_index(root: Path, include_hidden: bool, model: str | None = None) -> int:
|
|
225
|
+
"""Remove cached index entries for *root* (optionally filtered by *model*)."""
|
|
226
|
+
db_path = cache_file(root, model or "_", include_hidden)
|
|
227
|
+
if not db_path.exists():
|
|
228
|
+
return 0
|
|
229
|
+
|
|
230
|
+
conn = _connect(db_path)
|
|
231
|
+
try:
|
|
232
|
+
_ensure_schema(conn)
|
|
233
|
+
key = _cache_key(root, include_hidden)
|
|
234
|
+
if model is None:
|
|
235
|
+
query = "DELETE FROM index_metadata WHERE cache_key = ?"
|
|
236
|
+
params = (key,)
|
|
237
|
+
else:
|
|
238
|
+
query = "DELETE FROM index_metadata WHERE cache_key = ? AND model = ?"
|
|
239
|
+
params = (key, model)
|
|
240
|
+
with conn:
|
|
241
|
+
cursor = conn.execute(query, params)
|
|
242
|
+
return cursor.rowcount
|
|
243
|
+
finally:
|
|
244
|
+
conn.close()
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def compare_snapshot(
|
|
248
|
+
root: Path,
|
|
249
|
+
include_hidden: bool,
|
|
250
|
+
cached_files: Sequence[dict],
|
|
251
|
+
current_files: Sequence[Path] | None = None,
|
|
252
|
+
) -> bool:
|
|
253
|
+
"""Return True if the current filesystem matches the cached snapshot."""
|
|
254
|
+
if current_files is None:
|
|
255
|
+
current_files = collect_files(root, include_hidden=include_hidden)
|
|
256
|
+
if len(current_files) != len(cached_files):
|
|
257
|
+
return False
|
|
258
|
+
cached_map = {
|
|
259
|
+
entry["path"]: (entry["mtime"], entry.get("size"))
|
|
260
|
+
for entry in cached_files
|
|
261
|
+
}
|
|
262
|
+
for file in current_files:
|
|
263
|
+
rel = _relative_path(file, root)
|
|
264
|
+
data = cached_map.get(rel)
|
|
265
|
+
if data is None:
|
|
266
|
+
return False
|
|
267
|
+
cached_mtime, cached_size = data
|
|
268
|
+
stat = file.stat()
|
|
269
|
+
current_mtime = stat.st_mtime
|
|
270
|
+
current_size = stat.st_size
|
|
271
|
+
# allow drift due to filesystem precision (approx 0.5s on some platforms)
|
|
272
|
+
if abs(current_mtime - cached_mtime) > 5e-1:
|
|
273
|
+
if cached_size is not None and cached_size == current_size:
|
|
274
|
+
continue
|
|
275
|
+
return False
|
|
276
|
+
if cached_size is not None and cached_size != current_size:
|
|
277
|
+
return False
|
|
278
|
+
return True
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _relative_path(path: Path, root: Path) -> str:
|
|
282
|
+
try:
|
|
283
|
+
rel = path.relative_to(root)
|
|
284
|
+
except ValueError:
|
|
285
|
+
rel = path
|
|
286
|
+
return str(rel)
|
vexor/cli.py
ADDED
|
@@ -0,0 +1,466 @@
|
|
|
1
|
+
"""Command line interface for Vexor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Sequence
|
|
8
|
+
import shutil
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
import typer
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
|
|
15
|
+
from . import __version__
|
|
16
|
+
from .config import (
|
|
17
|
+
DEFAULT_BATCH_SIZE,
|
|
18
|
+
DEFAULT_MODEL,
|
|
19
|
+
load_config,
|
|
20
|
+
set_api_key,
|
|
21
|
+
set_batch_size,
|
|
22
|
+
set_model,
|
|
23
|
+
)
|
|
24
|
+
from .text import Messages, Styles
|
|
25
|
+
from .utils import collect_files, resolve_directory, format_path, ensure_positive
|
|
26
|
+
|
|
27
|
+
REMOTE_VERSION_URL = "https://raw.githubusercontent.com/scarletkc/vexor/refs/heads/main/vexor/__init__.py"
|
|
28
|
+
PROJECT_URL = "https://github.com/scarletkc/vexor"
|
|
29
|
+
PYPI_URL = "https://pypi.org/project/vexor/"
|
|
30
|
+
|
|
31
|
+
console = Console()
|
|
32
|
+
app = typer.Typer(
|
|
33
|
+
help=Messages.APP_HELP,
|
|
34
|
+
no_args_is_help=True,
|
|
35
|
+
context_settings={"help_option_names": ["-h", "--help"]},
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(slots=True)
|
|
40
|
+
class DisplayResult:
|
|
41
|
+
path: Path
|
|
42
|
+
score: float
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _version_callback(value: bool) -> None:
|
|
46
|
+
if value:
|
|
47
|
+
console.print(f"Vexor v{__version__}")
|
|
48
|
+
raise typer.Exit()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@app.callback()
|
|
52
|
+
def main(
|
|
53
|
+
version: bool = typer.Option(
|
|
54
|
+
False,
|
|
55
|
+
"--version",
|
|
56
|
+
"-v",
|
|
57
|
+
callback=_version_callback,
|
|
58
|
+
is_eager=True,
|
|
59
|
+
help="Show version and exit.",
|
|
60
|
+
)
|
|
61
|
+
) -> None:
|
|
62
|
+
"""Global Typer callback for shared options."""
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@app.command()
|
|
67
|
+
def search(
|
|
68
|
+
query: str = typer.Argument(..., help=Messages.HELP_QUERY),
|
|
69
|
+
path: Path = typer.Option(
|
|
70
|
+
Path.cwd(),
|
|
71
|
+
"--path",
|
|
72
|
+
"-p",
|
|
73
|
+
help=Messages.HELP_SEARCH_PATH,
|
|
74
|
+
),
|
|
75
|
+
top: int = typer.Option(5, "--top", "-k", help=Messages.HELP_SEARCH_TOP),
|
|
76
|
+
include_hidden: bool = typer.Option(
|
|
77
|
+
False,
|
|
78
|
+
"--include-hidden",
|
|
79
|
+
help=Messages.HELP_INCLUDE_HIDDEN,
|
|
80
|
+
),
|
|
81
|
+
) -> None:
|
|
82
|
+
"""Run the semantic search using a cached index."""
|
|
83
|
+
config = load_config()
|
|
84
|
+
model_name = config.model or DEFAULT_MODEL
|
|
85
|
+
batch_size = config.batch_size if config.batch_size is not None else DEFAULT_BATCH_SIZE
|
|
86
|
+
|
|
87
|
+
clean_query = query.strip()
|
|
88
|
+
if not clean_query:
|
|
89
|
+
console.print(_styled(Messages.ERROR_EMPTY_QUERY, Styles.ERROR))
|
|
90
|
+
raise typer.Exit(code=1)
|
|
91
|
+
try:
|
|
92
|
+
ensure_positive(top, "top")
|
|
93
|
+
except ValueError as exc: # pragma: no cover - validated by Typer
|
|
94
|
+
raise typer.BadParameter(str(exc), param_name="top") from exc
|
|
95
|
+
|
|
96
|
+
directory = resolve_directory(path)
|
|
97
|
+
console.print(_styled(Messages.INFO_SEARCH_RUNNING.format(path=directory), Styles.INFO))
|
|
98
|
+
try:
|
|
99
|
+
cached_paths, file_vectors, meta = _load_index(directory, model_name, include_hidden)
|
|
100
|
+
except FileNotFoundError:
|
|
101
|
+
console.print(
|
|
102
|
+
_styled(Messages.ERROR_INDEX_MISSING.format(path=directory), Styles.ERROR)
|
|
103
|
+
)
|
|
104
|
+
raise typer.Exit(code=1)
|
|
105
|
+
|
|
106
|
+
_warn_if_stale(directory, include_hidden, meta.get("files", []))
|
|
107
|
+
|
|
108
|
+
if not cached_paths:
|
|
109
|
+
console.print(_styled(Messages.INFO_INDEX_EMPTY, Styles.WARNING))
|
|
110
|
+
raise typer.Exit(code=0)
|
|
111
|
+
|
|
112
|
+
searcher = _create_searcher(model_name=model_name, batch_size=batch_size)
|
|
113
|
+
try:
|
|
114
|
+
query_vector = searcher.embed_texts([clean_query])[0]
|
|
115
|
+
except RuntimeError as exc:
|
|
116
|
+
console.print(_styled(str(exc), Styles.ERROR))
|
|
117
|
+
raise typer.Exit(code=1)
|
|
118
|
+
|
|
119
|
+
from sklearn.metrics.pairwise import cosine_similarity # local import
|
|
120
|
+
|
|
121
|
+
similarities = cosine_similarity(
|
|
122
|
+
query_vector.reshape(1, -1), file_vectors
|
|
123
|
+
)[0]
|
|
124
|
+
scored = [
|
|
125
|
+
DisplayResult(path=path, score=float(score))
|
|
126
|
+
for path, score in zip(cached_paths, similarities)
|
|
127
|
+
]
|
|
128
|
+
scored.sort(key=lambda item: item.score, reverse=True)
|
|
129
|
+
results = scored[:top]
|
|
130
|
+
|
|
131
|
+
if not results:
|
|
132
|
+
console.print(_styled(Messages.INFO_NO_RESULTS, Styles.WARNING))
|
|
133
|
+
raise typer.Exit(code=0)
|
|
134
|
+
|
|
135
|
+
_render_results(results, directory, searcher.device)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@app.command()
|
|
139
|
+
def index(
|
|
140
|
+
path: Path = typer.Option(
|
|
141
|
+
Path.cwd(),
|
|
142
|
+
"--path",
|
|
143
|
+
"-p",
|
|
144
|
+
help=Messages.HELP_INDEX_PATH,
|
|
145
|
+
),
|
|
146
|
+
include_hidden: bool = typer.Option(
|
|
147
|
+
False,
|
|
148
|
+
"--include-hidden",
|
|
149
|
+
help=Messages.HELP_INDEX_INCLUDE,
|
|
150
|
+
),
|
|
151
|
+
clear: bool = typer.Option(
|
|
152
|
+
False,
|
|
153
|
+
"--clear",
|
|
154
|
+
help=Messages.HELP_INDEX_CLEAR,
|
|
155
|
+
),
|
|
156
|
+
) -> None:
|
|
157
|
+
"""Create or refresh the cached index for the given directory."""
|
|
158
|
+
config = load_config()
|
|
159
|
+
model_name = config.model or DEFAULT_MODEL
|
|
160
|
+
batch_size = config.batch_size if config.batch_size is not None else DEFAULT_BATCH_SIZE
|
|
161
|
+
|
|
162
|
+
directory = resolve_directory(path)
|
|
163
|
+
if clear:
|
|
164
|
+
removed = _clear_index_cache(directory, include_hidden)
|
|
165
|
+
if removed:
|
|
166
|
+
plural = "ies" if removed > 1 else "y"
|
|
167
|
+
console.print(
|
|
168
|
+
_styled(
|
|
169
|
+
Messages.INFO_INDEX_CLEARED.format(
|
|
170
|
+
path=directory,
|
|
171
|
+
count=removed,
|
|
172
|
+
plural=plural,
|
|
173
|
+
),
|
|
174
|
+
Styles.SUCCESS,
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
else:
|
|
178
|
+
console.print(
|
|
179
|
+
_styled(
|
|
180
|
+
Messages.INFO_INDEX_CLEAR_NONE.format(path=directory),
|
|
181
|
+
Styles.INFO,
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
console.print(_styled(Messages.INFO_INDEX_RUNNING.format(path=directory), Styles.INFO))
|
|
187
|
+
files = collect_files(directory, include_hidden=include_hidden)
|
|
188
|
+
if not files:
|
|
189
|
+
console.print(_styled(Messages.INFO_NO_FILES, Styles.WARNING))
|
|
190
|
+
raise typer.Exit(code=0)
|
|
191
|
+
|
|
192
|
+
existing_meta = _load_index_metadata_safe(directory, model_name, include_hidden)
|
|
193
|
+
if existing_meta:
|
|
194
|
+
cached_files = existing_meta.get("files", [])
|
|
195
|
+
if cached_files and _is_cache_current(
|
|
196
|
+
directory, include_hidden, cached_files, current_files=files
|
|
197
|
+
):
|
|
198
|
+
console.print(
|
|
199
|
+
_styled(Messages.INFO_INDEX_UP_TO_DATE.format(path=directory), Styles.INFO)
|
|
200
|
+
)
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
searcher = _create_searcher(model_name=model_name, batch_size=batch_size)
|
|
204
|
+
file_labels = [_label_for_path(file) for file in files]
|
|
205
|
+
embeddings = searcher.embed_texts(file_labels)
|
|
206
|
+
|
|
207
|
+
cache_path = _store_index(
|
|
208
|
+
root=directory,
|
|
209
|
+
model=model_name,
|
|
210
|
+
include_hidden=include_hidden,
|
|
211
|
+
files=files,
|
|
212
|
+
embeddings=embeddings,
|
|
213
|
+
)
|
|
214
|
+
console.print(_styled(Messages.INFO_INDEX_SAVED.format(path=cache_path), Styles.SUCCESS))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@app.command()
|
|
218
|
+
def config(
|
|
219
|
+
set_api_key_option: str | None = typer.Option(
|
|
220
|
+
None,
|
|
221
|
+
"--set-api-key",
|
|
222
|
+
help=Messages.HELP_SET_API_KEY,
|
|
223
|
+
),
|
|
224
|
+
clear_api_key: bool = typer.Option(
|
|
225
|
+
False,
|
|
226
|
+
"--clear-api-key",
|
|
227
|
+
help=Messages.HELP_CLEAR_API_KEY,
|
|
228
|
+
),
|
|
229
|
+
set_model_option: str | None = typer.Option(
|
|
230
|
+
None,
|
|
231
|
+
"--set-model",
|
|
232
|
+
help=Messages.HELP_SET_MODEL,
|
|
233
|
+
),
|
|
234
|
+
set_batch_option: int | None = typer.Option(
|
|
235
|
+
None,
|
|
236
|
+
"--set-batch-size",
|
|
237
|
+
help=Messages.HELP_SET_BATCH,
|
|
238
|
+
),
|
|
239
|
+
show: bool = typer.Option(
|
|
240
|
+
False,
|
|
241
|
+
"--show",
|
|
242
|
+
help=Messages.HELP_SHOW_CONFIG,
|
|
243
|
+
),
|
|
244
|
+
) -> None:
|
|
245
|
+
"""Manage Vexor configuration stored in ~/.vexor/config.json."""
|
|
246
|
+
changed = False
|
|
247
|
+
|
|
248
|
+
if set_api_key_option is not None:
|
|
249
|
+
set_api_key(set_api_key_option)
|
|
250
|
+
console.print(_styled(Messages.INFO_API_SAVED, Styles.SUCCESS))
|
|
251
|
+
changed = True
|
|
252
|
+
if clear_api_key:
|
|
253
|
+
set_api_key(None)
|
|
254
|
+
console.print(_styled(Messages.INFO_API_CLEARED, Styles.SUCCESS))
|
|
255
|
+
changed = True
|
|
256
|
+
if set_model_option is not None:
|
|
257
|
+
set_model(set_model_option)
|
|
258
|
+
console.print(
|
|
259
|
+
_styled(Messages.INFO_MODEL_SET.format(value=set_model_option), Styles.SUCCESS)
|
|
260
|
+
)
|
|
261
|
+
changed = True
|
|
262
|
+
if set_batch_option is not None:
|
|
263
|
+
if set_batch_option < 0:
|
|
264
|
+
raise typer.BadParameter(Messages.ERROR_BATCH_NEGATIVE)
|
|
265
|
+
set_batch_size(set_batch_option)
|
|
266
|
+
console.print(
|
|
267
|
+
_styled(Messages.INFO_BATCH_SET.format(value=set_batch_option), Styles.SUCCESS)
|
|
268
|
+
)
|
|
269
|
+
changed = True
|
|
270
|
+
|
|
271
|
+
if show or not changed:
|
|
272
|
+
cfg = load_config()
|
|
273
|
+
console.print(
|
|
274
|
+
_styled(
|
|
275
|
+
Messages.INFO_CONFIG_SUMMARY.format(
|
|
276
|
+
api="yes" if cfg.api_key else "no",
|
|
277
|
+
model=cfg.model or DEFAULT_MODEL,
|
|
278
|
+
batch=cfg.batch_size if cfg.batch_size is not None else DEFAULT_BATCH_SIZE,
|
|
279
|
+
),
|
|
280
|
+
Styles.INFO,
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
@app.command()
|
|
286
|
+
def doctor() -> None:
|
|
287
|
+
"""Check whether the `vexor` command is available on PATH."""
|
|
288
|
+
console.print(_styled(Messages.INFO_DOCTOR_CHECKING, Styles.INFO))
|
|
289
|
+
command_path = shutil.which("vexor")
|
|
290
|
+
if command_path:
|
|
291
|
+
console.print(
|
|
292
|
+
_styled(Messages.INFO_DOCTOR_FOUND.format(path=command_path), Styles.SUCCESS)
|
|
293
|
+
)
|
|
294
|
+
return
|
|
295
|
+
console.print(_styled(Messages.ERROR_DOCTOR_MISSING, Styles.ERROR))
|
|
296
|
+
raise typer.Exit(code=1)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@app.command()
|
|
300
|
+
def update() -> None:
|
|
301
|
+
"""Check whether a newer release is available online."""
|
|
302
|
+
console.print(_styled(Messages.INFO_UPDATE_CHECKING, Styles.INFO))
|
|
303
|
+
console.print(_styled(Messages.INFO_UPDATE_CURRENT.format(current=__version__), Styles.INFO))
|
|
304
|
+
try:
|
|
305
|
+
latest = _fetch_remote_version()
|
|
306
|
+
except RuntimeError as exc:
|
|
307
|
+
console.print(
|
|
308
|
+
_styled(Messages.ERROR_UPDATE_FETCH.format(reason=str(exc)), Styles.ERROR)
|
|
309
|
+
)
|
|
310
|
+
raise typer.Exit(code=1)
|
|
311
|
+
|
|
312
|
+
if _version_tuple(latest) > _version_tuple(__version__):
|
|
313
|
+
console.print(
|
|
314
|
+
_styled(
|
|
315
|
+
Messages.INFO_UPDATE_AVAILABLE.format(
|
|
316
|
+
latest=latest,
|
|
317
|
+
github=PROJECT_URL,
|
|
318
|
+
pypi=PYPI_URL,
|
|
319
|
+
),
|
|
320
|
+
Styles.WARNING,
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
return
|
|
324
|
+
|
|
325
|
+
console.print(
|
|
326
|
+
_styled(Messages.INFO_UPDATE_UP_TO_DATE.format(latest=latest), Styles.SUCCESS)
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _render_results(results: Sequence[DisplayResult], base: Path, backend: str | None) -> None:
|
|
331
|
+
console.print(_styled(Messages.TABLE_TITLE, Styles.TITLE))
|
|
332
|
+
if backend:
|
|
333
|
+
console.print(_styled(f"{Messages.TABLE_BACKEND_PREFIX}{backend}", Styles.INFO))
|
|
334
|
+
table = Table(show_header=True, header_style=Styles.TABLE_HEADER)
|
|
335
|
+
table.add_column(Messages.TABLE_HEADER_INDEX, justify="right")
|
|
336
|
+
table.add_column(Messages.TABLE_HEADER_SIMILARITY, justify="right")
|
|
337
|
+
table.add_column(Messages.TABLE_HEADER_PATH, overflow="fold")
|
|
338
|
+
for idx, result in enumerate(results, start=1):
|
|
339
|
+
table.add_row(
|
|
340
|
+
str(idx),
|
|
341
|
+
f"{result.score:.3f}",
|
|
342
|
+
format_path(result.path, base),
|
|
343
|
+
)
|
|
344
|
+
console.print(table)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _create_searcher(model_name: str, batch_size: int):
|
|
348
|
+
from .search import VexorSearcher # Local import keeps CLI startup fast
|
|
349
|
+
|
|
350
|
+
return VexorSearcher(model_name=model_name, batch_size=batch_size)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _label_for_path(path: Path) -> str:
|
|
354
|
+
return path.name.replace("_", " ")
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _load_index(root: Path, model: str, include_hidden: bool):
|
|
358
|
+
from .cache import load_index_vectors # local import
|
|
359
|
+
|
|
360
|
+
return load_index_vectors(root, model, include_hidden)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _load_index_metadata_safe(root: Path, model: str, include_hidden: bool):
|
|
364
|
+
from .cache import load_index # local import
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
return load_index(root, model, include_hidden)
|
|
368
|
+
except FileNotFoundError:
|
|
369
|
+
return None
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _store_index(**kwargs):
|
|
373
|
+
from .cache import store_index # local import
|
|
374
|
+
|
|
375
|
+
return store_index(**kwargs)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _clear_index_cache(root: Path, include_hidden: bool, model: str | None = None) -> int:
|
|
379
|
+
from .cache import clear_index # local import
|
|
380
|
+
|
|
381
|
+
return clear_index(root=root, include_hidden=include_hidden, model=model)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _fetch_remote_version(url: str = REMOTE_VERSION_URL) -> str:
|
|
385
|
+
from urllib import request, error
|
|
386
|
+
|
|
387
|
+
try:
|
|
388
|
+
with request.urlopen(url, timeout=10) as response:
|
|
389
|
+
if response.status != 200:
|
|
390
|
+
raise RuntimeError(f"HTTP {response.status}")
|
|
391
|
+
text = response.read().decode("utf-8")
|
|
392
|
+
except error.URLError as exc: # pragma: no cover - network error
|
|
393
|
+
raise RuntimeError(str(exc)) from exc
|
|
394
|
+
|
|
395
|
+
match = re.search(r"__version__\s*=\s*['\"]([^'\"]+)['\"]", text)
|
|
396
|
+
if not match:
|
|
397
|
+
raise RuntimeError("Version string not found")
|
|
398
|
+
return match.group(1)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _version_tuple(raw: str) -> tuple[int, int, int, int]:
|
|
402
|
+
raw = raw.strip()
|
|
403
|
+
release_parts: list[int] = []
|
|
404
|
+
suffix_number = 0
|
|
405
|
+
|
|
406
|
+
for piece in raw.split('.'):
|
|
407
|
+
match = re.match(r"^(\d+)", piece)
|
|
408
|
+
if not match:
|
|
409
|
+
break
|
|
410
|
+
release_parts.append(int(match.group(1)))
|
|
411
|
+
remainder = piece[match.end():]
|
|
412
|
+
if remainder:
|
|
413
|
+
suffix_match = re.match(r"[A-Za-z]+(\d+)", remainder)
|
|
414
|
+
if suffix_match:
|
|
415
|
+
suffix_number = int(suffix_match.group(1))
|
|
416
|
+
break
|
|
417
|
+
if len(release_parts) >= 4:
|
|
418
|
+
break
|
|
419
|
+
|
|
420
|
+
while len(release_parts) < 4:
|
|
421
|
+
release_parts.append(0)
|
|
422
|
+
|
|
423
|
+
if suffix_number:
|
|
424
|
+
release_parts[3] = suffix_number
|
|
425
|
+
|
|
426
|
+
return tuple(release_parts[:4])
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _is_cache_current(
|
|
430
|
+
root: Path,
|
|
431
|
+
include_hidden: bool,
|
|
432
|
+
cached_files: Sequence[dict],
|
|
433
|
+
*,
|
|
434
|
+
current_files: Sequence[Path] | None = None,
|
|
435
|
+
) -> bool:
|
|
436
|
+
if not cached_files:
|
|
437
|
+
return False
|
|
438
|
+
from .cache import compare_snapshot # local import
|
|
439
|
+
|
|
440
|
+
return compare_snapshot(
|
|
441
|
+
root,
|
|
442
|
+
include_hidden,
|
|
443
|
+
cached_files,
|
|
444
|
+
current_files=current_files,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _warn_if_stale(root: Path, include_hidden: bool, cached_files: Sequence[dict]) -> None:
|
|
449
|
+
if not cached_files:
|
|
450
|
+
return
|
|
451
|
+
if not _is_cache_current(root, include_hidden, cached_files):
|
|
452
|
+
console.print(
|
|
453
|
+
_styled(Messages.WARNING_INDEX_STALE.format(path=root), Styles.WARNING)
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def _styled(text: str, style: str) -> str:
|
|
458
|
+
return f"[{style}]{text}[/{style}]"
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def run(argv: list[str] | None = None) -> None:
|
|
462
|
+
"""Entry point wrapper allowing optional argument override."""
|
|
463
|
+
if argv is None:
|
|
464
|
+
app()
|
|
465
|
+
else:
|
|
466
|
+
app(args=list(argv))
|
vexor/config.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Global configuration management for Vexor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict
|
|
10
|
+
|
|
11
|
+
CONFIG_DIR = Path(os.path.expanduser("~")) / ".vexor"
|
|
12
|
+
CONFIG_FILE = CONFIG_DIR / "config.json"
|
|
13
|
+
DEFAULT_MODEL = "gemini-embedding-001"
|
|
14
|
+
DEFAULT_BATCH_SIZE = 0
|
|
15
|
+
ENV_API_KEY = "GOOGLE_GENAI_API_KEY"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Config:
|
|
20
|
+
api_key: str | None = None
|
|
21
|
+
model: str = DEFAULT_MODEL
|
|
22
|
+
batch_size: int = DEFAULT_BATCH_SIZE
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_config() -> Config:
|
|
26
|
+
if not CONFIG_FILE.exists():
|
|
27
|
+
return Config()
|
|
28
|
+
raw = json.loads(CONFIG_FILE.read_text(encoding="utf-8"))
|
|
29
|
+
return Config(
|
|
30
|
+
api_key=raw.get("api_key") or None,
|
|
31
|
+
model=raw.get("model") or DEFAULT_MODEL,
|
|
32
|
+
batch_size=int(raw.get("batch_size", DEFAULT_BATCH_SIZE)),
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def save_config(config: Config) -> None:
|
|
37
|
+
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
data: Dict[str, Any] = {}
|
|
39
|
+
if config.api_key:
|
|
40
|
+
data["api_key"] = config.api_key
|
|
41
|
+
if config.model:
|
|
42
|
+
data["model"] = config.model
|
|
43
|
+
data["batch_size"] = config.batch_size
|
|
44
|
+
CONFIG_FILE.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def set_api_key(value: str | None) -> None:
|
|
48
|
+
config = load_config()
|
|
49
|
+
config.api_key = value
|
|
50
|
+
save_config(config)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def set_model(value: str) -> None:
|
|
54
|
+
config = load_config()
|
|
55
|
+
config.model = value
|
|
56
|
+
save_config(config)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def set_batch_size(value: int) -> None:
|
|
60
|
+
config = load_config()
|
|
61
|
+
config.batch_size = value
|
|
62
|
+
save_config(config)
|
vexor/search.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Semantic search helpers backed by the Google Gemini embedding API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterator, List, Protocol, Sequence
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from dotenv import load_dotenv
|
|
12
|
+
from google import genai
|
|
13
|
+
from google.genai import errors as genai_errors
|
|
14
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
15
|
+
|
|
16
|
+
from .config import DEFAULT_MODEL, ENV_API_KEY, load_config
|
|
17
|
+
from .text import Messages
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(slots=True)
|
|
21
|
+
class SearchResult:
|
|
22
|
+
"""Container describing a single semantic search hit."""
|
|
23
|
+
|
|
24
|
+
path: Path
|
|
25
|
+
score: float
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class EmbeddingBackend(Protocol):
|
|
29
|
+
"""Minimal protocol for components that can embed text batches."""
|
|
30
|
+
|
|
31
|
+
def embed(self, texts: Sequence[str]) -> np.ndarray:
|
|
32
|
+
"""Return embeddings for *texts* as a 2D numpy array."""
|
|
33
|
+
raise NotImplementedError # pragma: no cover
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class GeminiEmbeddingBackend:
|
|
37
|
+
"""Embedding backend that calls the Gemini API via google-genai."""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
*,
|
|
42
|
+
model_name: str = DEFAULT_MODEL,
|
|
43
|
+
api_key: str | None = None,
|
|
44
|
+
chunk_size: int | None = None,
|
|
45
|
+
) -> None:
|
|
46
|
+
load_dotenv()
|
|
47
|
+
config = load_config()
|
|
48
|
+
self.model_name = model_name
|
|
49
|
+
self.chunk_size = chunk_size if chunk_size and chunk_size > 0 else None
|
|
50
|
+
env_key = os.getenv(ENV_API_KEY)
|
|
51
|
+
configured_key = getattr(config, "api_key", None)
|
|
52
|
+
self.api_key = api_key or configured_key or env_key
|
|
53
|
+
if not self.api_key or self.api_key.strip().lower() == "your_api_key_here":
|
|
54
|
+
raise RuntimeError(Messages.ERROR_API_KEY_MISSING)
|
|
55
|
+
self._client = genai.Client(api_key=self.api_key)
|
|
56
|
+
|
|
57
|
+
def embed(self, texts: Sequence[str]) -> np.ndarray:
|
|
58
|
+
if not texts:
|
|
59
|
+
return np.empty((0, 0), dtype=np.float32)
|
|
60
|
+
vectors: list[np.ndarray] = []
|
|
61
|
+
for chunk in _chunk(texts, self.chunk_size):
|
|
62
|
+
try:
|
|
63
|
+
response = self._client.models.embed_content(
|
|
64
|
+
model=self.model_name,
|
|
65
|
+
contents=list(chunk),
|
|
66
|
+
)
|
|
67
|
+
except genai_errors.ClientError as exc:
|
|
68
|
+
raise RuntimeError(_format_genai_error(exc)) from exc
|
|
69
|
+
embeddings = getattr(response, "embeddings", None)
|
|
70
|
+
if not embeddings:
|
|
71
|
+
raise RuntimeError(Messages.ERROR_NO_EMBEDDINGS)
|
|
72
|
+
for embedding in embeddings:
|
|
73
|
+
values = getattr(embedding, "values", None) or getattr(
|
|
74
|
+
embedding, "value", None
|
|
75
|
+
)
|
|
76
|
+
vectors.append(np.asarray(values, dtype=np.float32))
|
|
77
|
+
return np.vstack(vectors)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class VexorSearcher:
|
|
81
|
+
"""Encapsulates embedding generation and similarity computation."""
|
|
82
|
+
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
model_name: str = DEFAULT_MODEL,
|
|
86
|
+
*,
|
|
87
|
+
backend: EmbeddingBackend | None = None,
|
|
88
|
+
batch_size: int = 0,
|
|
89
|
+
) -> None:
|
|
90
|
+
self.model_name = model_name
|
|
91
|
+
self.batch_size = max(batch_size, 0)
|
|
92
|
+
self._backend = backend or GeminiEmbeddingBackend(
|
|
93
|
+
model_name=model_name, chunk_size=self.batch_size
|
|
94
|
+
)
|
|
95
|
+
self._device = f"{self.model_name} via Gemini API"
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def device(self) -> str:
|
|
99
|
+
"""Return a description of the remote backend in use."""
|
|
100
|
+
return self._device
|
|
101
|
+
|
|
102
|
+
def _encode(self, texts: Sequence[str]) -> np.ndarray:
|
|
103
|
+
embeddings = self._backend.embed(texts)
|
|
104
|
+
if embeddings.size == 0:
|
|
105
|
+
return embeddings
|
|
106
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
107
|
+
norms[norms == 0] = 1.0
|
|
108
|
+
return embeddings / norms
|
|
109
|
+
|
|
110
|
+
def embed_texts(self, texts: Sequence[str]) -> np.ndarray:
|
|
111
|
+
"""Public helper to encode arbitrary text batches."""
|
|
112
|
+
return self._encode(texts)
|
|
113
|
+
|
|
114
|
+
def search(self, query: str, files: Sequence[Path], top_k: int = 5) -> List[SearchResult]:
|
|
115
|
+
"""Return the *top_k* most similar files for *query*."""
|
|
116
|
+
clean_query = query.strip()
|
|
117
|
+
if not clean_query:
|
|
118
|
+
raise ValueError("Query text must not be empty")
|
|
119
|
+
if not files:
|
|
120
|
+
return []
|
|
121
|
+
file_labels = [self._prepare_text(path) for path in files]
|
|
122
|
+
file_vectors = self._encode(file_labels)
|
|
123
|
+
query_vector = self._encode([clean_query])[0]
|
|
124
|
+
similarities = cosine_similarity(
|
|
125
|
+
query_vector.reshape(1, -1), file_vectors
|
|
126
|
+
)[0]
|
|
127
|
+
scored = [
|
|
128
|
+
SearchResult(path=path, score=float(score))
|
|
129
|
+
for path, score in zip(files, similarities)
|
|
130
|
+
]
|
|
131
|
+
scored.sort(key=lambda item: item.score, reverse=True)
|
|
132
|
+
return scored[:top_k]
|
|
133
|
+
|
|
134
|
+
@staticmethod
|
|
135
|
+
def _prepare_text(path: Path) -> str:
|
|
136
|
+
"""Return the text representation of a file path for embedding."""
|
|
137
|
+
return path.name.replace("_", " ")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _chunk(items: Sequence[str], size: int | None) -> Iterator[Sequence[str]]:
|
|
141
|
+
if size is None or size <= 0:
|
|
142
|
+
yield items
|
|
143
|
+
return
|
|
144
|
+
for idx in range(0, len(items), size):
|
|
145
|
+
yield items[idx : idx + size]
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _format_genai_error(exc: genai_errors.ClientError) -> str:
|
|
149
|
+
message = getattr(exc, "message", None) or str(exc)
|
|
150
|
+
if "API key" in message:
|
|
151
|
+
return Messages.ERROR_API_KEY_INVALID
|
|
152
|
+
return f"{Messages.ERROR_GENAI_PREFIX}{message}"
|
vexor/text.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Centralized user-facing text for Vexor CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
class Styles:
|
|
6
|
+
ERROR = "red"
|
|
7
|
+
WARNING = "yellow"
|
|
8
|
+
SUCCESS = "green"
|
|
9
|
+
INFO = "dim"
|
|
10
|
+
TITLE = "bold cyan"
|
|
11
|
+
TABLE_HEADER = "bold magenta"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Messages:
|
|
15
|
+
APP_HELP = "Vexor – A vector-powered CLI for semantic search over filenames."
|
|
16
|
+
HELP_QUERY = "Text used to semantically match file names."
|
|
17
|
+
HELP_SEARCH_PATH = "Root directory whose cached index will be used."
|
|
18
|
+
HELP_SEARCH_TOP = "Number of results to display."
|
|
19
|
+
HELP_INCLUDE_HIDDEN = "Use the index built with hidden files included."
|
|
20
|
+
HELP_INDEX_PATH = "Root directory to scan recursively for indexing."
|
|
21
|
+
HELP_INDEX_INCLUDE = "Include hidden files and directories when building the index."
|
|
22
|
+
HELP_INDEX_CLEAR = "Remove the cached index for the specified path (respecting include-hidden)."
|
|
23
|
+
HELP_DOCTOR = "Check whether the `vexor` command is available on the current PATH."
|
|
24
|
+
HELP_UPDATE = "Check if a newer Vexor release is available online."
|
|
25
|
+
HELP_SET_API_KEY = "Persist an API key in ~/.vexor/config.json."
|
|
26
|
+
HELP_CLEAR_API_KEY = "Remove the stored API key."
|
|
27
|
+
HELP_SET_MODEL = "Set the default embedding model."
|
|
28
|
+
HELP_SET_BATCH = "Set the default batch size (0 = single request)."
|
|
29
|
+
HELP_SHOW_CONFIG = "Show current configuration."
|
|
30
|
+
|
|
31
|
+
ERROR_API_KEY_MISSING = (
|
|
32
|
+
"Gemini API key is missing or still set to the placeholder. "
|
|
33
|
+
"Configure it via `vexor config --set-api-key <token>` or an environment variable."
|
|
34
|
+
)
|
|
35
|
+
ERROR_API_KEY_INVALID = (
|
|
36
|
+
"Gemini API key is invalid. Verify the stored token and try again."
|
|
37
|
+
)
|
|
38
|
+
ERROR_GENAI_PREFIX = "Gemini API request failed: "
|
|
39
|
+
ERROR_NO_EMBEDDINGS = "Gemini API returned no embeddings."
|
|
40
|
+
ERROR_EMPTY_QUERY = "Query text must not be empty."
|
|
41
|
+
ERROR_BATCH_NEGATIVE = "Batch size must be >= 0"
|
|
42
|
+
|
|
43
|
+
INFO_NO_FILES = "No files found in the selected directory."
|
|
44
|
+
INFO_NO_RESULTS = "No matching files found."
|
|
45
|
+
ERROR_INDEX_MISSING = (
|
|
46
|
+
"No cached index found for {path}. Run `vexor index --path \"{path}\"` first."
|
|
47
|
+
)
|
|
48
|
+
INFO_INDEX_SAVED = "Index saved to {path}."
|
|
49
|
+
INFO_INDEX_EMPTY = "Index contains no files."
|
|
50
|
+
INFO_INDEX_UP_TO_DATE = "Index already matches the current directory; nothing to do."
|
|
51
|
+
WARNING_INDEX_STALE = "Cached index for {path} appears outdated; run `vexor index --path \"{path}\"` to refresh."
|
|
52
|
+
INFO_INDEX_RUNNING = "Indexing files under {path}..."
|
|
53
|
+
INFO_INDEX_CLEARED = "Removed {count} cached index entr{plural} for {path}."
|
|
54
|
+
INFO_INDEX_CLEAR_NONE = "No cached index found for {path}."
|
|
55
|
+
INFO_API_SAVED = "API key saved."
|
|
56
|
+
INFO_API_CLEARED = "API key cleared."
|
|
57
|
+
INFO_MODEL_SET = "Default model set to {value}."
|
|
58
|
+
INFO_BATCH_SET = "Default batch size set to {value}."
|
|
59
|
+
INFO_CONFIG_SUMMARY = (
|
|
60
|
+
"API key set: {api}\n"
|
|
61
|
+
"Default model: {model}\n"
|
|
62
|
+
"Default batch size: {batch}"
|
|
63
|
+
)
|
|
64
|
+
INFO_SEARCH_RUNNING = "Searching cached index under {path}..."
|
|
65
|
+
INFO_DOCTOR_CHECKING = "Checking if `vexor` is on PATH..."
|
|
66
|
+
INFO_DOCTOR_FOUND = "`vexor` command is available at {path}."
|
|
67
|
+
ERROR_DOCTOR_MISSING = (
|
|
68
|
+
"`vexor` command is not on PATH. Install with pip or add the script directory to PATH."
|
|
69
|
+
)
|
|
70
|
+
INFO_UPDATE_CHECKING = "Checking latest Vexor version..."
|
|
71
|
+
INFO_UPDATE_CURRENT = "You are running Vexor v{current}."
|
|
72
|
+
INFO_UPDATE_AVAILABLE = (
|
|
73
|
+
"New version available: v{latest}. Visit {github} or {pypi} to download the update."
|
|
74
|
+
)
|
|
75
|
+
INFO_UPDATE_UP_TO_DATE = "You already have the latest version (v{latest})."
|
|
76
|
+
ERROR_UPDATE_FETCH = "Unable to fetch latest version information ({reason})."
|
|
77
|
+
|
|
78
|
+
TABLE_TITLE = "Vexor semantic file search results"
|
|
79
|
+
TABLE_HEADER_INDEX = "#"
|
|
80
|
+
TABLE_HEADER_SIMILARITY = "Similarity"
|
|
81
|
+
TABLE_HEADER_PATH = "File path"
|
|
82
|
+
TABLE_BACKEND_PREFIX = "Backend: "
|
vexor/utils.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Utility helpers for filesystem access and path handling."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable, List
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def resolve_directory(path: Path | str) -> Path:
|
|
11
|
+
"""Resolve and validate a user supplied directory path."""
|
|
12
|
+
dir_path = Path(path).expanduser().resolve()
|
|
13
|
+
if not dir_path.exists():
|
|
14
|
+
raise FileNotFoundError(f"Directory does not exist: {dir_path}")
|
|
15
|
+
if not dir_path.is_dir():
|
|
16
|
+
raise NotADirectoryError(f"Path is not a directory: {dir_path}")
|
|
17
|
+
return dir_path
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def collect_files(root: Path | str, include_hidden: bool = False) -> List[Path]:
|
|
21
|
+
"""Recursively collect files under *root*, optionally keeping hidden entries."""
|
|
22
|
+
directory = resolve_directory(root)
|
|
23
|
+
files: List[Path] = []
|
|
24
|
+
for dirpath, dirnames, filenames in os.walk(directory):
|
|
25
|
+
if not include_hidden:
|
|
26
|
+
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
|
27
|
+
filenames = [f for f in filenames if not f.startswith(".")]
|
|
28
|
+
current_dir = Path(dirpath)
|
|
29
|
+
for filename in filenames:
|
|
30
|
+
files.append(current_dir / filename)
|
|
31
|
+
files.sort()
|
|
32
|
+
return files
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def format_path(path: Path, base: Path | None = None) -> str:
|
|
36
|
+
"""Return a user friendly representation of *path* relative to *base* when possible."""
|
|
37
|
+
if base:
|
|
38
|
+
try:
|
|
39
|
+
relative = path.relative_to(base)
|
|
40
|
+
return f"./{relative.as_posix()}"
|
|
41
|
+
except ValueError:
|
|
42
|
+
return str(path)
|
|
43
|
+
return str(path)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def ensure_positive(value: int, name: str) -> int:
|
|
47
|
+
"""Validate that *value* is positive."""
|
|
48
|
+
if value <= 0:
|
|
49
|
+
raise ValueError(f"{name} must be greater than 0")
|
|
50
|
+
return value
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vexor
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A vector-powered CLI for semantic search over filenames.
|
|
5
|
+
Project-URL: Repository, https://github.com/scarletkc/vexor
|
|
6
|
+
Author: scarletkc
|
|
7
|
+
License: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: ai,cli,semantic-search,typer
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
18
|
+
Classifier: Topic :: System :: Filesystems
|
|
19
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
20
|
+
Classifier: Topic :: Utilities
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Requires-Dist: google-genai>=0.5.0
|
|
23
|
+
Requires-Dist: numpy>=1.23.0
|
|
24
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
25
|
+
Requires-Dist: rich>=13.0.0
|
|
26
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
27
|
+
Requires-Dist: typer>=0.9.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: build>=1.2.1; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
32
|
+
Requires-Dist: twine>=5.1.1; extra == 'dev'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
<div align="center">
|
|
36
|
+
|
|
37
|
+
<img src="https://raw.githubusercontent.com/scarletkc/vexor/refs/heads/main/assets/vexor.svg" alt="Vexor" width="50%" height="auto">
|
|
38
|
+
|
|
39
|
+
# Vexor
|
|
40
|
+
|
|
41
|
+
[](https://www.python.org/downloads/)
|
|
42
|
+
[](https://pypi.org/project/vexor/)
|
|
43
|
+
[](https://github.com/scarletkc/vexor/actions/workflows/publish.yml)
|
|
44
|
+
[](https://codecov.io/github/scarletkc/vexor)
|
|
45
|
+
[](https://github.com/scarletkc/vexor/blob/main/LICENSE)
|
|
46
|
+
|
|
47
|
+
</div>
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
Vexor is a vector-powered CLI that searches file names semantically. It uses Google GenAI's `gemini-embedding-001` model to embed file names and queries, then ranks matches with cosine similarity.
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
Download from [releases](https://github.com/scarletkc/vexor/releases) without python, or with:
|
|
55
|
+
```bash
|
|
56
|
+
pip install vexor # or use pipx, uv
|
|
57
|
+
```
|
|
58
|
+
The CLI entry point is `vexor`.
|
|
59
|
+
|
|
60
|
+
## Configure
|
|
61
|
+
Set the Gemini API key once and reuse it everywhere:
|
|
62
|
+
```bash
|
|
63
|
+
vexor config --set-api-key "YOUR_KEY"
|
|
64
|
+
```
|
|
65
|
+
Optional defaults:
|
|
66
|
+
```bash
|
|
67
|
+
vexor config --set-model gemini-embedding-001
|
|
68
|
+
vexor config --set-batch-size 0 # 0 = single request
|
|
69
|
+
```
|
|
70
|
+
Configuration is stored in `~/.vexor/config.json`.
|
|
71
|
+
|
|
72
|
+
## Workflow
|
|
73
|
+
1. **Index** the project root (includes every subdirectory):
|
|
74
|
+
```bash
|
|
75
|
+
vexor index --path ~/projects/demo --include-hidden
|
|
76
|
+
```
|
|
77
|
+
2. **Search** from anywhere, pointing to the same path:
|
|
78
|
+
```bash
|
|
79
|
+
vexor search "api client config" --path ~/projects/demo --top 5
|
|
80
|
+
```
|
|
81
|
+
Output example:
|
|
82
|
+
```
|
|
83
|
+
Vexor semantic file search results
|
|
84
|
+
──────────────────────────────────
|
|
85
|
+
1 0.923 ./src/config_loader.py
|
|
86
|
+
2 0.871 ./src/utils/config_parse.py
|
|
87
|
+
3 0.809 ./tests/test_config_loader.py
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Tips:
|
|
91
|
+
- Keep one index per project root; subdirectories need separate indexes only if you explicitly run `vexor index` on them.
|
|
92
|
+
- Hidden files are included only if both `index` and `search` use `--include-hidden`.
|
|
93
|
+
|
|
94
|
+
## Commands
|
|
95
|
+
| Command | Description |
|
|
96
|
+
| ------- | ----------- |
|
|
97
|
+
| `vexor index --path PATH [--include-hidden] [--clear]` | Recursively scans `PATH`, embeds file names, and writes a cache under `~/.vexor`. |
|
|
98
|
+
| `vexor search QUERY --path PATH [--top K] [--include-hidden]` | Loads the cached embeddings for `PATH` and ranks matches for `QUERY`. |
|
|
99
|
+
| `vexor doctor` | Checks whether the `vexor` command is available on the current `PATH`. |
|
|
100
|
+
| `vexor update` | Fetches the latest release version and shows links to update via GitHub or PyPI. |
|
|
101
|
+
| `vexor config --set-api-key/--clear-api-key` | Manage the stored Gemini API key. |
|
|
102
|
+
| `vexor config --set-model/--set-batch-size/--show` | Manage default model and batch size. |
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
vexor/__init__.py,sha256=sxxZci6pouzNpymD1ejnMDepVWWu0RZ7rwM0RKkRg6I,234
|
|
2
|
+
vexor/__main__.py,sha256=ZFzom1wCfP6TPXe3aoDFpNcUgjbCZ7Quy_vfzNsH5Fw,426
|
|
3
|
+
vexor/cache.py,sha256=Y_NoKijKqYU8lq6vfx9W5DDB3kEkFP_mKPykJj0MhV4,9180
|
|
4
|
+
vexor/cli.py,sha256=y_FKVkfAIJ506efaoPhhL_ffz4IAbYkOKSGbcK8zwu4,14175
|
|
5
|
+
vexor/config.py,sha256=euhmbeXxxWn3uULBENFS0YYvNQuX53qjMwVGuejXwF0,1597
|
|
6
|
+
vexor/search.py,sha256=tQG9MuAV3Us4eUl5Tez3sdUp6aolFQJfFREXN3qCnsc,5352
|
|
7
|
+
vexor/text.py,sha256=ELD_0f4AXwQEJmiXvkC1sK6pPMQB3l-Xe_GBIkXd5eA,3890
|
|
8
|
+
vexor/utils.py,sha256=kN8thdTBohRfCk-wO_BZtMpHdsQM6LN4tJBRxyrphpc,1727
|
|
9
|
+
vexor-0.2.0.dist-info/METADATA,sha256=8X2YvrOnUdq6ymyMCeMVeegNgG6zVqCYN36cdwUniZw,4211
|
|
10
|
+
vexor-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
11
|
+
vexor-0.2.0.dist-info/entry_points.txt,sha256=dvxp6Q1R1d6bozR7TwmpdJ0X_v83MkzsLPagGY_lfr0,40
|
|
12
|
+
vexor-0.2.0.dist-info/licenses/LICENSE,sha256=wP7TAKRll1t9LoYGxWS9NikPM_0hCc00LmlLyvQBsL8,1066
|
|
13
|
+
vexor-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 ScarletKc
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|