vexor 0.19.0a1__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vexor/__init__.py +4 -2
- vexor/_bundled_skills/vexor-cli/SKILL.md +1 -0
- vexor/api.py +87 -1
- vexor/cache.py +483 -275
- vexor/cli.py +78 -5
- vexor/config.py +240 -2
- vexor/providers/gemini.py +79 -13
- vexor/providers/openai.py +79 -13
- vexor/services/config_service.py +14 -0
- vexor/services/index_service.py +285 -4
- vexor/services/search_service.py +235 -24
- vexor/text.py +14 -0
- {vexor-0.19.0a1.dist-info → vexor-0.21.0.dist-info}/METADATA +42 -30
- vexor-0.21.0.dist-info/RECORD +33 -0
- vexor-0.19.0a1.dist-info/RECORD +0 -33
- {vexor-0.19.0a1.dist-info → vexor-0.21.0.dist-info}/WHEEL +0 -0
- {vexor-0.19.0a1.dist-info → vexor-0.21.0.dist-info}/entry_points.txt +0 -0
- {vexor-0.19.0a1.dist-info → vexor-0.21.0.dist-info}/licenses/LICENSE +0 -0
vexor/providers/openai.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
import time
|
|
6
7
|
from typing import Iterator, Sequence
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
@@ -35,14 +36,19 @@ class OpenAIEmbeddingBackend:
|
|
|
35
36
|
if base_url:
|
|
36
37
|
client_kwargs["base_url"] = base_url.rstrip("/")
|
|
37
38
|
self._client = OpenAI(**client_kwargs)
|
|
39
|
+
self._executor: ThreadPoolExecutor | None = None
|
|
38
40
|
|
|
39
41
|
def embed(self, texts: Sequence[str]) -> np.ndarray:
|
|
40
42
|
if not texts:
|
|
41
43
|
return np.empty((0, 0), dtype=np.float32)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
if self.concurrency > 1:
|
|
45
|
+
batches = list(_chunk(texts, self.chunk_size))
|
|
46
|
+
if len(batches) > 1:
|
|
47
|
+
vectors_by_batch: list[list[np.ndarray] | None] = [None] * len(batches)
|
|
48
|
+
executor = self._executor
|
|
49
|
+
if executor is None:
|
|
50
|
+
executor = ThreadPoolExecutor(max_workers=self.concurrency)
|
|
51
|
+
self._executor = executor
|
|
46
52
|
future_map = {
|
|
47
53
|
executor.submit(self._embed_batch, batch): idx
|
|
48
54
|
for idx, batch in enumerate(batches)
|
|
@@ -50,23 +56,34 @@ class OpenAIEmbeddingBackend:
|
|
|
50
56
|
for future in as_completed(future_map):
|
|
51
57
|
idx = future_map[future]
|
|
52
58
|
vectors_by_batch[idx] = future.result()
|
|
53
|
-
|
|
59
|
+
vectors = [vec for batch in vectors_by_batch if batch for vec in batch]
|
|
60
|
+
else:
|
|
61
|
+
vectors = []
|
|
62
|
+
for batch in batches:
|
|
63
|
+
vectors.extend(self._embed_batch(batch))
|
|
54
64
|
else:
|
|
55
65
|
vectors = []
|
|
56
|
-
for batch in
|
|
66
|
+
for batch in _chunk(texts, self.chunk_size):
|
|
57
67
|
vectors.extend(self._embed_batch(batch))
|
|
58
68
|
if not vectors:
|
|
59
69
|
raise RuntimeError(Messages.ERROR_NO_EMBEDDINGS)
|
|
60
70
|
return np.vstack(vectors)
|
|
61
71
|
|
|
62
72
|
def _embed_batch(self, batch: Sequence[str]) -> list[np.ndarray]:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
73
|
+
attempt = 0
|
|
74
|
+
while True:
|
|
75
|
+
try:
|
|
76
|
+
response = self._client.embeddings.create(
|
|
77
|
+
model=self.model_name,
|
|
78
|
+
input=list(batch),
|
|
79
|
+
)
|
|
80
|
+
break
|
|
81
|
+
except Exception as exc: # pragma: no cover - API client variations
|
|
82
|
+
if _should_retry_openai_error(exc) and attempt < _MAX_RETRIES:
|
|
83
|
+
_sleep(_backoff_delay(attempt))
|
|
84
|
+
attempt += 1
|
|
85
|
+
continue
|
|
86
|
+
raise RuntimeError(_format_openai_error(exc)) from exc
|
|
70
87
|
data = getattr(response, "data", None) or []
|
|
71
88
|
if not data:
|
|
72
89
|
raise RuntimeError(Messages.ERROR_NO_EMBEDDINGS)
|
|
@@ -87,6 +104,55 @@ def _chunk(items: Sequence[str], size: int | None) -> Iterator[Sequence[str]]:
|
|
|
87
104
|
yield items[idx : idx + size]
|
|
88
105
|
|
|
89
106
|
|
|
107
|
+
_RETRYABLE_STATUS_CODES = {408, 429, 500, 502, 503, 504}
|
|
108
|
+
_MAX_RETRIES = 2
|
|
109
|
+
_RETRY_BASE_DELAY = 0.5
|
|
110
|
+
_RETRY_MAX_DELAY = 4.0
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _sleep(seconds: float) -> None:
|
|
114
|
+
time.sleep(seconds)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _backoff_delay(attempt: int) -> float:
|
|
118
|
+
return min(_RETRY_MAX_DELAY, _RETRY_BASE_DELAY * (2**attempt))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _extract_status_code(exc: Exception) -> int | None:
|
|
122
|
+
for attr in ("status_code", "status", "http_status"):
|
|
123
|
+
value = getattr(exc, attr, None)
|
|
124
|
+
if isinstance(value, int):
|
|
125
|
+
return value
|
|
126
|
+
response = getattr(exc, "response", None)
|
|
127
|
+
if response is not None:
|
|
128
|
+
value = getattr(response, "status_code", None)
|
|
129
|
+
if isinstance(value, int):
|
|
130
|
+
return value
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _should_retry_openai_error(exc: Exception) -> bool:
|
|
135
|
+
status = _extract_status_code(exc)
|
|
136
|
+
if status in _RETRYABLE_STATUS_CODES:
|
|
137
|
+
return True
|
|
138
|
+
name = exc.__class__.__name__.lower()
|
|
139
|
+
if "ratelimit" in name or "timeout" in name or "temporarily" in name:
|
|
140
|
+
return True
|
|
141
|
+
message = str(exc).lower()
|
|
142
|
+
return any(
|
|
143
|
+
token in message
|
|
144
|
+
for token in (
|
|
145
|
+
"rate limit",
|
|
146
|
+
"timeout",
|
|
147
|
+
"temporar",
|
|
148
|
+
"overload",
|
|
149
|
+
"try again",
|
|
150
|
+
"too many requests",
|
|
151
|
+
"service unavailable",
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
90
156
|
def _format_openai_error(exc: Exception) -> str:
|
|
91
157
|
message = getattr(exc, "message", None) or str(exc)
|
|
92
158
|
return f"{Messages.ERROR_OPENAI_PREFIX}{message}"
|
vexor/services/config_service.py
CHANGED
|
@@ -11,6 +11,8 @@ from ..config import (
|
|
|
11
11
|
set_base_url,
|
|
12
12
|
set_batch_size,
|
|
13
13
|
set_embed_concurrency,
|
|
14
|
+
set_extract_concurrency,
|
|
15
|
+
set_extract_backend,
|
|
14
16
|
set_auto_index,
|
|
15
17
|
set_flashrank_model,
|
|
16
18
|
set_local_cuda,
|
|
@@ -28,6 +30,8 @@ class ConfigUpdateResult:
|
|
|
28
30
|
model_set: bool = False
|
|
29
31
|
batch_size_set: bool = False
|
|
30
32
|
embed_concurrency_set: bool = False
|
|
33
|
+
extract_concurrency_set: bool = False
|
|
34
|
+
extract_backend_set: bool = False
|
|
31
35
|
provider_set: bool = False
|
|
32
36
|
base_url_set: bool = False
|
|
33
37
|
base_url_cleared: bool = False
|
|
@@ -49,6 +53,8 @@ class ConfigUpdateResult:
|
|
|
49
53
|
self.model_set,
|
|
50
54
|
self.batch_size_set,
|
|
51
55
|
self.embed_concurrency_set,
|
|
56
|
+
self.extract_concurrency_set,
|
|
57
|
+
self.extract_backend_set,
|
|
52
58
|
self.provider_set,
|
|
53
59
|
self.base_url_set,
|
|
54
60
|
self.base_url_cleared,
|
|
@@ -71,6 +77,8 @@ def apply_config_updates(
|
|
|
71
77
|
model: str | None = None,
|
|
72
78
|
batch_size: int | None = None,
|
|
73
79
|
embed_concurrency: int | None = None,
|
|
80
|
+
extract_concurrency: int | None = None,
|
|
81
|
+
extract_backend: str | None = None,
|
|
74
82
|
provider: str | None = None,
|
|
75
83
|
base_url: str | None = None,
|
|
76
84
|
clear_base_url: bool = False,
|
|
@@ -101,6 +109,12 @@ def apply_config_updates(
|
|
|
101
109
|
if embed_concurrency is not None:
|
|
102
110
|
set_embed_concurrency(embed_concurrency)
|
|
103
111
|
result.embed_concurrency_set = True
|
|
112
|
+
if extract_concurrency is not None:
|
|
113
|
+
set_extract_concurrency(extract_concurrency)
|
|
114
|
+
result.extract_concurrency_set = True
|
|
115
|
+
if extract_backend is not None:
|
|
116
|
+
set_extract_backend(extract_backend)
|
|
117
|
+
result.extract_backend_set = True
|
|
104
118
|
if provider is not None:
|
|
105
119
|
set_provider(provider)
|
|
106
120
|
result.provider_set = True
|
vexor/services/index_service.py
CHANGED
|
@@ -2,8 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import itertools
|
|
5
6
|
import os
|
|
7
|
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
|
6
8
|
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime, timezone
|
|
7
10
|
from enum import Enum
|
|
8
11
|
from pathlib import Path
|
|
9
12
|
from typing import MutableMapping, Sequence
|
|
@@ -14,12 +17,18 @@ from .cache_service import load_index_metadata_safe
|
|
|
14
17
|
from .content_extract_service import TEXT_EXTENSIONS
|
|
15
18
|
from .js_parser import JSTS_EXTENSIONS
|
|
16
19
|
from ..cache import CACHE_VERSION, IndexedChunk, backfill_chunk_lines
|
|
17
|
-
from ..config import
|
|
20
|
+
from ..config import (
|
|
21
|
+
DEFAULT_EMBED_CONCURRENCY,
|
|
22
|
+
DEFAULT_EXTRACT_BACKEND,
|
|
23
|
+
DEFAULT_EXTRACT_CONCURRENCY,
|
|
24
|
+
)
|
|
18
25
|
from ..modes import get_strategy, ModePayload
|
|
19
26
|
|
|
20
27
|
INCREMENTAL_CHANGE_THRESHOLD = 0.5
|
|
21
28
|
MTIME_TOLERANCE = 5e-1
|
|
22
29
|
MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdx"}
|
|
30
|
+
_EXTRACT_PROCESS_MIN_FILES = 16
|
|
31
|
+
_CPU_HEAVY_MODES = {"auto", "code", "outline", "full"}
|
|
23
32
|
|
|
24
33
|
|
|
25
34
|
class IndexStatus(str, Enum):
|
|
@@ -35,6 +44,85 @@ class IndexResult:
|
|
|
35
44
|
files_indexed: int = 0
|
|
36
45
|
|
|
37
46
|
|
|
47
|
+
def _resolve_extract_concurrency(value: int) -> int:
|
|
48
|
+
return max(int(value or 1), 1)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _resolve_extract_backend(
|
|
52
|
+
value: str | None,
|
|
53
|
+
*,
|
|
54
|
+
mode: str,
|
|
55
|
+
file_count: int,
|
|
56
|
+
concurrency: int,
|
|
57
|
+
) -> str:
|
|
58
|
+
normalized = (value or DEFAULT_EXTRACT_BACKEND).strip().lower()
|
|
59
|
+
if normalized not in {"auto", "thread", "process"}:
|
|
60
|
+
normalized = DEFAULT_EXTRACT_BACKEND
|
|
61
|
+
if normalized == "auto":
|
|
62
|
+
if (
|
|
63
|
+
concurrency > 1
|
|
64
|
+
and file_count >= _EXTRACT_PROCESS_MIN_FILES
|
|
65
|
+
and mode in _CPU_HEAVY_MODES
|
|
66
|
+
):
|
|
67
|
+
return "process"
|
|
68
|
+
return "thread"
|
|
69
|
+
return normalized
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _extract_payloads_for_mode(path: Path, mode: str) -> list[ModePayload]:
|
|
73
|
+
strategy = get_strategy(mode)
|
|
74
|
+
return strategy.payloads_for_files([path])
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _payloads_for_files(
|
|
78
|
+
strategy,
|
|
79
|
+
files: Sequence[Path],
|
|
80
|
+
*,
|
|
81
|
+
mode: str,
|
|
82
|
+
extract_concurrency: int,
|
|
83
|
+
extract_backend: str,
|
|
84
|
+
) -> list[ModePayload]:
|
|
85
|
+
if not files:
|
|
86
|
+
return []
|
|
87
|
+
concurrency = _resolve_extract_concurrency(extract_concurrency)
|
|
88
|
+
if concurrency <= 1 or len(files) <= 1:
|
|
89
|
+
return strategy.payloads_for_files(files)
|
|
90
|
+
max_workers = min(concurrency, len(files))
|
|
91
|
+
|
|
92
|
+
def _extract_with_thread_pool() -> list[ModePayload]:
|
|
93
|
+
def _extract_one(path: Path) -> list[ModePayload]:
|
|
94
|
+
return strategy.payloads_for_files([path])
|
|
95
|
+
|
|
96
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
97
|
+
results = executor.map(_extract_one, files)
|
|
98
|
+
payloads: list[ModePayload] = []
|
|
99
|
+
for batch in results:
|
|
100
|
+
payloads.extend(batch)
|
|
101
|
+
return payloads
|
|
102
|
+
|
|
103
|
+
effective_backend = _resolve_extract_backend(
|
|
104
|
+
extract_backend,
|
|
105
|
+
mode=mode,
|
|
106
|
+
file_count=len(files),
|
|
107
|
+
concurrency=concurrency,
|
|
108
|
+
)
|
|
109
|
+
if effective_backend == "process":
|
|
110
|
+
try:
|
|
111
|
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
112
|
+
results = executor.map(
|
|
113
|
+
_extract_payloads_for_mode,
|
|
114
|
+
files,
|
|
115
|
+
itertools.repeat(mode),
|
|
116
|
+
)
|
|
117
|
+
payloads: list[ModePayload] = []
|
|
118
|
+
for batch in results:
|
|
119
|
+
payloads.extend(batch)
|
|
120
|
+
return payloads
|
|
121
|
+
except Exception:
|
|
122
|
+
return _extract_with_thread_pool()
|
|
123
|
+
return _extract_with_thread_pool()
|
|
124
|
+
|
|
125
|
+
|
|
38
126
|
def build_index(
|
|
39
127
|
directory: Path,
|
|
40
128
|
*,
|
|
@@ -45,12 +133,15 @@ def build_index(
|
|
|
45
133
|
model_name: str,
|
|
46
134
|
batch_size: int,
|
|
47
135
|
embed_concurrency: int = DEFAULT_EMBED_CONCURRENCY,
|
|
136
|
+
extract_concurrency: int = DEFAULT_EXTRACT_CONCURRENCY,
|
|
137
|
+
extract_backend: str = DEFAULT_EXTRACT_BACKEND,
|
|
48
138
|
provider: str,
|
|
49
139
|
base_url: str | None,
|
|
50
140
|
api_key: str | None,
|
|
51
141
|
local_cuda: bool = False,
|
|
52
142
|
exclude_patterns: Sequence[str] | None = None,
|
|
53
143
|
extensions: Sequence[str] | None = None,
|
|
144
|
+
no_cache: bool = False,
|
|
54
145
|
) -> IndexResult:
|
|
55
146
|
"""Create or refresh the cached index for *directory*."""
|
|
56
147
|
|
|
@@ -69,6 +160,7 @@ def build_index(
|
|
|
69
160
|
if not files:
|
|
70
161
|
return IndexResult(status=IndexStatus.EMPTY)
|
|
71
162
|
stat_cache: dict[Path, os.stat_result] = {}
|
|
163
|
+
extract_concurrency = _resolve_extract_concurrency(extract_concurrency)
|
|
72
164
|
|
|
73
165
|
existing_meta = load_index_metadata_safe(
|
|
74
166
|
directory,
|
|
@@ -109,6 +201,9 @@ def build_index(
|
|
|
109
201
|
files=files,
|
|
110
202
|
missing_rel_paths=missing_line_files,
|
|
111
203
|
root=directory,
|
|
204
|
+
extract_concurrency=extract_concurrency,
|
|
205
|
+
extract_backend=extract_backend,
|
|
206
|
+
mode=mode,
|
|
112
207
|
)
|
|
113
208
|
cache_path = backfill_chunk_lines(
|
|
114
209
|
root=directory,
|
|
@@ -167,7 +262,15 @@ def build_index(
|
|
|
167
262
|
path for rel, path in files_with_rel if rel in changed_rel_paths
|
|
168
263
|
]
|
|
169
264
|
changed_payloads = (
|
|
170
|
-
|
|
265
|
+
_payloads_for_files(
|
|
266
|
+
strategy,
|
|
267
|
+
changed_files,
|
|
268
|
+
mode=mode,
|
|
269
|
+
extract_concurrency=extract_concurrency,
|
|
270
|
+
extract_backend=extract_backend,
|
|
271
|
+
)
|
|
272
|
+
if changed_files
|
|
273
|
+
else []
|
|
171
274
|
)
|
|
172
275
|
|
|
173
276
|
cache_path = _apply_incremental_update(
|
|
@@ -187,6 +290,7 @@ def build_index(
|
|
|
187
290
|
exclude_patterns=exclude_patterns,
|
|
188
291
|
extensions=extensions,
|
|
189
292
|
stat_cache=stat_cache,
|
|
293
|
+
no_cache=no_cache,
|
|
190
294
|
)
|
|
191
295
|
|
|
192
296
|
line_backfill_targets = missing_line_files - changed_rel_paths - removed_rel_paths
|
|
@@ -196,6 +300,9 @@ def build_index(
|
|
|
196
300
|
files=files,
|
|
197
301
|
missing_rel_paths=line_backfill_targets,
|
|
198
302
|
root=directory,
|
|
303
|
+
extract_concurrency=extract_concurrency,
|
|
304
|
+
extract_backend=extract_backend,
|
|
305
|
+
mode=mode,
|
|
199
306
|
)
|
|
200
307
|
cache_path = backfill_chunk_lines(
|
|
201
308
|
root=directory,
|
|
@@ -214,12 +321,19 @@ def build_index(
|
|
|
214
321
|
files_indexed=len(files),
|
|
215
322
|
)
|
|
216
323
|
|
|
217
|
-
payloads =
|
|
324
|
+
payloads = _payloads_for_files(
|
|
325
|
+
strategy,
|
|
326
|
+
files,
|
|
327
|
+
mode=mode,
|
|
328
|
+
extract_concurrency=extract_concurrency,
|
|
329
|
+
extract_backend=extract_backend,
|
|
330
|
+
)
|
|
218
331
|
file_labels = [payload.label for payload in payloads]
|
|
219
332
|
embeddings = _embed_labels_with_cache(
|
|
220
333
|
searcher=searcher,
|
|
221
334
|
model_name=model_name,
|
|
222
335
|
labels=file_labels,
|
|
336
|
+
no_cache=no_cache,
|
|
223
337
|
)
|
|
224
338
|
entries = _build_index_entries(payloads, embeddings, directory, stat_cache=stat_cache)
|
|
225
339
|
|
|
@@ -241,6 +355,158 @@ def build_index(
|
|
|
241
355
|
)
|
|
242
356
|
|
|
243
357
|
|
|
358
|
+
def build_index_in_memory(
|
|
359
|
+
directory: Path,
|
|
360
|
+
*,
|
|
361
|
+
include_hidden: bool,
|
|
362
|
+
respect_gitignore: bool = True,
|
|
363
|
+
mode: str,
|
|
364
|
+
recursive: bool,
|
|
365
|
+
model_name: str,
|
|
366
|
+
batch_size: int,
|
|
367
|
+
embed_concurrency: int = DEFAULT_EMBED_CONCURRENCY,
|
|
368
|
+
extract_concurrency: int = DEFAULT_EXTRACT_CONCURRENCY,
|
|
369
|
+
extract_backend: str = DEFAULT_EXTRACT_BACKEND,
|
|
370
|
+
provider: str,
|
|
371
|
+
base_url: str | None,
|
|
372
|
+
api_key: str | None,
|
|
373
|
+
local_cuda: bool = False,
|
|
374
|
+
exclude_patterns: Sequence[str] | None = None,
|
|
375
|
+
extensions: Sequence[str] | None = None,
|
|
376
|
+
no_cache: bool = False,
|
|
377
|
+
) -> tuple[list[Path], np.ndarray, dict]:
|
|
378
|
+
"""Build an index in memory without writing to disk."""
|
|
379
|
+
|
|
380
|
+
from ..search import VexorSearcher # local import
|
|
381
|
+
from ..utils import collect_files # local import
|
|
382
|
+
|
|
383
|
+
files = collect_files(
|
|
384
|
+
directory,
|
|
385
|
+
include_hidden=include_hidden,
|
|
386
|
+
recursive=recursive,
|
|
387
|
+
extensions=extensions,
|
|
388
|
+
exclude_patterns=exclude_patterns,
|
|
389
|
+
respect_gitignore=respect_gitignore,
|
|
390
|
+
)
|
|
391
|
+
if not files:
|
|
392
|
+
empty = np.empty((0, 0), dtype=np.float32)
|
|
393
|
+
metadata = {
|
|
394
|
+
"index_id": None,
|
|
395
|
+
"version": CACHE_VERSION,
|
|
396
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
397
|
+
"root": str(directory),
|
|
398
|
+
"model": model_name,
|
|
399
|
+
"include_hidden": include_hidden,
|
|
400
|
+
"respect_gitignore": respect_gitignore,
|
|
401
|
+
"recursive": recursive,
|
|
402
|
+
"mode": mode,
|
|
403
|
+
"dimension": 0,
|
|
404
|
+
"exclude_patterns": tuple(exclude_patterns or ()),
|
|
405
|
+
"extensions": tuple(extensions or ()),
|
|
406
|
+
"files": [],
|
|
407
|
+
"chunks": [],
|
|
408
|
+
}
|
|
409
|
+
return [], empty, metadata
|
|
410
|
+
|
|
411
|
+
stat_cache: dict[Path, os.stat_result] = {}
|
|
412
|
+
strategy = get_strategy(mode)
|
|
413
|
+
searcher = VexorSearcher(
|
|
414
|
+
model_name=model_name,
|
|
415
|
+
batch_size=batch_size,
|
|
416
|
+
embed_concurrency=embed_concurrency,
|
|
417
|
+
provider=provider,
|
|
418
|
+
base_url=base_url,
|
|
419
|
+
api_key=api_key,
|
|
420
|
+
local_cuda=local_cuda,
|
|
421
|
+
)
|
|
422
|
+
payloads = _payloads_for_files(
|
|
423
|
+
strategy,
|
|
424
|
+
files,
|
|
425
|
+
mode=mode,
|
|
426
|
+
extract_concurrency=extract_concurrency,
|
|
427
|
+
extract_backend=extract_backend,
|
|
428
|
+
)
|
|
429
|
+
if not payloads:
|
|
430
|
+
empty = np.empty((0, 0), dtype=np.float32)
|
|
431
|
+
metadata = {
|
|
432
|
+
"index_id": None,
|
|
433
|
+
"version": CACHE_VERSION,
|
|
434
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
435
|
+
"root": str(directory),
|
|
436
|
+
"model": model_name,
|
|
437
|
+
"include_hidden": include_hidden,
|
|
438
|
+
"respect_gitignore": respect_gitignore,
|
|
439
|
+
"recursive": recursive,
|
|
440
|
+
"mode": mode,
|
|
441
|
+
"dimension": 0,
|
|
442
|
+
"exclude_patterns": tuple(exclude_patterns or ()),
|
|
443
|
+
"extensions": tuple(extensions or ()),
|
|
444
|
+
"files": [],
|
|
445
|
+
"chunks": [],
|
|
446
|
+
}
|
|
447
|
+
return [], empty, metadata
|
|
448
|
+
|
|
449
|
+
labels = [payload.label for payload in payloads]
|
|
450
|
+
if no_cache:
|
|
451
|
+
embeddings = searcher.embed_texts(labels)
|
|
452
|
+
vectors = np.asarray(embeddings, dtype=np.float32)
|
|
453
|
+
else:
|
|
454
|
+
vectors = _embed_labels_with_cache(
|
|
455
|
+
searcher=searcher,
|
|
456
|
+
model_name=model_name,
|
|
457
|
+
labels=labels,
|
|
458
|
+
)
|
|
459
|
+
entries = _build_index_entries(
|
|
460
|
+
payloads,
|
|
461
|
+
vectors,
|
|
462
|
+
directory,
|
|
463
|
+
stat_cache=stat_cache,
|
|
464
|
+
)
|
|
465
|
+
paths = [entry.path for entry in entries]
|
|
466
|
+
file_snapshot: dict[str, dict] = {}
|
|
467
|
+
chunk_entries: list[dict] = []
|
|
468
|
+
for entry in entries:
|
|
469
|
+
rel_path = entry.rel_path
|
|
470
|
+
chunk_entries.append(
|
|
471
|
+
{
|
|
472
|
+
"path": rel_path,
|
|
473
|
+
"absolute": str(entry.path),
|
|
474
|
+
"mtime": entry.mtime,
|
|
475
|
+
"size": entry.size_bytes,
|
|
476
|
+
"preview": entry.preview,
|
|
477
|
+
"label_hash": entry.label_hash,
|
|
478
|
+
"chunk_index": entry.chunk_index,
|
|
479
|
+
"start_line": entry.start_line,
|
|
480
|
+
"end_line": entry.end_line,
|
|
481
|
+
}
|
|
482
|
+
)
|
|
483
|
+
if rel_path not in file_snapshot:
|
|
484
|
+
file_snapshot[rel_path] = {
|
|
485
|
+
"path": rel_path,
|
|
486
|
+
"absolute": str(entry.path),
|
|
487
|
+
"mtime": entry.mtime,
|
|
488
|
+
"size": entry.size_bytes,
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
metadata = {
|
|
492
|
+
"index_id": None,
|
|
493
|
+
"version": CACHE_VERSION,
|
|
494
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
495
|
+
"root": str(directory),
|
|
496
|
+
"model": model_name,
|
|
497
|
+
"include_hidden": include_hidden,
|
|
498
|
+
"respect_gitignore": respect_gitignore,
|
|
499
|
+
"recursive": recursive,
|
|
500
|
+
"mode": mode,
|
|
501
|
+
"dimension": int(vectors.shape[1]) if vectors.size else 0,
|
|
502
|
+
"exclude_patterns": tuple(exclude_patterns or ()),
|
|
503
|
+
"extensions": tuple(extensions or ()),
|
|
504
|
+
"files": list(file_snapshot.values()),
|
|
505
|
+
"chunks": chunk_entries,
|
|
506
|
+
}
|
|
507
|
+
return paths, vectors, metadata
|
|
508
|
+
|
|
509
|
+
|
|
244
510
|
def clear_index_entries(
|
|
245
511
|
directory: Path,
|
|
246
512
|
*,
|
|
@@ -367,6 +633,7 @@ def _apply_incremental_update(
|
|
|
367
633
|
exclude_patterns: Sequence[str] | None,
|
|
368
634
|
extensions: Sequence[str] | None,
|
|
369
635
|
stat_cache: MutableMapping[Path, os.stat_result] | None = None,
|
|
636
|
+
no_cache: bool = False,
|
|
370
637
|
) -> Path:
|
|
371
638
|
payloads_to_embed, payloads_to_touch = _split_payloads_by_label(
|
|
372
639
|
changed_payloads,
|
|
@@ -387,6 +654,7 @@ def _apply_incremental_update(
|
|
|
387
654
|
searcher=searcher,
|
|
388
655
|
model_name=model_name,
|
|
389
656
|
labels=labels,
|
|
657
|
+
no_cache=no_cache,
|
|
390
658
|
)
|
|
391
659
|
changed_entries = _build_index_entries(
|
|
392
660
|
payloads_to_embed,
|
|
@@ -424,9 +692,13 @@ def _embed_labels_with_cache(
|
|
|
424
692
|
searcher,
|
|
425
693
|
model_name: str,
|
|
426
694
|
labels: Sequence[str],
|
|
695
|
+
no_cache: bool = False,
|
|
427
696
|
) -> np.ndarray:
|
|
428
697
|
if not labels:
|
|
429
698
|
return np.empty((0, 0), dtype=np.float32)
|
|
699
|
+
if no_cache:
|
|
700
|
+
vectors = searcher.embed_texts(labels)
|
|
701
|
+
return np.asarray(vectors, dtype=np.float32)
|
|
430
702
|
from ..cache import embedding_cache_key, load_embedding_cache, store_embedding_cache
|
|
431
703
|
|
|
432
704
|
hashes = [embedding_cache_key(label) for label in labels]
|
|
@@ -655,6 +927,9 @@ def _build_line_backfill_updates(
|
|
|
655
927
|
files: Sequence[Path],
|
|
656
928
|
missing_rel_paths: set[str],
|
|
657
929
|
root: Path,
|
|
930
|
+
extract_concurrency: int,
|
|
931
|
+
extract_backend: str,
|
|
932
|
+
mode: str,
|
|
658
933
|
) -> list[tuple[str, int, int | None, int | None]]:
|
|
659
934
|
if not missing_rel_paths:
|
|
660
935
|
return []
|
|
@@ -662,7 +937,13 @@ def _build_line_backfill_updates(
|
|
|
662
937
|
targets = [files_by_rel[rel] for rel in missing_rel_paths if rel in files_by_rel]
|
|
663
938
|
if not targets:
|
|
664
939
|
return []
|
|
665
|
-
payloads =
|
|
940
|
+
payloads = _payloads_for_files(
|
|
941
|
+
strategy,
|
|
942
|
+
targets,
|
|
943
|
+
mode=mode,
|
|
944
|
+
extract_concurrency=extract_concurrency,
|
|
945
|
+
extract_backend=extract_backend,
|
|
946
|
+
)
|
|
666
947
|
return [
|
|
667
948
|
(
|
|
668
949
|
_relative_to_root(payload.file, root),
|