vexor 0.19.0a1__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vexor/providers/openai.py CHANGED
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ import time
6
7
  from typing import Iterator, Sequence
7
8
 
8
9
  import numpy as np
@@ -35,14 +36,19 @@ class OpenAIEmbeddingBackend:
35
36
  if base_url:
36
37
  client_kwargs["base_url"] = base_url.rstrip("/")
37
38
  self._client = OpenAI(**client_kwargs)
39
+ self._executor: ThreadPoolExecutor | None = None
38
40
 
39
41
  def embed(self, texts: Sequence[str]) -> np.ndarray:
40
42
  if not texts:
41
43
  return np.empty((0, 0), dtype=np.float32)
42
- batches = list(_chunk(texts, self.chunk_size))
43
- if self.concurrency > 1 and len(batches) > 1:
44
- vectors_by_batch: list[list[np.ndarray] | None] = [None] * len(batches)
45
- with ThreadPoolExecutor(max_workers=min(self.concurrency, len(batches))) as executor:
44
+ if self.concurrency > 1:
45
+ batches = list(_chunk(texts, self.chunk_size))
46
+ if len(batches) > 1:
47
+ vectors_by_batch: list[list[np.ndarray] | None] = [None] * len(batches)
48
+ executor = self._executor
49
+ if executor is None:
50
+ executor = ThreadPoolExecutor(max_workers=self.concurrency)
51
+ self._executor = executor
46
52
  future_map = {
47
53
  executor.submit(self._embed_batch, batch): idx
48
54
  for idx, batch in enumerate(batches)
@@ -50,23 +56,34 @@ class OpenAIEmbeddingBackend:
50
56
  for future in as_completed(future_map):
51
57
  idx = future_map[future]
52
58
  vectors_by_batch[idx] = future.result()
53
- vectors = [vec for batch in vectors_by_batch if batch for vec in batch]
59
+ vectors = [vec for batch in vectors_by_batch if batch for vec in batch]
60
+ else:
61
+ vectors = []
62
+ for batch in batches:
63
+ vectors.extend(self._embed_batch(batch))
54
64
  else:
55
65
  vectors = []
56
- for batch in batches:
66
+ for batch in _chunk(texts, self.chunk_size):
57
67
  vectors.extend(self._embed_batch(batch))
58
68
  if not vectors:
59
69
  raise RuntimeError(Messages.ERROR_NO_EMBEDDINGS)
60
70
  return np.vstack(vectors)
61
71
 
62
72
  def _embed_batch(self, batch: Sequence[str]) -> list[np.ndarray]:
63
- try:
64
- response = self._client.embeddings.create(
65
- model=self.model_name,
66
- input=list(batch),
67
- )
68
- except Exception as exc: # pragma: no cover - API client variations
69
- raise RuntimeError(_format_openai_error(exc)) from exc
73
+ attempt = 0
74
+ while True:
75
+ try:
76
+ response = self._client.embeddings.create(
77
+ model=self.model_name,
78
+ input=list(batch),
79
+ )
80
+ break
81
+ except Exception as exc: # pragma: no cover - API client variations
82
+ if _should_retry_openai_error(exc) and attempt < _MAX_RETRIES:
83
+ _sleep(_backoff_delay(attempt))
84
+ attempt += 1
85
+ continue
86
+ raise RuntimeError(_format_openai_error(exc)) from exc
70
87
  data = getattr(response, "data", None) or []
71
88
  if not data:
72
89
  raise RuntimeError(Messages.ERROR_NO_EMBEDDINGS)
@@ -87,6 +104,55 @@ def _chunk(items: Sequence[str], size: int | None) -> Iterator[Sequence[str]]:
87
104
  yield items[idx : idx + size]
88
105
 
89
106
 
107
+ _RETRYABLE_STATUS_CODES = {408, 429, 500, 502, 503, 504}
108
+ _MAX_RETRIES = 2
109
+ _RETRY_BASE_DELAY = 0.5
110
+ _RETRY_MAX_DELAY = 4.0
111
+
112
+
113
+ def _sleep(seconds: float) -> None:
114
+ time.sleep(seconds)
115
+
116
+
117
+ def _backoff_delay(attempt: int) -> float:
118
+ return min(_RETRY_MAX_DELAY, _RETRY_BASE_DELAY * (2**attempt))
119
+
120
+
121
+ def _extract_status_code(exc: Exception) -> int | None:
122
+ for attr in ("status_code", "status", "http_status"):
123
+ value = getattr(exc, attr, None)
124
+ if isinstance(value, int):
125
+ return value
126
+ response = getattr(exc, "response", None)
127
+ if response is not None:
128
+ value = getattr(response, "status_code", None)
129
+ if isinstance(value, int):
130
+ return value
131
+ return None
132
+
133
+
134
+ def _should_retry_openai_error(exc: Exception) -> bool:
135
+ status = _extract_status_code(exc)
136
+ if status in _RETRYABLE_STATUS_CODES:
137
+ return True
138
+ name = exc.__class__.__name__.lower()
139
+ if "ratelimit" in name or "timeout" in name or "temporarily" in name:
140
+ return True
141
+ message = str(exc).lower()
142
+ return any(
143
+ token in message
144
+ for token in (
145
+ "rate limit",
146
+ "timeout",
147
+ "temporar",
148
+ "overload",
149
+ "try again",
150
+ "too many requests",
151
+ "service unavailable",
152
+ )
153
+ )
154
+
155
+
90
156
  def _format_openai_error(exc: Exception) -> str:
91
157
  message = getattr(exc, "message", None) or str(exc)
92
158
  return f"{Messages.ERROR_OPENAI_PREFIX}{message}"
@@ -11,6 +11,8 @@ from ..config import (
11
11
  set_base_url,
12
12
  set_batch_size,
13
13
  set_embed_concurrency,
14
+ set_extract_concurrency,
15
+ set_extract_backend,
14
16
  set_auto_index,
15
17
  set_flashrank_model,
16
18
  set_local_cuda,
@@ -28,6 +30,8 @@ class ConfigUpdateResult:
28
30
  model_set: bool = False
29
31
  batch_size_set: bool = False
30
32
  embed_concurrency_set: bool = False
33
+ extract_concurrency_set: bool = False
34
+ extract_backend_set: bool = False
31
35
  provider_set: bool = False
32
36
  base_url_set: bool = False
33
37
  base_url_cleared: bool = False
@@ -49,6 +53,8 @@ class ConfigUpdateResult:
49
53
  self.model_set,
50
54
  self.batch_size_set,
51
55
  self.embed_concurrency_set,
56
+ self.extract_concurrency_set,
57
+ self.extract_backend_set,
52
58
  self.provider_set,
53
59
  self.base_url_set,
54
60
  self.base_url_cleared,
@@ -71,6 +77,8 @@ def apply_config_updates(
71
77
  model: str | None = None,
72
78
  batch_size: int | None = None,
73
79
  embed_concurrency: int | None = None,
80
+ extract_concurrency: int | None = None,
81
+ extract_backend: str | None = None,
74
82
  provider: str | None = None,
75
83
  base_url: str | None = None,
76
84
  clear_base_url: bool = False,
@@ -101,6 +109,12 @@ def apply_config_updates(
101
109
  if embed_concurrency is not None:
102
110
  set_embed_concurrency(embed_concurrency)
103
111
  result.embed_concurrency_set = True
112
+ if extract_concurrency is not None:
113
+ set_extract_concurrency(extract_concurrency)
114
+ result.extract_concurrency_set = True
115
+ if extract_backend is not None:
116
+ set_extract_backend(extract_backend)
117
+ result.extract_backend_set = True
104
118
  if provider is not None:
105
119
  set_provider(provider)
106
120
  result.provider_set = True
@@ -2,8 +2,11 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import itertools
5
6
  import os
7
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
6
8
  from dataclasses import dataclass, field
9
+ from datetime import datetime, timezone
7
10
  from enum import Enum
8
11
  from pathlib import Path
9
12
  from typing import MutableMapping, Sequence
@@ -14,12 +17,18 @@ from .cache_service import load_index_metadata_safe
14
17
  from .content_extract_service import TEXT_EXTENSIONS
15
18
  from .js_parser import JSTS_EXTENSIONS
16
19
  from ..cache import CACHE_VERSION, IndexedChunk, backfill_chunk_lines
17
- from ..config import DEFAULT_EMBED_CONCURRENCY
20
+ from ..config import (
21
+ DEFAULT_EMBED_CONCURRENCY,
22
+ DEFAULT_EXTRACT_BACKEND,
23
+ DEFAULT_EXTRACT_CONCURRENCY,
24
+ )
18
25
  from ..modes import get_strategy, ModePayload
19
26
 
20
27
  INCREMENTAL_CHANGE_THRESHOLD = 0.5
21
28
  MTIME_TOLERANCE = 5e-1
22
29
  MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdx"}
30
+ _EXTRACT_PROCESS_MIN_FILES = 16
31
+ _CPU_HEAVY_MODES = {"auto", "code", "outline", "full"}
23
32
 
24
33
 
25
34
  class IndexStatus(str, Enum):
@@ -35,6 +44,85 @@ class IndexResult:
35
44
  files_indexed: int = 0
36
45
 
37
46
 
47
+ def _resolve_extract_concurrency(value: int) -> int:
48
+ return max(int(value or 1), 1)
49
+
50
+
51
+ def _resolve_extract_backend(
52
+ value: str | None,
53
+ *,
54
+ mode: str,
55
+ file_count: int,
56
+ concurrency: int,
57
+ ) -> str:
58
+ normalized = (value or DEFAULT_EXTRACT_BACKEND).strip().lower()
59
+ if normalized not in {"auto", "thread", "process"}:
60
+ normalized = DEFAULT_EXTRACT_BACKEND
61
+ if normalized == "auto":
62
+ if (
63
+ concurrency > 1
64
+ and file_count >= _EXTRACT_PROCESS_MIN_FILES
65
+ and mode in _CPU_HEAVY_MODES
66
+ ):
67
+ return "process"
68
+ return "thread"
69
+ return normalized
70
+
71
+
72
+ def _extract_payloads_for_mode(path: Path, mode: str) -> list[ModePayload]:
73
+ strategy = get_strategy(mode)
74
+ return strategy.payloads_for_files([path])
75
+
76
+
77
+ def _payloads_for_files(
78
+ strategy,
79
+ files: Sequence[Path],
80
+ *,
81
+ mode: str,
82
+ extract_concurrency: int,
83
+ extract_backend: str,
84
+ ) -> list[ModePayload]:
85
+ if not files:
86
+ return []
87
+ concurrency = _resolve_extract_concurrency(extract_concurrency)
88
+ if concurrency <= 1 or len(files) <= 1:
89
+ return strategy.payloads_for_files(files)
90
+ max_workers = min(concurrency, len(files))
91
+
92
+ def _extract_with_thread_pool() -> list[ModePayload]:
93
+ def _extract_one(path: Path) -> list[ModePayload]:
94
+ return strategy.payloads_for_files([path])
95
+
96
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
97
+ results = executor.map(_extract_one, files)
98
+ payloads: list[ModePayload] = []
99
+ for batch in results:
100
+ payloads.extend(batch)
101
+ return payloads
102
+
103
+ effective_backend = _resolve_extract_backend(
104
+ extract_backend,
105
+ mode=mode,
106
+ file_count=len(files),
107
+ concurrency=concurrency,
108
+ )
109
+ if effective_backend == "process":
110
+ try:
111
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
112
+ results = executor.map(
113
+ _extract_payloads_for_mode,
114
+ files,
115
+ itertools.repeat(mode),
116
+ )
117
+ payloads: list[ModePayload] = []
118
+ for batch in results:
119
+ payloads.extend(batch)
120
+ return payloads
121
+ except Exception:
122
+ return _extract_with_thread_pool()
123
+ return _extract_with_thread_pool()
124
+
125
+
38
126
  def build_index(
39
127
  directory: Path,
40
128
  *,
@@ -45,12 +133,15 @@ def build_index(
45
133
  model_name: str,
46
134
  batch_size: int,
47
135
  embed_concurrency: int = DEFAULT_EMBED_CONCURRENCY,
136
+ extract_concurrency: int = DEFAULT_EXTRACT_CONCURRENCY,
137
+ extract_backend: str = DEFAULT_EXTRACT_BACKEND,
48
138
  provider: str,
49
139
  base_url: str | None,
50
140
  api_key: str | None,
51
141
  local_cuda: bool = False,
52
142
  exclude_patterns: Sequence[str] | None = None,
53
143
  extensions: Sequence[str] | None = None,
144
+ no_cache: bool = False,
54
145
  ) -> IndexResult:
55
146
  """Create or refresh the cached index for *directory*."""
56
147
 
@@ -69,6 +160,7 @@ def build_index(
69
160
  if not files:
70
161
  return IndexResult(status=IndexStatus.EMPTY)
71
162
  stat_cache: dict[Path, os.stat_result] = {}
163
+ extract_concurrency = _resolve_extract_concurrency(extract_concurrency)
72
164
 
73
165
  existing_meta = load_index_metadata_safe(
74
166
  directory,
@@ -109,6 +201,9 @@ def build_index(
109
201
  files=files,
110
202
  missing_rel_paths=missing_line_files,
111
203
  root=directory,
204
+ extract_concurrency=extract_concurrency,
205
+ extract_backend=extract_backend,
206
+ mode=mode,
112
207
  )
113
208
  cache_path = backfill_chunk_lines(
114
209
  root=directory,
@@ -167,7 +262,15 @@ def build_index(
167
262
  path for rel, path in files_with_rel if rel in changed_rel_paths
168
263
  ]
169
264
  changed_payloads = (
170
- strategy.payloads_for_files(changed_files) if changed_files else []
265
+ _payloads_for_files(
266
+ strategy,
267
+ changed_files,
268
+ mode=mode,
269
+ extract_concurrency=extract_concurrency,
270
+ extract_backend=extract_backend,
271
+ )
272
+ if changed_files
273
+ else []
171
274
  )
172
275
 
173
276
  cache_path = _apply_incremental_update(
@@ -187,6 +290,7 @@ def build_index(
187
290
  exclude_patterns=exclude_patterns,
188
291
  extensions=extensions,
189
292
  stat_cache=stat_cache,
293
+ no_cache=no_cache,
190
294
  )
191
295
 
192
296
  line_backfill_targets = missing_line_files - changed_rel_paths - removed_rel_paths
@@ -196,6 +300,9 @@ def build_index(
196
300
  files=files,
197
301
  missing_rel_paths=line_backfill_targets,
198
302
  root=directory,
303
+ extract_concurrency=extract_concurrency,
304
+ extract_backend=extract_backend,
305
+ mode=mode,
199
306
  )
200
307
  cache_path = backfill_chunk_lines(
201
308
  root=directory,
@@ -214,12 +321,19 @@ def build_index(
214
321
  files_indexed=len(files),
215
322
  )
216
323
 
217
- payloads = strategy.payloads_for_files(files)
324
+ payloads = _payloads_for_files(
325
+ strategy,
326
+ files,
327
+ mode=mode,
328
+ extract_concurrency=extract_concurrency,
329
+ extract_backend=extract_backend,
330
+ )
218
331
  file_labels = [payload.label for payload in payloads]
219
332
  embeddings = _embed_labels_with_cache(
220
333
  searcher=searcher,
221
334
  model_name=model_name,
222
335
  labels=file_labels,
336
+ no_cache=no_cache,
223
337
  )
224
338
  entries = _build_index_entries(payloads, embeddings, directory, stat_cache=stat_cache)
225
339
 
@@ -241,6 +355,158 @@ def build_index(
241
355
  )
242
356
 
243
357
 
358
+ def build_index_in_memory(
359
+ directory: Path,
360
+ *,
361
+ include_hidden: bool,
362
+ respect_gitignore: bool = True,
363
+ mode: str,
364
+ recursive: bool,
365
+ model_name: str,
366
+ batch_size: int,
367
+ embed_concurrency: int = DEFAULT_EMBED_CONCURRENCY,
368
+ extract_concurrency: int = DEFAULT_EXTRACT_CONCURRENCY,
369
+ extract_backend: str = DEFAULT_EXTRACT_BACKEND,
370
+ provider: str,
371
+ base_url: str | None,
372
+ api_key: str | None,
373
+ local_cuda: bool = False,
374
+ exclude_patterns: Sequence[str] | None = None,
375
+ extensions: Sequence[str] | None = None,
376
+ no_cache: bool = False,
377
+ ) -> tuple[list[Path], np.ndarray, dict]:
378
+ """Build an index in memory without writing to disk."""
379
+
380
+ from ..search import VexorSearcher # local import
381
+ from ..utils import collect_files # local import
382
+
383
+ files = collect_files(
384
+ directory,
385
+ include_hidden=include_hidden,
386
+ recursive=recursive,
387
+ extensions=extensions,
388
+ exclude_patterns=exclude_patterns,
389
+ respect_gitignore=respect_gitignore,
390
+ )
391
+ if not files:
392
+ empty = np.empty((0, 0), dtype=np.float32)
393
+ metadata = {
394
+ "index_id": None,
395
+ "version": CACHE_VERSION,
396
+ "generated_at": datetime.now(timezone.utc).isoformat(),
397
+ "root": str(directory),
398
+ "model": model_name,
399
+ "include_hidden": include_hidden,
400
+ "respect_gitignore": respect_gitignore,
401
+ "recursive": recursive,
402
+ "mode": mode,
403
+ "dimension": 0,
404
+ "exclude_patterns": tuple(exclude_patterns or ()),
405
+ "extensions": tuple(extensions or ()),
406
+ "files": [],
407
+ "chunks": [],
408
+ }
409
+ return [], empty, metadata
410
+
411
+ stat_cache: dict[Path, os.stat_result] = {}
412
+ strategy = get_strategy(mode)
413
+ searcher = VexorSearcher(
414
+ model_name=model_name,
415
+ batch_size=batch_size,
416
+ embed_concurrency=embed_concurrency,
417
+ provider=provider,
418
+ base_url=base_url,
419
+ api_key=api_key,
420
+ local_cuda=local_cuda,
421
+ )
422
+ payloads = _payloads_for_files(
423
+ strategy,
424
+ files,
425
+ mode=mode,
426
+ extract_concurrency=extract_concurrency,
427
+ extract_backend=extract_backend,
428
+ )
429
+ if not payloads:
430
+ empty = np.empty((0, 0), dtype=np.float32)
431
+ metadata = {
432
+ "index_id": None,
433
+ "version": CACHE_VERSION,
434
+ "generated_at": datetime.now(timezone.utc).isoformat(),
435
+ "root": str(directory),
436
+ "model": model_name,
437
+ "include_hidden": include_hidden,
438
+ "respect_gitignore": respect_gitignore,
439
+ "recursive": recursive,
440
+ "mode": mode,
441
+ "dimension": 0,
442
+ "exclude_patterns": tuple(exclude_patterns or ()),
443
+ "extensions": tuple(extensions or ()),
444
+ "files": [],
445
+ "chunks": [],
446
+ }
447
+ return [], empty, metadata
448
+
449
+ labels = [payload.label for payload in payloads]
450
+ if no_cache:
451
+ embeddings = searcher.embed_texts(labels)
452
+ vectors = np.asarray(embeddings, dtype=np.float32)
453
+ else:
454
+ vectors = _embed_labels_with_cache(
455
+ searcher=searcher,
456
+ model_name=model_name,
457
+ labels=labels,
458
+ )
459
+ entries = _build_index_entries(
460
+ payloads,
461
+ vectors,
462
+ directory,
463
+ stat_cache=stat_cache,
464
+ )
465
+ paths = [entry.path for entry in entries]
466
+ file_snapshot: dict[str, dict] = {}
467
+ chunk_entries: list[dict] = []
468
+ for entry in entries:
469
+ rel_path = entry.rel_path
470
+ chunk_entries.append(
471
+ {
472
+ "path": rel_path,
473
+ "absolute": str(entry.path),
474
+ "mtime": entry.mtime,
475
+ "size": entry.size_bytes,
476
+ "preview": entry.preview,
477
+ "label_hash": entry.label_hash,
478
+ "chunk_index": entry.chunk_index,
479
+ "start_line": entry.start_line,
480
+ "end_line": entry.end_line,
481
+ }
482
+ )
483
+ if rel_path not in file_snapshot:
484
+ file_snapshot[rel_path] = {
485
+ "path": rel_path,
486
+ "absolute": str(entry.path),
487
+ "mtime": entry.mtime,
488
+ "size": entry.size_bytes,
489
+ }
490
+
491
+ metadata = {
492
+ "index_id": None,
493
+ "version": CACHE_VERSION,
494
+ "generated_at": datetime.now(timezone.utc).isoformat(),
495
+ "root": str(directory),
496
+ "model": model_name,
497
+ "include_hidden": include_hidden,
498
+ "respect_gitignore": respect_gitignore,
499
+ "recursive": recursive,
500
+ "mode": mode,
501
+ "dimension": int(vectors.shape[1]) if vectors.size else 0,
502
+ "exclude_patterns": tuple(exclude_patterns or ()),
503
+ "extensions": tuple(extensions or ()),
504
+ "files": list(file_snapshot.values()),
505
+ "chunks": chunk_entries,
506
+ }
507
+ return paths, vectors, metadata
508
+
509
+
244
510
  def clear_index_entries(
245
511
  directory: Path,
246
512
  *,
@@ -367,6 +633,7 @@ def _apply_incremental_update(
367
633
  exclude_patterns: Sequence[str] | None,
368
634
  extensions: Sequence[str] | None,
369
635
  stat_cache: MutableMapping[Path, os.stat_result] | None = None,
636
+ no_cache: bool = False,
370
637
  ) -> Path:
371
638
  payloads_to_embed, payloads_to_touch = _split_payloads_by_label(
372
639
  changed_payloads,
@@ -387,6 +654,7 @@ def _apply_incremental_update(
387
654
  searcher=searcher,
388
655
  model_name=model_name,
389
656
  labels=labels,
657
+ no_cache=no_cache,
390
658
  )
391
659
  changed_entries = _build_index_entries(
392
660
  payloads_to_embed,
@@ -424,9 +692,13 @@ def _embed_labels_with_cache(
424
692
  searcher,
425
693
  model_name: str,
426
694
  labels: Sequence[str],
695
+ no_cache: bool = False,
427
696
  ) -> np.ndarray:
428
697
  if not labels:
429
698
  return np.empty((0, 0), dtype=np.float32)
699
+ if no_cache:
700
+ vectors = searcher.embed_texts(labels)
701
+ return np.asarray(vectors, dtype=np.float32)
430
702
  from ..cache import embedding_cache_key, load_embedding_cache, store_embedding_cache
431
703
 
432
704
  hashes = [embedding_cache_key(label) for label in labels]
@@ -655,6 +927,9 @@ def _build_line_backfill_updates(
655
927
  files: Sequence[Path],
656
928
  missing_rel_paths: set[str],
657
929
  root: Path,
930
+ extract_concurrency: int,
931
+ extract_backend: str,
932
+ mode: str,
658
933
  ) -> list[tuple[str, int, int | None, int | None]]:
659
934
  if not missing_rel_paths:
660
935
  return []
@@ -662,7 +937,13 @@ def _build_line_backfill_updates(
662
937
  targets = [files_by_rel[rel] for rel in missing_rel_paths if rel in files_by_rel]
663
938
  if not targets:
664
939
  return []
665
- payloads = strategy.payloads_for_files(targets)
940
+ payloads = _payloads_for_files(
941
+ strategy,
942
+ targets,
943
+ mode=mode,
944
+ extract_concurrency=extract_concurrency,
945
+ extract_backend=extract_backend,
946
+ )
666
947
  return [
667
948
  (
668
949
  _relative_to_root(payload.file, root),