vexor 0.2.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,188 @@
1
+ """Helpers to extract head snippets from various file types."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Dict, Protocol
8
+
9
+ from charset_normalizer import from_path
10
+ from docx import Document
11
+ from pypdf import PdfReader
12
+
13
+ HEAD_CHAR_LIMIT = 1000
14
+
15
+
16
+ class HeadExtractor(Protocol):
17
+ """Protocol describing a file head extractor."""
18
+
19
+ def __call__(self, path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
20
+ ...
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class ExtractorEntry:
25
+ extensions: tuple[str, ...]
26
+ extractor: HeadExtractor
27
+
28
+
29
+ _registry: Dict[str, HeadExtractor] = {}
30
+
31
+
32
+ def register_extractor(entry: ExtractorEntry) -> None:
33
+ for ext in entry.extensions:
34
+ _registry[ext.lower()] = entry.extractor
35
+
36
+
37
+ def extract_head(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
38
+ """Return a text snippet representing the head of *path*."""
39
+
40
+ extractor = _registry.get(path.suffix.lower())
41
+ if extractor is None:
42
+ return None
43
+ return extractor(path, char_limit)
44
+
45
+
46
+ # Placeholder extractors ----------------------------------------------------
47
+
48
+ def _read_text_head(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
49
+ """Return the first *char_limit* characters of a text-like file."""
50
+
51
+ try:
52
+ result = from_path(path)
53
+ except Exception:
54
+ return None
55
+ if result is None or not len(result):
56
+ return None
57
+ best = result.best()
58
+ if best is None:
59
+ return None
60
+ text = str(best)
61
+ if not text:
62
+ return None
63
+ snippet = text[:char_limit]
64
+ return _cleanup_snippet(snippet)
65
+
66
+
67
+ def _pdf_extractor(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
68
+ try:
69
+ reader = PdfReader(str(path))
70
+ except Exception:
71
+ return None
72
+ buffer: list[str] = []
73
+ total_chars = 0
74
+ for page in reader.pages:
75
+ try:
76
+ text = page.extract_text() or ""
77
+ except Exception:
78
+ text = ""
79
+ text = text.strip()
80
+ if not text:
81
+ continue
82
+ buffer.append(text)
83
+ total_chars += len(text)
84
+ if total_chars >= char_limit:
85
+ break
86
+ combined = "\n".join(buffer)
87
+ if not combined:
88
+ return None
89
+ cleaned = _cleanup_snippet(combined)
90
+ if not cleaned:
91
+ return None
92
+ return cleaned[:char_limit]
93
+
94
+
95
+ def _docx_extractor(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
96
+ try:
97
+ document = Document(str(path))
98
+ except Exception:
99
+ return None
100
+ buffer: list[str] = []
101
+ total_chars = 0
102
+ for paragraph in document.paragraphs:
103
+ text = paragraph.text.strip()
104
+ if not text:
105
+ continue
106
+ buffer.append(text)
107
+ total_chars += len(text)
108
+ if total_chars >= char_limit:
109
+ break
110
+ combined = "\n".join(buffer)
111
+ if not combined:
112
+ return None
113
+ cleaned = _cleanup_snippet(combined)
114
+ if not cleaned:
115
+ return None
116
+ return cleaned[:char_limit]
117
+
118
+
119
+ def _cleanup_snippet(snippet: str) -> str | None:
120
+ lines = [line.strip() for line in snippet.splitlines() if line.strip()]
121
+ joined = " ".join(lines)
122
+ return joined or None
123
+
124
+
125
+ def _unimplemented_extractor(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
126
+ return None
127
+
128
+
129
+ register_extractor(
130
+ ExtractorEntry(
131
+ extensions=(
132
+ ".txt",
133
+ ".md",
134
+ ".py",
135
+ ".js",
136
+ ".ts",
137
+ ".json",
138
+ ".yaml",
139
+ ".yml",
140
+ ".html",
141
+ ".htm",
142
+ ".toml",
143
+ ".csv",
144
+ ".log",
145
+ ".ini",
146
+ ".cfg",
147
+ ".rst",
148
+ ".tex",
149
+ ".xml",
150
+ ".sh",
151
+ ".bat",
152
+ ".go",
153
+ ".java",
154
+ ".c",
155
+ ".cpp",
156
+ ".h",
157
+ ".hpp",
158
+ ".rb",
159
+ ".php",
160
+ ".swift",
161
+ ".rs",
162
+ ".kt",
163
+ ".dart",
164
+ ".scala",
165
+ ".pl",
166
+ ".r",
167
+ ".jl",
168
+ ".hs",
169
+ ".lua",
170
+ ".vb",
171
+ ".ps1",
172
+ ".bash",
173
+ ),
174
+ extractor=_read_text_head,
175
+ )
176
+ )
177
+
178
+ register_extractor(
179
+ ExtractorEntry((".pdf",), _pdf_extractor)
180
+ )
181
+
182
+ register_extractor(
183
+ ExtractorEntry((".docx",), _docx_extractor)
184
+ )
185
+
186
+ register_extractor(
187
+ ExtractorEntry((".pptx",), _unimplemented_extractor)
188
+ )
@@ -0,0 +1,260 @@
1
+ """Logic helpers for the `vexor index` command."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from enum import Enum
7
+ from pathlib import Path
8
+
9
+ from .cache_service import load_index_metadata_safe
10
+ from ..modes import get_strategy
11
+
12
+ INCREMENTAL_CHANGE_THRESHOLD = 0.5
13
+ MTIME_TOLERANCE = 5e-1
14
+
15
+
16
+ class IndexStatus(str, Enum):
17
+ EMPTY = "empty"
18
+ UP_TO_DATE = "up_to_date"
19
+ STORED = "stored"
20
+
21
+
22
+ @dataclass(slots=True)
23
+ class IndexResult:
24
+ status: IndexStatus
25
+ cache_path: Path | None = None
26
+ files_indexed: int = 0
27
+
28
+
29
+ def build_index(
30
+ directory: Path,
31
+ *,
32
+ include_hidden: bool,
33
+ mode: str,
34
+ recursive: bool,
35
+ model_name: str,
36
+ batch_size: int,
37
+ provider: str,
38
+ base_url: str | None,
39
+ api_key: str | None,
40
+ ) -> IndexResult:
41
+ """Create or refresh the cached index for *directory*."""
42
+
43
+ from ..search import VexorSearcher # local import
44
+ from ..utils import collect_files # local import
45
+ from ..cache import apply_index_updates, store_index # local import
46
+
47
+ files = collect_files(directory, include_hidden=include_hidden, recursive=recursive)
48
+ if not files:
49
+ return IndexResult(status=IndexStatus.EMPTY)
50
+
51
+ existing_meta = load_index_metadata_safe(directory, model_name, include_hidden, mode, recursive)
52
+ cached_files = existing_meta.get("files", []) if existing_meta else []
53
+
54
+ strategy = get_strategy(mode)
55
+ searcher = VexorSearcher(
56
+ model_name=model_name,
57
+ batch_size=batch_size,
58
+ provider=provider,
59
+ base_url=base_url,
60
+ api_key=api_key,
61
+ )
62
+
63
+ if cached_files:
64
+ snapshot = _snapshot_current_files(files, directory)
65
+ diff = _diff_cached_files(snapshot, cached_files)
66
+ if diff.is_noop:
67
+ return IndexResult(status=IndexStatus.UP_TO_DATE, files_indexed=len(files))
68
+
69
+ change_ratio = diff.change_ratio(len(snapshot), len(cached_files))
70
+ if change_ratio <= INCREMENTAL_CHANGE_THRESHOLD:
71
+ cache_path = _apply_incremental_update(
72
+ directory=directory,
73
+ include_hidden=include_hidden,
74
+ recursive=recursive,
75
+ mode=mode,
76
+ model_name=model_name,
77
+ files=files,
78
+ diff=diff,
79
+ searcher=searcher,
80
+ strategy=strategy,
81
+ apply_fn=apply_index_updates,
82
+ )
83
+ return IndexResult(
84
+ status=IndexStatus.STORED,
85
+ cache_path=cache_path,
86
+ files_indexed=len(files),
87
+ )
88
+
89
+ payloads = strategy.payloads_for_files(files)
90
+ file_labels = [payload.label for payload in payloads]
91
+ previews = [payload.preview or "" for payload in payloads]
92
+ embeddings = searcher.embed_texts(file_labels)
93
+
94
+ cache_path = store_index(
95
+ root=directory,
96
+ model=model_name,
97
+ include_hidden=include_hidden,
98
+ mode=mode,
99
+ recursive=recursive,
100
+ files=files,
101
+ previews=previews,
102
+ embeddings=embeddings,
103
+ )
104
+ return IndexResult(
105
+ status=IndexStatus.STORED,
106
+ cache_path=cache_path,
107
+ files_indexed=len(files),
108
+ )
109
+
110
+
111
+ def clear_index_entries(
112
+ directory: Path,
113
+ *,
114
+ include_hidden: bool,
115
+ mode: str,
116
+ recursive: bool,
117
+ model: str | None = None,
118
+ ) -> int:
119
+ """Remove cached entries for *directory* and return number removed."""
120
+
121
+ from ..cache import clear_index as clear_index_cache # local import
122
+
123
+ return clear_index_cache(
124
+ root=directory,
125
+ include_hidden=include_hidden,
126
+ mode=mode,
127
+ recursive=recursive,
128
+ model=model,
129
+ )
130
+
131
+
132
+ @dataclass(slots=True)
133
+ class SnapshotEntry:
134
+ path: Path
135
+ rel_path: str
136
+ mtime: float
137
+ size: int
138
+
139
+
140
+ @dataclass(slots=True)
141
+ class FileDiff:
142
+ added: list[Path] = field(default_factory=list)
143
+ modified: list[Path] = field(default_factory=list)
144
+ removed: list[str] = field(default_factory=list)
145
+
146
+ @property
147
+ def is_noop(self) -> bool:
148
+ return not (self.added or self.modified or self.removed)
149
+
150
+ def change_ratio(self, current_count: int, cached_count: int) -> float:
151
+ denom = max(current_count, cached_count, 1)
152
+ change_count = len(self.added) + len(self.modified) + len(self.removed)
153
+ return change_count / denom
154
+
155
+ def changed_paths(self) -> list[Path]:
156
+ return self.added + self.modified
157
+
158
+
159
+ def _snapshot_current_files(files: list[Path], root: Path) -> dict[str, SnapshotEntry]:
160
+ snapshot: dict[str, SnapshotEntry] = {}
161
+ for path in files:
162
+ rel = _relative_to_root(path, root)
163
+ stat = path.stat()
164
+ snapshot[rel] = SnapshotEntry(
165
+ path=path,
166
+ rel_path=rel,
167
+ mtime=stat.st_mtime,
168
+ size=stat.st_size,
169
+ )
170
+ return snapshot
171
+
172
+
173
+ def _diff_cached_files(
174
+ current: dict[str, SnapshotEntry],
175
+ cached_files: list[dict],
176
+ ) -> FileDiff:
177
+ cached_map = {entry["path"]: entry for entry in cached_files}
178
+ diff = FileDiff()
179
+
180
+ for rel_path, entry in current.items():
181
+ cached_entry = cached_map.get(rel_path)
182
+ if cached_entry is None:
183
+ diff.added.append(entry.path)
184
+ elif _has_entry_changed(entry, cached_entry):
185
+ diff.modified.append(entry.path)
186
+
187
+ for rel_path in cached_map.keys():
188
+ if rel_path not in current:
189
+ diff.removed.append(rel_path)
190
+
191
+ return diff
192
+
193
+
194
+ def _has_entry_changed(entry: SnapshotEntry, cached_entry: dict) -> bool:
195
+ cached_mtime = cached_entry.get("mtime")
196
+ cached_size = cached_entry.get("size")
197
+ if cached_mtime is None:
198
+ return True
199
+ if abs(entry.mtime - cached_mtime) > MTIME_TOLERANCE:
200
+ if cached_size is not None and cached_size == entry.size:
201
+ return False
202
+ return True
203
+ if cached_size is not None and cached_size != entry.size:
204
+ return True
205
+ return False
206
+
207
+
208
+ def _apply_incremental_update(
209
+ *,
210
+ directory: Path,
211
+ include_hidden: bool,
212
+ mode: str,
213
+ recursive: bool,
214
+ model_name: str,
215
+ files: list[Path],
216
+ diff: FileDiff,
217
+ searcher,
218
+ strategy,
219
+ apply_fn,
220
+ ) -> Path:
221
+ changed_set = set(diff.changed_paths())
222
+ if changed_set:
223
+ targets = [path for path in files if path in changed_set]
224
+ payloads = strategy.payloads_for_files(targets)
225
+ labels = [payload.label for payload in payloads]
226
+ previews = {
227
+ _relative_to_root(path, directory): (payload.preview or "")
228
+ for path, payload in zip(targets, payloads)
229
+ }
230
+ embeddings = searcher.embed_texts(labels)
231
+ embedding_map = {
232
+ _relative_to_root(path, directory): embeddings[idx]
233
+ for idx, path in enumerate(targets)
234
+ }
235
+ else:
236
+ targets = []
237
+ embedding_map = {}
238
+ previews = {}
239
+
240
+ cache_path = apply_fn(
241
+ root=directory,
242
+ model=model_name,
243
+ include_hidden=include_hidden,
244
+ mode=mode,
245
+ recursive=recursive,
246
+ current_files=files,
247
+ changed_files=targets,
248
+ removed_rel_paths=diff.removed,
249
+ embeddings=embedding_map,
250
+ previews=previews,
251
+ )
252
+ return cache_path
253
+
254
+
255
+ def _relative_to_root(path: Path, root: Path) -> str:
256
+ try:
257
+ rel = path.relative_to(root)
258
+ except ValueError:
259
+ rel = path
260
+ return str(rel)
@@ -0,0 +1,95 @@
1
+ """Logic helpers for the `vexor search` command."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Sequence
8
+
9
+ from .cache_service import is_cache_current
10
+
11
+
12
+ @dataclass(slots=True)
13
+ class SearchRequest:
14
+ query: str
15
+ directory: Path
16
+ include_hidden: bool
17
+ mode: str
18
+ recursive: bool
19
+ top_k: int
20
+ model_name: str
21
+ batch_size: int
22
+ provider: str
23
+ base_url: str | None
24
+ api_key: str | None
25
+
26
+
27
+ @dataclass(slots=True)
28
+ class SearchResponse:
29
+ base_path: Path
30
+ backend: str | None
31
+ results: Sequence[SearchResult]
32
+ is_stale: bool
33
+ index_empty: bool
34
+
35
+
36
+ def perform_search(request: SearchRequest) -> SearchResponse:
37
+ """Execute the semantic search flow and return ranked results."""
38
+
39
+ from sklearn.metrics.pairwise import cosine_similarity # local import
40
+ from ..cache import load_index_vectors # local import
41
+ from ..search import SearchResult, VexorSearcher # local import
42
+
43
+ paths, file_vectors, metadata = load_index_vectors(
44
+ request.directory,
45
+ request.model_name,
46
+ request.include_hidden,
47
+ request.mode,
48
+ request.recursive,
49
+ )
50
+ cached_files = metadata.get("files", [])
51
+ stale = bool(cached_files) and not is_cache_current(
52
+ request.directory,
53
+ request.include_hidden,
54
+ cached_files,
55
+ recursive=request.recursive,
56
+ )
57
+ preview_lookup = {
58
+ path: entry.get("preview")
59
+ for path, entry in zip(paths, cached_files)
60
+ }
61
+
62
+ if not len(paths):
63
+ return SearchResponse(
64
+ base_path=request.directory,
65
+ backend=None,
66
+ results=[],
67
+ is_stale=stale,
68
+ index_empty=True,
69
+ )
70
+
71
+ searcher = VexorSearcher(
72
+ model_name=request.model_name,
73
+ batch_size=request.batch_size,
74
+ provider=request.provider,
75
+ base_url=request.base_url,
76
+ api_key=request.api_key,
77
+ )
78
+ query_vector = searcher.embed_texts([request.query])[0]
79
+ similarities = cosine_similarity(
80
+ query_vector.reshape(1, -1),
81
+ file_vectors,
82
+ )[0]
83
+ scored = [
84
+ SearchResult(path=path, score=float(score), preview=preview_lookup.get(path))
85
+ for path, score in zip(paths, similarities)
86
+ ]
87
+ scored.sort(key=lambda item: item.score, reverse=True)
88
+ results = scored[: request.top_k]
89
+ return SearchResponse(
90
+ base_path=request.directory,
91
+ backend=searcher.device,
92
+ results=results,
93
+ is_stale=stale,
94
+ index_empty=False,
95
+ )
@@ -0,0 +1,81 @@
1
+ """Logic helpers for diagnostics, editors, and update checks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import shlex
8
+ import shutil
9
+ from typing import Optional, Sequence
10
+ from urllib import error, request
11
+
12
+ EDITOR_FALLBACKS = ("nano", "vi", "notepad", "notepad.exe")
13
+
14
+
15
+ def version_tuple(raw: str) -> tuple[int, int, int, int]:
16
+ """Parse a version string into a comparable tuple."""
17
+
18
+ raw = raw.strip()
19
+ release_parts: list[int] = []
20
+ suffix_number = 0
21
+
22
+ for piece in raw.split('.'):
23
+ match = re.match(r"^(\d+)", piece)
24
+ if not match:
25
+ break
26
+ release_parts.append(int(match.group(1)))
27
+ remainder = piece[match.end():]
28
+ if remainder:
29
+ suffix_match = re.match(r"[A-Za-z]+(\d+)", remainder)
30
+ if suffix_match:
31
+ suffix_number = int(suffix_match.group(1))
32
+ break
33
+ if len(release_parts) >= 4:
34
+ break
35
+
36
+ while len(release_parts) < 4:
37
+ release_parts.append(0)
38
+
39
+ if suffix_number:
40
+ release_parts[3] = suffix_number
41
+
42
+ return tuple(release_parts[:4])
43
+
44
+
45
+ def fetch_remote_version(url: str, *, timeout: float = 10.0) -> str:
46
+ """Fetch the latest version string from *url*."""
47
+
48
+ try:
49
+ with request.urlopen(url, timeout=timeout) as response:
50
+ if response.status != 200:
51
+ raise RuntimeError(f"HTTP {response.status}")
52
+ text = response.read().decode("utf-8")
53
+ except error.URLError as exc: # pragma: no cover - network error
54
+ raise RuntimeError(str(exc)) from exc
55
+
56
+ match = re.search(r"__version__\s*=\s*['\"]([^'\"]+)['\"]", text)
57
+ if not match:
58
+ raise RuntimeError("Version string not found")
59
+ return match.group(1)
60
+
61
+
62
+ def find_command_on_path(command: str) -> Optional[str]:
63
+ """Return the resolved path for *command* if present on PATH."""
64
+
65
+ return shutil.which(command)
66
+
67
+
68
+ def resolve_editor_command() -> Optional[Sequence[str]]:
69
+ """Return the preferred editor command as a tokenized sequence."""
70
+
71
+ for env_var in ("VISUAL", "EDITOR"):
72
+ value = os.environ.get(env_var)
73
+ if value:
74
+ return tuple(shlex.split(value))
75
+
76
+ for candidate in EDITOR_FALLBACKS:
77
+ path = shutil.which(candidate)
78
+ if path:
79
+ return (path,)
80
+
81
+ return None