PyPI - vexor - Versions diffs - 0.2.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

vexor 0.2.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

vexor/__init__.py +1 -1
vexor/cache.py +299 -26
vexor/cli.py +340 -193
vexor/config.py +45 -1
vexor/modes.py +81 -0
vexor/providers/__init__.py +3 -0
vexor/providers/gemini.py +74 -0
vexor/providers/openai.py +69 -0
vexor/search.py +38 -69
vexor/services/__init__.py +9 -0
vexor/services/cache_service.py +39 -0
vexor/services/config_service.py +83 -0
vexor/services/content_extract_service.py +188 -0
vexor/services/index_service.py +260 -0
vexor/services/search_service.py +95 -0
vexor/services/system_service.py +81 -0
vexor/text.py +53 -10
vexor/utils.py +24 -9
vexor-0.5.0.dist-info/METADATA +139 -0
vexor-0.5.0.dist-info/RECORD +24 -0
vexor-0.2.0.dist-info/METADATA +0 -102
vexor-0.2.0.dist-info/RECORD +0 -13
{vexor-0.2.0.dist-info → vexor-0.5.0.dist-info}/WHEEL +0 -0
{vexor-0.2.0.dist-info → vexor-0.5.0.dist-info}/entry_points.txt +0 -0
{vexor-0.2.0.dist-info → vexor-0.5.0.dist-info}/licenses/LICENSE +0 -0

vexor/services/content_extract_service.py ADDED Viewed

@@ -0,0 +1,188 @@
+"""Helpers to extract head snippets from various file types."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Protocol
+from charset_normalizer import from_path
+from docx import Document
+from pypdf import PdfReader
+HEAD_CHAR_LIMIT = 1000
+class HeadExtractor(Protocol):
+    """Protocol describing a file head extractor."""
+    def __call__(self, path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
+        ...
+@dataclass(frozen=True)
+class ExtractorEntry:
+    extensions: tuple[str, ...]
+    extractor: HeadExtractor
+_registry: Dict[str, HeadExtractor] = {}
+def register_extractor(entry: ExtractorEntry) -> None:
+    for ext in entry.extensions:
+        _registry[ext.lower()] = entry.extractor
+def extract_head(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
+    """Return a text snippet representing the head of *path*."""
+    extractor = _registry.get(path.suffix.lower())
+    if extractor is None:
+        return None
+    return extractor(path, char_limit)
+# Placeholder extractors ----------------------------------------------------
+def _read_text_head(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
+    """Return the first *char_limit* characters of a text-like file."""
+    try:
+        result = from_path(path)
+    except Exception:
+        return None
+    if result is None or not len(result):
+        return None
+    best = result.best()
+    if best is None:
+        return None
+    text = str(best)
+    if not text:
+        return None
+    snippet = text[:char_limit]
+    return _cleanup_snippet(snippet)
+def _pdf_extractor(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
+    try:
+        reader = PdfReader(str(path))
+    except Exception:
+        return None
+    buffer: list[str] = []
+    total_chars = 0
+    for page in reader.pages:
+        try:
+            text = page.extract_text() or ""
+        except Exception:
+            text = ""
+        text = text.strip()
+        if not text:
+            continue
+        buffer.append(text)
+        total_chars += len(text)
+        if total_chars >= char_limit:
+            break
+    combined = "\n".join(buffer)
+    if not combined:
+        return None
+    cleaned = _cleanup_snippet(combined)
+    if not cleaned:
+        return None
+    return cleaned[:char_limit]
+def _docx_extractor(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
+    try:
+        document = Document(str(path))
+    except Exception:
+        return None
+    buffer: list[str] = []
+    total_chars = 0
+    for paragraph in document.paragraphs:
+        text = paragraph.text.strip()
+        if not text:
+            continue
+        buffer.append(text)
+        total_chars += len(text)
+        if total_chars >= char_limit:
+            break
+    combined = "\n".join(buffer)
+    if not combined:
+        return None
+    cleaned = _cleanup_snippet(combined)
+    if not cleaned:
+        return None
+    return cleaned[:char_limit]
+def _cleanup_snippet(snippet: str) -> str | None:
+    lines = [line.strip() for line in snippet.splitlines() if line.strip()]
+    joined = " ".join(lines)
+    return joined or None
+def _unimplemented_extractor(path: Path, char_limit: int = HEAD_CHAR_LIMIT) -> str | None:
+    return None
+register_extractor(
+    ExtractorEntry(
+        extensions=(
+            ".txt",
+            ".md",
+            ".py",
+            ".js",
+            ".ts",
+            ".json",
+            ".yaml",
+            ".yml",
+            ".html",
+            ".htm",
+            ".toml",
+            ".csv",
+            ".log",
+            ".ini",
+            ".cfg",
+            ".rst",
+            ".tex",
+            ".xml",
+            ".sh",
+            ".bat",
+            ".go",
+            ".java",
+            ".c",
+            ".cpp",
+            ".h",
+            ".hpp",
+            ".rb",
+            ".php",
+            ".swift",
+            ".rs",
+            ".kt",
+            ".dart",
+            ".scala",
+            ".pl",
+            ".r",
+            ".jl",
+            ".hs",
+            ".lua",
+            ".vb",
+            ".ps1",
+            ".bash",
+        ),
+        extractor=_read_text_head,
+    )
+)
+register_extractor(
+    ExtractorEntry((".pdf",), _pdf_extractor)
+)
+register_extractor(
+    ExtractorEntry((".docx",), _docx_extractor)
+)
+register_extractor(
+    ExtractorEntry((".pptx",), _unimplemented_extractor)
+)

vexor/services/index_service.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""Logic helpers for the `vexor index` command."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from .cache_service import load_index_metadata_safe
+from ..modes import get_strategy
+INCREMENTAL_CHANGE_THRESHOLD = 0.5
+MTIME_TOLERANCE = 5e-1
+class IndexStatus(str, Enum):
+    EMPTY = "empty"
+    UP_TO_DATE = "up_to_date"
+    STORED = "stored"
+@dataclass(slots=True)
+class IndexResult:
+    status: IndexStatus
+    cache_path: Path | None = None
+    files_indexed: int = 0
+def build_index(
+    directory: Path,
+    *,
+    include_hidden: bool,
+    mode: str,
+    recursive: bool,
+    model_name: str,
+    batch_size: int,
+    provider: str,
+    base_url: str | None,
+    api_key: str | None,
+) -> IndexResult:
+    """Create or refresh the cached index for *directory*."""
+    from ..search import VexorSearcher  # local import
+    from ..utils import collect_files  # local import
+    from ..cache import apply_index_updates, store_index  # local import
+    files = collect_files(directory, include_hidden=include_hidden, recursive=recursive)
+    if not files:
+        return IndexResult(status=IndexStatus.EMPTY)
+    existing_meta = load_index_metadata_safe(directory, model_name, include_hidden, mode, recursive)
+    cached_files = existing_meta.get("files", []) if existing_meta else []
+    strategy = get_strategy(mode)
+    searcher = VexorSearcher(
+        model_name=model_name,
+        batch_size=batch_size,
+        provider=provider,
+        base_url=base_url,
+        api_key=api_key,
+    )
+    if cached_files:
+        snapshot = _snapshot_current_files(files, directory)
+        diff = _diff_cached_files(snapshot, cached_files)
+        if diff.is_noop:
+            return IndexResult(status=IndexStatus.UP_TO_DATE, files_indexed=len(files))
+        change_ratio = diff.change_ratio(len(snapshot), len(cached_files))
+        if change_ratio <= INCREMENTAL_CHANGE_THRESHOLD:
+            cache_path = _apply_incremental_update(
+                directory=directory,
+                include_hidden=include_hidden,
+                recursive=recursive,
+                mode=mode,
+                model_name=model_name,
+                files=files,
+                diff=diff,
+                searcher=searcher,
+                strategy=strategy,
+                apply_fn=apply_index_updates,
+            )
+            return IndexResult(
+                status=IndexStatus.STORED,
+                cache_path=cache_path,
+                files_indexed=len(files),
+            )
+    payloads = strategy.payloads_for_files(files)
+    file_labels = [payload.label for payload in payloads]
+    previews = [payload.preview or "" for payload in payloads]
+    embeddings = searcher.embed_texts(file_labels)
+    cache_path = store_index(
+        root=directory,
+        model=model_name,
+        include_hidden=include_hidden,
+        mode=mode,
+        recursive=recursive,
+        files=files,
+        previews=previews,
+        embeddings=embeddings,
+    )
+    return IndexResult(
+        status=IndexStatus.STORED,
+        cache_path=cache_path,
+        files_indexed=len(files),
+    )
+def clear_index_entries(
+    directory: Path,
+    *,
+    include_hidden: bool,
+    mode: str,
+    recursive: bool,
+    model: str | None = None,
+) -> int:
+    """Remove cached entries for *directory* and return number removed."""
+    from ..cache import clear_index as clear_index_cache  # local import
+    return clear_index_cache(
+        root=directory,
+        include_hidden=include_hidden,
+        mode=mode,
+        recursive=recursive,
+        model=model,
+    )
+@dataclass(slots=True)
+class SnapshotEntry:
+    path: Path
+    rel_path: str
+    mtime: float
+    size: int
+@dataclass(slots=True)
+class FileDiff:
+    added: list[Path] = field(default_factory=list)
+    modified: list[Path] = field(default_factory=list)
+    removed: list[str] = field(default_factory=list)
+    @property
+    def is_noop(self) -> bool:
+        return not (self.added or self.modified or self.removed)
+    def change_ratio(self, current_count: int, cached_count: int) -> float:
+        denom = max(current_count, cached_count, 1)
+        change_count = len(self.added) + len(self.modified) + len(self.removed)
+        return change_count / denom
+    def changed_paths(self) -> list[Path]:
+        return self.added + self.modified
+def _snapshot_current_files(files: list[Path], root: Path) -> dict[str, SnapshotEntry]:
+    snapshot: dict[str, SnapshotEntry] = {}
+    for path in files:
+        rel = _relative_to_root(path, root)
+        stat = path.stat()
+        snapshot[rel] = SnapshotEntry(
+            path=path,
+            rel_path=rel,
+            mtime=stat.st_mtime,
+            size=stat.st_size,
+        )
+    return snapshot
+def _diff_cached_files(
+    current: dict[str, SnapshotEntry],
+    cached_files: list[dict],
+) -> FileDiff:
+    cached_map = {entry["path"]: entry for entry in cached_files}
+    diff = FileDiff()
+    for rel_path, entry in current.items():
+        cached_entry = cached_map.get(rel_path)
+        if cached_entry is None:
+            diff.added.append(entry.path)
+        elif _has_entry_changed(entry, cached_entry):
+            diff.modified.append(entry.path)
+    for rel_path in cached_map.keys():
+        if rel_path not in current:
+            diff.removed.append(rel_path)
+    return diff
+def _has_entry_changed(entry: SnapshotEntry, cached_entry: dict) -> bool:
+    cached_mtime = cached_entry.get("mtime")
+    cached_size = cached_entry.get("size")
+    if cached_mtime is None:
+        return True
+    if abs(entry.mtime - cached_mtime) > MTIME_TOLERANCE:
+        if cached_size is not None and cached_size == entry.size:
+            return False
+        return True
+    if cached_size is not None and cached_size != entry.size:
+        return True
+    return False
+def _apply_incremental_update(
+    *,
+    directory: Path,
+    include_hidden: bool,
+    mode: str,
+    recursive: bool,
+    model_name: str,
+    files: list[Path],
+    diff: FileDiff,
+    searcher,
+    strategy,
+    apply_fn,
+) -> Path:
+    changed_set = set(diff.changed_paths())
+    if changed_set:
+        targets = [path for path in files if path in changed_set]
+        payloads = strategy.payloads_for_files(targets)
+        labels = [payload.label for payload in payloads]
+        previews = {
+            _relative_to_root(path, directory): (payload.preview or "")
+            for path, payload in zip(targets, payloads)
+        }
+        embeddings = searcher.embed_texts(labels)
+        embedding_map = {
+            _relative_to_root(path, directory): embeddings[idx]
+            for idx, path in enumerate(targets)
+        }
+    else:
+        targets = []
+        embedding_map = {}
+        previews = {}
+    cache_path = apply_fn(
+        root=directory,
+        model=model_name,
+        include_hidden=include_hidden,
+        mode=mode,
+        recursive=recursive,
+        current_files=files,
+        changed_files=targets,
+        removed_rel_paths=diff.removed,
+        embeddings=embedding_map,
+        previews=previews,
+    )
+    return cache_path
+def _relative_to_root(path: Path, root: Path) -> str:
+    try:
+        rel = path.relative_to(root)
+    except ValueError:
+        rel = path
+    return str(rel)

vexor/services/search_service.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Logic helpers for the `vexor search` command."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Sequence
+from .cache_service import is_cache_current
+@dataclass(slots=True)
+class SearchRequest:
+    query: str
+    directory: Path
+    include_hidden: bool
+    mode: str
+    recursive: bool
+    top_k: int
+    model_name: str
+    batch_size: int
+    provider: str
+    base_url: str | None
+    api_key: str | None
+@dataclass(slots=True)
+class SearchResponse:
+    base_path: Path
+    backend: str | None
+    results: Sequence[SearchResult]
+    is_stale: bool
+    index_empty: bool
+def perform_search(request: SearchRequest) -> SearchResponse:
+    """Execute the semantic search flow and return ranked results."""
+    from sklearn.metrics.pairwise import cosine_similarity  # local import
+    from ..cache import load_index_vectors  # local import
+    from ..search import SearchResult, VexorSearcher  # local import
+    paths, file_vectors, metadata = load_index_vectors(
+        request.directory,
+        request.model_name,
+        request.include_hidden,
+        request.mode,
+        request.recursive,
+    )
+    cached_files = metadata.get("files", [])
+    stale = bool(cached_files) and not is_cache_current(
+        request.directory,
+        request.include_hidden,
+        cached_files,
+        recursive=request.recursive,
+    )
+    preview_lookup = {
+        path: entry.get("preview")
+        for path, entry in zip(paths, cached_files)
+    }
+    if not len(paths):
+        return SearchResponse(
+            base_path=request.directory,
+            backend=None,
+            results=[],
+            is_stale=stale,
+            index_empty=True,
+        )
+    searcher = VexorSearcher(
+        model_name=request.model_name,
+        batch_size=request.batch_size,
+        provider=request.provider,
+        base_url=request.base_url,
+        api_key=request.api_key,
+    )
+    query_vector = searcher.embed_texts([request.query])[0]
+    similarities = cosine_similarity(
+        query_vector.reshape(1, -1),
+        file_vectors,
+    )[0]
+    scored = [
+        SearchResult(path=path, score=float(score), preview=preview_lookup.get(path))
+        for path, score in zip(paths, similarities)
+    ]
+    scored.sort(key=lambda item: item.score, reverse=True)
+    results = scored[: request.top_k]
+    return SearchResponse(
+        base_path=request.directory,
+        backend=searcher.device,
+        results=results,
+        is_stale=stale,
+        index_empty=False,
+    )

vexor/services/system_service.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""Logic helpers for diagnostics, editors, and update checks."""
+from __future__ import annotations
+import os
+import re
+import shlex
+import shutil
+from typing import Optional, Sequence
+from urllib import error, request
+EDITOR_FALLBACKS = ("nano", "vi", "notepad", "notepad.exe")
+def version_tuple(raw: str) -> tuple[int, int, int, int]:
+    """Parse a version string into a comparable tuple."""
+    raw = raw.strip()
+    release_parts: list[int] = []
+    suffix_number = 0
+    for piece in raw.split('.'):
+        match = re.match(r"^(\d+)", piece)
+        if not match:
+            break
+        release_parts.append(int(match.group(1)))
+        remainder = piece[match.end():]
+        if remainder:
+            suffix_match = re.match(r"[A-Za-z]+(\d+)", remainder)
+            if suffix_match:
+                suffix_number = int(suffix_match.group(1))
+            break
+        if len(release_parts) >= 4:
+            break
+    while len(release_parts) < 4:
+        release_parts.append(0)
+    if suffix_number:
+        release_parts[3] = suffix_number
+    return tuple(release_parts[:4])
+def fetch_remote_version(url: str, *, timeout: float = 10.0) -> str:
+    """Fetch the latest version string from *url*."""
+    try:
+        with request.urlopen(url, timeout=timeout) as response:
+            if response.status != 200:
+                raise RuntimeError(f"HTTP {response.status}")
+            text = response.read().decode("utf-8")
+    except error.URLError as exc:  # pragma: no cover - network error
+        raise RuntimeError(str(exc)) from exc
+    match = re.search(r"__version__\s*=\s*['\"]([^'\"]+)['\"]", text)
+    if not match:
+        raise RuntimeError("Version string not found")
+    return match.group(1)
+def find_command_on_path(command: str) -> Optional[str]:
+    """Return the resolved path for *command* if present on PATH."""
+    return shutil.which(command)
+def resolve_editor_command() -> Optional[Sequence[str]]:
+    """Return the preferred editor command as a tokenized sequence."""
+    for env_var in ("VISUAL", "EDITOR"):
+        value = os.environ.get(env_var)
+        if value:
+            return tuple(shlex.split(value))
+    for candidate in EDITOR_FALLBACKS:
+        path = shutil.which(candidate)
+        if path:
+            return (path,)
+    return None

vexor 0.2.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

vexor 0.2.0py3-none-any.whl → 0.5.0py3-none-any.whl