PyPI - vortexa - Versions diffs - 0.1.0__tar.gz → 0.1.2__tar.gz - Mend

vortexa 0.1.0tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{vortexa-0.1.0 → vortexa-0.1.2}/PKG-INFO +8 -5
{vortexa-0.1.0 → vortexa-0.1.2}/README.md +6 -4
{vortexa-0.1.0 → vortexa-0.1.2}/pyproject.toml +3 -1
vortexa-0.1.2/src/vortexa/__init__.py +0 -0
vortexa-0.1.2/src/vortexa/core/__init__.py +0 -0
vortexa-0.1.2/src/vortexa/core/chunking.py +229 -0
vortexa-0.1.2/src/vortexa/core/embedding.py +180 -0
vortexa-0.1.2/src/vortexa/core/indexer.py +456 -0
vortexa-0.1.2/src/vortexa/core/language.py +151 -0
vortexa-0.1.2/src/vortexa/core/lf4_model.py +168 -0
vortexa-0.1.2/src/vortexa/core/types.py +98 -0
vortexa-0.1.2/src/vortexa/interfaces/__init__.py +0 -0
vortexa-0.1.2/src/vortexa/interfaces/mcp_server.py +102 -0
vortexa-0.1.2/src/vortexa/interfaces/watcher.py +138 -0
vortexa-0.1.2/src/vortexa/search/__init__.py +0 -0
vortexa-0.1.2/src/vortexa/search/ranking.py +389 -0
vortexa-0.1.2/src/vortexa/search/search.py +165 -0
vortexa-0.1.2/src/vortexa/search/tokens.py +66 -0
vortexa-0.1.2/src/vortexa/storage/__init__.py +0 -0
vortexa-0.1.2/src/vortexa/storage/bm25.py +147 -0
vortexa-0.1.2/src/vortexa/storage/vector_store.py +193 -0
vortexa-0.1.2/src/vortexa/storage/walker.py +129 -0
{vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/PKG-INFO +8 -5
vortexa-0.1.2/src/vortexa.egg-info/SOURCES.txt +27 -0
{vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/requires.txt +1 -0
vortexa-0.1.2/src/vortexa.egg-info/top_level.txt +1 -0
vortexa-0.1.0/vortexa.egg-info/SOURCES.txt +0 -8
vortexa-0.1.0/vortexa.egg-info/top_level.txt +0 -1
{vortexa-0.1.0 → vortexa-0.1.2}/setup.cfg +0 -0
{vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/dependency_links.txt +0 -0
{vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/entry_points.txt +0 -0

{vortexa-0.1.0 → vortexa-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vortexa
-Version: 0.1.0
+Version: 0.1.2
 Summary: Codebase indexing and semantic search engine
 Author-email: VortexAI <koulabhay25@gmail.com>
 License-Expression: Apache-2.0
@@ -22,6 +22,7 @@ Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 Requires-Dist: numpy>=1.24.0
 Requires-Dist: lmdb>=1.4.0
+Requires-Dist: bm25s>=0.2.0
 Requires-Dist: pathspec>=0.12.0
 Requires-Dist: huggingface-hub>=0.20.0
 Requires-Dist: tokenizers>=0.19.0
@@ -43,7 +44,8 @@ _Dense + sparse hybrid retrieval · AST-aware chunking · LMDB persistence · MC
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
 [![Python](https://img.shields.io/badge/python-3.10+-brightgreen)](#)
-[![PyPI version](https://img.shields.io/badge/pypi-v0.1.0-orange)](#)
+[![PyPI version](https://img.shields.io/pypi/v/vortexa)](https://pypi.org/project/vortexa/)
+[![PyPI downloads](https://img.shields.io/pypi/dm/vortexa)](https://pypi.org/project/vortexa/)
 </div>
@@ -149,8 +151,8 @@ pip install vortexa
 # Full (Model2Vec embeddings + tree-sitter AST chunking)
 pip install "vortexa[full]"
-# With MCP server support
-pip install "vortexa[full]" fastmcp
+# With MCP server support (adds `vortexa` CLI command)
+pip install "vortexa[mcp]"
 ```
 ### Index a codebase
@@ -289,7 +291,7 @@ vortexa ships with a built-in **MCP (Model Context Protocol) server** that expos
 python -m vortexa.interfaces.mcp_server
 # Or via the installed entry point
-vortexa-mcp
+vortexa
 ```
 On startup it indexes the current working directory and prints stats to stderr:
@@ -465,6 +467,7 @@ graph TD
 |---------|----------|----------|
 | `numpy` | Yes | Vector operations, embedding inference |
 | `lmdb` | Yes | Persistent vector and chunk metadata storage |
+| `bm25s` | Yes | Fast BM25 keyword index and persistence |
 | `pathspec` | Yes | `.gitignore` pattern matching in file walker |
 | `model2vec` | Optional | Alternative static embeddings |
 | `huggingface-hub` | Yes (default model) | Loading `VTXAI/Vortex-Embed-4.7M` |

{vortexa-0.1.0 → vortexa-0.1.2}/README.md RENAMED Viewed

@@ -8,7 +8,8 @@ _Dense + sparse hybrid retrieval · AST-aware chunking · LMDB persistence · MC
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
 [![Python](https://img.shields.io/badge/python-3.10+-brightgreen)](#)
-[![PyPI version](https://img.shields.io/badge/pypi-v0.1.0-orange)](#)
+[![PyPI version](https://img.shields.io/pypi/v/vortexa)](https://pypi.org/project/vortexa/)
+[![PyPI downloads](https://img.shields.io/pypi/dm/vortexa)](https://pypi.org/project/vortexa/)
 </div>
@@ -114,8 +115,8 @@ pip install vortexa
 # Full (Model2Vec embeddings + tree-sitter AST chunking)
 pip install "vortexa[full]"
-# With MCP server support
-pip install "vortexa[full]" fastmcp
+# With MCP server support (adds `vortexa` CLI command)
+pip install "vortexa[mcp]"
 ```
 ### Index a codebase
@@ -254,7 +255,7 @@ vortexa ships with a built-in **MCP (Model Context Protocol) server** that expos
 python -m vortexa.interfaces.mcp_server
 # Or via the installed entry point
-vortexa-mcp
+vortexa
 ```
 On startup it indexes the current working directory and prints stats to stderr:
@@ -430,6 +431,7 @@ graph TD
 |---------|----------|----------|
 | `numpy` | Yes | Vector operations, embedding inference |
 | `lmdb` | Yes | Persistent vector and chunk metadata storage |
+| `bm25s` | Yes | Fast BM25 keyword index and persistence |
 | `pathspec` | Yes | `.gitignore` pattern matching in file walker |
 | `model2vec` | Optional | Alternative static embeddings |
 | `huggingface-hub` | Yes (default model) | Loading `VTXAI/Vortex-Embed-4.7M` |

{vortexa-0.1.0 → vortexa-0.1.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "vortexa"
-version = "0.1.0"
+version = "0.1.2"
 description = "Codebase indexing and semantic search engine"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -29,6 +29,7 @@ classifiers = [
 dependencies = [
     "numpy>=1.24.0",
     "lmdb>=1.4.0",
+    "bm25s>=0.2.0",
     "pathspec>=0.12.0",
     "huggingface-hub>=0.20.0",
     "tokenizers>=0.19.0",
@@ -54,6 +55,7 @@ Repository = "https://github.com/OEvortex/vortexa"
 Issues = "https://github.com/OEvortex/vortexa/issues"
 [tool.setuptools.packages.find]
+where = ["src"]
 include = ["vortexa*"]
 [tool.ruff]

vortexa-0.1.2/src/vortexa/__init__.py ADDED Viewed

File without changes

vortexa-0.1.2/src/vortexa/core/__init__.py ADDED Viewed

File without changes

vortexa-0.1.2/src/vortexa/core/chunking.py ADDED Viewed

@@ -0,0 +1,229 @@
+"""Code-aware chunking using tree-sitter with line-based fallback.
+Splits source code into chunks respecting AST boundaries (functions, classes, etc.)
+when tree-sitter supports the language, otherwise falls back to line-based splitting.
+Supports configurable chunk_size, min_chunk_size, and chunk_overlap
+(inspired by cocoindex's RecursiveSplitter).
+"""
+from __future__ import annotations
+import hashlib
+import logging
+from dataclasses import dataclass
+from functools import lru_cache
+from vortexa.core.types import Chunk, ChunkConfig, Lineage
+logger = logging.getLogger(__name__)
+@dataclass
+class ChunkBoundary:
+    """The output of the internal chunking algorithm."""
+    start: int
+    end: int
+@lru_cache(maxsize=64)
+def _get_parser(language: str):
+    """Get a tree-sitter parser for the given language. Returns None if unavailable."""
+    try:
+        from tree_sitter_language_pack import get_parser as _get_ts_parser
+        return _get_ts_parser(language)  # type: ignore
+    except Exception:
+        return None
+def is_supported_language(language: str) -> bool:
+    """Check if tree-sitter supports the given language."""
+    return _get_parser(language) is not None
+def _merge_adjacent_chunks(
+    chunks: list[ChunkBoundary],
+    desired_length: int,
+    overlap: int = 0,
+) -> list[ChunkBoundary]:
+    """Merge adjacent chunks up to the desired length, with optional overlap.
+    When overlap > 0, each chunk (after the first) starts `overlap` bytes
+    before the end of the previous chunk, creating overlapping regions.
+    """
+    if not chunks:
+        return []
+    merged: list[ChunkBoundary] = []
+    current_start = chunks[0].start
+    current_end = chunks[0].end
+    current_length = current_end - current_start
+    for group in chunks[1:]:
+        start, end = group.start, group.end
+        length = end - start
+        if current_length + length > desired_length:
+            merged.append(ChunkBoundary(start=current_start, end=current_end))
+            # Apply overlap: start the next chunk overlap bytes before current end
+            if overlap > 0:
+                current_start = max(current_end - overlap, start)
+            else:
+                current_start = start
+            current_end = end
+            current_length = current_end - current_start
+            continue
+        current_end = end
+        current_length += length
+    merged.append(ChunkBoundary(start=current_start, end=current_end))
+    return merged
+def _merge_node_inner(node, desired_length: int) -> list[ChunkBoundary]:
+    """Recursively merge and split AST nodes into chunks."""
+    if not node.children:
+        return [ChunkBoundary(node.start_byte, node.end_byte)]
+    groups: list[ChunkBoundary] = []
+    children = node.children
+    index = 0
+    while index < len(children):
+        child = children[index]
+        start = child.start_byte
+        end = child.end_byte
+        length = child.end_byte - child.start_byte
+        index += 1
+        # If this single chunk is longer than desired, recurse into it
+        if length > desired_length:
+            groups.extend(_merge_node_inner(child, desired_length))
+            continue
+        while index < len(children):
+            child = children[index]
+            child_length = child.end_byte - child.start_byte
+            if length + child_length > desired_length:
+                break
+            end = child.end_byte
+            length += child_length
+            index += 1
+        groups.append(ChunkBoundary(start, end))
+    return groups
+def _merge_node(node, desired_length: int, overlap: int = 0) -> list[ChunkBoundary]:
+    """Recursively turn AST nodes into chunks, then merge adjacent chunks."""
+    raw_chunks = _merge_node_inner(node, desired_length)
+    return _merge_adjacent_chunks(raw_chunks, desired_length, overlap)
+def chunk_lines(text: str, desired_length: int, overlap: int = 0) -> list[ChunkBoundary]:
+    """Chunk source code by line boundaries with optional overlap."""
+    if not text.strip():
+        return []
+    lines_as_groups: list[ChunkBoundary] = []
+    index = 0
+    for line in text.splitlines(keepends=True):
+        lines_as_groups.append(ChunkBoundary(start=index, end=index + len(line)))
+        index += len(line)
+    return _merge_adjacent_chunks(lines_as_groups, desired_length, overlap)
+def chunk_source(
+    source: str,
+    file_path: str,
+    language: str | None,
+    config: ChunkConfig | None = None,
+) -> list[Chunk]:
+    """Chunk source code into indexable units with lineage tracking.
+    Uses tree-sitter for AST-aware chunking when the language is supported,
+    falls back to line-based chunking otherwise.
+    :param source: Source code text.
+    :param file_path: Relative file path for the chunk metadata.
+    :param language: Detected programming language (or None).
+    :param config: Chunking configuration (chunk_size, overlap, etc.).
+    :return: List of Chunk objects with lineage and chunk_hash.
+    """
+    if not source.strip():
+        return []
+    if config is None:
+        config = ChunkConfig()
+    chunk_boundaries = None
+    if language is not None and is_supported_language(language):
+        parser = _get_parser(language)
+        if parser is not None:
+            try:
+                as_bytes = source.encode("utf-8")
+                root = parser.parse(as_bytes).root_node
+                chunk_boundaries = _merge_node(root, config.chunk_size, config.chunk_overlap)
+                # Convert byte offsets to char offsets
+                char_boundaries = []
+                for boundary in chunk_boundaries:
+                    start_char = len(as_bytes[: boundary.start].decode("utf-8"))
+                    end_char = len(as_bytes[: boundary.end].decode("utf-8"))
+                    char_boundaries.append(ChunkBoundary(start=start_char, end=end_char))
+                chunk_boundaries = char_boundaries
+            except Exception:
+                logger.debug("Tree-sitter chunking failed for %s, falling back", file_path)
+                chunk_boundaries = None
+    if chunk_boundaries is None:
+        chunk_boundaries = chunk_lines(source, config.chunk_size, config.chunk_overlap)
+    # Compute source hash for memoization
+    source_hash = hashlib.sha256(source.encode("utf-8")).hexdigest()[:16]
+    chunks: list[Chunk] = []
+    for boundary in chunk_boundaries:
+        end_index = max(boundary.end - 1, boundary.start)
+        text = source[boundary.start : end_index + 1]
+        if not text.strip():
+            continue
+        start_line = source[: boundary.start].count("\n") + 1
+        end_line = source[:end_index].count("\n") + 1
+        # Compute chunk-specific hash for memoization
+        chunk_hash = hashlib.sha256(
+            f"{file_path}:{source_hash}:{boundary.start}:{boundary.end}".encode()
+        ).hexdigest()[:16]
+        # Compute byte offsets for lineage
+        as_bytes = source.encode("utf-8")
+        byte_start = len(source[: boundary.start].encode("utf-8"))
+        byte_end = len(source[:end_index].encode("utf-8"))
+        chunks.append(
+            Chunk(
+                content=text,
+                file_path=file_path,
+                start_line=start_line,
+                end_line=end_line,
+                language=language,
+                lineage=Lineage(
+                    source_path=file_path,
+                    start_line=start_line,
+                    end_line=end_line,
+                    byte_start=byte_start,
+                    byte_end=byte_end,
+                ),
+                chunk_hash=chunk_hash,
+            )
+        )
+    return chunks

vortexa-0.1.2/src/vortexa/core/embedding.py ADDED Viewed

@@ -0,0 +1,180 @@
+"""Embedding model abstraction for the codebase indexer.
+Provides lazy-loading, thread-safe embedders with memoization support.
+Inspired by cocoindex's SentenceTransformerEmbedder pattern.
+"""
+from __future__ import annotations
+import logging
+import threading
+from typing import Protocol, runtime_checkable
+import numpy as np
+import numpy.typing as npt
+logger = logging.getLogger(__name__)
+@runtime_checkable
+class Embedder(Protocol):
+    """Protocol for embedding models used by the indexer."""
+    @property
+    def dim(self) -> int:
+        """Embedding dimensionality."""
+        ...
+    def embed(self, text: str) -> npt.NDArray[np.float32]:
+        """Embed a single text string."""
+        ...
+    def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
+        """Embed a batch of text strings."""
+        ...
+    @property
+    def memo_key(self) -> tuple:
+        """Identity key for memoization cache invalidation."""
+        ...
+class Model2VecEmbedder:
+    """Thread-safe, lazy-loading embedder wrapping model2vec.StaticModel.
+    The model is loaded on first use and cached. Thread-safe via a lock.
+    Memo key includes the model ID for cache invalidation.
+    """
+    def __init__(self, model_id: str = "AI4free/JARVIS-tool-search-v1") -> None:
+        self._model_id = model_id
+        self._model = None
+        self._lock = threading.Lock()
+    @property
+    def dim(self) -> int:
+        self._ensure_loaded()
+        assert self._model is not None
+        return self._model.dim
+    def _ensure_loaded(self) -> None:
+        if self._model is None:
+            with self._lock:
+                if self._model is None:  # Double-checked locking
+                    from model2vec import StaticModel
+                    logger.info("Loading embedding model: %s", self._model_id)
+                    self._model = StaticModel.from_pretrained(self._model_id)
+    def embed(self, text: str) -> npt.NDArray[np.float32]:
+        """Embed a single text string."""
+        self._ensure_loaded()
+        assert self._model is not None
+        return self._model.encode([text])[0]
+    def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
+        """Embed a batch of text strings."""
+        if not texts:
+            return np.empty((0, 0), dtype=np.float32)
+        self._ensure_loaded()
+        assert self._model is not None
+        result = self._model.encode(texts)
+        return np.array(result, dtype=np.float32)
+    @property
+    def memo_key(self) -> tuple:
+        """Identity key: (class, model_id)."""
+        return ("Model2VecEmbedder", self._model_id)
+class SentenceTransformerEmbedder:
+    """Thread-safe embedder wrapping sentence-transformers.
+    Supports any sentence-transformers model with lazy loading.
+    Memo key includes model name and device for cache invalidation.
+    """
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2", device: str | None = None) -> None:
+        self._model_name = model_name
+        self._device = device
+        self._model = None
+        self._lock = threading.Lock()
+    @property
+    def dim(self) -> int:
+        self._ensure_loaded()
+        assert self._model is not None
+        dim = self._model.get_embedding_dimension()
+        assert dim is not None
+        return dim
+    def _ensure_loaded(self) -> None:
+        if self._model is None:
+            with self._lock:
+                if self._model is None:
+                    from sentence_transformers import SentenceTransformer
+                    logger.info("Loading sentence-transformers model: %s", self._model_name)
+                    self._model = SentenceTransformer(self._model_name, device=self._device)
+    def embed(self, text: str) -> npt.NDArray[np.float32]:
+        """Embed a single text string."""
+        self._ensure_loaded()
+        assert self._model is not None
+        return self._model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
+    def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
+        """Embed a batch of text strings."""
+        if not texts:
+            return np.empty((0, 0), dtype=np.float32)
+        self._ensure_loaded()
+        assert self._model is not None
+        return self._model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
+    @property
+    def memo_key(self) -> tuple:
+        """Identity key: (class, model_name, device)."""
+        return ("SentenceTransformerEmbedder", self._model_name, self._device)
+class LF4Embedder:
+    """Thread-safe, lazy-loading embedder wrapping LF4StaticEmbedding (4-bit quantized).
+    Uses the VTXAI/Vortex-Embed-4.7M model by default — a 4-bit static embedding
+    model with ~3.5 MB footprint. Loads on first use, cached thereafter.
+    """
+    def __init__(self, model_id: str = "VTXAI/Vortex-Embed-4.7M") -> None:
+        self._model_id = model_id
+        self._model = None
+        self._lock = threading.Lock()
+    @property
+    def dim(self) -> int:
+        self._ensure_loaded()
+        assert self._model is not None
+        return self._model.dim
+    def _ensure_loaded(self) -> None:
+        if self._model is None:
+            with self._lock:
+                if self._model is None:
+                    logger.info("Loading LF4 embedding model: %s", self._model_id)
+                    from vortexa.core.lf4_model import LF4StaticEmbedding
+                    self._model = LF4StaticEmbedding.from_pretrained(self._model_id)
+    def embed(self, text: str) -> npt.NDArray[np.float32]:
+        """Embed a single text string."""
+        self._ensure_loaded()
+        assert self._model is not None
+        return self._model.encode([text])[0]
+    def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
+        """Embed a batch of text strings."""
+        if not texts:
+            return np.empty((0, 0), dtype=np.float32)
+        self._ensure_loaded()
+        assert self._model is not None
+        return self._model.encode(texts)
+    @property
+    def memo_key(self) -> tuple:
+        return ("LF4Embedder", self._model_id)

vortexa 0.1.0__tar.gz → 0.1.2__tar.gz

vortexa 0.1.0tar.gz → 0.1.2tar.gz