vortexa 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {vortexa-0.1.0 → vortexa-0.1.2}/PKG-INFO +8 -5
  2. {vortexa-0.1.0 → vortexa-0.1.2}/README.md +6 -4
  3. {vortexa-0.1.0 → vortexa-0.1.2}/pyproject.toml +3 -1
  4. vortexa-0.1.2/src/vortexa/__init__.py +0 -0
  5. vortexa-0.1.2/src/vortexa/core/__init__.py +0 -0
  6. vortexa-0.1.2/src/vortexa/core/chunking.py +229 -0
  7. vortexa-0.1.2/src/vortexa/core/embedding.py +180 -0
  8. vortexa-0.1.2/src/vortexa/core/indexer.py +456 -0
  9. vortexa-0.1.2/src/vortexa/core/language.py +151 -0
  10. vortexa-0.1.2/src/vortexa/core/lf4_model.py +168 -0
  11. vortexa-0.1.2/src/vortexa/core/types.py +98 -0
  12. vortexa-0.1.2/src/vortexa/interfaces/__init__.py +0 -0
  13. vortexa-0.1.2/src/vortexa/interfaces/mcp_server.py +102 -0
  14. vortexa-0.1.2/src/vortexa/interfaces/watcher.py +138 -0
  15. vortexa-0.1.2/src/vortexa/search/__init__.py +0 -0
  16. vortexa-0.1.2/src/vortexa/search/ranking.py +389 -0
  17. vortexa-0.1.2/src/vortexa/search/search.py +165 -0
  18. vortexa-0.1.2/src/vortexa/search/tokens.py +66 -0
  19. vortexa-0.1.2/src/vortexa/storage/__init__.py +0 -0
  20. vortexa-0.1.2/src/vortexa/storage/bm25.py +147 -0
  21. vortexa-0.1.2/src/vortexa/storage/vector_store.py +193 -0
  22. vortexa-0.1.2/src/vortexa/storage/walker.py +129 -0
  23. {vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/PKG-INFO +8 -5
  24. vortexa-0.1.2/src/vortexa.egg-info/SOURCES.txt +27 -0
  25. {vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/requires.txt +1 -0
  26. vortexa-0.1.2/src/vortexa.egg-info/top_level.txt +1 -0
  27. vortexa-0.1.0/vortexa.egg-info/SOURCES.txt +0 -8
  28. vortexa-0.1.0/vortexa.egg-info/top_level.txt +0 -1
  29. {vortexa-0.1.0 → vortexa-0.1.2}/setup.cfg +0 -0
  30. {vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/dependency_links.txt +0 -0
  31. {vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vortexa
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Codebase indexing and semantic search engine
5
5
  Author-email: VortexAI <koulabhay25@gmail.com>
6
6
  License-Expression: Apache-2.0
@@ -22,6 +22,7 @@ Requires-Python: >=3.10
22
22
  Description-Content-Type: text/markdown
23
23
  Requires-Dist: numpy>=1.24.0
24
24
  Requires-Dist: lmdb>=1.4.0
25
+ Requires-Dist: bm25s>=0.2.0
25
26
  Requires-Dist: pathspec>=0.12.0
26
27
  Requires-Dist: huggingface-hub>=0.20.0
27
28
  Requires-Dist: tokenizers>=0.19.0
@@ -43,7 +44,8 @@ _Dense + sparse hybrid retrieval · AST-aware chunking · LMDB persistence · MC
43
44
 
44
45
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
45
46
  [![Python](https://img.shields.io/badge/python-3.10+-brightgreen)](#)
46
- [![PyPI version](https://img.shields.io/badge/pypi-v0.1.0-orange)](#)
47
+ [![PyPI version](https://img.shields.io/pypi/v/vortexa)](https://pypi.org/project/vortexa/)
48
+ [![PyPI downloads](https://img.shields.io/pypi/dm/vortexa)](https://pypi.org/project/vortexa/)
47
49
 
48
50
  </div>
49
51
 
@@ -149,8 +151,8 @@ pip install vortexa
149
151
  # Full (Model2Vec embeddings + tree-sitter AST chunking)
150
152
  pip install "vortexa[full]"
151
153
 
152
- # With MCP server support
153
- pip install "vortexa[full]" fastmcp
154
+ # With MCP server support (adds `vortexa` CLI command)
155
+ pip install "vortexa[mcp]"
154
156
  ```
155
157
 
156
158
  ### Index a codebase
@@ -289,7 +291,7 @@ vortexa ships with a built-in **MCP (Model Context Protocol) server** that expos
289
291
  python -m vortexa.interfaces.mcp_server
290
292
 
291
293
  # Or via the installed entry point
292
- vortexa-mcp
294
+ vortexa
293
295
  ```
294
296
 
295
297
  On startup it indexes the current working directory and prints stats to stderr:
@@ -465,6 +467,7 @@ graph TD
465
467
  |---------|----------|----------|
466
468
  | `numpy` | Yes | Vector operations, embedding inference |
467
469
  | `lmdb` | Yes | Persistent vector and chunk metadata storage |
470
+ | `bm25s` | Yes | Fast BM25 keyword index and persistence |
468
471
  | `pathspec` | Yes | `.gitignore` pattern matching in file walker |
469
472
  | `model2vec` | Optional | Alternative static embeddings |
470
473
  | `huggingface-hub` | Yes (default model) | Loading `VTXAI/Vortex-Embed-4.7M` |
@@ -8,7 +8,8 @@ _Dense + sparse hybrid retrieval · AST-aware chunking · LMDB persistence · MC
8
8
 
9
9
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
10
10
  [![Python](https://img.shields.io/badge/python-3.10+-brightgreen)](#)
11
- [![PyPI version](https://img.shields.io/badge/pypi-v0.1.0-orange)](#)
11
+ [![PyPI version](https://img.shields.io/pypi/v/vortexa)](https://pypi.org/project/vortexa/)
12
+ [![PyPI downloads](https://img.shields.io/pypi/dm/vortexa)](https://pypi.org/project/vortexa/)
12
13
 
13
14
  </div>
14
15
 
@@ -114,8 +115,8 @@ pip install vortexa
114
115
  # Full (Model2Vec embeddings + tree-sitter AST chunking)
115
116
  pip install "vortexa[full]"
116
117
 
117
- # With MCP server support
118
- pip install "vortexa[full]" fastmcp
118
+ # With MCP server support (adds `vortexa` CLI command)
119
+ pip install "vortexa[mcp]"
119
120
  ```
120
121
 
121
122
  ### Index a codebase
@@ -254,7 +255,7 @@ vortexa ships with a built-in **MCP (Model Context Protocol) server** that expos
254
255
  python -m vortexa.interfaces.mcp_server
255
256
 
256
257
  # Or via the installed entry point
257
- vortexa-mcp
258
+ vortexa
258
259
  ```
259
260
 
260
261
  On startup it indexes the current working directory and prints stats to stderr:
@@ -430,6 +431,7 @@ graph TD
430
431
  |---------|----------|----------|
431
432
  | `numpy` | Yes | Vector operations, embedding inference |
432
433
  | `lmdb` | Yes | Persistent vector and chunk metadata storage |
434
+ | `bm25s` | Yes | Fast BM25 keyword index and persistence |
433
435
  | `pathspec` | Yes | `.gitignore` pattern matching in file walker |
434
436
  | `model2vec` | Optional | Alternative static embeddings |
435
437
  | `huggingface-hub` | Yes (default model) | Loading `VTXAI/Vortex-Embed-4.7M` |
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "vortexa"
7
- version = "0.1.0"
7
+ version = "0.1.2"
8
8
  description = "Codebase indexing and semantic search engine"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -29,6 +29,7 @@ classifiers = [
29
29
  dependencies = [
30
30
  "numpy>=1.24.0",
31
31
  "lmdb>=1.4.0",
32
+ "bm25s>=0.2.0",
32
33
  "pathspec>=0.12.0",
33
34
  "huggingface-hub>=0.20.0",
34
35
  "tokenizers>=0.19.0",
@@ -54,6 +55,7 @@ Repository = "https://github.com/OEvortex/vortexa"
54
55
  Issues = "https://github.com/OEvortex/vortexa/issues"
55
56
 
56
57
  [tool.setuptools.packages.find]
58
+ where = ["src"]
57
59
  include = ["vortexa*"]
58
60
 
59
61
  [tool.ruff]
File without changes
File without changes
@@ -0,0 +1,229 @@
1
+ """Code-aware chunking using tree-sitter with line-based fallback.
2
+
3
+ Splits source code into chunks respecting AST boundaries (functions, classes, etc.)
4
+ when tree-sitter supports the language, otherwise falls back to line-based splitting.
5
+
6
+ Supports configurable chunk_size, min_chunk_size, and chunk_overlap
7
+ (inspired by cocoindex's RecursiveSplitter).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import logging
14
+ from dataclasses import dataclass
15
+ from functools import lru_cache
16
+
17
+ from vortexa.core.types import Chunk, ChunkConfig, Lineage
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class ChunkBoundary:
24
+ """The output of the internal chunking algorithm."""
25
+
26
+ start: int
27
+ end: int
28
+
29
+
30
+ @lru_cache(maxsize=64)
31
+ def _get_parser(language: str):
32
+ """Get a tree-sitter parser for the given language. Returns None if unavailable."""
33
+ try:
34
+ from tree_sitter_language_pack import get_parser as _get_ts_parser
35
+
36
+ return _get_ts_parser(language) # type: ignore
37
+ except Exception:
38
+ return None
39
+
40
+
41
+ def is_supported_language(language: str) -> bool:
42
+ """Check if tree-sitter supports the given language."""
43
+ return _get_parser(language) is not None
44
+
45
+
46
+ def _merge_adjacent_chunks(
47
+ chunks: list[ChunkBoundary],
48
+ desired_length: int,
49
+ overlap: int = 0,
50
+ ) -> list[ChunkBoundary]:
51
+ """Merge adjacent chunks up to the desired length, with optional overlap.
52
+
53
+ When overlap > 0, each chunk (after the first) starts `overlap` bytes
54
+ before the end of the previous chunk, creating overlapping regions.
55
+ """
56
+ if not chunks:
57
+ return []
58
+
59
+ merged: list[ChunkBoundary] = []
60
+ current_start = chunks[0].start
61
+ current_end = chunks[0].end
62
+ current_length = current_end - current_start
63
+
64
+ for group in chunks[1:]:
65
+ start, end = group.start, group.end
66
+ length = end - start
67
+
68
+ if current_length + length > desired_length:
69
+ merged.append(ChunkBoundary(start=current_start, end=current_end))
70
+ # Apply overlap: start the next chunk overlap bytes before current end
71
+ if overlap > 0:
72
+ current_start = max(current_end - overlap, start)
73
+ else:
74
+ current_start = start
75
+ current_end = end
76
+ current_length = current_end - current_start
77
+ continue
78
+
79
+ current_end = end
80
+ current_length += length
81
+
82
+ merged.append(ChunkBoundary(start=current_start, end=current_end))
83
+ return merged
84
+
85
+
86
+ def _merge_node_inner(node, desired_length: int) -> list[ChunkBoundary]:
87
+ """Recursively merge and split AST nodes into chunks."""
88
+ if not node.children:
89
+ return [ChunkBoundary(node.start_byte, node.end_byte)]
90
+
91
+ groups: list[ChunkBoundary] = []
92
+ children = node.children
93
+ index = 0
94
+
95
+ while index < len(children):
96
+ child = children[index]
97
+ start = child.start_byte
98
+ end = child.end_byte
99
+ length = child.end_byte - child.start_byte
100
+
101
+ index += 1
102
+
103
+ # If this single chunk is longer than desired, recurse into it
104
+ if length > desired_length:
105
+ groups.extend(_merge_node_inner(child, desired_length))
106
+ continue
107
+
108
+ while index < len(children):
109
+ child = children[index]
110
+ child_length = child.end_byte - child.start_byte
111
+
112
+ if length + child_length > desired_length:
113
+ break
114
+
115
+ end = child.end_byte
116
+ length += child_length
117
+ index += 1
118
+
119
+ groups.append(ChunkBoundary(start, end))
120
+
121
+ return groups
122
+
123
+
124
+ def _merge_node(node, desired_length: int, overlap: int = 0) -> list[ChunkBoundary]:
125
+ """Recursively turn AST nodes into chunks, then merge adjacent chunks."""
126
+ raw_chunks = _merge_node_inner(node, desired_length)
127
+ return _merge_adjacent_chunks(raw_chunks, desired_length, overlap)
128
+
129
+
130
+ def chunk_lines(text: str, desired_length: int, overlap: int = 0) -> list[ChunkBoundary]:
131
+ """Chunk source code by line boundaries with optional overlap."""
132
+ if not text.strip():
133
+ return []
134
+ lines_as_groups: list[ChunkBoundary] = []
135
+ index = 0
136
+ for line in text.splitlines(keepends=True):
137
+ lines_as_groups.append(ChunkBoundary(start=index, end=index + len(line)))
138
+ index += len(line)
139
+
140
+ return _merge_adjacent_chunks(lines_as_groups, desired_length, overlap)
141
+
142
+
143
+ def chunk_source(
144
+ source: str,
145
+ file_path: str,
146
+ language: str | None,
147
+ config: ChunkConfig | None = None,
148
+ ) -> list[Chunk]:
149
+ """Chunk source code into indexable units with lineage tracking.
150
+
151
+ Uses tree-sitter for AST-aware chunking when the language is supported,
152
+ falls back to line-based chunking otherwise.
153
+
154
+ :param source: Source code text.
155
+ :param file_path: Relative file path for the chunk metadata.
156
+ :param language: Detected programming language (or None).
157
+ :param config: Chunking configuration (chunk_size, overlap, etc.).
158
+ :return: List of Chunk objects with lineage and chunk_hash.
159
+ """
160
+ if not source.strip():
161
+ return []
162
+
163
+ if config is None:
164
+ config = ChunkConfig()
165
+
166
+ chunk_boundaries = None
167
+
168
+ if language is not None and is_supported_language(language):
169
+ parser = _get_parser(language)
170
+ if parser is not None:
171
+ try:
172
+ as_bytes = source.encode("utf-8")
173
+ root = parser.parse(as_bytes).root_node
174
+ chunk_boundaries = _merge_node(root, config.chunk_size, config.chunk_overlap)
175
+ # Convert byte offsets to char offsets
176
+ char_boundaries = []
177
+ for boundary in chunk_boundaries:
178
+ start_char = len(as_bytes[: boundary.start].decode("utf-8"))
179
+ end_char = len(as_bytes[: boundary.end].decode("utf-8"))
180
+ char_boundaries.append(ChunkBoundary(start=start_char, end=end_char))
181
+ chunk_boundaries = char_boundaries
182
+ except Exception:
183
+ logger.debug("Tree-sitter chunking failed for %s, falling back", file_path)
184
+ chunk_boundaries = None
185
+
186
+ if chunk_boundaries is None:
187
+ chunk_boundaries = chunk_lines(source, config.chunk_size, config.chunk_overlap)
188
+
189
+ # Compute source hash for memoization
190
+ source_hash = hashlib.sha256(source.encode("utf-8")).hexdigest()[:16]
191
+
192
+ chunks: list[Chunk] = []
193
+ for boundary in chunk_boundaries:
194
+ end_index = max(boundary.end - 1, boundary.start)
195
+ text = source[boundary.start : end_index + 1]
196
+ if not text.strip():
197
+ continue
198
+
199
+ start_line = source[: boundary.start].count("\n") + 1
200
+ end_line = source[:end_index].count("\n") + 1
201
+
202
+ # Compute chunk-specific hash for memoization
203
+ chunk_hash = hashlib.sha256(
204
+ f"{file_path}:{source_hash}:{boundary.start}:{boundary.end}".encode()
205
+ ).hexdigest()[:16]
206
+
207
+ # Compute byte offsets for lineage
208
+ as_bytes = source.encode("utf-8")
209
+ byte_start = len(source[: boundary.start].encode("utf-8"))
210
+ byte_end = len(source[:end_index].encode("utf-8"))
211
+
212
+ chunks.append(
213
+ Chunk(
214
+ content=text,
215
+ file_path=file_path,
216
+ start_line=start_line,
217
+ end_line=end_line,
218
+ language=language,
219
+ lineage=Lineage(
220
+ source_path=file_path,
221
+ start_line=start_line,
222
+ end_line=end_line,
223
+ byte_start=byte_start,
224
+ byte_end=byte_end,
225
+ ),
226
+ chunk_hash=chunk_hash,
227
+ )
228
+ )
229
+ return chunks
@@ -0,0 +1,180 @@
1
+ """Embedding model abstraction for the codebase indexer.
2
+
3
+ Provides lazy-loading, thread-safe embedders with memoization support.
4
+ Inspired by cocoindex's SentenceTransformerEmbedder pattern.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import threading
11
+ from typing import Protocol, runtime_checkable
12
+
13
+ import numpy as np
14
+ import numpy.typing as npt
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @runtime_checkable
20
+ class Embedder(Protocol):
21
+ """Protocol for embedding models used by the indexer."""
22
+
23
+ @property
24
+ def dim(self) -> int:
25
+ """Embedding dimensionality."""
26
+ ...
27
+
28
+ def embed(self, text: str) -> npt.NDArray[np.float32]:
29
+ """Embed a single text string."""
30
+ ...
31
+
32
+ def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
33
+ """Embed a batch of text strings."""
34
+ ...
35
+
36
+ @property
37
+ def memo_key(self) -> tuple:
38
+ """Identity key for memoization cache invalidation."""
39
+ ...
40
+
41
+
42
+ class Model2VecEmbedder:
43
+ """Thread-safe, lazy-loading embedder wrapping model2vec.StaticModel.
44
+
45
+ The model is loaded on first use and cached. Thread-safe via a lock.
46
+ Memo key includes the model ID for cache invalidation.
47
+ """
48
+
49
+ def __init__(self, model_id: str = "AI4free/JARVIS-tool-search-v1") -> None:
50
+ self._model_id = model_id
51
+ self._model = None
52
+ self._lock = threading.Lock()
53
+
54
+ @property
55
+ def dim(self) -> int:
56
+ self._ensure_loaded()
57
+ assert self._model is not None
58
+ return self._model.dim
59
+
60
+ def _ensure_loaded(self) -> None:
61
+ if self._model is None:
62
+ with self._lock:
63
+ if self._model is None: # Double-checked locking
64
+ from model2vec import StaticModel
65
+ logger.info("Loading embedding model: %s", self._model_id)
66
+ self._model = StaticModel.from_pretrained(self._model_id)
67
+
68
+ def embed(self, text: str) -> npt.NDArray[np.float32]:
69
+ """Embed a single text string."""
70
+ self._ensure_loaded()
71
+ assert self._model is not None
72
+ return self._model.encode([text])[0]
73
+
74
+ def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
75
+ """Embed a batch of text strings."""
76
+ if not texts:
77
+ return np.empty((0, 0), dtype=np.float32)
78
+ self._ensure_loaded()
79
+ assert self._model is not None
80
+ result = self._model.encode(texts)
81
+ return np.array(result, dtype=np.float32)
82
+
83
+ @property
84
+ def memo_key(self) -> tuple:
85
+ """Identity key: (class, model_id)."""
86
+ return ("Model2VecEmbedder", self._model_id)
87
+
88
+
89
+ class SentenceTransformerEmbedder:
90
+ """Thread-safe embedder wrapping sentence-transformers.
91
+
92
+ Supports any sentence-transformers model with lazy loading.
93
+ Memo key includes model name and device for cache invalidation.
94
+ """
95
+
96
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2", device: str | None = None) -> None:
97
+ self._model_name = model_name
98
+ self._device = device
99
+ self._model = None
100
+ self._lock = threading.Lock()
101
+
102
+ @property
103
+ def dim(self) -> int:
104
+ self._ensure_loaded()
105
+ assert self._model is not None
106
+ dim = self._model.get_embedding_dimension()
107
+ assert dim is not None
108
+ return dim
109
+
110
+ def _ensure_loaded(self) -> None:
111
+ if self._model is None:
112
+ with self._lock:
113
+ if self._model is None:
114
+ from sentence_transformers import SentenceTransformer
115
+ logger.info("Loading sentence-transformers model: %s", self._model_name)
116
+ self._model = SentenceTransformer(self._model_name, device=self._device)
117
+
118
+ def embed(self, text: str) -> npt.NDArray[np.float32]:
119
+ """Embed a single text string."""
120
+ self._ensure_loaded()
121
+ assert self._model is not None
122
+ return self._model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
123
+
124
+ def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
125
+ """Embed a batch of text strings."""
126
+ if not texts:
127
+ return np.empty((0, 0), dtype=np.float32)
128
+ self._ensure_loaded()
129
+ assert self._model is not None
130
+ return self._model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
131
+
132
+ @property
133
+ def memo_key(self) -> tuple:
134
+ """Identity key: (class, model_name, device)."""
135
+ return ("SentenceTransformerEmbedder", self._model_name, self._device)
136
+
137
+
138
+ class LF4Embedder:
139
+ """Thread-safe, lazy-loading embedder wrapping LF4StaticEmbedding (4-bit quantized).
140
+
141
+ Uses the VTXAI/Vortex-Embed-4.7M model by default — a 4-bit static embedding
142
+ model with ~3.5 MB footprint. Loads on first use, cached thereafter.
143
+ """
144
+
145
+ def __init__(self, model_id: str = "VTXAI/Vortex-Embed-4.7M") -> None:
146
+ self._model_id = model_id
147
+ self._model = None
148
+ self._lock = threading.Lock()
149
+
150
+ @property
151
+ def dim(self) -> int:
152
+ self._ensure_loaded()
153
+ assert self._model is not None
154
+ return self._model.dim
155
+
156
+ def _ensure_loaded(self) -> None:
157
+ if self._model is None:
158
+ with self._lock:
159
+ if self._model is None:
160
+ logger.info("Loading LF4 embedding model: %s", self._model_id)
161
+ from vortexa.core.lf4_model import LF4StaticEmbedding
162
+ self._model = LF4StaticEmbedding.from_pretrained(self._model_id)
163
+
164
+ def embed(self, text: str) -> npt.NDArray[np.float32]:
165
+ """Embed a single text string."""
166
+ self._ensure_loaded()
167
+ assert self._model is not None
168
+ return self._model.encode([text])[0]
169
+
170
+ def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
171
+ """Embed a batch of text strings."""
172
+ if not texts:
173
+ return np.empty((0, 0), dtype=np.float32)
174
+ self._ensure_loaded()
175
+ assert self._model is not None
176
+ return self._model.encode(texts)
177
+
178
+ @property
179
+ def memo_key(self) -> tuple:
180
+ return ("LF4Embedder", self._model_id)