vortexa 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vortexa-0.1.0 → vortexa-0.1.2}/PKG-INFO +8 -5
- {vortexa-0.1.0 → vortexa-0.1.2}/README.md +6 -4
- {vortexa-0.1.0 → vortexa-0.1.2}/pyproject.toml +3 -1
- vortexa-0.1.2/src/vortexa/__init__.py +0 -0
- vortexa-0.1.2/src/vortexa/core/__init__.py +0 -0
- vortexa-0.1.2/src/vortexa/core/chunking.py +229 -0
- vortexa-0.1.2/src/vortexa/core/embedding.py +180 -0
- vortexa-0.1.2/src/vortexa/core/indexer.py +456 -0
- vortexa-0.1.2/src/vortexa/core/language.py +151 -0
- vortexa-0.1.2/src/vortexa/core/lf4_model.py +168 -0
- vortexa-0.1.2/src/vortexa/core/types.py +98 -0
- vortexa-0.1.2/src/vortexa/interfaces/__init__.py +0 -0
- vortexa-0.1.2/src/vortexa/interfaces/mcp_server.py +102 -0
- vortexa-0.1.2/src/vortexa/interfaces/watcher.py +138 -0
- vortexa-0.1.2/src/vortexa/search/__init__.py +0 -0
- vortexa-0.1.2/src/vortexa/search/ranking.py +389 -0
- vortexa-0.1.2/src/vortexa/search/search.py +165 -0
- vortexa-0.1.2/src/vortexa/search/tokens.py +66 -0
- vortexa-0.1.2/src/vortexa/storage/__init__.py +0 -0
- vortexa-0.1.2/src/vortexa/storage/bm25.py +147 -0
- vortexa-0.1.2/src/vortexa/storage/vector_store.py +193 -0
- vortexa-0.1.2/src/vortexa/storage/walker.py +129 -0
- {vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/PKG-INFO +8 -5
- vortexa-0.1.2/src/vortexa.egg-info/SOURCES.txt +27 -0
- {vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/requires.txt +1 -0
- vortexa-0.1.2/src/vortexa.egg-info/top_level.txt +1 -0
- vortexa-0.1.0/vortexa.egg-info/SOURCES.txt +0 -8
- vortexa-0.1.0/vortexa.egg-info/top_level.txt +0 -1
- {vortexa-0.1.0 → vortexa-0.1.2}/setup.cfg +0 -0
- {vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/dependency_links.txt +0 -0
- {vortexa-0.1.0 → vortexa-0.1.2/src}/vortexa.egg-info/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vortexa
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Codebase indexing and semantic search engine
|
|
5
5
|
Author-email: VortexAI <koulabhay25@gmail.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -22,6 +22,7 @@ Requires-Python: >=3.10
|
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
23
|
Requires-Dist: numpy>=1.24.0
|
|
24
24
|
Requires-Dist: lmdb>=1.4.0
|
|
25
|
+
Requires-Dist: bm25s>=0.2.0
|
|
25
26
|
Requires-Dist: pathspec>=0.12.0
|
|
26
27
|
Requires-Dist: huggingface-hub>=0.20.0
|
|
27
28
|
Requires-Dist: tokenizers>=0.19.0
|
|
@@ -43,7 +44,8 @@ _Dense + sparse hybrid retrieval · AST-aware chunking · LMDB persistence · MC
|
|
|
43
44
|
|
|
44
45
|
[](LICENSE)
|
|
45
46
|
[](#)
|
|
46
|
-
[](https://pypi.org/project/vortexa/)
|
|
48
|
+
[](https://pypi.org/project/vortexa/)
|
|
47
49
|
|
|
48
50
|
</div>
|
|
49
51
|
|
|
@@ -149,8 +151,8 @@ pip install vortexa
|
|
|
149
151
|
# Full (Model2Vec embeddings + tree-sitter AST chunking)
|
|
150
152
|
pip install "vortexa[full]"
|
|
151
153
|
|
|
152
|
-
# With MCP server support
|
|
153
|
-
pip install "vortexa[
|
|
154
|
+
# With MCP server support (adds `vortexa` CLI command)
|
|
155
|
+
pip install "vortexa[mcp]"
|
|
154
156
|
```
|
|
155
157
|
|
|
156
158
|
### Index a codebase
|
|
@@ -289,7 +291,7 @@ vortexa ships with a built-in **MCP (Model Context Protocol) server** that expos
|
|
|
289
291
|
python -m vortexa.interfaces.mcp_server
|
|
290
292
|
|
|
291
293
|
# Or via the installed entry point
|
|
292
|
-
vortexa
|
|
294
|
+
vortexa
|
|
293
295
|
```
|
|
294
296
|
|
|
295
297
|
On startup it indexes the current working directory and prints stats to stderr:
|
|
@@ -465,6 +467,7 @@ graph TD
|
|
|
465
467
|
|---------|----------|----------|
|
|
466
468
|
| `numpy` | Yes | Vector operations, embedding inference |
|
|
467
469
|
| `lmdb` | Yes | Persistent vector and chunk metadata storage |
|
|
470
|
+
| `bm25s` | Yes | Fast BM25 keyword index and persistence |
|
|
468
471
|
| `pathspec` | Yes | `.gitignore` pattern matching in file walker |
|
|
469
472
|
| `model2vec` | Optional | Alternative static embeddings |
|
|
470
473
|
| `huggingface-hub` | Yes (default model) | Loading `VTXAI/Vortex-Embed-4.7M` |
|
|
@@ -8,7 +8,8 @@ _Dense + sparse hybrid retrieval · AST-aware chunking · LMDB persistence · MC
|
|
|
8
8
|
|
|
9
9
|
[](LICENSE)
|
|
10
10
|
[](#)
|
|
11
|
-
[](https://pypi.org/project/vortexa/)
|
|
12
|
+
[](https://pypi.org/project/vortexa/)
|
|
12
13
|
|
|
13
14
|
</div>
|
|
14
15
|
|
|
@@ -114,8 +115,8 @@ pip install vortexa
|
|
|
114
115
|
# Full (Model2Vec embeddings + tree-sitter AST chunking)
|
|
115
116
|
pip install "vortexa[full]"
|
|
116
117
|
|
|
117
|
-
# With MCP server support
|
|
118
|
-
pip install "vortexa[
|
|
118
|
+
# With MCP server support (adds `vortexa` CLI command)
|
|
119
|
+
pip install "vortexa[mcp]"
|
|
119
120
|
```
|
|
120
121
|
|
|
121
122
|
### Index a codebase
|
|
@@ -254,7 +255,7 @@ vortexa ships with a built-in **MCP (Model Context Protocol) server** that expos
|
|
|
254
255
|
python -m vortexa.interfaces.mcp_server
|
|
255
256
|
|
|
256
257
|
# Or via the installed entry point
|
|
257
|
-
vortexa
|
|
258
|
+
vortexa
|
|
258
259
|
```
|
|
259
260
|
|
|
260
261
|
On startup it indexes the current working directory and prints stats to stderr:
|
|
@@ -430,6 +431,7 @@ graph TD
|
|
|
430
431
|
|---------|----------|----------|
|
|
431
432
|
| `numpy` | Yes | Vector operations, embedding inference |
|
|
432
433
|
| `lmdb` | Yes | Persistent vector and chunk metadata storage |
|
|
434
|
+
| `bm25s` | Yes | Fast BM25 keyword index and persistence |
|
|
433
435
|
| `pathspec` | Yes | `.gitignore` pattern matching in file walker |
|
|
434
436
|
| `model2vec` | Optional | Alternative static embeddings |
|
|
435
437
|
| `huggingface-hub` | Yes (default model) | Loading `VTXAI/Vortex-Embed-4.7M` |
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "vortexa"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.2"
|
|
8
8
|
description = "Codebase indexing and semantic search engine"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -29,6 +29,7 @@ classifiers = [
|
|
|
29
29
|
dependencies = [
|
|
30
30
|
"numpy>=1.24.0",
|
|
31
31
|
"lmdb>=1.4.0",
|
|
32
|
+
"bm25s>=0.2.0",
|
|
32
33
|
"pathspec>=0.12.0",
|
|
33
34
|
"huggingface-hub>=0.20.0",
|
|
34
35
|
"tokenizers>=0.19.0",
|
|
@@ -54,6 +55,7 @@ Repository = "https://github.com/OEvortex/vortexa"
|
|
|
54
55
|
Issues = "https://github.com/OEvortex/vortexa/issues"
|
|
55
56
|
|
|
56
57
|
[tool.setuptools.packages.find]
|
|
58
|
+
where = ["src"]
|
|
57
59
|
include = ["vortexa*"]
|
|
58
60
|
|
|
59
61
|
[tool.ruff]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""Code-aware chunking using tree-sitter with line-based fallback.
|
|
2
|
+
|
|
3
|
+
Splits source code into chunks respecting AST boundaries (functions, classes, etc.)
|
|
4
|
+
when tree-sitter supports the language, otherwise falls back to line-based splitting.
|
|
5
|
+
|
|
6
|
+
Supports configurable chunk_size, min_chunk_size, and chunk_overlap
|
|
7
|
+
(inspired by cocoindex's RecursiveSplitter).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import logging
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
|
|
17
|
+
from vortexa.core.types import Chunk, ChunkConfig, Lineage
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ChunkBoundary:
|
|
24
|
+
"""The output of the internal chunking algorithm."""
|
|
25
|
+
|
|
26
|
+
start: int
|
|
27
|
+
end: int
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@lru_cache(maxsize=64)
|
|
31
|
+
def _get_parser(language: str):
|
|
32
|
+
"""Get a tree-sitter parser for the given language. Returns None if unavailable."""
|
|
33
|
+
try:
|
|
34
|
+
from tree_sitter_language_pack import get_parser as _get_ts_parser
|
|
35
|
+
|
|
36
|
+
return _get_ts_parser(language) # type: ignore
|
|
37
|
+
except Exception:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def is_supported_language(language: str) -> bool:
|
|
42
|
+
"""Check if tree-sitter supports the given language."""
|
|
43
|
+
return _get_parser(language) is not None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _merge_adjacent_chunks(
|
|
47
|
+
chunks: list[ChunkBoundary],
|
|
48
|
+
desired_length: int,
|
|
49
|
+
overlap: int = 0,
|
|
50
|
+
) -> list[ChunkBoundary]:
|
|
51
|
+
"""Merge adjacent chunks up to the desired length, with optional overlap.
|
|
52
|
+
|
|
53
|
+
When overlap > 0, each chunk (after the first) starts `overlap` bytes
|
|
54
|
+
before the end of the previous chunk, creating overlapping regions.
|
|
55
|
+
"""
|
|
56
|
+
if not chunks:
|
|
57
|
+
return []
|
|
58
|
+
|
|
59
|
+
merged: list[ChunkBoundary] = []
|
|
60
|
+
current_start = chunks[0].start
|
|
61
|
+
current_end = chunks[0].end
|
|
62
|
+
current_length = current_end - current_start
|
|
63
|
+
|
|
64
|
+
for group in chunks[1:]:
|
|
65
|
+
start, end = group.start, group.end
|
|
66
|
+
length = end - start
|
|
67
|
+
|
|
68
|
+
if current_length + length > desired_length:
|
|
69
|
+
merged.append(ChunkBoundary(start=current_start, end=current_end))
|
|
70
|
+
# Apply overlap: start the next chunk overlap bytes before current end
|
|
71
|
+
if overlap > 0:
|
|
72
|
+
current_start = max(current_end - overlap, start)
|
|
73
|
+
else:
|
|
74
|
+
current_start = start
|
|
75
|
+
current_end = end
|
|
76
|
+
current_length = current_end - current_start
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
current_end = end
|
|
80
|
+
current_length += length
|
|
81
|
+
|
|
82
|
+
merged.append(ChunkBoundary(start=current_start, end=current_end))
|
|
83
|
+
return merged
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _merge_node_inner(node, desired_length: int) -> list[ChunkBoundary]:
|
|
87
|
+
"""Recursively merge and split AST nodes into chunks."""
|
|
88
|
+
if not node.children:
|
|
89
|
+
return [ChunkBoundary(node.start_byte, node.end_byte)]
|
|
90
|
+
|
|
91
|
+
groups: list[ChunkBoundary] = []
|
|
92
|
+
children = node.children
|
|
93
|
+
index = 0
|
|
94
|
+
|
|
95
|
+
while index < len(children):
|
|
96
|
+
child = children[index]
|
|
97
|
+
start = child.start_byte
|
|
98
|
+
end = child.end_byte
|
|
99
|
+
length = child.end_byte - child.start_byte
|
|
100
|
+
|
|
101
|
+
index += 1
|
|
102
|
+
|
|
103
|
+
# If this single chunk is longer than desired, recurse into it
|
|
104
|
+
if length > desired_length:
|
|
105
|
+
groups.extend(_merge_node_inner(child, desired_length))
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
while index < len(children):
|
|
109
|
+
child = children[index]
|
|
110
|
+
child_length = child.end_byte - child.start_byte
|
|
111
|
+
|
|
112
|
+
if length + child_length > desired_length:
|
|
113
|
+
break
|
|
114
|
+
|
|
115
|
+
end = child.end_byte
|
|
116
|
+
length += child_length
|
|
117
|
+
index += 1
|
|
118
|
+
|
|
119
|
+
groups.append(ChunkBoundary(start, end))
|
|
120
|
+
|
|
121
|
+
return groups
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _merge_node(node, desired_length: int, overlap: int = 0) -> list[ChunkBoundary]:
|
|
125
|
+
"""Recursively turn AST nodes into chunks, then merge adjacent chunks."""
|
|
126
|
+
raw_chunks = _merge_node_inner(node, desired_length)
|
|
127
|
+
return _merge_adjacent_chunks(raw_chunks, desired_length, overlap)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def chunk_lines(text: str, desired_length: int, overlap: int = 0) -> list[ChunkBoundary]:
|
|
131
|
+
"""Chunk source code by line boundaries with optional overlap."""
|
|
132
|
+
if not text.strip():
|
|
133
|
+
return []
|
|
134
|
+
lines_as_groups: list[ChunkBoundary] = []
|
|
135
|
+
index = 0
|
|
136
|
+
for line in text.splitlines(keepends=True):
|
|
137
|
+
lines_as_groups.append(ChunkBoundary(start=index, end=index + len(line)))
|
|
138
|
+
index += len(line)
|
|
139
|
+
|
|
140
|
+
return _merge_adjacent_chunks(lines_as_groups, desired_length, overlap)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def chunk_source(
|
|
144
|
+
source: str,
|
|
145
|
+
file_path: str,
|
|
146
|
+
language: str | None,
|
|
147
|
+
config: ChunkConfig | None = None,
|
|
148
|
+
) -> list[Chunk]:
|
|
149
|
+
"""Chunk source code into indexable units with lineage tracking.
|
|
150
|
+
|
|
151
|
+
Uses tree-sitter for AST-aware chunking when the language is supported,
|
|
152
|
+
falls back to line-based chunking otherwise.
|
|
153
|
+
|
|
154
|
+
:param source: Source code text.
|
|
155
|
+
:param file_path: Relative file path for the chunk metadata.
|
|
156
|
+
:param language: Detected programming language (or None).
|
|
157
|
+
:param config: Chunking configuration (chunk_size, overlap, etc.).
|
|
158
|
+
:return: List of Chunk objects with lineage and chunk_hash.
|
|
159
|
+
"""
|
|
160
|
+
if not source.strip():
|
|
161
|
+
return []
|
|
162
|
+
|
|
163
|
+
if config is None:
|
|
164
|
+
config = ChunkConfig()
|
|
165
|
+
|
|
166
|
+
chunk_boundaries = None
|
|
167
|
+
|
|
168
|
+
if language is not None and is_supported_language(language):
|
|
169
|
+
parser = _get_parser(language)
|
|
170
|
+
if parser is not None:
|
|
171
|
+
try:
|
|
172
|
+
as_bytes = source.encode("utf-8")
|
|
173
|
+
root = parser.parse(as_bytes).root_node
|
|
174
|
+
chunk_boundaries = _merge_node(root, config.chunk_size, config.chunk_overlap)
|
|
175
|
+
# Convert byte offsets to char offsets
|
|
176
|
+
char_boundaries = []
|
|
177
|
+
for boundary in chunk_boundaries:
|
|
178
|
+
start_char = len(as_bytes[: boundary.start].decode("utf-8"))
|
|
179
|
+
end_char = len(as_bytes[: boundary.end].decode("utf-8"))
|
|
180
|
+
char_boundaries.append(ChunkBoundary(start=start_char, end=end_char))
|
|
181
|
+
chunk_boundaries = char_boundaries
|
|
182
|
+
except Exception:
|
|
183
|
+
logger.debug("Tree-sitter chunking failed for %s, falling back", file_path)
|
|
184
|
+
chunk_boundaries = None
|
|
185
|
+
|
|
186
|
+
if chunk_boundaries is None:
|
|
187
|
+
chunk_boundaries = chunk_lines(source, config.chunk_size, config.chunk_overlap)
|
|
188
|
+
|
|
189
|
+
# Compute source hash for memoization
|
|
190
|
+
source_hash = hashlib.sha256(source.encode("utf-8")).hexdigest()[:16]
|
|
191
|
+
|
|
192
|
+
chunks: list[Chunk] = []
|
|
193
|
+
for boundary in chunk_boundaries:
|
|
194
|
+
end_index = max(boundary.end - 1, boundary.start)
|
|
195
|
+
text = source[boundary.start : end_index + 1]
|
|
196
|
+
if not text.strip():
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
start_line = source[: boundary.start].count("\n") + 1
|
|
200
|
+
end_line = source[:end_index].count("\n") + 1
|
|
201
|
+
|
|
202
|
+
# Compute chunk-specific hash for memoization
|
|
203
|
+
chunk_hash = hashlib.sha256(
|
|
204
|
+
f"{file_path}:{source_hash}:{boundary.start}:{boundary.end}".encode()
|
|
205
|
+
).hexdigest()[:16]
|
|
206
|
+
|
|
207
|
+
# Compute byte offsets for lineage
|
|
208
|
+
as_bytes = source.encode("utf-8")
|
|
209
|
+
byte_start = len(source[: boundary.start].encode("utf-8"))
|
|
210
|
+
byte_end = len(source[:end_index].encode("utf-8"))
|
|
211
|
+
|
|
212
|
+
chunks.append(
|
|
213
|
+
Chunk(
|
|
214
|
+
content=text,
|
|
215
|
+
file_path=file_path,
|
|
216
|
+
start_line=start_line,
|
|
217
|
+
end_line=end_line,
|
|
218
|
+
language=language,
|
|
219
|
+
lineage=Lineage(
|
|
220
|
+
source_path=file_path,
|
|
221
|
+
start_line=start_line,
|
|
222
|
+
end_line=end_line,
|
|
223
|
+
byte_start=byte_start,
|
|
224
|
+
byte_end=byte_end,
|
|
225
|
+
),
|
|
226
|
+
chunk_hash=chunk_hash,
|
|
227
|
+
)
|
|
228
|
+
)
|
|
229
|
+
return chunks
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Embedding model abstraction for the codebase indexer.
|
|
2
|
+
|
|
3
|
+
Provides lazy-loading, thread-safe embedders with memoization support.
|
|
4
|
+
Inspired by cocoindex's SentenceTransformerEmbedder pattern.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import threading
|
|
11
|
+
from typing import Protocol, runtime_checkable
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import numpy.typing as npt
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@runtime_checkable
|
|
20
|
+
class Embedder(Protocol):
|
|
21
|
+
"""Protocol for embedding models used by the indexer."""
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def dim(self) -> int:
|
|
25
|
+
"""Embedding dimensionality."""
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
def embed(self, text: str) -> npt.NDArray[np.float32]:
|
|
29
|
+
"""Embed a single text string."""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
33
|
+
"""Embed a batch of text strings."""
|
|
34
|
+
...
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def memo_key(self) -> tuple:
|
|
38
|
+
"""Identity key for memoization cache invalidation."""
|
|
39
|
+
...
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Model2VecEmbedder:
|
|
43
|
+
"""Thread-safe, lazy-loading embedder wrapping model2vec.StaticModel.
|
|
44
|
+
|
|
45
|
+
The model is loaded on first use and cached. Thread-safe via a lock.
|
|
46
|
+
Memo key includes the model ID for cache invalidation.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, model_id: str = "AI4free/JARVIS-tool-search-v1") -> None:
|
|
50
|
+
self._model_id = model_id
|
|
51
|
+
self._model = None
|
|
52
|
+
self._lock = threading.Lock()
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def dim(self) -> int:
|
|
56
|
+
self._ensure_loaded()
|
|
57
|
+
assert self._model is not None
|
|
58
|
+
return self._model.dim
|
|
59
|
+
|
|
60
|
+
def _ensure_loaded(self) -> None:
|
|
61
|
+
if self._model is None:
|
|
62
|
+
with self._lock:
|
|
63
|
+
if self._model is None: # Double-checked locking
|
|
64
|
+
from model2vec import StaticModel
|
|
65
|
+
logger.info("Loading embedding model: %s", self._model_id)
|
|
66
|
+
self._model = StaticModel.from_pretrained(self._model_id)
|
|
67
|
+
|
|
68
|
+
def embed(self, text: str) -> npt.NDArray[np.float32]:
|
|
69
|
+
"""Embed a single text string."""
|
|
70
|
+
self._ensure_loaded()
|
|
71
|
+
assert self._model is not None
|
|
72
|
+
return self._model.encode([text])[0]
|
|
73
|
+
|
|
74
|
+
def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
75
|
+
"""Embed a batch of text strings."""
|
|
76
|
+
if not texts:
|
|
77
|
+
return np.empty((0, 0), dtype=np.float32)
|
|
78
|
+
self._ensure_loaded()
|
|
79
|
+
assert self._model is not None
|
|
80
|
+
result = self._model.encode(texts)
|
|
81
|
+
return np.array(result, dtype=np.float32)
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def memo_key(self) -> tuple:
|
|
85
|
+
"""Identity key: (class, model_id)."""
|
|
86
|
+
return ("Model2VecEmbedder", self._model_id)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class SentenceTransformerEmbedder:
|
|
90
|
+
"""Thread-safe embedder wrapping sentence-transformers.
|
|
91
|
+
|
|
92
|
+
Supports any sentence-transformers model with lazy loading.
|
|
93
|
+
Memo key includes model name and device for cache invalidation.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
def __init__(self, model_name: str = "all-MiniLM-L6-v2", device: str | None = None) -> None:
|
|
97
|
+
self._model_name = model_name
|
|
98
|
+
self._device = device
|
|
99
|
+
self._model = None
|
|
100
|
+
self._lock = threading.Lock()
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def dim(self) -> int:
|
|
104
|
+
self._ensure_loaded()
|
|
105
|
+
assert self._model is not None
|
|
106
|
+
dim = self._model.get_embedding_dimension()
|
|
107
|
+
assert dim is not None
|
|
108
|
+
return dim
|
|
109
|
+
|
|
110
|
+
def _ensure_loaded(self) -> None:
|
|
111
|
+
if self._model is None:
|
|
112
|
+
with self._lock:
|
|
113
|
+
if self._model is None:
|
|
114
|
+
from sentence_transformers import SentenceTransformer
|
|
115
|
+
logger.info("Loading sentence-transformers model: %s", self._model_name)
|
|
116
|
+
self._model = SentenceTransformer(self._model_name, device=self._device)
|
|
117
|
+
|
|
118
|
+
def embed(self, text: str) -> npt.NDArray[np.float32]:
|
|
119
|
+
"""Embed a single text string."""
|
|
120
|
+
self._ensure_loaded()
|
|
121
|
+
assert self._model is not None
|
|
122
|
+
return self._model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
|
|
123
|
+
|
|
124
|
+
def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
125
|
+
"""Embed a batch of text strings."""
|
|
126
|
+
if not texts:
|
|
127
|
+
return np.empty((0, 0), dtype=np.float32)
|
|
128
|
+
self._ensure_loaded()
|
|
129
|
+
assert self._model is not None
|
|
130
|
+
return self._model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def memo_key(self) -> tuple:
|
|
134
|
+
"""Identity key: (class, model_name, device)."""
|
|
135
|
+
return ("SentenceTransformerEmbedder", self._model_name, self._device)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class LF4Embedder:
|
|
139
|
+
"""Thread-safe, lazy-loading embedder wrapping LF4StaticEmbedding (4-bit quantized).
|
|
140
|
+
|
|
141
|
+
Uses the VTXAI/Vortex-Embed-4.7M model by default — a 4-bit static embedding
|
|
142
|
+
model with ~3.5 MB footprint. Loads on first use, cached thereafter.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
def __init__(self, model_id: str = "VTXAI/Vortex-Embed-4.7M") -> None:
|
|
146
|
+
self._model_id = model_id
|
|
147
|
+
self._model = None
|
|
148
|
+
self._lock = threading.Lock()
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def dim(self) -> int:
|
|
152
|
+
self._ensure_loaded()
|
|
153
|
+
assert self._model is not None
|
|
154
|
+
return self._model.dim
|
|
155
|
+
|
|
156
|
+
def _ensure_loaded(self) -> None:
|
|
157
|
+
if self._model is None:
|
|
158
|
+
with self._lock:
|
|
159
|
+
if self._model is None:
|
|
160
|
+
logger.info("Loading LF4 embedding model: %s", self._model_id)
|
|
161
|
+
from vortexa.core.lf4_model import LF4StaticEmbedding
|
|
162
|
+
self._model = LF4StaticEmbedding.from_pretrained(self._model_id)
|
|
163
|
+
|
|
164
|
+
def embed(self, text: str) -> npt.NDArray[np.float32]:
|
|
165
|
+
"""Embed a single text string."""
|
|
166
|
+
self._ensure_loaded()
|
|
167
|
+
assert self._model is not None
|
|
168
|
+
return self._model.encode([text])[0]
|
|
169
|
+
|
|
170
|
+
def embed_batch(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
171
|
+
"""Embed a batch of text strings."""
|
|
172
|
+
if not texts:
|
|
173
|
+
return np.empty((0, 0), dtype=np.float32)
|
|
174
|
+
self._ensure_loaded()
|
|
175
|
+
assert self._model is not None
|
|
176
|
+
return self._model.encode(texts)
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def memo_key(self) -> tuple:
|
|
180
|
+
return ("LF4Embedder", self._model_id)
|