flexvec 0.0.1__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexvec-0.1.0/LICENSE +21 -0
- flexvec-0.1.0/PKG-INFO +14 -0
- flexvec-0.1.0/README.md +174 -0
- flexvec-0.1.0/flexvec/__init__.py +7 -0
- flexvec-0.1.0/flexvec/__main__.py +8 -0
- flexvec-0.1.0/flexvec/embed.py +31 -0
- flexvec-0.1.0/flexvec/execute.py +20 -0
- flexvec-0.1.0/flexvec/keyword.py +146 -0
- flexvec-0.1.0/flexvec/onnx/__init__.py +2 -0
- flexvec-0.1.0/flexvec/onnx/embed.py +265 -0
- flexvec-0.1.0/flexvec/onnx/fetch.py +118 -0
- flexvec-0.1.0/flexvec/onnx/nomic_embed.py +117 -0
- flexvec-0.1.0/flexvec/onnx/special_tokens_map.json +37 -0
- flexvec-0.1.0/flexvec/onnx/tokenizer.json +30672 -0
- flexvec-0.1.0/flexvec/onnx/tokenizer_config.json +55 -0
- flexvec-0.1.0/flexvec/onnx/vocab.txt +30522 -0
- flexvec-0.1.0/flexvec/vec_ops.py +758 -0
- flexvec-0.1.0/flexvec.egg-info/PKG-INFO +14 -0
- flexvec-0.1.0/flexvec.egg-info/SOURCES.txt +24 -0
- flexvec-0.1.0/flexvec.egg-info/requires.txt +8 -0
- flexvec-0.1.0/pyproject.toml +25 -0
- flexvec-0.1.0/tests/test_keyword.py +179 -0
- flexvec-0.1.0/tests/test_vec_ops.py +981 -0
- flexvec-0.0.1/PKG-INFO +0 -5
- flexvec-0.0.1/flexvec/__init__.py +0 -1
- flexvec-0.0.1/flexvec.egg-info/PKG-INFO +0 -5
- flexvec-0.0.1/flexvec.egg-info/SOURCES.txt +0 -6
- flexvec-0.0.1/pyproject.toml +0 -9
- {flexvec-0.0.1 → flexvec-0.1.0}/flexvec.egg-info/dependency_links.txt +0 -0
- {flexvec-0.0.1 → flexvec-0.1.0}/flexvec.egg-info/top_level.txt +0 -0
- {flexvec-0.0.1 → flexvec-0.1.0}/setup.cfg +0 -0
flexvec-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Damian Delmas
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
flexvec-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flexvec
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: numpy-backed semantic search for any SQLite database
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Provides-Extra: embed
|
|
10
|
+
Requires-Dist: onnxruntime; extra == "embed"
|
|
11
|
+
Requires-Dist: tokenizers; extra == "embed"
|
|
12
|
+
Provides-Extra: graph
|
|
13
|
+
Requires-Dist: networkx; extra == "graph"
|
|
14
|
+
Dynamic: license-file
|
flexvec-0.1.0/README.md
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# flexvec
|
|
2
|
+
|
|
3
|
+
Semantic similarity has a differentiation problem. You want chunks about `authentication` but not `OAuth` — the embeddings are too similar. A vector database can't help. You get `.query()` and metadata filters. The embeddings are abstracted away. You can't touch them.
|
|
4
|
+
|
|
5
|
+
flexvec keeps embeddings as numpy arrays you can operate on. Subtract a concept. Decay by recency. Spread across subtopics. Project a direction through embedding space. These aren't metadata filters — they're operations on the vectors themselves.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install flexvec
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Three-Phase Pipeline
|
|
14
|
+
|
|
15
|
+
Every query runs three steps. SQL narrows the candidates. NumPy scores them. SQL composes the results.
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
SQL pre-filter → NumPy vector operations → SQL compose
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
This solves the pre-filter problem that sqlite-vec and sqlite-vss never solved. They run vector search first, filter second. flexvec filters first, searches second. If your WHERE returns 500 rows, NumPy operates on 500 vectors — not your whole table.
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
import sqlite3
|
|
25
|
+
from flexvec import VectorCache, register_vec_ops, execute, get_embed_fn
|
|
26
|
+
|
|
27
|
+
db = sqlite3.connect("my.db")
|
|
28
|
+
|
|
29
|
+
# Load vectors into memory (once)
|
|
30
|
+
cache = VectorCache()
|
|
31
|
+
cache.load_from_db(db, "chunks", "embedding", "id")
|
|
32
|
+
|
|
33
|
+
# Default: bundled Nomic ONNX model (pip install flexvec[embed])
|
|
34
|
+
embed_fn = get_embed_fn()
|
|
35
|
+
|
|
36
|
+
# Register vec_ops as a SQL function
|
|
37
|
+
register_vec_ops(db, {"chunks": cache}, embed_fn)
|
|
38
|
+
|
|
39
|
+
# Query — three phases in one statement
|
|
40
|
+
rows = execute(db, """
|
|
41
|
+
SELECT v.id, v.score, c.content
|
|
42
|
+
FROM vec_ops('chunks', 'authentication patterns',
|
|
43
|
+
'diverse unlike:oauth recent:7',
|
|
44
|
+
'SELECT id FROM chunks WHERE created_at > 1709000000') v
|
|
45
|
+
JOIN chunks c ON v.id = c.id
|
|
46
|
+
ORDER BY v.score DESC
|
|
47
|
+
LIMIT 10
|
|
48
|
+
""")
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Modulation Tokens
|
|
54
|
+
|
|
55
|
+
Tokens reshape the scoring landscape before candidate selection. They compose freely. `diverse unlike:oauth recent:7` is three operations in one pass.
|
|
56
|
+
|
|
57
|
+
| token | what it does |
|
|
58
|
+
|---|---|
|
|
59
|
+
| `diverse` | MMR — spread results across subtopics |
|
|
60
|
+
| `recent:N` | temporal decay with N-day half-life |
|
|
61
|
+
| `unlike:TEXT` | contrastive — suppress similarity to a concept in embedding space |
|
|
62
|
+
| `like:id1,id2` | centroid — search from the mean of example chunks |
|
|
63
|
+
| `from:T to:T` | trajectory — a direction vector through embedding space |
|
|
64
|
+
| `local_communities` | per-query Louvain clustering, adds `_community` column |
|
|
65
|
+
| `limit:N` | candidate pool size (default 500) |
|
|
66
|
+
|
|
67
|
+
`unlike:TEXT` isn't a metadata filter. It embeds TEXT separately and penalizes similarity to it in the vector space. The dominant signal gets suppressed — what surfaces are the edges that a normal query drowns out.
|
|
68
|
+
|
|
69
|
+
`from:T to:T` isn't "search for X then Y." It computes the delta between two concepts and uses that direction as a lens. Content along the conceptual arc surfaces.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## FTS5 Keyword Search
|
|
74
|
+
|
|
75
|
+
`keyword()` is the FTS5 peer to `vec_ops`. Same materializer pattern. Both compose through SQL.
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
# Keyword search
|
|
79
|
+
rows = execute(db, """
|
|
80
|
+
SELECT k.id, k.rank, k.snippet
|
|
81
|
+
FROM keyword('authentication') k
|
|
82
|
+
ORDER BY k.rank DESC
|
|
83
|
+
""")
|
|
84
|
+
|
|
85
|
+
# Hybrid: only chunks matching BOTH keyword AND semantic
|
|
86
|
+
rows = execute(db, """
|
|
87
|
+
SELECT k.id, k.rank, v.score
|
|
88
|
+
FROM keyword('auth') k
|
|
89
|
+
JOIN vec_ops('chunks', 'authentication patterns') v ON k.id = v.id
|
|
90
|
+
ORDER BY k.rank + v.score DESC
|
|
91
|
+
LIMIT 10
|
|
92
|
+
""")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Bring Your Own Embeddings
|
|
98
|
+
|
|
99
|
+
flexvec doesn't care how vectors got into your table. Any BLOB column of float32 arrays works.
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
# OpenAI
|
|
103
|
+
embed_fn = lambda text: openai_client.embeddings.create(
|
|
104
|
+
input=text, model="text-embedding-3-small").data[0].embedding
|
|
105
|
+
|
|
106
|
+
# sentence-transformers
|
|
107
|
+
embed_fn = lambda text: st_model.encode([text])
|
|
108
|
+
|
|
109
|
+
# Any table layout
|
|
110
|
+
cache.load_from_db(db, "documents", "ada_embedding", "doc_id")
|
|
111
|
+
cache.load_from_db(db, "paragraphs", "embedding", "id")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Performance
|
|
117
|
+
|
|
118
|
+
All vectors loaded once into a numpy matrix. Queries are a single matmul. Benchmarked on Apple M-series, 768 dimensions.
|
|
119
|
+
|
|
120
|
+
| corpus size | query time |
|
|
121
|
+
|---|---|
|
|
122
|
+
| 1K vectors | 0.1ms |
|
|
123
|
+
| 10K vectors | 0.5ms |
|
|
124
|
+
| 100K vectors | 5ms |
|
|
125
|
+
| 367K vectors | 12ms |
|
|
126
|
+
|
|
127
|
+
Memory: ~30MB per 10K vectors at 768 dimensions.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Schema
|
|
132
|
+
|
|
133
|
+
One table. An ID column and an embedding BLOB column. That's it.
|
|
134
|
+
|
|
135
|
+
```sql
|
|
136
|
+
CREATE TABLE chunks (
|
|
137
|
+
id TEXT PRIMARY KEY,
|
|
138
|
+
content TEXT,
|
|
139
|
+
embedding BLOB, -- float32 array, any dimension
|
|
140
|
+
timestamp INTEGER -- optional: epoch seconds, enables recent:N
|
|
141
|
+
);
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## vs sqlite-vec
|
|
147
|
+
|
|
148
|
+
| | flexvec | sqlite-vec |
|
|
149
|
+
|---|---|---|
|
|
150
|
+
| install | `pip install` | compile C extension |
|
|
151
|
+
| engine | numpy matmul | custom C SIMD |
|
|
152
|
+
| pre-filter | any SQL, before vector search | after vector search, limited |
|
|
153
|
+
| modulation tokens | 7 composable tokens | none |
|
|
154
|
+
| FTS5 hybrid | built-in | manual |
|
|
155
|
+
| dependency | numpy | C toolchain |
|
|
156
|
+
|
|
157
|
+
sqlite-vec is faster at raw KNN on large corpora. flexvec is faster to ship and more composable with SQL. For personal knowledge bases under 500K chunks, brute force numpy with pre-filtering is fast enough and gives you operations that indexed search can't.
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Install
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
pip install flexvec # core: vec_ops + keyword (numpy only)
|
|
165
|
+
pip install flexvec[embed] # + ONNX embedder (onnxruntime, tokenizers)
|
|
166
|
+
pip install flexvec[graph] # + local_communities token (networkx)
|
|
167
|
+
pip install flexvec[embed,graph] # everything
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
flexvec powers retrieval in [flex](https://github.com/damian-delmas/flex), a SQL-first knowledge engine for AI agents. Extracted as a standalone library — same code, zero dependencies on flex.
|
|
173
|
+
|
|
174
|
+
MIT · Python 3.10+ · SQLite · numpy
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""flexvec — numpy-backed semantic search for any SQLite database."""
|
|
2
|
+
from .vec_ops import (
|
|
3
|
+
VectorCache, register_vec_ops, materialize_vec_ops, parse_modifiers,
|
|
4
|
+
)
|
|
5
|
+
from .keyword import materialize_keyword
|
|
6
|
+
from .execute import execute
|
|
7
|
+
from .embed import get_embed_fn
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Default embedding function — wraps the bundled ONNX model."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_embed_fn(prefix='search_query: '):
|
|
5
|
+
"""Return an embedding function using the bundled Nomic ONNX model.
|
|
6
|
+
|
|
7
|
+
Requires: pip install flexvec[embed]
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from flexvec import get_embed_fn
|
|
11
|
+
embed_fn = get_embed_fn()
|
|
12
|
+
"""
|
|
13
|
+
try:
|
|
14
|
+
from .onnx.embed import get_model
|
|
15
|
+
from .onnx.fetch import model_ready, download_model
|
|
16
|
+
except ImportError:
|
|
17
|
+
raise ImportError(
|
|
18
|
+
"flexvec[embed] is required for get_embed_fn(). "
|
|
19
|
+
"Install with: pip install flexvec[embed]"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if not model_ready():
|
|
23
|
+
print("flexvec: downloading embedding model (~87MB)...")
|
|
24
|
+
download_model()
|
|
25
|
+
|
|
26
|
+
model = get_model()
|
|
27
|
+
|
|
28
|
+
def embed(text):
|
|
29
|
+
return model.encode([text], prefix=prefix)
|
|
30
|
+
|
|
31
|
+
return embed
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Convenience wrapper — chains both materializers, then executes."""
|
|
2
|
+
|
|
3
|
+
from .vec_ops import materialize_vec_ops
|
|
4
|
+
from .keyword import materialize_keyword
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def execute(db, sql):
|
|
8
|
+
"""Chain vec_ops and keyword materializers, then execute.
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
from flexvec import execute
|
|
12
|
+
rows = execute(db, "SELECT v.id FROM vec_ops('chunks', 'auth') v LIMIT 10")
|
|
13
|
+
"""
|
|
14
|
+
sql = materialize_vec_ops(db, sql)
|
|
15
|
+
if sql.startswith('{"error"'):
|
|
16
|
+
return sql
|
|
17
|
+
sql = materialize_keyword(db, sql)
|
|
18
|
+
if sql.startswith('{"error"'):
|
|
19
|
+
return sql
|
|
20
|
+
return db.execute(sql).fetchall()
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""FTS5 keyword materializer — peer primitive to vec_ops.
|
|
2
|
+
|
|
3
|
+
AI writes: FROM keyword('search term') k
|
|
4
|
+
Becomes: FROM _kw_results_xxxx k (temp table with id, rank, snippet)
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
import sqlite3
|
|
10
|
+
import uuid
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def materialize_keyword(db, sql: str) -> str:
|
|
14
|
+
"""Transparently materialize keyword() as a temp table.
|
|
15
|
+
|
|
16
|
+
Returns original SQL unchanged if no keyword() table source found.
|
|
17
|
+
Returns JSON error string on failure.
|
|
18
|
+
"""
|
|
19
|
+
# Find keyword(...) call
|
|
20
|
+
start = re.search(r'keyword\s*\(', sql)
|
|
21
|
+
if not start:
|
|
22
|
+
return sql
|
|
23
|
+
|
|
24
|
+
# Only materialize when used as a table source (FROM/JOIN position)
|
|
25
|
+
before = sql[:start.start()].rstrip().upper()
|
|
26
|
+
if not (before.endswith('FROM') or before.endswith('JOIN') or before.endswith(',')):
|
|
27
|
+
return sql
|
|
28
|
+
|
|
29
|
+
# Balanced-paren extraction (handles quoted strings with escaped '' quotes)
|
|
30
|
+
paren_start = start.end() - 1
|
|
31
|
+
depth = 0
|
|
32
|
+
in_quote = False
|
|
33
|
+
end_pos = None
|
|
34
|
+
i = paren_start
|
|
35
|
+
while i < len(sql):
|
|
36
|
+
c = sql[i]
|
|
37
|
+
if in_quote:
|
|
38
|
+
if c == "'":
|
|
39
|
+
if i + 1 < len(sql) and sql[i + 1] == "'":
|
|
40
|
+
i += 2
|
|
41
|
+
continue
|
|
42
|
+
else:
|
|
43
|
+
in_quote = False
|
|
44
|
+
else:
|
|
45
|
+
if c == "'":
|
|
46
|
+
in_quote = True
|
|
47
|
+
elif c == '(':
|
|
48
|
+
depth += 1
|
|
49
|
+
elif c == ')':
|
|
50
|
+
depth -= 1
|
|
51
|
+
if depth == 0:
|
|
52
|
+
end_pos = i + 1
|
|
53
|
+
break
|
|
54
|
+
i += 1
|
|
55
|
+
if end_pos is None:
|
|
56
|
+
return sql
|
|
57
|
+
|
|
58
|
+
# Extract args from keyword('term', 'modifiers')
|
|
59
|
+
inner = sql[paren_start + 1:end_pos - 1].strip()
|
|
60
|
+
args = _split_args(inner)
|
|
61
|
+
if not args:
|
|
62
|
+
return json.dumps({"error": "keyword() requires a non-empty search term"})
|
|
63
|
+
|
|
64
|
+
term = args[0].strip()
|
|
65
|
+
# Strip surrounding quotes
|
|
66
|
+
if len(term) >= 2 and term[0] == "'" and term[-1] == "'":
|
|
67
|
+
term = term[1:-1].replace("''", "'")
|
|
68
|
+
|
|
69
|
+
if not term or not term.strip():
|
|
70
|
+
return json.dumps({"error": "keyword() requires a non-empty search term"})
|
|
71
|
+
|
|
72
|
+
# Parse modifiers (2nd arg)
|
|
73
|
+
limit = 200
|
|
74
|
+
if len(args) > 1:
|
|
75
|
+
mod_str = args[1].strip()
|
|
76
|
+
if len(mod_str) >= 2 and mod_str[0] == "'" and mod_str[-1] == "'":
|
|
77
|
+
mod_str = mod_str[1:-1]
|
|
78
|
+
m = re.search(r'limit:(\d+)', mod_str)
|
|
79
|
+
if m:
|
|
80
|
+
limit = int(m.group(1)) or 200
|
|
81
|
+
|
|
82
|
+
# Execute FTS5 query — raw-first, quote-on-error fallback
|
|
83
|
+
fts_sql = (
|
|
84
|
+
"SELECT c.id, "
|
|
85
|
+
" -bm25(chunks_fts) as rank, "
|
|
86
|
+
" snippet(chunks_fts, 0, '>>>', '<<<', '...', 30) as snippet "
|
|
87
|
+
"FROM chunks_fts "
|
|
88
|
+
"JOIN _raw_chunks c ON chunks_fts.rowid = c.rowid "
|
|
89
|
+
"WHERE chunks_fts MATCH ? "
|
|
90
|
+
"ORDER BY bm25(chunks_fts) "
|
|
91
|
+
"LIMIT ?"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
try:
|
|
96
|
+
rows = db.execute(fts_sql, (term, limit)).fetchall()
|
|
97
|
+
except sqlite3.OperationalError:
|
|
98
|
+
# Fallback: double-quote for literal matching
|
|
99
|
+
escaped = '"' + term.replace('"', '""') + '"'
|
|
100
|
+
rows = db.execute(fts_sql, (escaped, limit)).fetchall()
|
|
101
|
+
except Exception as e:
|
|
102
|
+
return json.dumps({"error": f"keyword() search failed: {e}"})
|
|
103
|
+
|
|
104
|
+
# Create temp table (always — even on empty results)
|
|
105
|
+
tmp_name = f"_kw_results_{uuid.uuid4().hex[:8]}"
|
|
106
|
+
db.execute(f"CREATE TEMP TABLE [{tmp_name}] (id TEXT PRIMARY KEY, rank REAL, snippet TEXT)")
|
|
107
|
+
if rows:
|
|
108
|
+
db.executemany(
|
|
109
|
+
f"INSERT INTO [{tmp_name}] VALUES (?, ?, ?)",
|
|
110
|
+
[(r[0], r[1], r[2]) for r in rows]
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Rewrite: replace keyword(...) with temp table name
|
|
114
|
+
return sql[:start.start()] + tmp_name + sql[end_pos:]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _split_args(inner: str) -> list[str]:
|
|
118
|
+
"""Split comma-separated args respecting single-quoted strings."""
|
|
119
|
+
args = []
|
|
120
|
+
current = []
|
|
121
|
+
in_quote = False
|
|
122
|
+
i = 0
|
|
123
|
+
while i < len(inner):
|
|
124
|
+
c = inner[i]
|
|
125
|
+
if in_quote:
|
|
126
|
+
current.append(c)
|
|
127
|
+
if c == "'":
|
|
128
|
+
if i + 1 < len(inner) and inner[i + 1] == "'":
|
|
129
|
+
current.append("'")
|
|
130
|
+
i += 2
|
|
131
|
+
continue
|
|
132
|
+
else:
|
|
133
|
+
in_quote = False
|
|
134
|
+
else:
|
|
135
|
+
if c == "'":
|
|
136
|
+
in_quote = True
|
|
137
|
+
current.append(c)
|
|
138
|
+
elif c == ',':
|
|
139
|
+
args.append(''.join(current))
|
|
140
|
+
current = []
|
|
141
|
+
else:
|
|
142
|
+
current.append(c)
|
|
143
|
+
i += 1
|
|
144
|
+
if current:
|
|
145
|
+
args.append(''.join(current))
|
|
146
|
+
return args
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ONNX-based embedding model — Nomic embed-text-v1.5 (768-dim, Matryoshka).
|
|
3
|
+
|
|
4
|
+
Drop-in replacement for sentence-transformers. Uses ONNX runtime.
|
|
5
|
+
No PyTorch dependency. ~87MB int8 model.
|
|
6
|
+
|
|
7
|
+
Task prefixes (mandatory for Nomic):
|
|
8
|
+
search_document: — index time (default)
|
|
9
|
+
search_query: — query time
|
|
10
|
+
clustering: — clustering tasks
|
|
11
|
+
classification: — classification tasks
|
|
12
|
+
|
|
13
|
+
Memory-safe adaptive batching:
|
|
14
|
+
Attention is O(seq_len²). One long text in a batch pads the entire batch
|
|
15
|
+
to max_seq_len, spiking memory. We sort by tokenized length and scale
|
|
16
|
+
batch_size inversely with sequence length. Budget: 512MB attention ceiling.
|
|
17
|
+
|
|
18
|
+
Performance:
|
|
19
|
+
Adaptive batching: sorts inputs by tokenized length so short texts batch
|
|
20
|
+
together (fast, low padding) and long texts batch together (predictable).
|
|
21
|
+
Attention memory stays under 512MB at any input mix. Results returned in
|
|
22
|
+
original order.
|
|
23
|
+
|
|
24
|
+
Usage:
|
|
25
|
+
from flex.onnx import ONNXEmbedder
|
|
26
|
+
|
|
27
|
+
model = ONNXEmbedder()
|
|
28
|
+
embeddings = model.encode(["text1", "text2"]) # index
|
|
29
|
+
embeddings = model.encode(["query"], prefix='search_query: ') # search
|
|
30
|
+
"""
|
|
31
|
+
import numpy as np
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import List, Union
|
|
34
|
+
|
|
35
|
+
# Lazy imports
|
|
36
|
+
_ort = None
|
|
37
|
+
_tokenizer = None
|
|
38
|
+
|
|
39
|
+
ONNX_DIR = Path(__file__).parent
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _resolve_model_path() -> Path:
|
|
43
|
+
"""Bundled first, then $FLEX_HOME/models/ (FLEX_HOME-aware)."""
|
|
44
|
+
import os
|
|
45
|
+
bundled = ONNX_DIR / "model.onnx"
|
|
46
|
+
if bundled.exists():
|
|
47
|
+
return bundled
|
|
48
|
+
flex_home = Path(os.environ.get("FLEX_HOME", Path.home() / ".flex"))
|
|
49
|
+
user = flex_home / "models" / "model.onnx"
|
|
50
|
+
if user.exists():
|
|
51
|
+
return user
|
|
52
|
+
raise RuntimeError(
|
|
53
|
+
"Embedding model not found.\n"
|
|
54
|
+
f" Checked: {bundled}\n"
|
|
55
|
+
f" Checked: {user}\n"
|
|
56
|
+
" Run 'flexvec.onnx.fetch.download_model()' to download it."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Attention memory budget: batch × 12_heads × seq² × 4_bytes ≤ ATTN_BUDGET_BYTES
|
|
61
|
+
# 512MB keeps us safe on 8GB machines with headroom for hidden states + OS.
|
|
62
|
+
ATTN_BUDGET_BYTES = 512 * 1024 * 1024
|
|
63
|
+
ATTN_HEADS = 12
|
|
64
|
+
MAX_BATCH = 256
|
|
65
|
+
MAX_LENGTH = 512
|
|
66
|
+
MAX_CHARS = MAX_LENGTH * 8 # pre-truncate before tokenizer sees the text
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _safe_batch_size(seq_len: int) -> int:
|
|
70
|
+
"""Max batch size that keeps attention intermediates under budget."""
|
|
71
|
+
if seq_len <= 0:
|
|
72
|
+
return MAX_BATCH
|
|
73
|
+
max_bs = ATTN_BUDGET_BYTES // (ATTN_HEADS * seq_len * seq_len * 4)
|
|
74
|
+
return max(1, min(max_bs, MAX_BATCH))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _get_onnxruntime():
|
|
78
|
+
global _ort
|
|
79
|
+
if _ort is None:
|
|
80
|
+
import onnxruntime as ort
|
|
81
|
+
_ort = ort
|
|
82
|
+
return _ort
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def has_gpu() -> bool:
|
|
86
|
+
"""Return True if CUDAExecutionProvider is available."""
|
|
87
|
+
try:
|
|
88
|
+
ort = _get_onnxruntime()
|
|
89
|
+
return "CUDAExecutionProvider" in ort.get_available_providers()
|
|
90
|
+
except Exception:
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _get_tokenizer():
|
|
95
|
+
global _tokenizer
|
|
96
|
+
if _tokenizer is None:
|
|
97
|
+
from tokenizers import Tokenizer
|
|
98
|
+
_tokenizer = Tokenizer.from_file(str(ONNX_DIR / "tokenizer.json"))
|
|
99
|
+
return _tokenizer
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class ONNXEmbedder:
|
|
103
|
+
"""ONNX-based sentence embedder compatible with sentence-transformers API."""
|
|
104
|
+
|
|
105
|
+
def __init__(self, model_path: Path = None):
|
|
106
|
+
self.model_path = model_path or _resolve_model_path()
|
|
107
|
+
self._session = None
|
|
108
|
+
self._tokenizer = None
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def session(self):
|
|
112
|
+
if self._session is None:
|
|
113
|
+
ort = _get_onnxruntime()
|
|
114
|
+
opts = ort.SessionOptions()
|
|
115
|
+
# ORT_ENABLE_ALL: fuses QKV attention, layer norm, GELU, embedding
|
|
116
|
+
# layers into single kernels. 2-5x speedup on transformers.
|
|
117
|
+
opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
118
|
+
# intra_op=0 lets ONNX use all physical cores (default heuristic).
|
|
119
|
+
# inter_op=1 because we run one model sequentially.
|
|
120
|
+
opts.intra_op_num_threads = 0
|
|
121
|
+
opts.inter_op_num_threads = 1
|
|
122
|
+
# Prefer CUDA if available, fall back to CPU transparently
|
|
123
|
+
available = ort.get_available_providers()
|
|
124
|
+
providers = (
|
|
125
|
+
["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
126
|
+
if "CUDAExecutionProvider" in available
|
|
127
|
+
else ["CPUExecutionProvider"]
|
|
128
|
+
)
|
|
129
|
+
self._session = ort.InferenceSession(
|
|
130
|
+
str(self.model_path),
|
|
131
|
+
sess_options=opts,
|
|
132
|
+
providers=providers,
|
|
133
|
+
)
|
|
134
|
+
return self._session
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def tokenizer(self):
|
|
138
|
+
if self._tokenizer is None:
|
|
139
|
+
self._tokenizer = _get_tokenizer()
|
|
140
|
+
return self._tokenizer
|
|
141
|
+
|
|
142
|
+
def _encode_batch(self, batch: list, normalize: bool) -> np.ndarray:
|
|
143
|
+
"""Tokenize and run inference on a single pre-sorted batch."""
|
|
144
|
+
tok = self.tokenizer
|
|
145
|
+
encoded = tok.encode_batch(batch)
|
|
146
|
+
input_ids = np.array([e.ids for e in encoded], dtype=np.int64)
|
|
147
|
+
attention_mask = np.array([e.attention_mask for e in encoded], dtype=np.int64)
|
|
148
|
+
|
|
149
|
+
feed = {"input_ids": input_ids, "attention_mask": attention_mask}
|
|
150
|
+
# token_type_ids is optional — only send if the model expects it
|
|
151
|
+
if any(i.name == "token_type_ids" for i in self.session.get_inputs()):
|
|
152
|
+
feed["token_type_ids"] = np.zeros_like(input_ids)
|
|
153
|
+
outputs = self.session.run(None, feed)
|
|
154
|
+
|
|
155
|
+
# Mean pooling
|
|
156
|
+
last_hidden = outputs[0]
|
|
157
|
+
mask_expanded = np.expand_dims(attention_mask, -1).astype(np.float32)
|
|
158
|
+
sum_embeddings = np.sum(last_hidden * mask_expanded, axis=1)
|
|
159
|
+
sum_mask = np.sum(mask_expanded, axis=1)
|
|
160
|
+
embeddings = sum_embeddings / np.maximum(sum_mask, 1e-9)
|
|
161
|
+
|
|
162
|
+
if normalize:
|
|
163
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
164
|
+
embeddings = embeddings / np.maximum(norms, 1e-9)
|
|
165
|
+
|
|
166
|
+
return embeddings
|
|
167
|
+
|
|
168
|
+
def encode(
|
|
169
|
+
self,
|
|
170
|
+
sentences: Union[str, List[str]],
|
|
171
|
+
batch_size: int = 32,
|
|
172
|
+
normalize: bool = True,
|
|
173
|
+
prefix: str = 'search_document: ',
|
|
174
|
+
show_progress_bar: bool = False, # noqa: ARG002 — sentence-transformers API compat
|
|
175
|
+
matryoshka_dim: int = 128,
|
|
176
|
+
) -> np.ndarray:
|
|
177
|
+
"""
|
|
178
|
+
Encode sentences to embeddings with adaptive batching.
|
|
179
|
+
|
|
180
|
+
Sorts inputs by tokenized length so short texts batch together (fast)
|
|
181
|
+
and long texts get smaller batches (safe). Attention memory stays under
|
|
182
|
+
512MB regardless of input mix. Results returned in original order.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
sentences: Single sentence or list of sentences
|
|
186
|
+
batch_size: Max batch size hint (actual size may be smaller for long texts)
|
|
187
|
+
normalize: Whether to L2-normalize embeddings
|
|
188
|
+
prefix: Task prefix for Nomic (default 'search_document: ' for indexing,
|
|
189
|
+
use 'search_query: ' for retrieval). Empty string for no prefix.
|
|
190
|
+
show_progress_bar: Ignored. Exists for sentence-transformers API compatibility.
|
|
191
|
+
matryoshka_dim: Truncate output to this many dimensions (Matryoshka).
|
|
192
|
+
Default 128. Set to 768 for full embeddings. Re-normalizes after
|
|
193
|
+
truncation so cosine similarity remains valid.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
numpy array of shape (n_sentences, matryoshka_dim)
|
|
197
|
+
"""
|
|
198
|
+
if isinstance(sentences, str):
|
|
199
|
+
sentences = [sentences]
|
|
200
|
+
if prefix:
|
|
201
|
+
sentences = [prefix + s for s in sentences]
|
|
202
|
+
# Pre-truncate: no point tokenizing 214K chars to keep 512 tokens
|
|
203
|
+
sentences = [s[:MAX_CHARS] for s in sentences]
|
|
204
|
+
|
|
205
|
+
n = len(sentences)
|
|
206
|
+
if n == 0:
|
|
207
|
+
return np.empty((0, 768), dtype=np.float32)
|
|
208
|
+
|
|
209
|
+
tok = self.tokenizer
|
|
210
|
+
tok.enable_truncation(max_length=MAX_LENGTH)
|
|
211
|
+
tok.enable_padding()
|
|
212
|
+
|
|
213
|
+
# Tokenize all to get lengths for sorting + adaptive batch sizing
|
|
214
|
+
pre_encoded = tok.encode_batch(sentences)
|
|
215
|
+
lengths = [len(e.ids) for e in pre_encoded]
|
|
216
|
+
|
|
217
|
+
# Sort by length: short texts first, long texts last
|
|
218
|
+
order = np.argsort(lengths)
|
|
219
|
+
sorted_sentences = [sentences[i] for i in order]
|
|
220
|
+
sorted_lengths = [lengths[i] for i in order]
|
|
221
|
+
|
|
222
|
+
all_embeddings = []
|
|
223
|
+
i = 0
|
|
224
|
+
while i < n:
|
|
225
|
+
# Adaptive batch size from longest text in this slice
|
|
226
|
+
max_seq = sorted_lengths[min(i + batch_size - 1, n - 1)]
|
|
227
|
+
safe_bs = min(batch_size, _safe_batch_size(max_seq))
|
|
228
|
+
end = min(i + safe_bs, n)
|
|
229
|
+
batch = sorted_sentences[i:end]
|
|
230
|
+
|
|
231
|
+
all_embeddings.append(self._encode_batch(batch, normalize))
|
|
232
|
+
i = end
|
|
233
|
+
|
|
234
|
+
stacked = np.vstack(all_embeddings)
|
|
235
|
+
|
|
236
|
+
# Unsort back to original order
|
|
237
|
+
result = np.empty_like(stacked)
|
|
238
|
+
result[order] = stacked
|
|
239
|
+
|
|
240
|
+
# Matryoshka truncation: slice to target dim and re-normalize
|
|
241
|
+
if matryoshka_dim and matryoshka_dim < result.shape[1]:
|
|
242
|
+
result = result[:, :matryoshka_dim].copy()
|
|
243
|
+
if normalize:
|
|
244
|
+
norms = np.linalg.norm(result, axis=1, keepdims=True)
|
|
245
|
+
norms = np.where(norms < 1e-9, 1.0, norms)
|
|
246
|
+
result = result / norms
|
|
247
|
+
|
|
248
|
+
return result
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
# Singleton
|
|
252
|
+
_model = None
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def get_model() -> ONNXEmbedder:
|
|
256
|
+
"""Get singleton ONNX embedder instance."""
|
|
257
|
+
global _model
|
|
258
|
+
if _model is None:
|
|
259
|
+
_model = ONNXEmbedder()
|
|
260
|
+
return _model
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def encode(sentences: Union[str, List[str]], prefix: str = 'search_document: ', **kwargs) -> np.ndarray:
|
|
264
|
+
"""Convenience function to encode sentences."""
|
|
265
|
+
return get_model().encode(sentences, prefix=prefix, **kwargs)
|