menteedb 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- menteedb-0.1.0/PKG-INFO +96 -0
- menteedb-0.1.0/README.md +85 -0
- menteedb-0.1.0/menteedb/__init__.py +3 -0
- menteedb-0.1.0/menteedb/core.py +195 -0
- menteedb-0.1.0/menteedb/embeddings.py +29 -0
- menteedb-0.1.0/menteedb/file_handler.py +156 -0
- menteedb-0.1.0/menteedb.egg-info/PKG-INFO +96 -0
- menteedb-0.1.0/menteedb.egg-info/SOURCES.txt +16 -0
- menteedb-0.1.0/menteedb.egg-info/dependency_links.txt +1 -0
- menteedb-0.1.0/menteedb.egg-info/requires.txt +5 -0
- menteedb-0.1.0/menteedb.egg-info/top_level.txt +1 -0
- menteedb-0.1.0/pyproject.toml +26 -0
- menteedb-0.1.0/setup.cfg +4 -0
- menteedb-0.1.0/tests/test_core.py +49 -0
menteedb-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: menteedb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight local vector-aware database for Python
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: numpy>=1.24
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
10
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
11
|
+
|
|
12
|
+
# menteedb
|
|
13
|
+
|
|
14
|
+
menteedb is a lightweight local Python library that combines table-like records with optional vector similarity search.
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- Define tables with a schema.
|
|
19
|
+
- Insert structured records.
|
|
20
|
+
- Enable vector search on one text field per table.
|
|
21
|
+
- Add fast text contains search per table.
|
|
22
|
+
- Query by field filters and/or semantic similarity.
|
|
23
|
+
- Persist data locally with append-only files for speed.
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from menteedb import MenteeDB
|
|
29
|
+
|
|
30
|
+
db = MenteeDB(base_path="./data")
|
|
31
|
+
|
|
32
|
+
db.create_table(
|
|
33
|
+
table_name="notes",
|
|
34
|
+
fields={"title": "str", "body": "str", "tag": "str"},
|
|
35
|
+
vector_field="body",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
db.insert("notes", {"title": "First", "body": "Vector databases are useful.", "tag": "ml"})
|
|
39
|
+
db.insert("notes", {"title": "Second", "body": "I enjoy local-first tools.", "tag": "dev"})
|
|
40
|
+
|
|
41
|
+
results = db.query("notes", vector_query="local vector tools", top_k=2)
|
|
42
|
+
for item in results:
|
|
43
|
+
print(item["score"], item["record"])
|
|
44
|
+
|
|
45
|
+
text_hits = db.query("notes", text_query="local", text_fields=["body"])
|
|
46
|
+
print(text_hits)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Query Modes
|
|
50
|
+
|
|
51
|
+
- Filter-only:
|
|
52
|
+
- `db.query("notes", conditions={"tag": "ml"})`
|
|
53
|
+
- Text contains search:
|
|
54
|
+
- `db.query("notes", text_query="vector", text_fields=["title", "body"])`
|
|
55
|
+
- Vector-only:
|
|
56
|
+
- `db.query("notes", vector_query="your text")`
|
|
57
|
+
- Hybrid (filter + vector):
|
|
58
|
+
- `db.query("notes", conditions={"tag": "dev"}, vector_query="local tools")`
|
|
59
|
+
|
|
60
|
+
## Storage Layout
|
|
61
|
+
|
|
62
|
+
For `base_path="./data"` and table `notes`, menteedb stores:
|
|
63
|
+
|
|
64
|
+
- `./data/notes/schema.json`
|
|
65
|
+
- `./data/notes/records.jsonl`
|
|
66
|
+
- `./data/notes/vector_ids.jsonl`
|
|
67
|
+
- `./data/notes/vectors.f32`
|
|
68
|
+
|
|
69
|
+
This is local file-based storage. It is not publicly exposed over the network, but anyone with local filesystem access to this folder can read it.
|
|
70
|
+
|
|
71
|
+
## Privacy and Permissions
|
|
72
|
+
|
|
73
|
+
- By default, `MenteeDB(..., secure_permissions=True)` applies best-effort private permissions (`700` for table folders, `600` for files).
|
|
74
|
+
- On Windows, real privacy is controlled by NTFS ACLs; chmod behavior is limited.
|
|
75
|
+
|
|
76
|
+
## Testing
|
|
77
|
+
|
|
78
|
+
Run locally:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install .[dev]
|
|
82
|
+
pytest -q
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## CI/CD to PyPI
|
|
86
|
+
|
|
87
|
+
Workflow file: `.github/workflows/pypi-publish.yml`
|
|
88
|
+
|
|
89
|
+
- Runs tests on pushes to `main`, tags (`v*`), and releases.
|
|
90
|
+
- Publishes to PyPI on tag push (`v*`) or GitHub Release publish.
|
|
91
|
+
- Uses trusted publishing via GitHub OIDC.
|
|
92
|
+
|
|
93
|
+
## Notes
|
|
94
|
+
|
|
95
|
+
- This initial version supports one vector field per table.
|
|
96
|
+
- Default embeddings use a deterministic local hashing embedder with no external model download.
|
menteedb-0.1.0/README.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# menteedb
|
|
2
|
+
|
|
3
|
+
menteedb is a lightweight local Python library that combines table-like records with optional vector similarity search.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Define tables with a schema.
|
|
8
|
+
- Insert structured records.
|
|
9
|
+
- Enable vector search on one text field per table.
|
|
10
|
+
- Add fast text contains search per table.
|
|
11
|
+
- Query by field filters and/or semantic similarity.
|
|
12
|
+
- Persist data locally with append-only files for speed.
|
|
13
|
+
|
|
14
|
+
## Quick Start
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
from menteedb import MenteeDB
|
|
18
|
+
|
|
19
|
+
db = MenteeDB(base_path="./data")
|
|
20
|
+
|
|
21
|
+
db.create_table(
|
|
22
|
+
table_name="notes",
|
|
23
|
+
fields={"title": "str", "body": "str", "tag": "str"},
|
|
24
|
+
vector_field="body",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
db.insert("notes", {"title": "First", "body": "Vector databases are useful.", "tag": "ml"})
|
|
28
|
+
db.insert("notes", {"title": "Second", "body": "I enjoy local-first tools.", "tag": "dev"})
|
|
29
|
+
|
|
30
|
+
results = db.query("notes", vector_query="local vector tools", top_k=2)
|
|
31
|
+
for item in results:
|
|
32
|
+
print(item["score"], item["record"])
|
|
33
|
+
|
|
34
|
+
text_hits = db.query("notes", text_query="local", text_fields=["body"])
|
|
35
|
+
print(text_hits)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Query Modes
|
|
39
|
+
|
|
40
|
+
- Filter-only:
|
|
41
|
+
- `db.query("notes", conditions={"tag": "ml"})`
|
|
42
|
+
- Text contains search:
|
|
43
|
+
- `db.query("notes", text_query="vector", text_fields=["title", "body"])`
|
|
44
|
+
- Vector-only:
|
|
45
|
+
- `db.query("notes", vector_query="your text")`
|
|
46
|
+
- Hybrid (filter + vector):
|
|
47
|
+
- `db.query("notes", conditions={"tag": "dev"}, vector_query="local tools")`
|
|
48
|
+
|
|
49
|
+
## Storage Layout
|
|
50
|
+
|
|
51
|
+
For `base_path="./data"` and table `notes`, menteedb stores:
|
|
52
|
+
|
|
53
|
+
- `./data/notes/schema.json`
|
|
54
|
+
- `./data/notes/records.jsonl`
|
|
55
|
+
- `./data/notes/vector_ids.jsonl`
|
|
56
|
+
- `./data/notes/vectors.f32`
|
|
57
|
+
|
|
58
|
+
This is local file-based storage. It is not publicly exposed over the network, but anyone with local filesystem access to this folder can read it.
|
|
59
|
+
|
|
60
|
+
## Privacy and Permissions
|
|
61
|
+
|
|
62
|
+
- By default, `MenteeDB(..., secure_permissions=True)` applies best-effort private permissions (`700` for table folders, `600` for files).
|
|
63
|
+
- On Windows, real privacy is controlled by NTFS ACLs; chmod behavior is limited.
|
|
64
|
+
|
|
65
|
+
## Testing
|
|
66
|
+
|
|
67
|
+
Run locally:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install .[dev]
|
|
71
|
+
pytest -q
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## CI/CD to PyPI
|
|
75
|
+
|
|
76
|
+
Workflow file: `.github/workflows/pypi-publish.yml`
|
|
77
|
+
|
|
78
|
+
- Runs tests on pushes to `main`, tags (`v*`), and releases.
|
|
79
|
+
- Publishes to PyPI on tag push (`v*`) or GitHub Release publish.
|
|
80
|
+
- Uses trusted publishing via GitHub OIDC.
|
|
81
|
+
|
|
82
|
+
## Notes
|
|
83
|
+
|
|
84
|
+
- This initial version supports one vector field per table.
|
|
85
|
+
- Default embeddings use a deterministic local hashing embedder with no external model download.
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import uuid
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional, Sequence
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from .embeddings import HashingEmbedder
|
|
11
|
+
from .file_handler import (
|
|
12
|
+
append_record,
|
|
13
|
+
append_vector,
|
|
14
|
+
create_table_files,
|
|
15
|
+
ensure_path,
|
|
16
|
+
load_schema,
|
|
17
|
+
load_vectors,
|
|
18
|
+
read_all_records,
|
|
19
|
+
table_exists,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MenteeDB:
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
base_path: str = "./menteedb_data",
|
|
27
|
+
embedder: Optional[Any] = None,
|
|
28
|
+
secure_permissions: bool = True,
|
|
29
|
+
) -> None:
|
|
30
|
+
self.base_path = Path(base_path)
|
|
31
|
+
ensure_path(self.base_path)
|
|
32
|
+
self.embedder = embedder or HashingEmbedder()
|
|
33
|
+
self.secure_permissions = secure_permissions
|
|
34
|
+
|
|
35
|
+
def create_table(self, table_name: str, fields: Dict[str, str], vector_field: Optional[str] = None) -> Dict[str, Any]:
|
|
36
|
+
if not table_name or not isinstance(table_name, str):
|
|
37
|
+
raise ValueError("table_name must be a non-empty string.")
|
|
38
|
+
if not re.fullmatch(r"[A-Za-z0-9_\-]+", table_name):
|
|
39
|
+
raise ValueError("table_name may only contain letters, numbers, underscore, and dash.")
|
|
40
|
+
if not isinstance(fields, dict) or not fields:
|
|
41
|
+
raise ValueError("fields must be a non-empty dictionary.")
|
|
42
|
+
if table_exists(self.base_path, table_name):
|
|
43
|
+
raise ValueError(f"Table '{table_name}' already exists.")
|
|
44
|
+
if vector_field is not None and vector_field not in fields:
|
|
45
|
+
raise ValueError("vector_field must be one of the schema field names.")
|
|
46
|
+
|
|
47
|
+
embedding_dim = getattr(self.embedder, "dimension", None) if vector_field else None
|
|
48
|
+
if vector_field and (not isinstance(embedding_dim, int) or embedding_dim <= 0):
|
|
49
|
+
raise ValueError("Embedder must expose a positive integer 'dimension' attribute when vector_field is enabled.")
|
|
50
|
+
|
|
51
|
+
schema = {
|
|
52
|
+
"table_name": table_name,
|
|
53
|
+
"fields": fields,
|
|
54
|
+
"vector_field": vector_field,
|
|
55
|
+
"embedding_dim": embedding_dim,
|
|
56
|
+
"version": 1,
|
|
57
|
+
}
|
|
58
|
+
create_table_files(self.base_path, table_name, schema, secure_permissions=self.secure_permissions)
|
|
59
|
+
return {"ok": True, "table": table_name, "vector_field": vector_field}
|
|
60
|
+
|
|
61
|
+
def insert(self, table_name: str, record: Dict[str, Any], record_id: Optional[str] = None) -> Dict[str, Any]:
|
|
62
|
+
schema = load_schema(self.base_path, table_name)
|
|
63
|
+
fields = schema["fields"]
|
|
64
|
+
|
|
65
|
+
if not isinstance(record, dict):
|
|
66
|
+
raise ValueError("record must be a dictionary.")
|
|
67
|
+
|
|
68
|
+
for field_name in fields:
|
|
69
|
+
if field_name not in record:
|
|
70
|
+
raise ValueError(f"Missing field '{field_name}' in record.")
|
|
71
|
+
|
|
72
|
+
rid = record_id or str(uuid.uuid4())
|
|
73
|
+
stored = {"_id": rid, **record}
|
|
74
|
+
append_record(self.base_path, table_name, stored, secure_permissions=self.secure_permissions)
|
|
75
|
+
|
|
76
|
+
vector_field = schema.get("vector_field")
|
|
77
|
+
if vector_field:
|
|
78
|
+
value = record.get(vector_field)
|
|
79
|
+
if not isinstance(value, str):
|
|
80
|
+
raise ValueError(f"vector_field '{vector_field}' must contain string data for embedding.")
|
|
81
|
+
embedding = self.embedder.encode(value).astype(np.float32)
|
|
82
|
+
embedding_dim = schema.get("embedding_dim")
|
|
83
|
+
if embedding_dim is None:
|
|
84
|
+
embedding_dim = embedding.shape[0]
|
|
85
|
+
if embedding.shape[0] != embedding_dim:
|
|
86
|
+
raise ValueError("Embedding dimension does not match table configuration.")
|
|
87
|
+
|
|
88
|
+
append_vector(
|
|
89
|
+
self.base_path,
|
|
90
|
+
table_name,
|
|
91
|
+
rid,
|
|
92
|
+
embedding,
|
|
93
|
+
secure_permissions=self.secure_permissions,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return {"ok": True, "id": rid}
|
|
97
|
+
|
|
98
|
+
def query(
|
|
99
|
+
self,
|
|
100
|
+
table_name: str,
|
|
101
|
+
conditions: Optional[Dict[str, Any]] = None,
|
|
102
|
+
text_query: Optional[str] = None,
|
|
103
|
+
text_fields: Optional[Sequence[str]] = None,
|
|
104
|
+
case_sensitive: bool = False,
|
|
105
|
+
vector_query: Optional[str] = None,
|
|
106
|
+
top_k: int = 5,
|
|
107
|
+
min_score: Optional[float] = None,
|
|
108
|
+
) -> List[Dict[str, Any]]:
|
|
109
|
+
schema = load_schema(self.base_path, table_name)
|
|
110
|
+
rows = read_all_records(self.base_path, table_name)
|
|
111
|
+
|
|
112
|
+
if conditions:
|
|
113
|
+
rows = [r for r in rows if self._record_matches(r, conditions)]
|
|
114
|
+
|
|
115
|
+
if text_query is not None:
|
|
116
|
+
rows = self._text_filter_rows(rows, text_query, text_fields=text_fields, case_sensitive=case_sensitive)
|
|
117
|
+
|
|
118
|
+
if vector_query is None:
|
|
119
|
+
return [{"id": row["_id"], "score": None, "record": row} for row in rows]
|
|
120
|
+
|
|
121
|
+
vector_field = schema.get("vector_field")
|
|
122
|
+
if not vector_field:
|
|
123
|
+
raise ValueError(f"Table '{table_name}' is not configured for vector search.")
|
|
124
|
+
if not isinstance(vector_query, str) or not vector_query.strip():
|
|
125
|
+
raise ValueError("vector_query must be a non-empty string.")
|
|
126
|
+
|
|
127
|
+
embedding_dim = schema.get("embedding_dim")
|
|
128
|
+
ids, vectors = load_vectors(self.base_path, table_name, dimension=embedding_dim)
|
|
129
|
+
if vectors is None or len(ids) == 0:
|
|
130
|
+
return []
|
|
131
|
+
|
|
132
|
+
query_vec = self.embedder.encode(vector_query).astype(np.float32)
|
|
133
|
+
if query_vec.shape[0] != vectors.shape[1]:
|
|
134
|
+
raise ValueError("Query embedding dimension does not match stored vectors.")
|
|
135
|
+
|
|
136
|
+
scores = self._cosine_scores(vectors, query_vec)
|
|
137
|
+
by_id = {row["_id"]: row for row in rows}
|
|
138
|
+
|
|
139
|
+
ranked = []
|
|
140
|
+
for idx, rid in enumerate(ids):
|
|
141
|
+
row = by_id.get(rid)
|
|
142
|
+
if row is None:
|
|
143
|
+
continue
|
|
144
|
+
score = float(scores[idx])
|
|
145
|
+
if min_score is not None and score < min_score:
|
|
146
|
+
continue
|
|
147
|
+
ranked.append({"id": rid, "score": score, "record": row})
|
|
148
|
+
|
|
149
|
+
ranked.sort(key=lambda x: x["score"], reverse=True)
|
|
150
|
+
return ranked[:top_k]
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def _record_matches(record: Dict[str, Any], conditions: Dict[str, Any]) -> bool:
|
|
154
|
+
for key, expected in conditions.items():
|
|
155
|
+
if record.get(key) != expected:
|
|
156
|
+
return False
|
|
157
|
+
return True
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def _text_filter_rows(
|
|
161
|
+
rows: List[Dict[str, Any]],
|
|
162
|
+
text_query: str,
|
|
163
|
+
text_fields: Optional[Sequence[str]] = None,
|
|
164
|
+
case_sensitive: bool = False,
|
|
165
|
+
) -> List[Dict[str, Any]]:
|
|
166
|
+
if not isinstance(text_query, str) or not text_query.strip():
|
|
167
|
+
raise ValueError("text_query must be a non-empty string when provided.")
|
|
168
|
+
|
|
169
|
+
query = text_query if case_sensitive else text_query.lower()
|
|
170
|
+
selected_fields = list(text_fields) if text_fields is not None else None
|
|
171
|
+
|
|
172
|
+
out: List[Dict[str, Any]] = []
|
|
173
|
+
for row in rows:
|
|
174
|
+
field_names = selected_fields or [k for k in row.keys() if k != "_id"]
|
|
175
|
+
haystack_parts: List[str] = []
|
|
176
|
+
for field_name in field_names:
|
|
177
|
+
value = row.get(field_name)
|
|
178
|
+
if isinstance(value, str):
|
|
179
|
+
haystack_parts.append(value)
|
|
180
|
+
haystack = " ".join(haystack_parts)
|
|
181
|
+
haystack = haystack if case_sensitive else haystack.lower()
|
|
182
|
+
if query in haystack:
|
|
183
|
+
out.append(row)
|
|
184
|
+
return out
|
|
185
|
+
|
|
186
|
+
@staticmethod
|
|
187
|
+
def _cosine_scores(matrix: np.ndarray, query_vector: np.ndarray) -> np.ndarray:
|
|
188
|
+
matrix_norm = np.linalg.norm(matrix, axis=1)
|
|
189
|
+
query_norm = np.linalg.norm(query_vector)
|
|
190
|
+
if query_norm == 0:
|
|
191
|
+
return np.zeros(matrix.shape[0], dtype=np.float32)
|
|
192
|
+
|
|
193
|
+
denom = matrix_norm * query_norm
|
|
194
|
+
denom = np.where(denom == 0, 1e-12, denom)
|
|
195
|
+
return (matrix @ query_vector) / denom
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class HashingEmbedder:
|
|
11
|
+
"""Deterministic local embedder based on hashing.
|
|
12
|
+
|
|
13
|
+
This keeps the library dependency-light and fully offline.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
dimension: int = 384
|
|
17
|
+
|
|
18
|
+
def encode(self, text: str) -> np.ndarray:
|
|
19
|
+
if not isinstance(text, str):
|
|
20
|
+
raise TypeError("Embedding input must be a string.")
|
|
21
|
+
|
|
22
|
+
digest = hashlib.sha256(text.encode("utf-8")).digest()
|
|
23
|
+
seed = int.from_bytes(digest[:8], byteorder="big", signed=False)
|
|
24
|
+
rng = np.random.default_rng(seed)
|
|
25
|
+
vec = rng.standard_normal(self.dimension, dtype=np.float32)
|
|
26
|
+
norm = np.linalg.norm(vec)
|
|
27
|
+
if norm == 0:
|
|
28
|
+
return vec
|
|
29
|
+
return vec / norm
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def ensure_path(path: Path) -> None:
|
|
12
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def table_dir(base_path: Path, table_name: str) -> Path:
|
|
16
|
+
return base_path / table_name
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def schema_path(base_path: Path, table_name: str) -> Path:
|
|
20
|
+
return table_dir(base_path, table_name) / "schema.json"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def records_path(base_path: Path, table_name: str) -> Path:
|
|
24
|
+
return table_dir(base_path, table_name) / "records.jsonl"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def vectors_path(base_path: Path, table_name: str) -> Path:
|
|
28
|
+
return table_dir(base_path, table_name) / "vectors.npz"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def vectors_bin_path(base_path: Path, table_name: str) -> Path:
|
|
32
|
+
return table_dir(base_path, table_name) / "vectors.f32"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def vector_ids_path(base_path: Path, table_name: str) -> Path:
|
|
36
|
+
return table_dir(base_path, table_name) / "vector_ids.jsonl"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _apply_private_permissions(path: Path, is_dir: bool) -> None:
|
|
40
|
+
# Best effort only. On Windows, chmod is limited and ACLs are the real control.
|
|
41
|
+
try:
|
|
42
|
+
os.chmod(path, 0o700 if is_dir else 0o600)
|
|
43
|
+
except OSError:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def table_exists(base_path: Path, table_name: str) -> bool:
|
|
48
|
+
return schema_path(base_path, table_name).exists()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def create_table_files(base_path: Path, table_name: str, schema: Dict[str, Any], secure_permissions: bool = True) -> None:
|
|
52
|
+
tdir = table_dir(base_path, table_name)
|
|
53
|
+
ensure_path(tdir)
|
|
54
|
+
|
|
55
|
+
spath = schema_path(base_path, table_name)
|
|
56
|
+
rpath = records_path(base_path, table_name)
|
|
57
|
+
|
|
58
|
+
with spath.open("w", encoding="utf-8") as f:
|
|
59
|
+
json.dump(schema, f, ensure_ascii=True, indent=2)
|
|
60
|
+
|
|
61
|
+
if not rpath.exists():
|
|
62
|
+
rpath.touch()
|
|
63
|
+
|
|
64
|
+
if secure_permissions:
|
|
65
|
+
_apply_private_permissions(tdir, is_dir=True)
|
|
66
|
+
_apply_private_permissions(spath, is_dir=False)
|
|
67
|
+
_apply_private_permissions(rpath, is_dir=False)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def load_schema(base_path: Path, table_name: str) -> Dict[str, Any]:
|
|
71
|
+
spath = schema_path(base_path, table_name)
|
|
72
|
+
if not spath.exists():
|
|
73
|
+
raise ValueError(f"Table '{table_name}' does not exist.")
|
|
74
|
+
|
|
75
|
+
with spath.open("r", encoding="utf-8") as f:
|
|
76
|
+
return json.load(f)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def append_record(base_path: Path, table_name: str, record: Dict[str, Any], secure_permissions: bool = True) -> None:
|
|
80
|
+
rpath = records_path(base_path, table_name)
|
|
81
|
+
with rpath.open("a", encoding="utf-8") as f:
|
|
82
|
+
f.write(json.dumps(record, ensure_ascii=True) + "\n")
|
|
83
|
+
if secure_permissions:
|
|
84
|
+
_apply_private_permissions(rpath, is_dir=False)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def read_all_records(base_path: Path, table_name: str) -> List[Dict[str, Any]]:
|
|
88
|
+
rpath = records_path(base_path, table_name)
|
|
89
|
+
if not rpath.exists():
|
|
90
|
+
return []
|
|
91
|
+
|
|
92
|
+
items: List[Dict[str, Any]] = []
|
|
93
|
+
with rpath.open("r", encoding="utf-8") as f:
|
|
94
|
+
for line in f:
|
|
95
|
+
line = line.strip()
|
|
96
|
+
if line:
|
|
97
|
+
items.append(json.loads(line))
|
|
98
|
+
return items
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def append_vector(
|
|
102
|
+
base_path: Path,
|
|
103
|
+
table_name: str,
|
|
104
|
+
record_id: str,
|
|
105
|
+
vector: np.ndarray,
|
|
106
|
+
secure_permissions: bool = True,
|
|
107
|
+
) -> None:
|
|
108
|
+
id_path = vector_ids_path(base_path, table_name)
|
|
109
|
+
bin_path = vectors_bin_path(base_path, table_name)
|
|
110
|
+
|
|
111
|
+
with id_path.open("a", encoding="utf-8") as idf:
|
|
112
|
+
idf.write(json.dumps(record_id, ensure_ascii=True) + "\n")
|
|
113
|
+
|
|
114
|
+
vec = vector.astype(np.float32, copy=False)
|
|
115
|
+
with bin_path.open("ab") as vf:
|
|
116
|
+
vec.tofile(vf)
|
|
117
|
+
|
|
118
|
+
if secure_permissions:
|
|
119
|
+
_apply_private_permissions(id_path, is_dir=False)
|
|
120
|
+
_apply_private_permissions(bin_path, is_dir=False)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def load_vectors(base_path: Path, table_name: str, dimension: Optional[int] = None) -> Tuple[List[str], Optional[np.ndarray]]:
|
|
124
|
+
# Backward compatibility with the initial compressed storage format.
|
|
125
|
+
legacy_vpath = vectors_path(base_path, table_name)
|
|
126
|
+
if legacy_vpath.exists():
|
|
127
|
+
data = np.load(legacy_vpath, allow_pickle=True)
|
|
128
|
+
ids = data["ids"].tolist()
|
|
129
|
+
vectors = data["vectors"]
|
|
130
|
+
return ids, vectors
|
|
131
|
+
|
|
132
|
+
id_path = vector_ids_path(base_path, table_name)
|
|
133
|
+
bin_path = vectors_bin_path(base_path, table_name)
|
|
134
|
+
if not id_path.exists() or not bin_path.exists():
|
|
135
|
+
return [], None
|
|
136
|
+
|
|
137
|
+
ids: List[str] = []
|
|
138
|
+
with id_path.open("r", encoding="utf-8") as f:
|
|
139
|
+
for line in f:
|
|
140
|
+
line = line.strip()
|
|
141
|
+
if line:
|
|
142
|
+
ids.append(json.loads(line))
|
|
143
|
+
|
|
144
|
+
raw = np.fromfile(bin_path, dtype=np.float32)
|
|
145
|
+
if not ids:
|
|
146
|
+
return [], None
|
|
147
|
+
if dimension is None:
|
|
148
|
+
raise ValueError("dimension is required for binary vector loading.")
|
|
149
|
+
|
|
150
|
+
expected = len(ids) * dimension
|
|
151
|
+
if raw.size != expected:
|
|
152
|
+
raise ValueError(
|
|
153
|
+
f"Corrupt vector storage for table '{table_name}': expected {expected} float32 values, found {raw.size}."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return ids, raw.reshape(len(ids), dimension)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: menteedb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight local vector-aware database for Python
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: numpy>=1.24
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
10
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
11
|
+
|
|
12
|
+
# menteedb
|
|
13
|
+
|
|
14
|
+
menteedb is a lightweight local Python library that combines table-like records with optional vector similarity search.
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- Define tables with a schema.
|
|
19
|
+
- Insert structured records.
|
|
20
|
+
- Enable vector search on one text field per table.
|
|
21
|
+
- Add fast text contains search per table.
|
|
22
|
+
- Query by field filters and/or semantic similarity.
|
|
23
|
+
- Persist data locally with append-only files for speed.
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from menteedb import MenteeDB
|
|
29
|
+
|
|
30
|
+
db = MenteeDB(base_path="./data")
|
|
31
|
+
|
|
32
|
+
db.create_table(
|
|
33
|
+
table_name="notes",
|
|
34
|
+
fields={"title": "str", "body": "str", "tag": "str"},
|
|
35
|
+
vector_field="body",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
db.insert("notes", {"title": "First", "body": "Vector databases are useful.", "tag": "ml"})
|
|
39
|
+
db.insert("notes", {"title": "Second", "body": "I enjoy local-first tools.", "tag": "dev"})
|
|
40
|
+
|
|
41
|
+
results = db.query("notes", vector_query="local vector tools", top_k=2)
|
|
42
|
+
for item in results:
|
|
43
|
+
print(item["score"], item["record"])
|
|
44
|
+
|
|
45
|
+
text_hits = db.query("notes", text_query="local", text_fields=["body"])
|
|
46
|
+
print(text_hits)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Query Modes
|
|
50
|
+
|
|
51
|
+
- Filter-only:
|
|
52
|
+
- `db.query("notes", conditions={"tag": "ml"})`
|
|
53
|
+
- Text contains search:
|
|
54
|
+
- `db.query("notes", text_query="vector", text_fields=["title", "body"])`
|
|
55
|
+
- Vector-only:
|
|
56
|
+
- `db.query("notes", vector_query="your text")`
|
|
57
|
+
- Hybrid (filter + vector):
|
|
58
|
+
- `db.query("notes", conditions={"tag": "dev"}, vector_query="local tools")`
|
|
59
|
+
|
|
60
|
+
## Storage Layout
|
|
61
|
+
|
|
62
|
+
For `base_path="./data"` and table `notes`, menteedb stores:
|
|
63
|
+
|
|
64
|
+
- `./data/notes/schema.json`
|
|
65
|
+
- `./data/notes/records.jsonl`
|
|
66
|
+
- `./data/notes/vector_ids.jsonl`
|
|
67
|
+
- `./data/notes/vectors.f32`
|
|
68
|
+
|
|
69
|
+
This is local file-based storage. It is not publicly exposed over the network, but anyone with local filesystem access to this folder can read it.
|
|
70
|
+
|
|
71
|
+
## Privacy and Permissions
|
|
72
|
+
|
|
73
|
+
- By default, `MenteeDB(..., secure_permissions=True)` applies best-effort private permissions (`700` for table folders, `600` for files).
|
|
74
|
+
- On Windows, real privacy is controlled by NTFS ACLs; chmod behavior is limited.
|
|
75
|
+
|
|
76
|
+
## Testing
|
|
77
|
+
|
|
78
|
+
Run locally:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install .[dev]
|
|
82
|
+
pytest -q
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## CI/CD to PyPI
|
|
86
|
+
|
|
87
|
+
Workflow file: `.github/workflows/pypi-publish.yml`
|
|
88
|
+
|
|
89
|
+
- Runs tests on pushes to `main`, tags (`v*`), and releases.
|
|
90
|
+
- Publishes to PyPI on tag push (`v*`) or GitHub Release publish.
|
|
91
|
+
- Uses trusted publishing via GitHub OIDC.
|
|
92
|
+
|
|
93
|
+
## Notes
|
|
94
|
+
|
|
95
|
+
- This initial version supports one vector field per table.
|
|
96
|
+
- Default embeddings use a deterministic local hashing embedder with no external model download.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
./menteedb/__init__.py
|
|
4
|
+
./menteedb/core.py
|
|
5
|
+
./menteedb/embeddings.py
|
|
6
|
+
./menteedb/file_handler.py
|
|
7
|
+
menteedb/__init__.py
|
|
8
|
+
menteedb/core.py
|
|
9
|
+
menteedb/embeddings.py
|
|
10
|
+
menteedb/file_handler.py
|
|
11
|
+
menteedb.egg-info/PKG-INFO
|
|
12
|
+
menteedb.egg-info/SOURCES.txt
|
|
13
|
+
menteedb.egg-info/dependency_links.txt
|
|
14
|
+
menteedb.egg-info/requires.txt
|
|
15
|
+
menteedb.egg-info/top_level.txt
|
|
16
|
+
tests/test_core.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
menteedb
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "menteedb"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A lightweight local vector-aware database for Python"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"numpy>=1.24",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.optional-dependencies]
|
|
16
|
+
dev = [
|
|
17
|
+
"pytest>=8.0",
|
|
18
|
+
"build>=1.2",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[tool.setuptools]
|
|
22
|
+
package-dir = {"" = "."}
|
|
23
|
+
|
|
24
|
+
[tool.setuptools.packages.find]
|
|
25
|
+
where = ["."]
|
|
26
|
+
include = ["menteedb*"]
|
menteedb-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from menteedb import MenteeDB
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_create_insert_filter_and_text_query(tmp_path: Path) -> None:
|
|
7
|
+
db = MenteeDB(base_path=str(tmp_path))
|
|
8
|
+
db.create_table(
|
|
9
|
+
table_name="notes",
|
|
10
|
+
fields={"title": "str", "body": "str", "tag": "str"},
|
|
11
|
+
vector_field="body",
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
db.insert("notes", {"title": "A", "body": "Vector search is local", "tag": "ml"}, record_id="1")
|
|
15
|
+
db.insert("notes", {"title": "B", "body": "Fast tiny library", "tag": "dev"}, record_id="2")
|
|
16
|
+
|
|
17
|
+
filtered = db.query("notes", conditions={"tag": "dev"})
|
|
18
|
+
assert len(filtered) == 1
|
|
19
|
+
assert filtered[0]["id"] == "2"
|
|
20
|
+
|
|
21
|
+
text_hits = db.query("notes", text_query="tiny", text_fields=["body"])
|
|
22
|
+
assert len(text_hits) == 1
|
|
23
|
+
assert text_hits[0]["id"] == "2"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_vector_query_returns_ranked_results(tmp_path: Path) -> None:
|
|
27
|
+
db = MenteeDB(base_path=str(tmp_path))
|
|
28
|
+
db.create_table(
|
|
29
|
+
table_name="docs",
|
|
30
|
+
fields={"title": "str", "body": "str", "tag": "str"},
|
|
31
|
+
vector_field="body",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
db.insert("docs", {"title": "One", "body": "cats and pets", "tag": "a"}, record_id="r1")
|
|
35
|
+
db.insert("docs", {"title": "Two", "body": "dogs and parks", "tag": "b"}, record_id="r2")
|
|
36
|
+
|
|
37
|
+
results = db.query("docs", vector_query="pets", top_k=2)
|
|
38
|
+
assert len(results) == 2
|
|
39
|
+
assert results[0]["score"] >= results[1]["score"]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_rejects_invalid_table_name(tmp_path: Path) -> None:
|
|
43
|
+
db = MenteeDB(base_path=str(tmp_path))
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
db.create_table("../../bad", fields={"x": "str"})
|
|
47
|
+
assert False, "Expected ValueError"
|
|
48
|
+
except ValueError:
|
|
49
|
+
pass
|