4lt7ab-grimoire 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- 4lt7ab_grimoire-0.0.1/.gitignore +15 -0
- 4lt7ab_grimoire-0.0.1/PKG-INFO +11 -0
- 4lt7ab_grimoire-0.0.1/pyproject.toml +20 -0
- 4lt7ab_grimoire-0.0.1/src/grimoire/__init__.py +20 -0
- 4lt7ab_grimoire-0.0.1/src/grimoire/core.py +209 -0
- 4lt7ab_grimoire-0.0.1/src/grimoire/embedder.py +14 -0
- 4lt7ab_grimoire-0.0.1/src/grimoire/embedders/__init__.py +3 -0
- 4lt7ab_grimoire-0.0.1/src/grimoire/embedders/fastembed.py +49 -0
- 4lt7ab_grimoire-0.0.1/src/grimoire/errors.py +14 -0
- 4lt7ab_grimoire-0.0.1/src/grimoire/models.py +20 -0
- 4lt7ab_grimoire-0.0.1/src/grimoire/schema.py +71 -0
- 4lt7ab_grimoire-0.0.1/tests/test_fastembed_embedder.py +65 -0
- 4lt7ab_grimoire-0.0.1/tests/test_smoke.py +283 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: 4lt7ab-grimoire
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: SQLite + sqlite-vec semantic search datastore
|
|
5
|
+
Requires-Python: >=3.14
|
|
6
|
+
Requires-Dist: python-ulid>=3.0
|
|
7
|
+
Requires-Dist: sqlite-vec>=0.1.6
|
|
8
|
+
Requires-Dist: typing-extensions>=4.0
|
|
9
|
+
Provides-Extra: fastembed
|
|
10
|
+
Requires-Dist: fastembed>=0.4; extra == 'fastembed'
|
|
11
|
+
Requires-Dist: socksio>=1.0; extra == 'fastembed'
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "4lt7ab-grimoire"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "SQLite + sqlite-vec semantic search datastore"
|
|
5
|
+
requires-python = ">=3.14"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"python-ulid>=3.0",
|
|
8
|
+
"sqlite-vec>=0.1.6",
|
|
9
|
+
"typing-extensions>=4.0",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[project.optional-dependencies]
|
|
13
|
+
fastembed = ["fastembed>=0.4", "socksio>=1.0"]
|
|
14
|
+
|
|
15
|
+
[build-system]
|
|
16
|
+
requires = ["hatchling"]
|
|
17
|
+
build-backend = "hatchling.build"
|
|
18
|
+
|
|
19
|
+
[tool.hatch.build.targets.wheel]
|
|
20
|
+
packages = ["src/grimoire"]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from grimoire.core import Grimoire
|
|
2
|
+
from grimoire.embedder import Embedder
|
|
3
|
+
from grimoire.errors import (
|
|
4
|
+
GrimoireError,
|
|
5
|
+
GrimoireMismatch,
|
|
6
|
+
InvalidEmbedder,
|
|
7
|
+
SchemaVersionError,
|
|
8
|
+
)
|
|
9
|
+
from grimoire.models import Entry, Stats
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"Embedder",
|
|
13
|
+
"Entry",
|
|
14
|
+
"Grimoire",
|
|
15
|
+
"GrimoireError",
|
|
16
|
+
"GrimoireMismatch",
|
|
17
|
+
"InvalidEmbedder",
|
|
18
|
+
"SchemaVersionError",
|
|
19
|
+
"Stats",
|
|
20
|
+
]
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sqlite3
|
|
3
|
+
import struct
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Self
|
|
6
|
+
|
|
7
|
+
import sqlite_vec
|
|
8
|
+
from ulid import ULID
|
|
9
|
+
|
|
10
|
+
from grimoire.embedder import Embedder
|
|
11
|
+
from grimoire.models import Entry, Stats
|
|
12
|
+
from grimoire.schema import bootstrap
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Grimoire:
|
|
16
|
+
"""A semantically-indexed datastore backed by one SQLite file."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, *, conn: sqlite3.Connection, embedder: Embedder) -> None:
|
|
19
|
+
self._conn = conn
|
|
20
|
+
self._embedder = embedder
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def open(cls, path: str | Path, *, embedder: Embedder) -> Self:
|
|
24
|
+
conn = _open_conn(str(path))
|
|
25
|
+
try:
|
|
26
|
+
bootstrap(conn, embedder)
|
|
27
|
+
return cls(conn=conn, embedder=embedder)
|
|
28
|
+
except BaseException:
|
|
29
|
+
conn.close()
|
|
30
|
+
raise
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def peek(cls, path: str | Path) -> Stats | None:
|
|
34
|
+
"""Read metadata and counts from a grimoire file without opening it for use.
|
|
35
|
+
|
|
36
|
+
Returns None if the file does not exist or is not a grimoire database.
|
|
37
|
+
Does not load sqlite-vec or require an embedder, so it is safe for
|
|
38
|
+
inspection (CLI `info`, model auto-detect) before deciding how to open.
|
|
39
|
+
"""
|
|
40
|
+
path = Path(path)
|
|
41
|
+
if not path.exists():
|
|
42
|
+
return None
|
|
43
|
+
try:
|
|
44
|
+
conn = sqlite3.connect(path)
|
|
45
|
+
try:
|
|
46
|
+
row = conn.execute(
|
|
47
|
+
"SELECT model, dimension FROM grimoire WHERE id = 1"
|
|
48
|
+
).fetchone()
|
|
49
|
+
if row is None:
|
|
50
|
+
return None
|
|
51
|
+
version = conn.execute("PRAGMA user_version").fetchone()[0]
|
|
52
|
+
count = conn.execute("SELECT COUNT(*) FROM entries").fetchone()[0]
|
|
53
|
+
kind_rows = conn.execute(
|
|
54
|
+
"SELECT kind, COUNT(*) FROM entries GROUP BY kind ORDER BY kind"
|
|
55
|
+
).fetchall()
|
|
56
|
+
finally:
|
|
57
|
+
conn.close()
|
|
58
|
+
except sqlite3.Error:
|
|
59
|
+
return None
|
|
60
|
+
return Stats(
|
|
61
|
+
model=row[0],
|
|
62
|
+
dimension=row[1],
|
|
63
|
+
schema_version=version,
|
|
64
|
+
entry_count=count,
|
|
65
|
+
kinds=dict(kind_rows),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def add(
|
|
69
|
+
self,
|
|
70
|
+
*,
|
|
71
|
+
kind: str,
|
|
72
|
+
content: str,
|
|
73
|
+
payload: dict[str, Any] | None = None,
|
|
74
|
+
threshold: float | None = None,
|
|
75
|
+
) -> Entry:
|
|
76
|
+
entry_id = str(ULID())
|
|
77
|
+
vector = self._embedder.embed(content)
|
|
78
|
+
payload_json = json.dumps(payload) if payload is not None else None
|
|
79
|
+
|
|
80
|
+
with self._conn:
|
|
81
|
+
self._conn.execute(
|
|
82
|
+
"""
|
|
83
|
+
INSERT INTO entries (id, kind, content, payload, threshold)
|
|
84
|
+
VALUES (?, ?, ?, ?, ?)
|
|
85
|
+
""",
|
|
86
|
+
(entry_id, kind, content, payload_json, threshold),
|
|
87
|
+
)
|
|
88
|
+
self._conn.execute(
|
|
89
|
+
"INSERT INTO vectors (entry_id, kind, embedding) VALUES (?, ?, ?)",
|
|
90
|
+
(entry_id, kind, _pack(vector)),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return Entry(
|
|
94
|
+
id=entry_id,
|
|
95
|
+
kind=kind,
|
|
96
|
+
content=content,
|
|
97
|
+
payload=payload_json,
|
|
98
|
+
threshold=threshold,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def get(self, entry_id: str) -> Entry | None:
|
|
102
|
+
row = self._conn.execute(
|
|
103
|
+
"SELECT id, kind, content, payload, threshold FROM entries WHERE id = ?",
|
|
104
|
+
(entry_id,),
|
|
105
|
+
).fetchone()
|
|
106
|
+
return _row_to_entry(row) if row is not None else None
|
|
107
|
+
|
|
108
|
+
def list(
|
|
109
|
+
self,
|
|
110
|
+
*,
|
|
111
|
+
kind: str | None = None,
|
|
112
|
+
limit: int = 100,
|
|
113
|
+
after_id: str | None = None,
|
|
114
|
+
) -> list[Entry]:
|
|
115
|
+
sql = "SELECT id, kind, content, payload, threshold FROM entries"
|
|
116
|
+
params: list[Any] = []
|
|
117
|
+
clauses: list[str] = []
|
|
118
|
+
if kind is not None:
|
|
119
|
+
clauses.append("kind = ?")
|
|
120
|
+
params.append(kind)
|
|
121
|
+
if after_id is not None:
|
|
122
|
+
clauses.append("id > ?")
|
|
123
|
+
params.append(after_id)
|
|
124
|
+
if clauses:
|
|
125
|
+
sql += " WHERE " + " AND ".join(clauses)
|
|
126
|
+
sql += " ORDER BY id LIMIT ?"
|
|
127
|
+
params.append(limit)
|
|
128
|
+
|
|
129
|
+
rows = self._conn.execute(sql, params).fetchall()
|
|
130
|
+
return [_row_to_entry(r) for r in rows]
|
|
131
|
+
|
|
132
|
+
def delete(self, entry_id: str) -> bool:
|
|
133
|
+
with self._conn:
|
|
134
|
+
cursor = self._conn.execute("DELETE FROM entries WHERE id = ?", (entry_id,))
|
|
135
|
+
if cursor.rowcount == 0:
|
|
136
|
+
return False
|
|
137
|
+
self._conn.execute("DELETE FROM vectors WHERE entry_id = ?", (entry_id,))
|
|
138
|
+
return True
|
|
139
|
+
|
|
140
|
+
def search(
|
|
141
|
+
self,
|
|
142
|
+
query: str,
|
|
143
|
+
*,
|
|
144
|
+
kind: str | None = None,
|
|
145
|
+
k: int = 10,
|
|
146
|
+
dynamic_threshold: bool = False,
|
|
147
|
+
) -> list[Entry]:
|
|
148
|
+
vector = self._embedder.embed(query)
|
|
149
|
+
|
|
150
|
+
sql = (
|
|
151
|
+
"SELECT e.id, e.kind, e.content, e.payload, e.threshold, v.distance "
|
|
152
|
+
"FROM vectors v JOIN entries e ON e.id = v.entry_id "
|
|
153
|
+
"WHERE v.embedding MATCH ? AND k = ?"
|
|
154
|
+
)
|
|
155
|
+
params: list[Any] = [_pack(vector), k]
|
|
156
|
+
if kind is not None:
|
|
157
|
+
sql += " AND v.kind = ?"
|
|
158
|
+
params.append(kind)
|
|
159
|
+
sql += " ORDER BY v.distance"
|
|
160
|
+
|
|
161
|
+
rows = self._conn.execute(sql, params).fetchall()
|
|
162
|
+
results = [
|
|
163
|
+
Entry(
|
|
164
|
+
id=r[0],
|
|
165
|
+
kind=r[1],
|
|
166
|
+
content=r[2],
|
|
167
|
+
payload=r[3],
|
|
168
|
+
threshold=r[4],
|
|
169
|
+
distance=r[5],
|
|
170
|
+
)
|
|
171
|
+
for r in rows
|
|
172
|
+
]
|
|
173
|
+
if dynamic_threshold:
|
|
174
|
+
results = [
|
|
175
|
+
r for r in results if r.threshold is None or r.distance <= r.threshold
|
|
176
|
+
]
|
|
177
|
+
return results
|
|
178
|
+
|
|
179
|
+
def close(self) -> None:
|
|
180
|
+
self._conn.close()
|
|
181
|
+
|
|
182
|
+
def __enter__(self) -> Self:
|
|
183
|
+
return self
|
|
184
|
+
|
|
185
|
+
def __exit__(self, *exc: object) -> None:
|
|
186
|
+
self.close()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _open_conn(path: str) -> sqlite3.Connection:
|
|
190
|
+
conn = sqlite3.connect(path)
|
|
191
|
+
conn.enable_load_extension(True)
|
|
192
|
+
sqlite_vec.load(conn)
|
|
193
|
+
conn.enable_load_extension(False)
|
|
194
|
+
conn.execute("PRAGMA foreign_keys = ON")
|
|
195
|
+
return conn
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _pack(vector: list[float]) -> bytes:
|
|
199
|
+
return struct.pack(f"{len(vector)}f", *vector)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _row_to_entry(row: tuple) -> Entry:
|
|
203
|
+
return Entry(
|
|
204
|
+
id=row[0],
|
|
205
|
+
kind=row[1],
|
|
206
|
+
content=row[2],
|
|
207
|
+
payload=row[3],
|
|
208
|
+
threshold=row[4],
|
|
209
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import Protocol, runtime_checkable
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@runtime_checkable
|
|
5
|
+
class Embedder(Protocol):
|
|
6
|
+
"""Produces fixed-dimension vectors for text. Caller-supplied to a Grimoire."""
|
|
7
|
+
|
|
8
|
+
@property
|
|
9
|
+
def model(self) -> str: ...
|
|
10
|
+
|
|
11
|
+
@property
|
|
12
|
+
def dimension(self) -> int: ...
|
|
13
|
+
|
|
14
|
+
def embed(self, text: str) -> list[float]: ...
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class FastembedEmbedder:
|
|
5
|
+
"""Embedder backed by Qdrant's `fastembed` library (ONNX Runtime).
|
|
6
|
+
|
|
7
|
+
Requires the optional extra: `pip install grimoire[fastembed]`.
|
|
8
|
+
|
|
9
|
+
`cache_folder` is required — the library does not pick a default
|
|
10
|
+
filesystem location on the caller's behalf.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
model_name: str = "BAAI/bge-small-en-v1.5",
|
|
16
|
+
*,
|
|
17
|
+
cache_folder: str | Path,
|
|
18
|
+
threads: int | None = None,
|
|
19
|
+
) -> None:
|
|
20
|
+
try:
|
|
21
|
+
from fastembed import TextEmbedding
|
|
22
|
+
except ImportError as exc:
|
|
23
|
+
raise ImportError(
|
|
24
|
+
"FastembedEmbedder requires the `fastembed` extra. "
|
|
25
|
+
"Install with: pip install grimoire[fastembed]"
|
|
26
|
+
) from exc
|
|
27
|
+
|
|
28
|
+
self._model_name = model_name
|
|
29
|
+
self._model = TextEmbedding(
|
|
30
|
+
model_name=model_name,
|
|
31
|
+
cache_dir=str(cache_folder),
|
|
32
|
+
threads=threads,
|
|
33
|
+
)
|
|
34
|
+
# Determine dimension by embedding a probe — works regardless of
|
|
35
|
+
# fastembed's internal model registry shape.
|
|
36
|
+
[probe] = list(self._model.embed(["dimension probe"]))
|
|
37
|
+
self._dimension = len(probe)
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def model(self) -> str:
|
|
41
|
+
return self._model_name
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def dimension(self) -> int:
|
|
45
|
+
return self._dimension
|
|
46
|
+
|
|
47
|
+
def embed(self, text: str) -> list[float]:
|
|
48
|
+
[vector] = list(self._model.embed([text]))
|
|
49
|
+
return vector.tolist()
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
class GrimoireError(Exception):
|
|
2
|
+
"""Base exception for all grimoire errors."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GrimoireMismatch(GrimoireError):
|
|
6
|
+
"""An embedder's model or dimension does not match the stored grimoire."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SchemaVersionError(GrimoireError):
|
|
10
|
+
"""The database file's schema version does not match what the library expects."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InvalidEmbedder(GrimoireError):
|
|
14
|
+
"""An embedder reported a model or dimension outside the allowed shape."""
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class Entry:
|
|
6
|
+
id: str
|
|
7
|
+
kind: str
|
|
8
|
+
content: str
|
|
9
|
+
payload: str | None = None
|
|
10
|
+
threshold: float | None = None
|
|
11
|
+
distance: float | None = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Stats:
|
|
16
|
+
model: str
|
|
17
|
+
dimension: int
|
|
18
|
+
schema_version: int
|
|
19
|
+
entry_count: int
|
|
20
|
+
kinds: dict[str, int] = field(default_factory=dict)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
|
|
3
|
+
from grimoire.embedder import Embedder
|
|
4
|
+
from grimoire.errors import GrimoireMismatch, InvalidEmbedder, SchemaVersionError
|
|
5
|
+
|
|
6
|
+
SCHEMA_VERSION = 1
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def bootstrap(conn: sqlite3.Connection, embedder: Embedder) -> None:
|
|
10
|
+
_validate_embedder(embedder)
|
|
11
|
+
version = conn.execute("PRAGMA user_version").fetchone()[0]
|
|
12
|
+
if version == 0:
|
|
13
|
+
conn.executescript(
|
|
14
|
+
f"""
|
|
15
|
+
CREATE TABLE grimoire (
|
|
16
|
+
id INTEGER PRIMARY KEY CHECK (id = 1),
|
|
17
|
+
model TEXT NOT NULL,
|
|
18
|
+
dimension INTEGER NOT NULL
|
|
19
|
+
);
|
|
20
|
+
CREATE TABLE entries (
|
|
21
|
+
id TEXT PRIMARY KEY,
|
|
22
|
+
kind TEXT NOT NULL,
|
|
23
|
+
content TEXT NOT NULL,
|
|
24
|
+
payload TEXT,
|
|
25
|
+
threshold REAL
|
|
26
|
+
);
|
|
27
|
+
CREATE INDEX entries_kind ON entries(kind);
|
|
28
|
+
CREATE VIRTUAL TABLE vectors USING vec0(
|
|
29
|
+
entry_id TEXT PRIMARY KEY,
|
|
30
|
+
kind TEXT partition key,
|
|
31
|
+
embedding FLOAT[{embedder.dimension}]
|
|
32
|
+
);
|
|
33
|
+
"""
|
|
34
|
+
)
|
|
35
|
+
conn.execute(
|
|
36
|
+
"INSERT INTO grimoire (id, model, dimension) VALUES (1, ?, ?)",
|
|
37
|
+
(embedder.model, embedder.dimension),
|
|
38
|
+
)
|
|
39
|
+
conn.execute(f"PRAGMA user_version = {SCHEMA_VERSION}")
|
|
40
|
+
conn.commit()
|
|
41
|
+
return
|
|
42
|
+
if version != SCHEMA_VERSION:
|
|
43
|
+
raise SchemaVersionError(
|
|
44
|
+
f"Database schema version is {version}, library expects {SCHEMA_VERSION}"
|
|
45
|
+
)
|
|
46
|
+
row = conn.execute("SELECT model, dimension FROM grimoire WHERE id = 1").fetchone()
|
|
47
|
+
if row is None:
|
|
48
|
+
raise SchemaVersionError("Database is missing its grimoire row")
|
|
49
|
+
stored_model, stored_dim = row
|
|
50
|
+
if stored_model != embedder.model or stored_dim != embedder.dimension:
|
|
51
|
+
raise GrimoireMismatch(
|
|
52
|
+
f"Embedder (model={embedder.model!r}, dim={embedder.dimension}) "
|
|
53
|
+
f"does not match grimoire "
|
|
54
|
+
f"(model={stored_model!r}, dim={stored_dim})"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _validate_embedder(embedder: Embedder) -> None:
|
|
59
|
+
if not isinstance(embedder.dimension, int) or isinstance(embedder.dimension, bool):
|
|
60
|
+
raise InvalidEmbedder(
|
|
61
|
+
f"Embedder dimension must be an int, "
|
|
62
|
+
f"got {type(embedder.dimension).__name__}"
|
|
63
|
+
)
|
|
64
|
+
if embedder.dimension <= 0:
|
|
65
|
+
raise InvalidEmbedder(
|
|
66
|
+
f"Embedder dimension must be positive, got {embedder.dimension}"
|
|
67
|
+
)
|
|
68
|
+
if not isinstance(embedder.model, str) or not embedder.model:
|
|
69
|
+
raise InvalidEmbedder(
|
|
70
|
+
f"Embedder model must be a non-empty string, got {embedder.model!r}"
|
|
71
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Integration tests for the FastembedEmbedder.
|
|
2
|
+
|
|
3
|
+
Skipped unless the `fastembed` extra is installed:
|
|
4
|
+
uv sync --package grimoire --extra fastembed
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
pytest.importorskip("fastembed")
|
|
12
|
+
|
|
13
|
+
from grimoire import Grimoire # noqa: E402
|
|
14
|
+
from grimoire.embedders import FastembedEmbedder # noqa: E402
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.fixture
|
|
18
|
+
def cache_dir(tmp_path, monkeypatch):
|
|
19
|
+
"""Pin the model cache to a sandbox-safe location for tests."""
|
|
20
|
+
cache = tmp_path / "fastembed_cache"
|
|
21
|
+
monkeypatch.setenv("HF_HOME", str(cache))
|
|
22
|
+
return cache
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_default_model_dimension(cache_dir):
|
|
26
|
+
e = FastembedEmbedder(cache_folder=cache_dir)
|
|
27
|
+
assert e.model == "BAAI/bge-small-en-v1.5"
|
|
28
|
+
assert e.dimension == 384
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_embed_returns_correct_length_vector(cache_dir):
|
|
32
|
+
e = FastembedEmbedder(cache_folder=cache_dir)
|
|
33
|
+
vector = e.embed("hello world")
|
|
34
|
+
assert len(vector) == e.dimension
|
|
35
|
+
assert all(isinstance(x, float) for x in vector)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_round_trip_through_grimoire(tmp_path, cache_dir):
|
|
39
|
+
e = FastembedEmbedder(cache_folder=cache_dir)
|
|
40
|
+
with Grimoire.open(tmp_path / "store.db", embedder=e) as g:
|
|
41
|
+
g.add(kind="note", content="the moon is full tonight")
|
|
42
|
+
g.add(kind="note", content="dragons fly at midnight")
|
|
43
|
+
|
|
44
|
+
results = g.search("the moon is full tonight", k=2)
|
|
45
|
+
assert len(results) == 2
|
|
46
|
+
assert results[0].content == "the moon is full tonight"
|
|
47
|
+
assert results[0].distance < results[1].distance
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_cache_folder_pass_through(tmp_path):
|
|
51
|
+
cache = tmp_path / "models"
|
|
52
|
+
e = FastembedEmbedder(cache_folder=cache)
|
|
53
|
+
assert e.dimension == 384
|
|
54
|
+
# fastembed creates the cache directory lazily; just confirm the embedder
|
|
55
|
+
# initialized without error and the path is at least a directory or its
|
|
56
|
+
# parent exists.
|
|
57
|
+
assert cache.exists() or cache.parent.exists()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Skip-marker safety: ensure HF_HOME doesn't leak into other tests.
|
|
61
|
+
def test_env_isolation():
|
|
62
|
+
if os.environ.get("HF_HOME"):
|
|
63
|
+
# monkeypatch.setenv from earlier fixtures restores on teardown,
|
|
64
|
+
# so this should never be set when this test runs without the fixture.
|
|
65
|
+
pass
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from grimoire import Entry, Grimoire, GrimoireMismatch, InvalidEmbedder, Stats
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FakeEmbedder:
|
|
8
|
+
def __init__(self, model: str = "fake-v1", dimension: int = 8) -> None:
|
|
9
|
+
self._model = model
|
|
10
|
+
self._dimension = dimension
|
|
11
|
+
|
|
12
|
+
@property
|
|
13
|
+
def model(self) -> str:
|
|
14
|
+
return self._model
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def dimension(self) -> int:
|
|
18
|
+
return self._dimension
|
|
19
|
+
|
|
20
|
+
def embed(self, text: str) -> list[float]:
|
|
21
|
+
digest = hashlib.sha256(text.encode()).digest()
|
|
22
|
+
return [(b - 128) / 128.0 for b in digest[: self._dimension]]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_open_creates_file_idempotently(tmp_path):
|
|
26
|
+
db = tmp_path / "store.db"
|
|
27
|
+
Grimoire.open(db, embedder=FakeEmbedder()).close()
|
|
28
|
+
Grimoire.open(db, embedder=FakeEmbedder()).close()
|
|
29
|
+
assert db.exists()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_embedder_model_mismatch_raises(tmp_path):
|
|
33
|
+
db = tmp_path / "store.db"
|
|
34
|
+
Grimoire.open(db, embedder=FakeEmbedder(model="alpha")).close()
|
|
35
|
+
with pytest.raises(GrimoireMismatch):
|
|
36
|
+
Grimoire.open(db, embedder=FakeEmbedder(model="beta"))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_embedder_dimension_mismatch_raises(tmp_path):
|
|
40
|
+
db = tmp_path / "store.db"
|
|
41
|
+
Grimoire.open(db, embedder=FakeEmbedder(dimension=8)).close()
|
|
42
|
+
with pytest.raises(GrimoireMismatch):
|
|
43
|
+
Grimoire.open(db, embedder=FakeEmbedder(dimension=16))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_add_returns_entry(tmp_path):
|
|
47
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
48
|
+
entry = g.add(kind="note", content="the moon is full")
|
|
49
|
+
assert isinstance(entry, Entry)
|
|
50
|
+
assert entry.kind == "note"
|
|
51
|
+
assert entry.content == "the moon is full"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_search_finds_exact_match_first(tmp_path):
|
|
55
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
56
|
+
g.add(kind="note", content="the moon is full")
|
|
57
|
+
g.add(kind="note", content="dragons fly at midnight")
|
|
58
|
+
g.add(kind="note", content="potions bubble in the cauldron")
|
|
59
|
+
|
|
60
|
+
results = g.search("the moon is full", k=3)
|
|
61
|
+
assert len(results) == 3
|
|
62
|
+
assert results[0].content == "the moon is full"
|
|
63
|
+
assert results[0].distance == 0.0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_search_filters_by_kind(tmp_path):
|
|
67
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
68
|
+
g.add(kind="spell", content="lumos")
|
|
69
|
+
g.add(kind="potion", content="lumos")
|
|
70
|
+
|
|
71
|
+
results = g.search("lumos", kind="spell", k=10)
|
|
72
|
+
assert len(results) == 1
|
|
73
|
+
assert results[0].kind == "spell"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_dynamic_threshold_drops_low_match(tmp_path):
|
|
77
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
78
|
+
g.add(kind="note", content="the moon is full", threshold=0.0)
|
|
79
|
+
g.add(kind="note", content="dragons fly at midnight", threshold=0.0)
|
|
80
|
+
|
|
81
|
+
all_results = g.search("the moon is full", k=10)
|
|
82
|
+
assert len(all_results) == 2
|
|
83
|
+
|
|
84
|
+
gated = g.search("the moon is full", k=10, dynamic_threshold=True)
|
|
85
|
+
assert len(gated) == 1
|
|
86
|
+
assert gated[0].content == "the moon is full"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_two_files_are_independent(tmp_path):
|
|
90
|
+
a_path = tmp_path / "a.db"
|
|
91
|
+
b_path = tmp_path / "b.db"
|
|
92
|
+
with Grimoire.open(a_path, embedder=FakeEmbedder()) as a:
|
|
93
|
+
a.add(kind="note", content="alpha")
|
|
94
|
+
with Grimoire.open(b_path, embedder=FakeEmbedder()) as b:
|
|
95
|
+
b.add(kind="note", content="beta")
|
|
96
|
+
results = b.search("alpha", k=10)
|
|
97
|
+
assert all(r.content != "alpha" for r in results)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_data_persists_across_reopens(tmp_path):
|
|
101
|
+
db = tmp_path / "store.db"
|
|
102
|
+
with Grimoire.open(db, embedder=FakeEmbedder()) as g:
|
|
103
|
+
g.add(kind="note", content="the moon is full")
|
|
104
|
+
|
|
105
|
+
with Grimoire.open(db, embedder=FakeEmbedder()) as g:
|
|
106
|
+
results = g.search("the moon is full", k=1)
|
|
107
|
+
assert len(results) == 1
|
|
108
|
+
assert results[0].content == "the moon is full"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_get_returns_entry(tmp_path):
|
|
112
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
113
|
+
added = g.add(kind="note", content="lumos")
|
|
114
|
+
fetched = g.get(added.id)
|
|
115
|
+
assert fetched is not None
|
|
116
|
+
assert fetched.id == added.id
|
|
117
|
+
assert fetched.content == "lumos"
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_get_returns_none_for_missing_id(tmp_path):
|
|
121
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
122
|
+
assert g.get("01HXXXXXXXXXXXXXXXXXXXXXXX") is None
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_list_returns_all_entries_in_chronological_order(tmp_path):
|
|
126
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
127
|
+
a = g.add(kind="note", content="first")
|
|
128
|
+
b = g.add(kind="note", content="second")
|
|
129
|
+
c = g.add(kind="note", content="third")
|
|
130
|
+
results = g.list()
|
|
131
|
+
assert [r.id for r in results] == [a.id, b.id, c.id]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_list_filters_by_kind(tmp_path):
|
|
135
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
136
|
+
g.add(kind="spell", content="lumos")
|
|
137
|
+
g.add(kind="potion", content="felix felicis")
|
|
138
|
+
g.add(kind="spell", content="alohomora")
|
|
139
|
+
|
|
140
|
+
spells = g.list(kind="spell")
|
|
141
|
+
assert len(spells) == 2
|
|
142
|
+
assert all(r.kind == "spell" for r in spells)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_list_paginates_via_after_id(tmp_path):
|
|
146
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
147
|
+
added = [g.add(kind="note", content=f"e{i}") for i in range(5)]
|
|
148
|
+
|
|
149
|
+
page1 = g.list(limit=2)
|
|
150
|
+
assert [r.id for r in page1] == [added[0].id, added[1].id]
|
|
151
|
+
|
|
152
|
+
page2 = g.list(limit=2, after_id=page1[-1].id)
|
|
153
|
+
assert [r.id for r in page2] == [added[2].id, added[3].id]
|
|
154
|
+
|
|
155
|
+
page3 = g.list(limit=2, after_id=page2[-1].id)
|
|
156
|
+
assert [r.id for r in page3] == [added[4].id]
|
|
157
|
+
|
|
158
|
+
page4 = g.list(limit=2, after_id=page3[-1].id)
|
|
159
|
+
assert page4 == []
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_list_respects_limit(tmp_path):
|
|
163
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
164
|
+
for i in range(5):
|
|
165
|
+
g.add(kind="note", content=f"e{i}")
|
|
166
|
+
assert len(g.list(limit=3)) == 3
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def test_delete_removes_entry_and_vector(tmp_path):
|
|
170
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
171
|
+
added = g.add(kind="note", content="ephemeral")
|
|
172
|
+
assert g.delete(added.id) is True
|
|
173
|
+
assert g.get(added.id) is None
|
|
174
|
+
|
|
175
|
+
# Search should no longer return it.
|
|
176
|
+
results = g.search("ephemeral", k=10)
|
|
177
|
+
assert all(r.id != added.id for r in results)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def test_delete_returns_false_for_missing_id(tmp_path):
|
|
181
|
+
with Grimoire.open(tmp_path / "store.db", embedder=FakeEmbedder()) as g:
|
|
182
|
+
assert g.delete("01HXXXXXXXXXXXXXXXXXXXXXXX") is False
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class _BadDimensionEmbedder:
|
|
186
|
+
@property
|
|
187
|
+
def model(self) -> str:
|
|
188
|
+
return "bad"
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def dimension(self): # not annotated, returns whatever
|
|
192
|
+
return "8); DROP TABLE entries; --"
|
|
193
|
+
|
|
194
|
+
def embed(self, text: str) -> list[float]:
|
|
195
|
+
return [0.0] * 8
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class _NonPositiveDimensionEmbedder:
|
|
199
|
+
@property
|
|
200
|
+
def model(self) -> str:
|
|
201
|
+
return "bad"
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def dimension(self) -> int:
|
|
205
|
+
return 0
|
|
206
|
+
|
|
207
|
+
def embed(self, text: str) -> list[float]:
|
|
208
|
+
return []
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class _EmptyModelEmbedder:
|
|
212
|
+
@property
|
|
213
|
+
def model(self) -> str:
|
|
214
|
+
return ""
|
|
215
|
+
|
|
216
|
+
@property
|
|
217
|
+
def dimension(self) -> int:
|
|
218
|
+
return 8
|
|
219
|
+
|
|
220
|
+
def embed(self, text: str) -> list[float]:
|
|
221
|
+
return [0.0] * 8
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def test_embedder_with_non_int_dimension_rejected(tmp_path):
|
|
225
|
+
db = tmp_path / "store.db"
|
|
226
|
+
with pytest.raises(InvalidEmbedder):
|
|
227
|
+
Grimoire.open(db, embedder=_BadDimensionEmbedder())
|
|
228
|
+
assert not db.exists() or db.stat().st_size == 0
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def test_embedder_with_zero_dimension_rejected(tmp_path):
|
|
232
|
+
with pytest.raises(InvalidEmbedder):
|
|
233
|
+
Grimoire.open(tmp_path / "store.db", embedder=_NonPositiveDimensionEmbedder())
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def test_embedder_with_empty_model_rejected(tmp_path):
|
|
237
|
+
with pytest.raises(InvalidEmbedder):
|
|
238
|
+
Grimoire.open(tmp_path / "store.db", embedder=_EmptyModelEmbedder())
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# ---------- peek ----------
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def test_peek_returns_none_for_missing_file(tmp_path):
|
|
245
|
+
assert Grimoire.peek(tmp_path / "nope.db") is None
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def test_peek_returns_none_for_non_grimoire_file(tmp_path):
|
|
249
|
+
import sqlite3
|
|
250
|
+
|
|
251
|
+
db = tmp_path / "stranger.db"
|
|
252
|
+
conn = sqlite3.connect(db)
|
|
253
|
+
conn.execute("CREATE TABLE other (x INTEGER)")
|
|
254
|
+
conn.commit()
|
|
255
|
+
conn.close()
|
|
256
|
+
assert Grimoire.peek(db) is None
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def test_peek_returns_stats_for_initialized_grimoire(tmp_path):
|
|
260
|
+
db = tmp_path / "store.db"
|
|
261
|
+
with Grimoire.open(db, embedder=FakeEmbedder(model="m1", dimension=8)) as g:
|
|
262
|
+
g.add(kind="note", content="alpha")
|
|
263
|
+
g.add(kind="note", content="beta")
|
|
264
|
+
g.add(kind="spell", content="lumos")
|
|
265
|
+
|
|
266
|
+
stats = Grimoire.peek(db)
|
|
267
|
+
assert isinstance(stats, Stats)
|
|
268
|
+
assert stats.model == "m1"
|
|
269
|
+
assert stats.dimension == 8
|
|
270
|
+
assert stats.schema_version == 1
|
|
271
|
+
assert stats.entry_count == 3
|
|
272
|
+
assert stats.kinds == {"note": 2, "spell": 1}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def test_peek_does_not_require_embedder_or_extension(tmp_path):
|
|
276
|
+
# peek must be safe on a freshly-created file from another process,
|
|
277
|
+
# without sqlite-vec or an embedder loaded.
|
|
278
|
+
db = tmp_path / "store.db"
|
|
279
|
+
Grimoire.open(db, embedder=FakeEmbedder()).close()
|
|
280
|
+
stats = Grimoire.peek(db)
|
|
281
|
+
assert stats is not None
|
|
282
|
+
assert stats.entry_count == 0
|
|
283
|
+
assert stats.kinds == {}
|