nerva-mneme 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nerva_mneme-0.1.0/.env.example +15 -0
- nerva_mneme-0.1.0/.gitignore +32 -0
- nerva_mneme-0.1.0/LICENSE +21 -0
- nerva_mneme-0.1.0/PKG-INFO +73 -0
- nerva_mneme-0.1.0/README.md +47 -0
- nerva_mneme-0.1.0/mneme/__init__.py +113 -0
- nerva_mneme-0.1.0/mneme/cache.py +50 -0
- nerva_mneme-0.1.0/mneme/chunker.py +69 -0
- nerva_mneme-0.1.0/mneme/cli.py +66 -0
- nerva_mneme-0.1.0/mneme/config.py +67 -0
- nerva_mneme-0.1.0/mneme/corpus.py +34 -0
- nerva_mneme-0.1.0/mneme/db.py +106 -0
- nerva_mneme-0.1.0/mneme/digest.py +54 -0
- nerva_mneme-0.1.0/mneme/gen.py +59 -0
- nerva_mneme-0.1.0/mneme/loader.py +55 -0
- nerva_mneme-0.1.0/mneme/models.py +74 -0
- nerva_mneme-0.1.0/mneme/presets.py +52 -0
- nerva_mneme-0.1.0/mneme/sweep.py +130 -0
- nerva_mneme-0.1.0/mneme/types.py +65 -0
- nerva_mneme-0.1.0/pyproject.toml +45 -0
- nerva_mneme-0.1.0/uv.lock +328 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
DATABASE_URL=postgresql://postgres@localhost:5432/mneme
|
|
2
|
+
|
|
3
|
+
# Path to documents for digest. Local path or URL.
|
|
4
|
+
DATA_PATH=https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
|
|
5
|
+
|
|
6
|
+
# API key for cloud providers. Leave empty for local (Ollama, vLLM).
|
|
7
|
+
API_KEY=
|
|
8
|
+
|
|
9
|
+
# Override defaults (Ollama localhost with bge-m3 / llama3).
|
|
10
|
+
# Works with any /v1/-compatible API (Ollama, vLLM, etc).
|
|
11
|
+
# EMBEDDER_URL=http://localhost:11434
|
|
12
|
+
# EMBEDDER_MODEL=bge-m3
|
|
13
|
+
# EMBEDDING_DIM=1024
|
|
14
|
+
# INFERENCE_URL=http://localhost:11434
|
|
15
|
+
# INFERENCE_MODEL=llama3:8b-instruct-q4_K_M
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.venv/
|
|
9
|
+
venv/
|
|
10
|
+
|
|
11
|
+
# Env
|
|
12
|
+
.env
|
|
13
|
+
.env.*
|
|
14
|
+
!.env.example
|
|
15
|
+
|
|
16
|
+
# Editors
|
|
17
|
+
.idea/
|
|
18
|
+
.vscode/
|
|
19
|
+
|
|
20
|
+
# Logs
|
|
21
|
+
*.log
|
|
22
|
+
|
|
23
|
+
# Cache
|
|
24
|
+
.cache/
|
|
25
|
+
|
|
26
|
+
# Project-local notes
|
|
27
|
+
CLAUDE.md
|
|
28
|
+
.design/
|
|
29
|
+
.research/
|
|
30
|
+
.eval/
|
|
31
|
+
todo.md
|
|
32
|
+
tests/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pavel Dolgov
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nerva-mneme
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local-first RAG with built-in eval. Hybrid search, parameter sweep, one command.
|
|
5
|
+
Project-URL: Repository, https://github.com/nerva-project/mneme
|
|
6
|
+
Author-email: Pavel Dolgov <mikepromogratus@proton.me>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: embeddings,eval,local-first,pgvector,rag,search
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Requires-Dist: asyncpg>=0.29
|
|
17
|
+
Requires-Dist: click>=8.1
|
|
18
|
+
Requires-Dist: httpx>=0.27
|
|
19
|
+
Requires-Dist: numpy>=2.0
|
|
20
|
+
Requires-Dist: pgvector>=0.3
|
|
21
|
+
Requires-Dist: python-dotenv>=1.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# Mneme
|
|
28
|
+
|
|
29
|
+
Local RAG pipeline with built-in evaluation. Postgres + pgvector for hybrid search, any /v1/-compatible LLM backend for embeddings and inference.
|
|
30
|
+
|
|
31
|
+
## Setup
|
|
32
|
+
|
|
33
|
+
Requires Python 3.12+, [uv](https://docs.astral.sh/uv/), Postgres with the `pgvector` extension.
|
|
34
|
+
|
|
35
|
+
Copy `.env.example` to `.env` and fill in your values, then install:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
cp .env.example .env
|
|
39
|
+
uv sync
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
uv run mneme digest # parse DATA_PATH source into cache
|
|
46
|
+
uv run mneme ingest <file.jsonl>
|
|
47
|
+
uv run mneme ask "query"
|
|
48
|
+
uv run mneme sweep <fast|medium|thorough> --limit 30
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Library
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from mneme import Mneme, Config
|
|
55
|
+
|
|
56
|
+
cfg = Config(database_url="postgresql://...", api_key="sk-...")
|
|
57
|
+
|
|
58
|
+
async with Mneme(cfg) as m:
|
|
59
|
+
await m.ingest("./corpus")
|
|
60
|
+
answer = await m.ask("What is X?")
|
|
61
|
+
|
|
62
|
+
rows = await Mneme.sweep(cfg, "medium", limit=30)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Input format
|
|
66
|
+
|
|
67
|
+
JSONL, one document per line:
|
|
68
|
+
|
|
69
|
+
```json
|
|
70
|
+
{"content": "...", "source": "optional", "created_at": "2026-04-01T12:00:00Z", "metadata": {}}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Only `content` is required. `source` falls back to the file stem, `created_at` to the current time, `metadata` to `{}`.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Mneme
|
|
2
|
+
|
|
3
|
+
Local RAG pipeline with built-in evaluation. Postgres + pgvector for hybrid search, any /v1/-compatible LLM backend for embeddings and inference.
|
|
4
|
+
|
|
5
|
+
## Setup
|
|
6
|
+
|
|
7
|
+
Requires Python 3.12+, [uv](https://docs.astral.sh/uv/), Postgres with the `pgvector` extension.
|
|
8
|
+
|
|
9
|
+
Copy `.env.example` to `.env` and fill in your values, then install:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
cp .env.example .env
|
|
13
|
+
uv sync
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
uv run mneme digest # parse DATA_PATH source into cache
|
|
20
|
+
uv run mneme ingest <file.jsonl>
|
|
21
|
+
uv run mneme ask "query"
|
|
22
|
+
uv run mneme sweep <fast|medium|thorough> --limit 30
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Library
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from mneme import Mneme, Config
|
|
29
|
+
|
|
30
|
+
cfg = Config(database_url="postgresql://...", api_key="sk-...")
|
|
31
|
+
|
|
32
|
+
async with Mneme(cfg) as m:
|
|
33
|
+
await m.ingest("./corpus")
|
|
34
|
+
answer = await m.ask("What is X?")
|
|
35
|
+
|
|
36
|
+
rows = await Mneme.sweep(cfg, "medium", limit=30)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Input format
|
|
40
|
+
|
|
41
|
+
JSONL, one document per line:
|
|
42
|
+
|
|
43
|
+
```json
|
|
44
|
+
{"content": "...", "source": "optional", "created_at": "2026-04-01T12:00:00Z", "metadata": {}}
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Only `content` is required. `source` falls back to the file stem, `created_at` to the current time, `metadata` to `{}`.
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
from .chunker import chunk
|
|
8
|
+
from .config import Config
|
|
9
|
+
from .db import Db
|
|
10
|
+
from .loader import load_docs
|
|
11
|
+
from .models import chat, embed
|
|
12
|
+
from .types import Chunk, SearchHit
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Mneme:
|
|
16
|
+
"""RAG engine with built-in eval. Lifecycle: Mneme(cfg) → open() → work → close().
|
|
17
|
+
Or use `async with Mneme(cfg) as m:` for automatic cleanup."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, cfg: Config) -> None:
|
|
20
|
+
self.cfg = cfg.resolved()
|
|
21
|
+
|
|
22
|
+
async def open(self) -> None:
|
|
23
|
+
self.db = Db(self.cfg.database_url, self.cfg.embedding_dim)
|
|
24
|
+
await self.db.open()
|
|
25
|
+
await self.db.init_schema()
|
|
26
|
+
self.http = httpx.AsyncClient(timeout=60.0)
|
|
27
|
+
|
|
28
|
+
async def close(self) -> None:
|
|
29
|
+
await self.db.close()
|
|
30
|
+
await self.http.aclose()
|
|
31
|
+
|
|
32
|
+
async def __aenter__(self) -> Mneme:
|
|
33
|
+
await self.open()
|
|
34
|
+
return self
|
|
35
|
+
|
|
36
|
+
async def __aexit__(self, *_: object) -> None:
|
|
37
|
+
await self.close()
|
|
38
|
+
|
|
39
|
+
async def reset(self) -> None:
|
|
40
|
+
await self.db.truncate()
|
|
41
|
+
|
|
42
|
+
async def ingest(self, source_path: str) -> None:
|
|
43
|
+
docs = load_docs(source_path)
|
|
44
|
+
|
|
45
|
+
class Piece:
|
|
46
|
+
def __init__(self, doc, idx, raw):
|
|
47
|
+
self.doc = doc
|
|
48
|
+
self.idx = idx
|
|
49
|
+
self.raw = raw
|
|
50
|
+
|
|
51
|
+
pieces = []
|
|
52
|
+
for doc in docs:
|
|
53
|
+
for idx, raw in enumerate(chunk(doc.content, self.cfg.chunk_size, self.cfg.overlap)):
|
|
54
|
+
pieces.append(Piece(doc, idx, raw))
|
|
55
|
+
|
|
56
|
+
texts = [p.raw.overlapped() for p in pieces]
|
|
57
|
+
vectors = await embed(self.cfg, self.http, texts)
|
|
58
|
+
print(f"embedded {len(texts)} chunks in one call")
|
|
59
|
+
|
|
60
|
+
chunks = [
|
|
61
|
+
Chunk(
|
|
62
|
+
id=hashlib.md5(f"{p.doc.source}:{p.idx}:{p.raw.clean}".encode()).hexdigest(),
|
|
63
|
+
source=p.doc.source,
|
|
64
|
+
chunk_index=p.idx,
|
|
65
|
+
content=p.raw.clean,
|
|
66
|
+
embedding=vectors[i],
|
|
67
|
+
metadata=p.doc.metadata,
|
|
68
|
+
created_at=p.doc.created_at,
|
|
69
|
+
)
|
|
70
|
+
for i, p in enumerate(pieces)
|
|
71
|
+
]
|
|
72
|
+
await self.db.insert(chunks)
|
|
73
|
+
print(f"ingest done: {len(chunks)} chunks from {len(docs)} docs")
|
|
74
|
+
|
|
75
|
+
async def ask(self, query: str) -> str:
|
|
76
|
+
vectors = await embed(self.cfg, self.http, [query])
|
|
77
|
+
hits = await self.db.search(self.cfg, vectors[0], query)
|
|
78
|
+
|
|
79
|
+
if hits:
|
|
80
|
+
return await self._answer_with_context(query, hits)
|
|
81
|
+
else:
|
|
82
|
+
return await self._answer_without_context(query)
|
|
83
|
+
|
|
84
|
+
async def _answer_with_context(self, query: str, hits: list[SearchHit]) -> str:
|
|
85
|
+
prompt = "You are a personal knowledge assistant. You answer questions based ONLY on the provided context. If the context doesn't contain enough information, say so honestly. Answer in the same language as the question. Be concise and direct."
|
|
86
|
+
|
|
87
|
+
parts: list[str] = []
|
|
88
|
+
for i, h in enumerate(hits):
|
|
89
|
+
c = h.chunk
|
|
90
|
+
date = c.created_at.date().isoformat()
|
|
91
|
+
parts.append(f"[{i + 1}] ({date}, {c.source}, sim={h.similarity:.3f})\n{c.content}")
|
|
92
|
+
context = "\n\n".join(parts)
|
|
93
|
+
|
|
94
|
+
return await chat(self.cfg, self.http, prompt, f"Context:\n{context}\n\nQuestion: {query}")
|
|
95
|
+
|
|
96
|
+
async def _answer_without_context(self, query: str) -> str:
|
|
97
|
+
prompt = "You are a knowledge assistant. Answer the question directly based on your general knowledge. Answer in the same language as the question. Be concise and direct."
|
|
98
|
+
return await chat(self.cfg, self.http, prompt, query)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
from .digest import digest as _digest # noqa: E402
|
|
102
|
+
from .sweep import run_sweep, SweepRow, EvalMetrics, EvalResult # noqa: E402
|
|
103
|
+
|
|
104
|
+
Mneme.digest = staticmethod(_digest)
|
|
105
|
+
Mneme.sweep = staticmethod(run_sweep)
|
|
106
|
+
|
|
107
|
+
__all__ = [
|
|
108
|
+
"Mneme",
|
|
109
|
+
"Config",
|
|
110
|
+
"SweepRow",
|
|
111
|
+
"EvalMetrics",
|
|
112
|
+
"EvalResult",
|
|
113
|
+
]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""File-based JSONL cache for expensive operations."""
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
CACHE_DIR = Path(".cache")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def corpus_hash(source_path: str) -> str:
|
|
10
|
+
"""Deterministic hash of source file(s) content."""
|
|
11
|
+
h = hashlib.md5()
|
|
12
|
+
p = Path(source_path)
|
|
13
|
+
if p.is_file():
|
|
14
|
+
h.update(p.read_bytes())
|
|
15
|
+
elif p.is_dir():
|
|
16
|
+
for f in sorted(p.rglob("*.jsonl")):
|
|
17
|
+
h.update(f.read_bytes())
|
|
18
|
+
return h.hexdigest()[:12]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Cache:
|
|
22
|
+
"""Generic JSONL cache. Doesn't know what it stores — just dicts."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, **params: object) -> None:
|
|
25
|
+
raw = json.dumps(params, sort_keys=True, default=str)
|
|
26
|
+
key = hashlib.md5(raw.encode()).hexdigest()[:16]
|
|
27
|
+
self._path = CACHE_DIR / f"{key}.jsonl"
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def path(self) -> Path:
|
|
31
|
+
return self._path
|
|
32
|
+
|
|
33
|
+
def exists(self) -> bool:
|
|
34
|
+
return self._path.exists()
|
|
35
|
+
|
|
36
|
+
def load(self) -> list[dict] | None:
|
|
37
|
+
if not self._path.exists():
|
|
38
|
+
return None
|
|
39
|
+
try:
|
|
40
|
+
return [json.loads(line) for line in self._path.read_text().splitlines()]
|
|
41
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
42
|
+
self._path.unlink(missing_ok=True)
|
|
43
|
+
print(f"cache corrupt, deleted: {self._path}")
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
def save(self, rows: list[dict]) -> None:
|
|
47
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
with open(self._path, "w") as f:
|
|
49
|
+
for row in rows:
|
|
50
|
+
f.write(json.dumps(row) + "\n")
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
# Strongest separators first. Empty string is the final fallback (per-character).
|
|
4
|
+
SEPARATORS = ["\n\n", "\n", ". ", ", ", " ", ""]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class ChunkData:
|
|
9
|
+
"""A chunk with its overlap context. The embedding is computed from
|
|
10
|
+
`overlapped()` so the vector 'knows' about its neighbors, but only
|
|
11
|
+
`clean` is stored in the database — no text duplication between rows."""
|
|
12
|
+
clean: str
|
|
13
|
+
head: str
|
|
14
|
+
tail: str
|
|
15
|
+
|
|
16
|
+
def overlapped(self) -> str:
|
|
17
|
+
return self.head + self.clean + self.tail
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def chunk(text: str, chunk_size: int, overlap: float) -> list[ChunkData]:
|
|
21
|
+
"""Recursive character text splitter: split by separator hierarchy,
|
|
22
|
+
then greedily merge adjacent pieces up to chunk_size."""
|
|
23
|
+
if not text:
|
|
24
|
+
return []
|
|
25
|
+
|
|
26
|
+
clean = _merge(_separate(text.strip(), chunk_size), chunk_size)
|
|
27
|
+
overlap_chars = int(chunk_size * overlap)
|
|
28
|
+
result: list[ChunkData] = []
|
|
29
|
+
|
|
30
|
+
for i, piece in enumerate(clean):
|
|
31
|
+
head = clean[i - 1][-overlap_chars:] if (overlap_chars > 0 and i > 0) else ""
|
|
32
|
+
tail = clean[i + 1][:overlap_chars] if (overlap_chars > 0 and i < len(clean) - 1) else ""
|
|
33
|
+
result.append(ChunkData(clean=piece, head=head, tail=tail))
|
|
34
|
+
|
|
35
|
+
return result
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _separate(text: str, chunk_size: int, depth: int = 0) -> list[str]:
|
|
39
|
+
if depth >= len(SEPARATORS):
|
|
40
|
+
return [text]
|
|
41
|
+
|
|
42
|
+
sep = SEPARATORS[depth]
|
|
43
|
+
parts = text.split(sep)
|
|
44
|
+
result: list[str] = []
|
|
45
|
+
|
|
46
|
+
for i, part in enumerate(parts):
|
|
47
|
+
if len(part) < chunk_size:
|
|
48
|
+
# Reattach the separator as a suffix so _merge can reconstruct
|
|
49
|
+
# the original text without losing whitespace or newlines.
|
|
50
|
+
suffix = sep if i < len(parts) - 1 else ""
|
|
51
|
+
result.append(part + suffix)
|
|
52
|
+
else:
|
|
53
|
+
result.extend(_separate(part, chunk_size, depth + 1))
|
|
54
|
+
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _merge(splits: list[str], chunk_size: int) -> list[str]:
|
|
59
|
+
raw: list[str] = []
|
|
60
|
+
for s in splits:
|
|
61
|
+
if not s:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
if raw and len(raw[-1]) + len(s) < chunk_size:
|
|
65
|
+
raw[-1] += s
|
|
66
|
+
else:
|
|
67
|
+
raw.append(s)
|
|
68
|
+
|
|
69
|
+
return raw
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
|
|
6
|
+
from . import Mneme
|
|
7
|
+
from .config import Config
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _mneme() -> Mneme:
|
|
11
|
+
return Mneme(Config.from_env())
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@click.group(help="Mneme — RAG with built-in eval")
|
|
15
|
+
def app() -> None:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@app.command(help="Digest raw source into cached JSONL.")
|
|
20
|
+
@click.argument("source", default="")
|
|
21
|
+
def digest(source: str) -> None:
|
|
22
|
+
cfg = Config.from_env()
|
|
23
|
+
data_path = source or cfg.data_path
|
|
24
|
+
Mneme.digest(data_path)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@app.command(help="Ingest documents from a JSONL file or directory.")
|
|
28
|
+
@click.argument("source")
|
|
29
|
+
def ingest(source: str) -> None:
|
|
30
|
+
async def run() -> None:
|
|
31
|
+
async with _mneme() as m:
|
|
32
|
+
await m.ingest(source)
|
|
33
|
+
asyncio.run(run())
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@app.command(help="Ask a question against the ingested corpus.")
|
|
37
|
+
@click.argument("query")
|
|
38
|
+
def ask(query: str) -> None:
|
|
39
|
+
async def run() -> None:
|
|
40
|
+
async with _mneme() as m:
|
|
41
|
+
answer = await m.ask(query)
|
|
42
|
+
print(f"\n{answer}\n")
|
|
43
|
+
asyncio.run(run())
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@app.command(help="Run an eval sweep across preset configurations.")
|
|
47
|
+
@click.argument("level")
|
|
48
|
+
@click.option("--limit", "-l", default=30, type=int, help="Number of sample chunks for eval")
|
|
49
|
+
def sweep(level: str, limit: int) -> None:
|
|
50
|
+
async def run() -> None:
|
|
51
|
+
cfg = Config.from_env()
|
|
52
|
+
await Mneme.sweep(cfg, level, limit)
|
|
53
|
+
asyncio.run(run())
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def main() -> None:
|
|
57
|
+
load_dotenv()
|
|
58
|
+
try:
|
|
59
|
+
app()
|
|
60
|
+
except (RuntimeError, ValueError) as exc:
|
|
61
|
+
print(f"error: {exc}")
|
|
62
|
+
raise SystemExit(1)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
main()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass, replace
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class Config:
|
|
9
|
+
database_url: str = ""
|
|
10
|
+
data_path: str = ""
|
|
11
|
+
api_key: str = ""
|
|
12
|
+
embedder_url: str = ""
|
|
13
|
+
embedder_model: str = ""
|
|
14
|
+
embedding_dim: int = 0
|
|
15
|
+
inference_url: str = ""
|
|
16
|
+
inference_model: str = ""
|
|
17
|
+
chunk_size: int = 600
|
|
18
|
+
overlap: float = 0.0
|
|
19
|
+
alpha: float = 0.7
|
|
20
|
+
k: int = 5
|
|
21
|
+
|
|
22
|
+
def resolved(self) -> Config:
|
|
23
|
+
"""Fills empty fields from defaults, validates, returns new Config."""
|
|
24
|
+
if not self.database_url:
|
|
25
|
+
raise ValueError("config: database_url is required")
|
|
26
|
+
|
|
27
|
+
cfg = replace(
|
|
28
|
+
self,
|
|
29
|
+
embedder_url=self.embedder_url or DEFAULTS.embedder_url,
|
|
30
|
+
embedder_model=self.embedder_model or DEFAULTS.embedder_model,
|
|
31
|
+
embedding_dim=self.embedding_dim or DEFAULTS.embedding_dim,
|
|
32
|
+
inference_url=self.inference_url or DEFAULTS.inference_url,
|
|
33
|
+
inference_model=self.inference_model or DEFAULTS.inference_model,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
if cfg.chunk_size < 100 or cfg.chunk_size > 10000:
|
|
37
|
+
raise ValueError(f"config: chunk_size must be 100..10000, got {cfg.chunk_size}")
|
|
38
|
+
if cfg.overlap < 0 or cfg.overlap > 0.5:
|
|
39
|
+
raise ValueError(f"config: overlap must be 0..0.5, got {cfg.overlap}")
|
|
40
|
+
if cfg.alpha < 0 or cfg.alpha > 1:
|
|
41
|
+
raise ValueError(f"config: alpha must be 0..1, got {cfg.alpha}")
|
|
42
|
+
if cfg.k < 1 or cfg.k > 20:
|
|
43
|
+
raise ValueError(f"config: k must be 1..20, got {cfg.k}")
|
|
44
|
+
|
|
45
|
+
return cfg
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def from_env() -> Config:
|
|
49
|
+
return Config(
|
|
50
|
+
database_url=os.environ.get("DATABASE_URL", ""),
|
|
51
|
+
data_path=os.environ.get("DATA_PATH", ""),
|
|
52
|
+
api_key=os.environ.get("API_KEY", ""),
|
|
53
|
+
embedder_url=os.environ.get("EMBEDDER_URL", ""),
|
|
54
|
+
embedder_model=os.environ.get("EMBEDDER_MODEL", ""),
|
|
55
|
+
embedding_dim=int(os.environ.get("EMBEDDING_DIM", "0")),
|
|
56
|
+
inference_url=os.environ.get("INFERENCE_URL", ""),
|
|
57
|
+
inference_model=os.environ.get("INFERENCE_MODEL", ""),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
DEFAULTS = Config(
|
|
62
|
+
embedder_url="http://localhost:11434",
|
|
63
|
+
embedder_model="bge-m3",
|
|
64
|
+
embedding_dim=1024,
|
|
65
|
+
inference_url="http://localhost:11434",
|
|
66
|
+
inference_model="llama3:8b-instruct-q4_K_M",
|
|
67
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Built-in SQuAD parser. Used by digest when DATA_PATH is a URL."""
|
|
2
|
+
import json
|
|
3
|
+
from urllib.error import URLError
|
|
4
|
+
from urllib.request import urlopen
|
|
5
|
+
|
|
6
|
+
SQUAD_LIMIT = 200
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def download_squad(url: str) -> list[dict]:
|
|
10
|
+
print(f"downloading from {url}...")
|
|
11
|
+
try:
|
|
12
|
+
raw = json.loads(urlopen(url).read())
|
|
13
|
+
except (URLError, json.JSONDecodeError) as exc:
|
|
14
|
+
raise RuntimeError(f"failed to download {url}: {exc}") from exc
|
|
15
|
+
|
|
16
|
+
docs = []
|
|
17
|
+
for article in raw["data"]:
|
|
18
|
+
title = article["title"]
|
|
19
|
+
for para in article["paragraphs"]:
|
|
20
|
+
context = para["context"].strip()
|
|
21
|
+
if len(context) < 50:
|
|
22
|
+
continue
|
|
23
|
+
docs.append({
|
|
24
|
+
"content": context,
|
|
25
|
+
"source": title,
|
|
26
|
+
"metadata": {"dataset": "squad-v2"},
|
|
27
|
+
})
|
|
28
|
+
if len(docs) >= SQUAD_LIMIT:
|
|
29
|
+
break
|
|
30
|
+
if len(docs) >= SQUAD_LIMIT:
|
|
31
|
+
break
|
|
32
|
+
|
|
33
|
+
print(f"downloaded {len(docs)} paragraphs")
|
|
34
|
+
return docs
|