epochdb 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epochdb-0.1.0/LICENSE +21 -0
- epochdb-0.1.0/PKG-INFO +39 -0
- epochdb-0.1.0/README.md +21 -0
- epochdb-0.1.0/epochdb/__init__.py +4 -0
- epochdb-0.1.0/epochdb/atom.py +46 -0
- epochdb-0.1.0/epochdb/cold_tier.py +81 -0
- epochdb-0.1.0/epochdb/engine.py +132 -0
- epochdb-0.1.0/epochdb/hot_tier.py +72 -0
- epochdb-0.1.0/epochdb/retrieval.py +88 -0
- epochdb-0.1.0/epochdb/transaction.py +73 -0
- epochdb-0.1.0/epochdb.egg-info/PKG-INFO +39 -0
- epochdb-0.1.0/epochdb.egg-info/SOURCES.txt +15 -0
- epochdb-0.1.0/epochdb.egg-info/dependency_links.txt +1 -0
- epochdb-0.1.0/epochdb.egg-info/requires.txt +3 -0
- epochdb-0.1.0/epochdb.egg-info/top_level.txt +1 -0
- epochdb-0.1.0/pyproject.toml +27 -0
- epochdb-0.1.0/setup.cfg +4 -0
epochdb-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jeff
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
epochdb-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: epochdb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An agentic memory engine designed for lossless, tiered verbatim storage and multi-hop retrieval.
|
|
5
|
+
Author: Jeff
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: Homepage, https://github.com/jeff/epochdb
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: pyarrow
|
|
16
|
+
Requires-Dist: hnswlib
|
|
17
|
+
Dynamic: license-file
|
|
18
|
+
|
|
19
|
+
# EpochDB
|
|
20
|
+
|
|
21
|
+
**EpochDB** is an agentic memory engine designed for lossless, tiered verbatim storage and multi-hop retrieval.
|
|
22
|
+
|
|
23
|
+
## Why
|
|
24
|
+
I had this idea while playing with LMDB. I wanted to create a memory system that could store conversations in a hybrid way, using in-memory for the most recent conversations and on-disk for older conversations. So, in order to have immutable data, I decided to use Parquet files for the on-disk storage.
|
|
25
|
+
|
|
26
|
+
## Overview
|
|
27
|
+
Traditional AI memory systems compress conversations through destructive summarization. EpochDB bypasses this constraint by storing "Unified Memory Atoms"—the raw text intrinsically paired with dense embeddings.
|
|
28
|
+
|
|
29
|
+
EpochDB uses a tiered architecture reminiscent of CPU caching:
|
|
30
|
+
1. **L1: Working Memory**: Sub-millisecond HNSW vector index in RAM.
|
|
31
|
+
2. **L2: Historical Archive**: Cold storage in immutable, time-partitioned `.parquet` files via PyArrow.
|
|
32
|
+
|
|
33
|
+
It uniquely handles multi-hop retrieval over time-partitioned data using a **Global Entity Index**.
|
|
34
|
+
|
|
35
|
+
## How It Works
|
|
36
|
+
See [`how_it_works.md`](how_it_works.md) for a detailed technical dive into the architecture.
|
|
37
|
+
|
|
38
|
+
## Benchmarks & Examples
|
|
39
|
+
See [`benchmark.md`](benchmark.md) for traces of EpochDB successfully integrated via `LangGraph`. Check out [`example_langgraph.py`](example_langgraph.py) for the source code.
|
epochdb-0.1.0/README.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# EpochDB
|
|
2
|
+
|
|
3
|
+
**EpochDB** is an agentic memory engine designed for lossless, tiered verbatim storage and multi-hop retrieval.
|
|
4
|
+
|
|
5
|
+
## Why
|
|
6
|
+
I had this idea while playing with LMDB. I wanted to create a memory system that could store conversations in a hybrid way, using in-memory for the most recent conversations and on-disk for older conversations. So, in order to have immutable data, I decided to use Parquet files for the on-disk storage.
|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
Traditional AI memory systems compress conversations through destructive summarization. EpochDB bypasses this constraint by storing "Unified Memory Atoms"—the raw text intrinsically paired with dense embeddings.
|
|
10
|
+
|
|
11
|
+
EpochDB uses a tiered architecture reminiscent of CPU caching:
|
|
12
|
+
1. **L1: Working Memory**: Sub-millisecond HNSW vector index in RAM.
|
|
13
|
+
2. **L2: Historical Archive**: Cold storage in immutable, time-partitioned `.parquet` files via PyArrow.
|
|
14
|
+
|
|
15
|
+
It uniquely handles multi-hop retrieval over time-partitioned data using a **Global Entity Index**.
|
|
16
|
+
|
|
17
|
+
## How It Works
|
|
18
|
+
See [`how_it_works.md`](how_it_works.md) for a detailed technical dive into the architecture.
|
|
19
|
+
|
|
20
|
+
## Benchmarks & Examples
|
|
21
|
+
See [`benchmark.md`](benchmark.md) for traces of EpochDB successfully integrated via `LangGraph`. Check out [`example_langgraph.py`](example_langgraph.py) for the source code.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
import time
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, List, Tuple
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class UnifiedMemoryAtom:
|
|
9
|
+
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
10
|
+
payload: Any = None
|
|
11
|
+
embedding: np.ndarray = field(default_factory=lambda: np.array([]))
|
|
12
|
+
triples: List[Tuple[str, str, str]] = field(default_factory=list)
|
|
13
|
+
created_at: float = field(default_factory=time.time)
|
|
14
|
+
access_count: int = 0
|
|
15
|
+
epoch_id: str = "active"
|
|
16
|
+
|
|
17
|
+
def calculate_saliency(self) -> float:
|
|
18
|
+
"""
|
|
19
|
+
S = R / (T + 1)
|
|
20
|
+
where R is access_count and T is time since creation in hours.
|
|
21
|
+
"""
|
|
22
|
+
T_hours = (time.time() - self.created_at) / 3600.0
|
|
23
|
+
return self.access_count / (T_hours + 1.0)
|
|
24
|
+
|
|
25
|
+
def to_dict(self):
|
|
26
|
+
return {
|
|
27
|
+
"id": self.id,
|
|
28
|
+
"payload": self.payload,
|
|
29
|
+
"embedding": self.embedding.tolist(),
|
|
30
|
+
"triples": self.triples,
|
|
31
|
+
"created_at": self.created_at,
|
|
32
|
+
"access_count": self.access_count,
|
|
33
|
+
"epoch_id": self.epoch_id
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_dict(cls, data):
|
|
38
|
+
return cls(
|
|
39
|
+
id=data["id"],
|
|
40
|
+
payload=data["payload"],
|
|
41
|
+
embedding=np.array(data["embedding"], dtype=np.float32),
|
|
42
|
+
triples=[tuple(t) for t in data["triples"]],
|
|
43
|
+
created_at=data["created_at"],
|
|
44
|
+
access_count=data["access_count"],
|
|
45
|
+
epoch_id=data["epoch_id"]
|
|
46
|
+
)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pyarrow as pa
|
|
3
|
+
import pyarrow.parquet as pq
|
|
4
|
+
import numpy as np
|
|
5
|
+
from typing import List, Dict
|
|
6
|
+
from .atom import UnifiedMemoryAtom
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
class ColdTier:
|
|
12
|
+
"""
|
|
13
|
+
L2 Historical Archive. Resides on Disk.
|
|
14
|
+
Uses Parquet format.
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self, storage_dir: str):
|
|
17
|
+
self.storage_dir = storage_dir
|
|
18
|
+
os.makedirs(self.storage_dir, exist_ok=True)
|
|
19
|
+
|
|
20
|
+
def serialize_epoch(self, epoch_id: str, atoms: List[UnifiedMemoryAtom]):
|
|
21
|
+
"""Flushes hot partition to Parquet blocks."""
|
|
22
|
+
if not atoms:
|
|
23
|
+
return
|
|
24
|
+
|
|
25
|
+
file_path = os.path.join(self.storage_dir, f"epoch_{epoch_id}.parquet")
|
|
26
|
+
|
|
27
|
+
ids = [a.id for a in atoms]
|
|
28
|
+
payloads = [str(a.payload) for a in atoms]
|
|
29
|
+
embeddings = [a.embedding.tolist() for a in atoms]
|
|
30
|
+
created_ats = [a.created_at for a in atoms]
|
|
31
|
+
access_counts = [a.access_count for a in atoms]
|
|
32
|
+
triples_str = [str(a.triples) for a in atoms]
|
|
33
|
+
|
|
34
|
+
table = pa.table({
|
|
35
|
+
"id": ids,
|
|
36
|
+
"payload": payloads,
|
|
37
|
+
"embedding": embeddings,
|
|
38
|
+
"triples": triples_str,
|
|
39
|
+
"created_at": created_ats,
|
|
40
|
+
"access_count": access_counts,
|
|
41
|
+
"epoch_id": [epoch_id] * len(atoms)
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
pq.write_table(table, file_path)
|
|
45
|
+
logger.info(f"Serialized {len(atoms)} atoms to {file_path}")
|
|
46
|
+
|
|
47
|
+
def load_epoch(self, epoch_id: str) -> List[UnifiedMemoryAtom]:
|
|
48
|
+
file_path = os.path.join(self.storage_dir, f"epoch_{epoch_id}.parquet")
|
|
49
|
+
if not os.path.exists(file_path):
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
table = pq.read_table(file_path)
|
|
53
|
+
rows = table.to_pylist()
|
|
54
|
+
|
|
55
|
+
atoms = []
|
|
56
|
+
import ast
|
|
57
|
+
for row in rows:
|
|
58
|
+
try:
|
|
59
|
+
triples = ast.literal_eval(row['triples'])
|
|
60
|
+
except:
|
|
61
|
+
triples = []
|
|
62
|
+
|
|
63
|
+
atom = UnifiedMemoryAtom(
|
|
64
|
+
id=row['id'],
|
|
65
|
+
payload=row['payload'],
|
|
66
|
+
embedding=np.array(row['embedding'], dtype=np.float32),
|
|
67
|
+
triples=triples,
|
|
68
|
+
created_at=row['created_at'],
|
|
69
|
+
access_count=row['access_count'],
|
|
70
|
+
epoch_id=row['epoch_id']
|
|
71
|
+
)
|
|
72
|
+
atoms.append(atom)
|
|
73
|
+
return atoms
|
|
74
|
+
|
|
75
|
+
def get_all_epochs(self) -> List[str]:
|
|
76
|
+
epochs = []
|
|
77
|
+
for f in os.listdir(self.storage_dir):
|
|
78
|
+
if f.startswith("epoch_") and f.endswith(".parquet"):
|
|
79
|
+
epoch_id = f[len("epoch_"):-len(".parquet")]
|
|
80
|
+
epochs.append(epoch_id)
|
|
81
|
+
return epochs
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from typing import List, Optional, Dict
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from .atom import UnifiedMemoryAtom
|
|
9
|
+
from .hot_tier import HotTier
|
|
10
|
+
from .cold_tier import ColdTier
|
|
11
|
+
from .transaction import WriteAheadLog, FileLock, MultiIndexTransaction
|
|
12
|
+
from .retrieval import RetrievalManager
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
class EpochDB:
|
|
17
|
+
"""The main client for the Agentic Memory Engine."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, storage_dir: str = "./.epochdb_data", dim: int = 384, epoch_duration_secs: int = 3600, saliency_threshold: float = 0.1):
|
|
20
|
+
self.storage_dir = os.path.abspath(storage_dir)
|
|
21
|
+
self.dim = dim
|
|
22
|
+
self.epoch_duration_secs = epoch_duration_secs
|
|
23
|
+
self.saliency_threshold = saliency_threshold
|
|
24
|
+
|
|
25
|
+
os.makedirs(self.storage_dir, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
# Concurrency Lock
|
|
28
|
+
self.lock = FileLock(os.path.join(self.storage_dir, ".lock"))
|
|
29
|
+
self.lock.acquire()
|
|
30
|
+
|
|
31
|
+
# WAL
|
|
32
|
+
self.wal = WriteAheadLog(os.path.join(self.storage_dir, "wal.jsonl"))
|
|
33
|
+
|
|
34
|
+
# Global Entity Index
|
|
35
|
+
self.global_kg_file = os.path.join(self.storage_dir, "global_kg.json")
|
|
36
|
+
self.global_kg: Dict[str, List[List[str]]] = {}
|
|
37
|
+
if os.path.exists(self.global_kg_file):
|
|
38
|
+
try:
|
|
39
|
+
with open(self.global_kg_file, "r") as f:
|
|
40
|
+
self.global_kg = json.load(f)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.error(f"Failed to load global KG: {e}")
|
|
43
|
+
|
|
44
|
+
# Epoch State
|
|
45
|
+
self.current_epoch_id = f"epoch_{int(time.time())}"
|
|
46
|
+
self.epoch_start_time = time.time()
|
|
47
|
+
|
|
48
|
+
# Tiers
|
|
49
|
+
self.hot_tier = HotTier(dim=self.dim)
|
|
50
|
+
self.cold_tier = ColdTier(self.storage_dir)
|
|
51
|
+
|
|
52
|
+
# Retrieval
|
|
53
|
+
# We pass self.global_kg explicitly to the retriever now
|
|
54
|
+
self.retriever = RetrievalManager(self.hot_tier, self.cold_tier, self.global_kg)
|
|
55
|
+
|
|
56
|
+
def _save_global_kg(self):
|
|
57
|
+
with open(self.global_kg_file, "w") as f:
|
|
58
|
+
json.dump(self.global_kg, f)
|
|
59
|
+
|
|
60
|
+
def add_memory(self, payload: any, embedding: np.ndarray, triples: List[tuple] = None) -> str:
|
|
61
|
+
"""Agent adds a new memory atom."""
|
|
62
|
+
if triples is None:
|
|
63
|
+
triples = []
|
|
64
|
+
|
|
65
|
+
atom = UnifiedMemoryAtom(
|
|
66
|
+
payload=payload,
|
|
67
|
+
embedding=embedding,
|
|
68
|
+
triples=triples,
|
|
69
|
+
epoch_id=self.current_epoch_id
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# ACID Multi-Index Transaction
|
|
73
|
+
with MultiIndexTransaction(self.wal, self.hot_tier) as tx:
|
|
74
|
+
tx.add(atom)
|
|
75
|
+
|
|
76
|
+
# Update Global Index
|
|
77
|
+
for subj, pred, obj in triples:
|
|
78
|
+
if subj not in self.global_kg: self.global_kg[subj] = []
|
|
79
|
+
if obj not in self.global_kg: self.global_kg[obj] = []
|
|
80
|
+
|
|
81
|
+
# Subj/Obj -> [atom.id, epoch_id]
|
|
82
|
+
self.global_kg[subj].append([atom.id, self.current_epoch_id])
|
|
83
|
+
self.global_kg[obj].append([atom.id, self.current_epoch_id])
|
|
84
|
+
|
|
85
|
+
self._save_global_kg()
|
|
86
|
+
|
|
87
|
+
self._check_epoch_expiry()
|
|
88
|
+
return atom.id
|
|
89
|
+
|
|
90
|
+
def recall(self, query_emb: np.ndarray, top_k: int = 5) -> List[UnifiedMemoryAtom]:
|
|
91
|
+
"""Agent queries memory."""
|
|
92
|
+
results = self.retriever.search(query_emb, top_k=top_k)
|
|
93
|
+
self._check_epoch_expiry()
|
|
94
|
+
return results
|
|
95
|
+
|
|
96
|
+
def _check_epoch_expiry(self):
|
|
97
|
+
"""Lifecycle Management: Hot -> Cold"""
|
|
98
|
+
if time.time() - self.epoch_start_time > self.epoch_duration_secs:
|
|
99
|
+
self._checkpoint()
|
|
100
|
+
|
|
101
|
+
def _checkpoint(self):
|
|
102
|
+
"""Epoch Checkpoint: Flush to disk, clear memory."""
|
|
103
|
+
logger.info(f"Triggering Epoch Checkpoint for {self.current_epoch_id}")
|
|
104
|
+
|
|
105
|
+
# Gather atoms
|
|
106
|
+
atoms = list(self.hot_tier.atoms.values())
|
|
107
|
+
|
|
108
|
+
if atoms:
|
|
109
|
+
self.cold_tier.serialize_epoch(self.current_epoch_id, atoms)
|
|
110
|
+
|
|
111
|
+
# Clear Hot Tier & WAL
|
|
112
|
+
self.hot_tier.clear()
|
|
113
|
+
self.wal.clear()
|
|
114
|
+
|
|
115
|
+
# Start new Epoch
|
|
116
|
+
self.current_epoch_id = f"epoch_{int(time.time())}"
|
|
117
|
+
self.epoch_start_time = time.time()
|
|
118
|
+
|
|
119
|
+
def force_checkpoint(self):
|
|
120
|
+
"""Manually trigger checkpoint for testing."""
|
|
121
|
+
self._checkpoint()
|
|
122
|
+
|
|
123
|
+
def close(self):
|
|
124
|
+
self._save_global_kg()
|
|
125
|
+
self.wal.close()
|
|
126
|
+
self.lock.release()
|
|
127
|
+
|
|
128
|
+
def __del__(self):
|
|
129
|
+
try:
|
|
130
|
+
self.lock.release()
|
|
131
|
+
except:
|
|
132
|
+
pass
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import hnswlib
|
|
2
|
+
import numpy as np
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
|
+
from .atom import UnifiedMemoryAtom
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
class HotTier:
|
|
10
|
+
"""
|
|
11
|
+
L1 Working Memory. Resides in RAM.
|
|
12
|
+
Houses Active Partition, vector index. (Global KG is now in engine).
|
|
13
|
+
"""
|
|
14
|
+
def __init__(self, dim: int, max_elements: int = 10000):
|
|
15
|
+
self.dim = dim
|
|
16
|
+
self.max_elements = max_elements
|
|
17
|
+
|
|
18
|
+
# HNSW Index for Vectors
|
|
19
|
+
self.vector_index = hnswlib.Index(space='cosine', dim=self.dim)
|
|
20
|
+
self.vector_index.init_index(max_elements=max_elements, ef_construction=200, M=16)
|
|
21
|
+
|
|
22
|
+
# Atom storage: id -> UnifiedMemoryAtom
|
|
23
|
+
self.atoms: Dict[str, UnifiedMemoryAtom] = {}
|
|
24
|
+
|
|
25
|
+
# map string uuid to integer for hnsw
|
|
26
|
+
self.uuid_to_int = {}
|
|
27
|
+
self.int_to_uuid = {}
|
|
28
|
+
self._next_int_id = 0
|
|
29
|
+
|
|
30
|
+
def _add_atom(self, atom: UnifiedMemoryAtom):
|
|
31
|
+
"""Internal method called by MIT transaction."""
|
|
32
|
+
if atom.id in self.atoms:
|
|
33
|
+
return # Already exists
|
|
34
|
+
|
|
35
|
+
int_id = self._next_int_id
|
|
36
|
+
self.uuid_to_int[atom.id] = int_id
|
|
37
|
+
self.int_to_uuid[int_id] = atom.id
|
|
38
|
+
self._next_int_id += 1
|
|
39
|
+
|
|
40
|
+
# 1. Add to Vector space
|
|
41
|
+
if atom.embedding is not None and len(atom.embedding) == self.dim:
|
|
42
|
+
self.vector_index.add_items([atom.embedding], [int_id])
|
|
43
|
+
else:
|
|
44
|
+
logger.warning(f"Atom {atom.id} has no valid embedding for dim {self.dim}.")
|
|
45
|
+
|
|
46
|
+
# 3. Store payload
|
|
47
|
+
self.atoms[atom.id] = atom
|
|
48
|
+
|
|
49
|
+
def query_vector(self, query_emb: np.ndarray, top_k: int = 5) -> List[UnifiedMemoryAtom]:
|
|
50
|
+
if len(self.atoms) == 0:
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
actual_k = min(top_k, len(self.atoms))
|
|
54
|
+
# knn_query returns (labels, distances)
|
|
55
|
+
labels, distances = self.vector_index.knn_query([query_emb], k=actual_k)
|
|
56
|
+
|
|
57
|
+
results = []
|
|
58
|
+
for int_lbl in labels[0]:
|
|
59
|
+
if int_lbl in self.int_to_uuid:
|
|
60
|
+
uuid_str = self.int_to_uuid[int_lbl]
|
|
61
|
+
results.append(self.atoms[uuid_str])
|
|
62
|
+
|
|
63
|
+
return results
|
|
64
|
+
|
|
65
|
+
def clear(self):
|
|
66
|
+
"""Called upon Epoch Expiry after serialization."""
|
|
67
|
+
self.atoms.clear()
|
|
68
|
+
self.uuid_to_int.clear()
|
|
69
|
+
self.int_to_uuid.clear()
|
|
70
|
+
self._next_int_id = 0
|
|
71
|
+
self.vector_index = hnswlib.Index(space='cosine', dim=self.dim)
|
|
72
|
+
self.vector_index.init_index(max_elements=self.max_elements, ef_construction=200, M=16)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import List, Dict, Set
|
|
3
|
+
from .atom import UnifiedMemoryAtom
|
|
4
|
+
from .hot_tier import HotTier
|
|
5
|
+
from .cold_tier import ColdTier
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
class RetrievalManager:
|
|
11
|
+
"""Multi-stage retrieval process with Global KG."""
|
|
12
|
+
def __init__(self, hot_tier: HotTier, cold_tier: ColdTier, global_kg: Dict[str, List[List[str]]]):
|
|
13
|
+
self.hot_tier = hot_tier
|
|
14
|
+
self.cold_tier = cold_tier
|
|
15
|
+
self.global_kg = global_kg
|
|
16
|
+
|
|
17
|
+
def _fetch_atom_by_id(self, atom_id: str, epoch_id: str) -> UnifiedMemoryAtom:
|
|
18
|
+
# Check Hot Tier
|
|
19
|
+
if atom_id in self.hot_tier.atoms:
|
|
20
|
+
return self.hot_tier.atoms[atom_id]
|
|
21
|
+
|
|
22
|
+
# Check Cold Tier targeted by epoch
|
|
23
|
+
# epoch_id might be "epoch_X" but load_epoch expects X
|
|
24
|
+
epoch_str = epoch_id.replace("epoch_", "")
|
|
25
|
+
atoms = self.cold_tier.load_epoch(epoch_str)
|
|
26
|
+
for a in atoms:
|
|
27
|
+
if a.id == atom_id:
|
|
28
|
+
return a
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
def search(self, query_emb: np.ndarray, top_k: int = 5, expand_hops: int = 1) -> List[UnifiedMemoryAtom]:
|
|
32
|
+
candidates: Dict[str, UnifiedMemoryAtom] = {}
|
|
33
|
+
|
|
34
|
+
# 1. Semantic Hook (Hot Tier)
|
|
35
|
+
hot_hits = self.hot_tier.query_vector(query_emb, top_k=top_k)
|
|
36
|
+
for atom in hot_hits:
|
|
37
|
+
candidates[atom.id] = atom
|
|
38
|
+
|
|
39
|
+
# Cold tier Semantic Hook
|
|
40
|
+
epochs = self.cold_tier.get_all_epochs()
|
|
41
|
+
for epoch in epochs:
|
|
42
|
+
cold_atoms = self.cold_tier.load_epoch(epoch)
|
|
43
|
+
if cold_atoms:
|
|
44
|
+
embeddings = np.array([a.embedding for a in cold_atoms])
|
|
45
|
+
norms = np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_emb)
|
|
46
|
+
norms = np.where(norms == 0, 1e-10, norms)
|
|
47
|
+
dots = np.dot(embeddings, query_emb)
|
|
48
|
+
sims = dots / norms
|
|
49
|
+
|
|
50
|
+
best_idx = np.argsort(sims)[-top_k:][::-1]
|
|
51
|
+
for idx in best_idx:
|
|
52
|
+
if sims[idx] > 0.0:
|
|
53
|
+
atom = cold_atoms[idx]
|
|
54
|
+
candidates[atom.id] = atom
|
|
55
|
+
|
|
56
|
+
# 2. Relational Expansion (Global KG)
|
|
57
|
+
if expand_hops > 0:
|
|
58
|
+
expansion_set = set(candidates.keys())
|
|
59
|
+
for _ in range(expand_hops):
|
|
60
|
+
new_neighbors = set()
|
|
61
|
+
# Step B: Identify Entities
|
|
62
|
+
for a_id in expansion_set:
|
|
63
|
+
atom = candidates.get(a_id)
|
|
64
|
+
if not atom: continue
|
|
65
|
+
entities = set()
|
|
66
|
+
for subj, pred, obj in atom.triples:
|
|
67
|
+
entities.add(subj)
|
|
68
|
+
entities.add(obj)
|
|
69
|
+
|
|
70
|
+
# Step C: Query KG for neighbors
|
|
71
|
+
for ent in entities:
|
|
72
|
+
if ent in self.global_kg:
|
|
73
|
+
for neighbor_atom_id, epoch_id in self.global_kg[ent]:
|
|
74
|
+
if neighbor_atom_id not in candidates:
|
|
75
|
+
# Step D: Targeted Fetch
|
|
76
|
+
n_atom = self._fetch_atom_by_id(neighbor_atom_id, epoch_id)
|
|
77
|
+
if n_atom:
|
|
78
|
+
new_neighbors.add(n_atom.id)
|
|
79
|
+
candidates[n_atom.id] = n_atom
|
|
80
|
+
expansion_set = new_neighbors
|
|
81
|
+
|
|
82
|
+
# 3. Temporal Re-ranking and Update Access
|
|
83
|
+
results = list(candidates.values())
|
|
84
|
+
for r in results:
|
|
85
|
+
r.access_count += 1
|
|
86
|
+
|
|
87
|
+
results.sort(key=lambda x: x.calculate_saliency(), reverse=True)
|
|
88
|
+
return results[:top_k * 2]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, Any, Optional
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
class FileLock:
|
|
9
|
+
"""A simple file-based lock to prevent concurrent epochdb intances from writing."""
|
|
10
|
+
def __init__(self, lock_path: str):
|
|
11
|
+
self.lock_path = lock_path
|
|
12
|
+
|
|
13
|
+
def acquire(self):
|
|
14
|
+
if os.path.exists(self.lock_path):
|
|
15
|
+
raise RuntimeError(f"Database is locked by another process: {self.lock_path}")
|
|
16
|
+
with open(self.lock_path, "w") as f:
|
|
17
|
+
f.write(str(os.getpid()))
|
|
18
|
+
|
|
19
|
+
def release(self):
|
|
20
|
+
if os.path.exists(self.lock_path):
|
|
21
|
+
os.remove(self.lock_path)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class WriteAheadLog:
|
|
25
|
+
"""Append-only JSONL log for crash recovery."""
|
|
26
|
+
def __init__(self, wal_path: str):
|
|
27
|
+
self.wal_path = wal_path
|
|
28
|
+
self._file = open(self.wal_path, "a")
|
|
29
|
+
|
|
30
|
+
def append(self, operation: str, data: Dict[str, Any]):
|
|
31
|
+
record = json.dumps({"op": operation, "data": data})
|
|
32
|
+
self._file.write(record + "\n")
|
|
33
|
+
self._file.flush()
|
|
34
|
+
os.fsync(self._file.fileno())
|
|
35
|
+
|
|
36
|
+
def close(self):
|
|
37
|
+
self._file.close()
|
|
38
|
+
|
|
39
|
+
def clear(self):
|
|
40
|
+
"""Called upon successful Epoch Checkpoint."""
|
|
41
|
+
self._file.close()
|
|
42
|
+
open(self.wal_path, "w").close()
|
|
43
|
+
self._file = open(self.wal_path, "a")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class MultiIndexTransaction:
|
|
47
|
+
"""
|
|
48
|
+
Context manager to ensure an atom is written to the WAL,
|
|
49
|
+
the Vector Index, and the Knowledge Graph atomically.
|
|
50
|
+
"""
|
|
51
|
+
def __init__(self, wal: WriteAheadLog, hot_tier):
|
|
52
|
+
self.wal = wal
|
|
53
|
+
self.hot_tier = hot_tier
|
|
54
|
+
self.pending_atoms = []
|
|
55
|
+
|
|
56
|
+
def __enter__(self):
|
|
57
|
+
self.pending_atoms = []
|
|
58
|
+
return self
|
|
59
|
+
|
|
60
|
+
def add(self, atom):
|
|
61
|
+
self.pending_atoms.append(atom)
|
|
62
|
+
self.wal.append("ADD", atom.to_dict())
|
|
63
|
+
|
|
64
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
65
|
+
if exc_type is not None:
|
|
66
|
+
logger.error(f"Transaction failed, rolling back. Reason: {exc_val}")
|
|
67
|
+
self.wal.append("ROLLBACK", {})
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
for atom in self.pending_atoms:
|
|
71
|
+
self.hot_tier._add_atom(atom)
|
|
72
|
+
self.wal.append("COMMIT", {})
|
|
73
|
+
return True
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: epochdb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An agentic memory engine designed for lossless, tiered verbatim storage and multi-hop retrieval.
|
|
5
|
+
Author: Jeff
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: Homepage, https://github.com/jeff/epochdb
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: pyarrow
|
|
16
|
+
Requires-Dist: hnswlib
|
|
17
|
+
Dynamic: license-file
|
|
18
|
+
|
|
19
|
+
# EpochDB
|
|
20
|
+
|
|
21
|
+
**EpochDB** is an agentic memory engine designed for lossless, tiered verbatim storage and multi-hop retrieval.
|
|
22
|
+
|
|
23
|
+
## Why
|
|
24
|
+
I had this idea while playing with LMDB. I wanted to create a memory system that could store conversations in a hybrid way, using in-memory for the most recent conversations and on-disk for older conversations. So, in order to have immutable data, I decided to use Parquet files for the on-disk storage.
|
|
25
|
+
|
|
26
|
+
## Overview
|
|
27
|
+
Traditional AI memory systems compress conversations through destructive summarization. EpochDB bypasses this constraint by storing "Unified Memory Atoms"—the raw text intrinsically paired with dense embeddings.
|
|
28
|
+
|
|
29
|
+
EpochDB uses a tiered architecture reminiscent of CPU caching:
|
|
30
|
+
1. **L1: Working Memory**: Sub-millisecond HNSW vector index in RAM.
|
|
31
|
+
2. **L2: Historical Archive**: Cold storage in immutable, time-partitioned `.parquet` files via PyArrow.
|
|
32
|
+
|
|
33
|
+
It uniquely handles multi-hop retrieval over time-partitioned data using a **Global Entity Index**.
|
|
34
|
+
|
|
35
|
+
## How It Works
|
|
36
|
+
See [`how_it_works.md`](how_it_works.md) for a detailed technical dive into the architecture.
|
|
37
|
+
|
|
38
|
+
## Benchmarks & Examples
|
|
39
|
+
See [`benchmark.md`](benchmark.md) for traces of EpochDB successfully integrated via `LangGraph`. Check out [`example_langgraph.py`](example_langgraph.py) for the source code.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
epochdb/__init__.py
|
|
5
|
+
epochdb/atom.py
|
|
6
|
+
epochdb/cold_tier.py
|
|
7
|
+
epochdb/engine.py
|
|
8
|
+
epochdb/hot_tier.py
|
|
9
|
+
epochdb/retrieval.py
|
|
10
|
+
epochdb/transaction.py
|
|
11
|
+
epochdb.egg-info/PKG-INFO
|
|
12
|
+
epochdb.egg-info/SOURCES.txt
|
|
13
|
+
epochdb.egg-info/dependency_links.txt
|
|
14
|
+
epochdb.egg-info/requires.txt
|
|
15
|
+
epochdb.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
epochdb
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "epochdb"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "An agentic memory engine designed for lossless, tiered verbatim storage and multi-hop retrieval."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT License" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Jeff" }
|
|
13
|
+
]
|
|
14
|
+
requires-python = ">=3.8"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"numpy",
|
|
22
|
+
"pyarrow",
|
|
23
|
+
"hnswlib"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
"Homepage" = "https://github.com/jeff/epochdb"
|
epochdb-0.1.0/setup.cfg
ADDED