bubble-memory 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bubble/__init__.py +101 -0
- bubble/_shared.py +49 -0
- bubble/archive.py +48 -0
- bubble/chain.py +317 -0
- bubble/cluster.py +66 -0
- bubble/db.py +48 -0
- bubble/decomposer.py +130 -0
- bubble/embed.py +26 -0
- bubble/ingest.py +165 -0
- bubble/main.py +275 -0
- bubble/promote.py +118 -0
- bubble/rerank.py +30 -0
- bubble/retrieve.py +211 -0
- bubble_memory-0.1.0.dist-info/METADATA +156 -0
- bubble_memory-0.1.0.dist-info/RECORD +18 -0
- bubble_memory-0.1.0.dist-info/WHEEL +5 -0
- bubble_memory-0.1.0.dist-info/licenses/LICENSE +21 -0
- bubble_memory-0.1.0.dist-info/top_level.txt +1 -0
bubble/__init__.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""
|
|
2
|
+
bubble — Hierarchical Memory Consolidation System
|
|
3
|
+
|
|
4
|
+
Typical agent usage
|
|
5
|
+
-------------------
|
|
6
|
+
import bubble
|
|
7
|
+
|
|
8
|
+
# Once, when a user session starts:
|
|
9
|
+
await bubble.init_graph(user_id)
|
|
10
|
+
|
|
11
|
+
# On every user message — retrieve and store in one call (preferred):
|
|
12
|
+
result = await bubble.observe(user_id, message, prior=agent_reply)
|
|
13
|
+
context = result["retrieved"] # SnapshotNode results relevant to this message
|
|
14
|
+
stored = result["stored"] # ingested node descriptors
|
|
15
|
+
|
|
16
|
+
# Or separately:
|
|
17
|
+
await bubble.process(user_id, message, prior=agent_reply)
|
|
18
|
+
context = await bubble.retrieve(user_id, query)
|
|
19
|
+
|
|
20
|
+
# Periodically (runs HDBSCAN + promotion):
|
|
21
|
+
await bubble.consolidate(user_id)
|
|
22
|
+
|
|
23
|
+
# retrieved is a list of dicts:
|
|
24
|
+
# {id, summary, members: [{id, summary, confidence_label}],
|
|
25
|
+
# context: [{rel, id, summary, confidence_label}]}
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import asyncio
|
|
29
|
+
|
|
30
|
+
from .db import get_graph, init_graph
|
|
31
|
+
from .embed import embed as _embed
|
|
32
|
+
from .decomposer import decompose as _decompose
|
|
33
|
+
from .ingest import _route_segments, ingest, replay
|
|
34
|
+
from .promote import promote
|
|
35
|
+
from .retrieve import _retrieve_from_vecs, retrieve
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def observe(user_id: str, message: str, prior: str | None = None, top_k: int = 3, verbose: bool = False) -> dict:
|
|
39
|
+
"""
|
|
40
|
+
Decompose once, retrieve relevant memories, then store — all in a single call.
|
|
41
|
+
|
|
42
|
+
Shares the decompose+embed step between retrieval and ingestion.
|
|
43
|
+
Retrieval runs before storage so newly ingested segments don't appear in results.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
{
|
|
47
|
+
"retrieved": [...], # same format as retrieve()
|
|
48
|
+
"stored": [...], # same format as process()
|
|
49
|
+
}
|
|
50
|
+
"""
|
|
51
|
+
segments = await _decompose(message, prior)
|
|
52
|
+
embeddings = list(await asyncio.gather(*[_embed(s["text"]) for s in segments]))
|
|
53
|
+
|
|
54
|
+
g = get_graph(user_id)
|
|
55
|
+
stored = await _route_segments(user_id, segments, embeddings, prior)
|
|
56
|
+
retrieved = await _retrieve_from_vecs(g, message, embeddings, top_k, verbose)
|
|
57
|
+
return {"retrieved": retrieved, "stored": stored}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
async def process(user_id: str, message: str, prior: str | None = None) -> list[dict]:
|
|
61
|
+
"""
|
|
62
|
+
Ingest a message into the user's memory graph.
|
|
63
|
+
|
|
64
|
+
Routes each segment to:
|
|
65
|
+
- Episodic Episode (intensity >= 0.6): JSONL + Layer 1 node immediately
|
|
66
|
+
- Layer 0 active pool (everything else): waits for consolidate()
|
|
67
|
+
|
|
68
|
+
prior: optional conversational context the user is responding to.
|
|
69
|
+
Returns the list of created node descriptors.
|
|
70
|
+
"""
|
|
71
|
+
nodes = await ingest(user_id, message, prior)
|
|
72
|
+
await promote(user_id)
|
|
73
|
+
return nodes
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def consolidate(user_id: str) -> dict:
|
|
77
|
+
"""
|
|
78
|
+
Run the full consolidation pipeline on a user's graph:
|
|
79
|
+
1. HDBSCAN on the Layer 0 active pool
|
|
80
|
+
2. Promote clusters crossing the t_promo_score threshold to Episodes
|
|
81
|
+
(includes JSONL archival, SegmentNode deletion, L2 assignment)
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
{"promoted": [...]} # newly created Episode descriptors
|
|
85
|
+
|
|
86
|
+
Call periodically rather than on every message.
|
|
87
|
+
"""
|
|
88
|
+
promoted = await promote(user_id)
|
|
89
|
+
return {"promoted": promoted}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
__all__ = [
|
|
93
|
+
"init_graph",
|
|
94
|
+
"observe",
|
|
95
|
+
"process",
|
|
96
|
+
"consolidate",
|
|
97
|
+
"retrieve",
|
|
98
|
+
"ingest",
|
|
99
|
+
"promote",
|
|
100
|
+
"replay",
|
|
101
|
+
]
|
bubble/_shared.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from anthropic import AsyncAnthropic
|
|
6
|
+
from dotenv import load_dotenv
|
|
7
|
+
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
MODEL = os.getenv("BUBBLE_MODEL", "claude-sonnet-4-6")
|
|
11
|
+
_client = AsyncAnthropic()
|
|
12
|
+
|
|
13
|
+
_SUMMARIZE_SYSTEM = """\
|
|
14
|
+
You distill one or more user statements into a single memory record.
|
|
15
|
+
|
|
16
|
+
Rules:
|
|
17
|
+
- Capture the belief, preference, event, or tendency the statements express.
|
|
18
|
+
- When multiple statements are given, identify the common pattern they share.
|
|
19
|
+
- Write exactly one sentence with no grammatical subject.
|
|
20
|
+
- Start with a verb or descriptor that names the belief, event, or pattern.
|
|
21
|
+
- Do not explain, qualify, or ask for clarification.\
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _now() -> str:
|
|
26
|
+
return datetime.now(timezone.utc).isoformat()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _normalize(vec: np.ndarray) -> list[float]:
|
|
30
|
+
"""L2-normalize a numpy vector and return as a Python list."""
|
|
31
|
+
norm = np.linalg.norm(vec)
|
|
32
|
+
return (vec / norm if norm > 0 else vec).tolist()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _centroid(nodes: list[dict]) -> list[float]:
|
|
36
|
+
"""Mean of source embeddings, L2-normalized."""
|
|
37
|
+
matrix = np.array([n["embedding"] for n in nodes], dtype=np.float32)
|
|
38
|
+
return _normalize(matrix.mean(axis=0))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
async def _summarize(nodes: list[dict]) -> str:
|
|
42
|
+
texts = "\n".join(f"- {n['raw_text']}" for n in nodes)
|
|
43
|
+
response = await _client.messages.create(
|
|
44
|
+
model=MODEL,
|
|
45
|
+
max_tokens=128,
|
|
46
|
+
system=_SUMMARIZE_SYSTEM,
|
|
47
|
+
messages=[{"role": "user", "content": texts}],
|
|
48
|
+
)
|
|
49
|
+
return response.content[0].text.strip()
|
bubble/archive.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
_ARCHIVE_DIR = os.getenv("BUBBLE_ARCHIVE_DIR", "./data/archive")
|
|
6
|
+
_MKDIR_DONE = False
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _path(user_id: str) -> Path:
|
|
10
|
+
global _MKDIR_DONE
|
|
11
|
+
p = Path(_ARCHIVE_DIR)
|
|
12
|
+
if not _MKDIR_DONE:
|
|
13
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
_MKDIR_DONE = True
|
|
15
|
+
return p / f"{user_id}.jsonl"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def read_segments(user_id: str):
|
|
19
|
+
"""Yield all archived segment records for a user."""
|
|
20
|
+
path = _path(user_id)
|
|
21
|
+
if not path.exists():
|
|
22
|
+
return
|
|
23
|
+
with path.open("r", encoding="utf-8") as f:
|
|
24
|
+
for line in f:
|
|
25
|
+
line = line.strip()
|
|
26
|
+
if line:
|
|
27
|
+
yield json.loads(line)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def write_segment(
|
|
31
|
+
user_id: str,
|
|
32
|
+
*,
|
|
33
|
+
text: str,
|
|
34
|
+
prior: str | None,
|
|
35
|
+
intensity: float,
|
|
36
|
+
valence: str,
|
|
37
|
+
timestamp: str,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Append one segment record to the user's JSONL archive."""
|
|
40
|
+
entry = {
|
|
41
|
+
"text": text,
|
|
42
|
+
"prior": prior,
|
|
43
|
+
"intensity": intensity,
|
|
44
|
+
"valence": valence,
|
|
45
|
+
"timestamp": timestamp,
|
|
46
|
+
}
|
|
47
|
+
with _path(user_id).open("a", encoding="utf-8") as f:
|
|
48
|
+
f.write(json.dumps(entry) + "\n")
|
bubble/chain.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chain assignment pipeline — spec §3.8.
|
|
3
|
+
|
|
4
|
+
When a new Episode is created, assign it to a topic chain:
|
|
5
|
+
1 — ANN top-1 against Episode centroids.
|
|
6
|
+
2 — Similarity threshold check. Below → new isolated SnapshotNode.
|
|
7
|
+
3 — LLM relatedness check (binary). Not related → new isolated SnapshotNode.
|
|
8
|
+
4 — Related → traverse FOLLOWED_BY edges to chain tail, wire FOLLOWED_BY edge, join SnapshotNode.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import uuid
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
from ._shared import MODEL, _client, _normalize, _now
|
|
17
|
+
from .db import get_graph
|
|
18
|
+
|
|
19
|
+
_CHAIN_MAX_DISTANCE = float(os.getenv("BUBBLE_CHAIN_MAX_DISTANCE", "0.4"))
|
|
20
|
+
_NLI_ENABLED = os.getenv("BUBBLE_ENABLE_NLI", "false").lower() == "true"
|
|
21
|
+
_NLI_ENDPOINT = os.getenv("BUBBLE_NLI_ENDPOINT", "http://localhost:8999/predict")
|
|
22
|
+
|
|
23
|
+
_SNAPSHOT_SYSTEM = """\
|
|
24
|
+
A sequence of memory records about the user is listed below, from earliest to most recent.
|
|
25
|
+
|
|
26
|
+
Rules:
|
|
27
|
+
- The most recent memory takes precedence over earlier ones.
|
|
28
|
+
- Earlier memory provides historical context.
|
|
29
|
+
- Synthesize all memory into a single simplified coherent narrative that represents the full arc.
|
|
30
|
+
- Output one concise paragraph.
|
|
31
|
+
- No subject, start with verb.
|
|
32
|
+
- Do not explain or justify.\
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Relatedness check (LLM or NLI)
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
async def _related(summary_a: str, summary_b: str) -> bool:
|
|
41
|
+
"""True if the two summaries are about the same topic.
|
|
42
|
+
|
|
43
|
+
BUBBLE_CHAIN_BACKEND=nli — local NLI model, argmax != neutral → related.
|
|
44
|
+
BUBBLE_CHAIN_BACKEND=llm — LLM yes/no (default).
|
|
45
|
+
"""
|
|
46
|
+
if _NLI_ENABLED:
|
|
47
|
+
import httpx
|
|
48
|
+
|
|
49
|
+
async with httpx.AsyncClient() as client:
|
|
50
|
+
resp = await client.post(
|
|
51
|
+
_NLI_ENDPOINT,
|
|
52
|
+
json={"inputs": [summary_a, summary_b]},
|
|
53
|
+
headers={"Content-Type": "application/json", "Accept": "application/json"},
|
|
54
|
+
)
|
|
55
|
+
resp.raise_for_status()
|
|
56
|
+
scores = resp.json() # [{"label": ..., "score": ...}, ...]
|
|
57
|
+
return max(scores, key=lambda x: x["score"])["label"].lower() != "neutral"
|
|
58
|
+
|
|
59
|
+
answer = (
|
|
60
|
+
(
|
|
61
|
+
await _client.messages.create(
|
|
62
|
+
model=MODEL,
|
|
63
|
+
max_tokens=8,
|
|
64
|
+
system="You are a memory topic classifier. Reply with exactly 'yes' or 'no'.",
|
|
65
|
+
messages=[
|
|
66
|
+
{
|
|
67
|
+
"role": "user",
|
|
68
|
+
"content": (f"Are these two beliefs about the same topic or subject?\n\nA: {summary_a}\nB: {summary_b}"),
|
|
69
|
+
}
|
|
70
|
+
],
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
.content[0]
|
|
74
|
+
.text.strip()
|
|
75
|
+
.lower()
|
|
76
|
+
)
|
|
77
|
+
return answer.startswith("y")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
# Chain traversal
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
async def _traverse_to_tail(g, start_id: str) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Follow FOLLOWED_BY edges from start_id to the chain tail
|
|
88
|
+
(the node with no outgoing FOLLOWED_BY edge).
|
|
89
|
+
"""
|
|
90
|
+
result = await g.query(
|
|
91
|
+
"MATCH (start:Episode {id: $id})-[:FOLLOWED_BY*0..]->(tail:Episode) WHERE NOT (tail)-[:FOLLOWED_BY]->() RETURN tail.id LIMIT 1",
|
|
92
|
+
{"id": start_id},
|
|
93
|
+
)
|
|
94
|
+
if result.result_set:
|
|
95
|
+
return result.result_set[0][0]
|
|
96
|
+
return start_id
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
# Graph writers
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
async def _wire_follows(g, from_id: str, to_id: str) -> None:
|
|
105
|
+
await g.query(
|
|
106
|
+
"MATCH (a:Episode {id: $a}), (b:Episode {id: $b}) CREATE (a)-[:FOLLOWED_BY]->(b)",
|
|
107
|
+
{"a": from_id, "b": to_id},
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
# Graph loaders
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _rows_to_episode_dicts(rows) -> list[dict]:
|
|
117
|
+
return [
|
|
118
|
+
{
|
|
119
|
+
"id": r[0],
|
|
120
|
+
"summary": r[1],
|
|
121
|
+
"centroid": r[2],
|
|
122
|
+
"episodic": bool(r[3]),
|
|
123
|
+
"timestamp": r[4],
|
|
124
|
+
"valence": r[5] or "neu",
|
|
125
|
+
}
|
|
126
|
+
for r in rows
|
|
127
|
+
if r[2] is not None
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def _load_node(g, episode_id: str) -> dict | None:
|
|
132
|
+
result = await g.query(
|
|
133
|
+
"MATCH (t:Episode {id: $id}) RETURN t.id, t.summary, t.centroid, t.episodic, t.timestamp, t.valence",
|
|
134
|
+
{"id": episode_id},
|
|
135
|
+
)
|
|
136
|
+
rows = _rows_to_episode_dicts(result.result_set)
|
|
137
|
+
return rows[0] if rows else None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
# SnapshotNode management
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
async def _recompute_snapshot_centroid(g, snap_id: str) -> None:
|
|
146
|
+
result = await g.query(
|
|
147
|
+
"MATCH (snap:SnapshotNode {id: $id})-[:SYNTHESIZES]->(t:Episode) RETURN t.centroid",
|
|
148
|
+
{"id": snap_id},
|
|
149
|
+
)
|
|
150
|
+
centroids = [row[0] for row in result.result_set if row[0] is not None]
|
|
151
|
+
if not centroids:
|
|
152
|
+
return
|
|
153
|
+
new_centroid = _normalize(np.array(centroids, dtype=np.float32).mean(axis=0))
|
|
154
|
+
await g.query(
|
|
155
|
+
"MATCH (snap:SnapshotNode {id: $id}) SET snap.centroid = vecf32($centroid)",
|
|
156
|
+
{"id": snap_id, "centroid": new_centroid},
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
async def _join_snapshot(g, snap_id: str, new_node_id: str) -> None:
|
|
161
|
+
await g.query(
|
|
162
|
+
"MATCH (snap:SnapshotNode {id: $snap_id}), (t:Episode {id: $tid}) CREATE (snap)-[:SYNTHESIZES]->(t) SET snap.valid = false",
|
|
163
|
+
{"snap_id": snap_id, "tid": new_node_id},
|
|
164
|
+
)
|
|
165
|
+
await _recompute_snapshot_centroid(g, snap_id)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
async def _create_snapshot(g, episode_id: str, summary: str, centroid: list[float]) -> None:
|
|
169
|
+
snap_id = str(uuid.uuid4())
|
|
170
|
+
await g.query(
|
|
171
|
+
"CREATE (snap:SnapshotNode { id: $id, summary: $summary, centroid: vecf32($centroid), valid: true, timestamp: $ts})",
|
|
172
|
+
{"id": snap_id, "summary": summary, "centroid": centroid, "ts": _now()},
|
|
173
|
+
)
|
|
174
|
+
await g.query(
|
|
175
|
+
"MATCH (snap:SnapshotNode {id: $snap_id}), (t:Episode {id: $tid}) CREATE (snap)-[:SYNTHESIZES]->(t)",
|
|
176
|
+
{"snap_id": snap_id, "tid": episode_id},
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# ---------------------------------------------------------------------------
|
|
181
|
+
# Lazy snapshot summary generation
|
|
182
|
+
# ---------------------------------------------------------------------------
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
async def ensure_snapshot_summary(
|
|
186
|
+
g,
|
|
187
|
+
snap_id: str,
|
|
188
|
+
*,
|
|
189
|
+
valid: bool | None = None,
|
|
190
|
+
summary: str | None = None,
|
|
191
|
+
) -> str | None:
|
|
192
|
+
"""
|
|
193
|
+
Return the SnapshotNode summary, generating it lazily if valid=false or summary is null.
|
|
194
|
+
|
|
195
|
+
Pass `valid` and `summary` when already fetched (e.g. from ANN query) to skip the
|
|
196
|
+
redundant DB round-trip.
|
|
197
|
+
|
|
198
|
+
Single-member chains: copy Episode summary directly (no LLM).
|
|
199
|
+
Multi-member chains: LLM synthesis ordered by timestamp, episodic members last.
|
|
200
|
+
"""
|
|
201
|
+
if valid is None or summary is None:
|
|
202
|
+
result = await g.query(
|
|
203
|
+
"MATCH (snap:SnapshotNode {id: $id}) RETURN snap.summary, snap.valid",
|
|
204
|
+
{"id": snap_id},
|
|
205
|
+
)
|
|
206
|
+
if not result.result_set:
|
|
207
|
+
return None
|
|
208
|
+
summary, valid = result.result_set[0]
|
|
209
|
+
if valid and summary:
|
|
210
|
+
return summary
|
|
211
|
+
|
|
212
|
+
result = await g.query(
|
|
213
|
+
"MATCH (snap:SnapshotNode {id: $id})-[:SYNTHESIZES]->(t:Episode) RETURN t.id, t.summary, t.episodic, t.timestamp",
|
|
214
|
+
{"id": snap_id},
|
|
215
|
+
)
|
|
216
|
+
members = [
|
|
217
|
+
{
|
|
218
|
+
"id": r[0],
|
|
219
|
+
"summary": r[1],
|
|
220
|
+
"episodic": bool(r[2]),
|
|
221
|
+
"timestamp": r[3] or "",
|
|
222
|
+
}
|
|
223
|
+
for r in result.result_set
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
if not members:
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
if len(members) == 1:
|
|
230
|
+
new_summary = members[0]["summary"]
|
|
231
|
+
else:
|
|
232
|
+
non_ep = sorted([m for m in members if not m["episodic"]], key=lambda m: m["timestamp"])
|
|
233
|
+
ep = sorted([m for m in members if m["episodic"]], key=lambda m: m["timestamp"])
|
|
234
|
+
ordered = non_ep + ep
|
|
235
|
+
|
|
236
|
+
count = len(ordered)
|
|
237
|
+
lines = []
|
|
238
|
+
for i, m in enumerate(ordered):
|
|
239
|
+
tag = " [episodic]" if m["episodic"] else ""
|
|
240
|
+
if i == 0:
|
|
241
|
+
label = f"Belief 1 (earliest){tag}"
|
|
242
|
+
elif i == count - 1:
|
|
243
|
+
label = f"Belief {i + 1} (most recent){tag}"
|
|
244
|
+
else:
|
|
245
|
+
label = f"Belief {i + 1}{tag}"
|
|
246
|
+
lines.append(f"{label}: {m['summary']}")
|
|
247
|
+
|
|
248
|
+
new_summary = (
|
|
249
|
+
(
|
|
250
|
+
await _client.messages.create(
|
|
251
|
+
model=MODEL,
|
|
252
|
+
max_tokens=256,
|
|
253
|
+
system=_SNAPSHOT_SYSTEM,
|
|
254
|
+
messages=[{"role": "user", "content": "\n\n".join(lines)}],
|
|
255
|
+
)
|
|
256
|
+
)
|
|
257
|
+
.content[0]
|
|
258
|
+
.text.strip()
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
await g.query(
|
|
262
|
+
"MATCH (snap:SnapshotNode {id: $id}) SET snap.summary = $summary, snap.valid = true",
|
|
263
|
+
{"id": snap_id, "summary": new_summary},
|
|
264
|
+
)
|
|
265
|
+
return new_summary
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# ---------------------------------------------------------------------------
|
|
269
|
+
# Public API
|
|
270
|
+
# ---------------------------------------------------------------------------
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
async def check_new(user_id: str, new_episode_id: str) -> None:
|
|
274
|
+
"""
|
|
275
|
+
Assign a newly created Episode to a topic chain.
|
|
276
|
+
Called by promote() and ingest._store_episodic() after Episode creation.
|
|
277
|
+
"""
|
|
278
|
+
g = get_graph(user_id)
|
|
279
|
+
new_node = await _load_node(g, new_episode_id)
|
|
280
|
+
if new_node is None or new_node["centroid"] is None:
|
|
281
|
+
return
|
|
282
|
+
|
|
283
|
+
# Step 1 — ANN top-1 (k=3: self may occupy a slot if already indexed)
|
|
284
|
+
result = await g.query(
|
|
285
|
+
"CALL db.idx.vector.queryNodes('Episode', 'centroid', 2, vecf32($vec)) YIELD node, score WHERE node.id <> $id RETURN node.id, node.summary, score LIMIT 1",
|
|
286
|
+
{"vec": new_node["centroid"], "id": new_episode_id},
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
if not result.result_set:
|
|
290
|
+
# No other Episodes exist yet
|
|
291
|
+
await _create_snapshot(g, new_episode_id, new_node["summary"], new_node["centroid"])
|
|
292
|
+
return
|
|
293
|
+
|
|
294
|
+
closest_id, closest_summary, score = result.result_set[0]
|
|
295
|
+
|
|
296
|
+
# Step 2 — Similarity threshold
|
|
297
|
+
if score > _CHAIN_MAX_DISTANCE:
|
|
298
|
+
await _create_snapshot(g, new_episode_id, new_node["summary"], new_node["centroid"])
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
# Step 3 — relatedness check
|
|
302
|
+
if not await _related(new_node["summary"], closest_summary):
|
|
303
|
+
await _create_snapshot(g, new_episode_id, new_node["summary"], new_node["centroid"])
|
|
304
|
+
return
|
|
305
|
+
|
|
306
|
+
# Step 4 — Append to chain
|
|
307
|
+
tail_id = await _traverse_to_tail(g, closest_id)
|
|
308
|
+
await _wire_follows(g, tail_id, new_episode_id)
|
|
309
|
+
|
|
310
|
+
snap_result = await g.query(
|
|
311
|
+
"MATCH (snap:SnapshotNode)-[:SYNTHESIZES]->(t:Episode {id: $id}) RETURN snap.id",
|
|
312
|
+
{"id": closest_id},
|
|
313
|
+
)
|
|
314
|
+
if snap_result.result_set:
|
|
315
|
+
await _join_snapshot(g, snap_result.result_set[0][0], new_episode_id)
|
|
316
|
+
else:
|
|
317
|
+
await _create_snapshot(g, new_episode_id, new_node["summary"], new_node["centroid"])
|
bubble/cluster.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from sklearn.cluster import HDBSCAN
|
|
5
|
+
|
|
6
|
+
from .db import get_graph
|
|
7
|
+
|
|
8
|
+
_CLUSTER_MIN_SIZE = int(os.getenv("BUBBLE_CLUSTER_MIN_SIZE", "3"))
|
|
9
|
+
_CLUSTER_DIMS = int(os.getenv("BUBBLE_CLUSTER_DIMS", "128"))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def get_clusters(user_id: str) -> dict[int, list[dict]]:
|
|
13
|
+
"""
|
|
14
|
+
Run HDBSCAN on all SegmentNodes for a user.
|
|
15
|
+
All SegmentNodes in the graph are the active pool — promoted nodes are deleted at promotion.
|
|
16
|
+
|
|
17
|
+
Returns {cluster_label: [node_dicts]} — noise (label -1) is excluded.
|
|
18
|
+
Returns empty dict if the pool is smaller than _CLUSTER_MIN_SIZE.
|
|
19
|
+
|
|
20
|
+
Each node dict contains: id, raw_text, embedding (768-dim), intensity, valence,
|
|
21
|
+
prior (str|None), timestamp (str).
|
|
22
|
+
"""
|
|
23
|
+
g = get_graph(user_id)
|
|
24
|
+
result = await g.query(
|
|
25
|
+
"MATCH (n:SegmentNode) "
|
|
26
|
+
"RETURN n.id, n.raw_text, n.embedding, n.intensity, n.valence, n.prior, n.timestamp"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if not result.result_set:
|
|
30
|
+
return {}
|
|
31
|
+
|
|
32
|
+
nodes = [
|
|
33
|
+
{
|
|
34
|
+
"id": row[0],
|
|
35
|
+
"raw_text": row[1],
|
|
36
|
+
"embedding": row[2],
|
|
37
|
+
"intensity": row[3],
|
|
38
|
+
"valence": row[4],
|
|
39
|
+
"prior": row[5],
|
|
40
|
+
"timestamp": row[6],
|
|
41
|
+
}
|
|
42
|
+
for row in result.result_set
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
if len(nodes) < _CLUSTER_MIN_SIZE:
|
|
46
|
+
return {}
|
|
47
|
+
|
|
48
|
+
# Truncate to 128 dims (Matryoshka), then re-normalize.
|
|
49
|
+
# The full 768-dim vector is unit-norm, but the truncated prefix is not —
|
|
50
|
+
# re-normalizing restores the cosine distance relationship for euclidean HDBSCAN.
|
|
51
|
+
raw = np.array([n["embedding"][:_CLUSTER_DIMS] for n in nodes], dtype=np.float32)
|
|
52
|
+
norms = np.linalg.norm(raw, axis=1, keepdims=True)
|
|
53
|
+
matrix = raw / np.where(norms > 0, norms, 1.0)
|
|
54
|
+
|
|
55
|
+
labels = HDBSCAN(
|
|
56
|
+
min_cluster_size=_CLUSTER_MIN_SIZE,
|
|
57
|
+
metric="euclidean", # equiv to cosine on unit vectors
|
|
58
|
+
).fit_predict(matrix)
|
|
59
|
+
|
|
60
|
+
clusters: dict[int, list[dict]] = {}
|
|
61
|
+
for node, label in zip(nodes, labels):
|
|
62
|
+
if label == -1:
|
|
63
|
+
continue # noise — accumulates at Layer 0, never promotes
|
|
64
|
+
clusters.setdefault(int(label), []).append(node)
|
|
65
|
+
|
|
66
|
+
return clusters
|
bubble/db.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dotenv import load_dotenv
|
|
3
|
+
from falkordb.asyncio import FalkorDB
|
|
4
|
+
|
|
5
|
+
load_dotenv()
|
|
6
|
+
|
|
7
|
+
_client: FalkorDB | None = None
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_client() -> FalkorDB:
|
|
11
|
+
global _client
|
|
12
|
+
if _client is None:
|
|
13
|
+
_client = FalkorDB(
|
|
14
|
+
host=os.getenv("FALKORDB_HOST", "localhost"),
|
|
15
|
+
port=int(os.getenv("FALKORDB_PORT", "6379")),
|
|
16
|
+
)
|
|
17
|
+
return _client
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_graph(user_id: str):
|
|
21
|
+
return get_client().select_graph(f"bubble:{user_id}")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def init_graph(user_id: str) -> None:
|
|
25
|
+
"""Create indexes for a user graph. Safe to call on an already-initialized graph."""
|
|
26
|
+
g = get_graph(user_id)
|
|
27
|
+
|
|
28
|
+
from .embed import EMBED_DIM
|
|
29
|
+
|
|
30
|
+
# HNSW vector index on SnapshotNode.centroid — retrieval (L2 entry point).
|
|
31
|
+
try:
|
|
32
|
+
await g.query(
|
|
33
|
+
"CREATE VECTOR INDEX FOR (n:SnapshotNode) ON (n.centroid) "
|
|
34
|
+
f"OPTIONS {{dimension: {EMBED_DIM}, similarityFunction: 'cosine'}}"
|
|
35
|
+
)
|
|
36
|
+
except Exception:
|
|
37
|
+
pass # already exists
|
|
38
|
+
|
|
39
|
+
# HNSW vector index on Episode.centroid — chain assignment ANN search.
|
|
40
|
+
# Used by check_new (chain.py) to find the nearest existing Episode
|
|
41
|
+
# for the three-gate chain assignment cascade.
|
|
42
|
+
try:
|
|
43
|
+
await g.query(
|
|
44
|
+
"CREATE VECTOR INDEX FOR (n:Episode) ON (n.centroid) "
|
|
45
|
+
f"OPTIONS {{dimension: {EMBED_DIM}, similarityFunction: 'cosine'}}"
|
|
46
|
+
)
|
|
47
|
+
except Exception:
|
|
48
|
+
pass # already exists
|