@simbimbo/memory-ocmemog 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +9 -7
- package/brain/runtime/inference.py +31 -1
- package/brain/runtime/memory/api.py +824 -5
- package/brain/runtime/memory/context_builder.py +101 -76
- package/brain/runtime/memory/distill.py +156 -13
- package/brain/runtime/memory/freshness.py +24 -1
- package/brain/runtime/memory/integrity.py +22 -6
- package/brain/runtime/memory/pondering_engine.py +87 -8
- package/brain/runtime/memory/promote.py +6 -0
- package/brain/runtime/memory/provenance.py +52 -0
- package/brain/runtime/memory/retrieval.py +116 -50
- package/brain/runtime/memory/vector_index.py +67 -5
- package/docs/notes/2026-03-18-memory-repair-and-backfill.md +70 -0
- package/docs/notes/local-model-role-matrix-2026-03-18.md +50 -0
- package/docs/usage.md +16 -14
- package/index.ts +1 -1
- package/ocmemog/sidecar/app.py +381 -9
- package/ocmemog/sidecar/compat.py +7 -1
- package/ocmemog/sidecar/transcript_watcher.py +2 -2
- package/package.json +1 -1
- package/scripts/install-ocmemog.sh +2 -2
- package/scripts/ocmemog-backfill-vectors.py +33 -0
- package/scripts/ocmemog-context.sh +1 -1
- package/scripts/ocmemog-demo.py +1 -1
- package/scripts/ocmemog-load-test.py +1 -1
- package/scripts/ocmemog-ponder.sh +2 -2
- package/scripts/ocmemog-recall-test.py +1 -1
- package/scripts/ocmemog-reindex-vectors.py +8 -0
- package/scripts/ocmemog-reliability-soak.py +1 -1
- package/scripts/ocmemog-sidecar.sh +33 -7
- package/scripts/ocmemog-test-rig.py +1 -1
|
@@ -106,6 +106,16 @@ def normalize_metadata(metadata: Optional[Dict[str, Any]], *, source: Optional[s
|
|
|
106
106
|
"derived_from_promotion_id",
|
|
107
107
|
"derived_via",
|
|
108
108
|
"kind",
|
|
109
|
+
"memory_status",
|
|
110
|
+
"superseded_by",
|
|
111
|
+
"supersedes",
|
|
112
|
+
"duplicate_of",
|
|
113
|
+
"duplicate_candidates",
|
|
114
|
+
"contradicts",
|
|
115
|
+
"contradiction_candidates",
|
|
116
|
+
"contradiction_status",
|
|
117
|
+
"canonical_reference",
|
|
118
|
+
"supersession_recommendation",
|
|
109
119
|
):
|
|
110
120
|
if raw.get(key) is not None and provenance.get(key) is None:
|
|
111
121
|
provenance[key] = raw.get(key)
|
|
@@ -177,6 +187,20 @@ def apply_links(reference: str, metadata: Optional[Dict[str, Any]]) -> None:
|
|
|
177
187
|
_link_once(reference, "candidate", f"candidate:{provenance['derived_from_candidate_id']}")
|
|
178
188
|
if provenance.get("derived_from_promotion_id"):
|
|
179
189
|
_link_once(reference, "promotion", f"promotions:{provenance['derived_from_promotion_id']}")
|
|
190
|
+
if provenance.get("superseded_by"):
|
|
191
|
+
_link_once(reference, "superseded_by", str(provenance.get("superseded_by")))
|
|
192
|
+
if provenance.get("supersedes"):
|
|
193
|
+
_link_once(reference, "supersedes", str(provenance.get("supersedes")))
|
|
194
|
+
if provenance.get("duplicate_of"):
|
|
195
|
+
_link_once(reference, "duplicate_of", str(provenance.get("duplicate_of")))
|
|
196
|
+
for candidate in provenance.get("duplicate_candidates") or []:
|
|
197
|
+
_link_once(reference, "duplicate_candidate", str(candidate))
|
|
198
|
+
for target in provenance.get("contradicts") or []:
|
|
199
|
+
_link_once(reference, "contradicts", str(target))
|
|
200
|
+
for target in provenance.get("contradiction_candidates") or []:
|
|
201
|
+
_link_once(reference, "contradiction_candidate", str(target))
|
|
202
|
+
if provenance.get("canonical_reference"):
|
|
203
|
+
_link_once(reference, "canonical", str(provenance.get("canonical_reference")))
|
|
180
204
|
|
|
181
205
|
|
|
182
206
|
def update_memory_metadata(reference: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
@@ -201,6 +225,34 @@ def update_memory_metadata(reference: str, updates: Dict[str, Any]) -> Optional[
|
|
|
201
225
|
return merged
|
|
202
226
|
|
|
203
227
|
|
|
228
|
+
def force_update_memory_metadata(reference: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
229
|
+
table, sep, raw_id = reference.partition(":")
|
|
230
|
+
if not sep or table not in _MEMORY_TABLES or not raw_id.isdigit():
|
|
231
|
+
return None
|
|
232
|
+
conn = store.connect()
|
|
233
|
+
try:
|
|
234
|
+
row = conn.execute(f"SELECT metadata_json FROM {table} WHERE id = ?", (int(raw_id),)).fetchone()
|
|
235
|
+
if not row:
|
|
236
|
+
return None
|
|
237
|
+
current = _load_json(row["metadata_json"], {})
|
|
238
|
+
provenance_meta = current.get("provenance") if isinstance(current.get("provenance"), dict) else {}
|
|
239
|
+
for key, value in updates.items():
|
|
240
|
+
if value is None or value == "":
|
|
241
|
+
provenance_meta.pop(key, None)
|
|
242
|
+
else:
|
|
243
|
+
provenance_meta[key] = value
|
|
244
|
+
current["provenance"] = provenance_meta
|
|
245
|
+
conn.execute(
|
|
246
|
+
f"UPDATE {table} SET metadata_json = ? WHERE id = ?",
|
|
247
|
+
(json.dumps(current, ensure_ascii=False), int(raw_id)),
|
|
248
|
+
)
|
|
249
|
+
conn.commit()
|
|
250
|
+
finally:
|
|
251
|
+
conn.close()
|
|
252
|
+
apply_links(reference, current)
|
|
253
|
+
return current
|
|
254
|
+
|
|
255
|
+
|
|
204
256
|
def fetch_reference(reference: str) -> Optional[Dict[str, Any]]:
|
|
205
257
|
prefix, sep, raw_id = reference.partition(":")
|
|
206
258
|
if not sep or not prefix:
|
|
@@ -1,19 +1,57 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from datetime import datetime, timezone
|
|
3
4
|
from typing import Dict, List, Any, Iterable, Tuple
|
|
4
5
|
|
|
6
|
+
import json
|
|
7
|
+
|
|
5
8
|
from brain.runtime.instrumentation import emit_event
|
|
6
9
|
from brain.runtime import state_store
|
|
7
10
|
from brain.runtime.memory import memory_links, provenance, store, vector_index
|
|
8
11
|
|
|
9
12
|
|
|
13
|
+
def _tokenize(text: str) -> List[str]:
|
|
14
|
+
return [token for token in "".join(ch.lower() if ch.isalnum() else " " for ch in (text or "")).split() if token]
|
|
15
|
+
|
|
16
|
+
|
|
10
17
|
def _match_score(text: str, query: str) -> float:
|
|
11
|
-
if not text:
|
|
18
|
+
if not text or not query:
|
|
12
19
|
return 0.0
|
|
13
20
|
text_l = text.lower()
|
|
14
21
|
query_l = query.lower()
|
|
15
22
|
if query_l in text_l:
|
|
16
23
|
return 1.0
|
|
24
|
+
query_tokens = set(_tokenize(query_l))
|
|
25
|
+
if not query_tokens:
|
|
26
|
+
return 0.0
|
|
27
|
+
text_tokens = set(_tokenize(text_l))
|
|
28
|
+
if not text_tokens:
|
|
29
|
+
return 0.0
|
|
30
|
+
overlap = len(query_tokens & text_tokens) / max(1, len(query_tokens))
|
|
31
|
+
return round(min(0.95, overlap * 0.85), 3)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _recency_score(timestamp: str | None) -> float:
|
|
35
|
+
if not timestamp:
|
|
36
|
+
return 0.0
|
|
37
|
+
parsed = None
|
|
38
|
+
for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"):
|
|
39
|
+
try:
|
|
40
|
+
parsed = datetime.strptime(timestamp, fmt).replace(tzinfo=timezone.utc)
|
|
41
|
+
break
|
|
42
|
+
except ValueError:
|
|
43
|
+
continue
|
|
44
|
+
if parsed is None:
|
|
45
|
+
return 0.0
|
|
46
|
+
age_days = max(0.0, (datetime.now(timezone.utc) - parsed).total_seconds() / 86400.0)
|
|
47
|
+
if age_days <= 1:
|
|
48
|
+
return 0.2
|
|
49
|
+
if age_days <= 7:
|
|
50
|
+
return 0.15
|
|
51
|
+
if age_days <= 30:
|
|
52
|
+
return 0.08
|
|
53
|
+
if age_days <= 180:
|
|
54
|
+
return 0.03
|
|
17
55
|
return 0.0
|
|
18
56
|
|
|
19
57
|
|
|
@@ -31,6 +69,31 @@ def _empty_results() -> Dict[str, List[Dict[str, Any]]]:
|
|
|
31
69
|
return {bucket: [] for bucket in MEMORY_BUCKETS}
|
|
32
70
|
|
|
33
71
|
|
|
72
|
+
def _parse_metadata(raw: Any) -> Dict[str, Any]:
|
|
73
|
+
if isinstance(raw, dict):
|
|
74
|
+
return raw
|
|
75
|
+
try:
|
|
76
|
+
return json.loads(raw or "{}")
|
|
77
|
+
except Exception:
|
|
78
|
+
return {}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _governance_state(metadata: Dict[str, Any]) -> tuple[str, Dict[str, Any]]:
|
|
82
|
+
preview = provenance.preview_from_metadata(metadata)
|
|
83
|
+
prov = metadata.get("provenance") if isinstance(metadata.get("provenance"), dict) else {}
|
|
84
|
+
state = {
|
|
85
|
+
"memory_status": prov.get("memory_status") or metadata.get("memory_status") or "active",
|
|
86
|
+
"superseded_by": prov.get("superseded_by") or metadata.get("superseded_by"),
|
|
87
|
+
"supersedes": prov.get("supersedes") or metadata.get("supersedes"),
|
|
88
|
+
"duplicate_of": prov.get("duplicate_of") or metadata.get("duplicate_of"),
|
|
89
|
+
"contradicts": prov.get("contradicts") or metadata.get("contradicts") or [],
|
|
90
|
+
"contradiction_status": prov.get("contradiction_status") or metadata.get("contradiction_status"),
|
|
91
|
+
"canonical_reference": prov.get("canonical_reference") or metadata.get("canonical_reference"),
|
|
92
|
+
"provenance_preview": preview,
|
|
93
|
+
}
|
|
94
|
+
return str(state["memory_status"] or "active"), state
|
|
95
|
+
|
|
96
|
+
|
|
34
97
|
def retrieve(prompt: str, limit: int = 5, categories: Iterable[str] | None = None) -> Dict[str, List[Dict[str, Any]]]:
|
|
35
98
|
emit_event(state_store.reports_dir() / "brain_memory.log.jsonl", "brain_memory_retrieval_start", status="ok")
|
|
36
99
|
emit_event(state_store.reports_dir() / "brain_memory.log.jsonl", "brain_memory_retrieval_rank_start", status="ok")
|
|
@@ -39,10 +102,7 @@ def retrieve(prompt: str, limit: int = 5, categories: Iterable[str] | None = Non
|
|
|
39
102
|
results = _empty_results()
|
|
40
103
|
selected_categories = tuple(dict.fromkeys(category for category in (categories or MEMORY_BUCKETS) if category in MEMORY_BUCKETS))
|
|
41
104
|
|
|
42
|
-
|
|
43
|
-
reinf_rows = conn.execute(
|
|
44
|
-
"SELECT memory_reference, reward_score, confidence FROM experiences",
|
|
45
|
-
).fetchall()
|
|
105
|
+
reinf_rows = conn.execute("SELECT memory_reference, reward_score, confidence FROM experiences").fetchall()
|
|
46
106
|
reinforcement: Dict[str, Dict[str, float]] = {}
|
|
47
107
|
for row in reinf_rows:
|
|
48
108
|
reference = str(row[0] or "")
|
|
@@ -57,66 +117,72 @@ def retrieve(prompt: str, limit: int = 5, categories: Iterable[str] | None = Non
|
|
|
57
117
|
current["reward_score"] = float(current.get("reward_score") or 0.0) / count
|
|
58
118
|
current["confidence"] = float(current.get("confidence") or 0.0) / count
|
|
59
119
|
|
|
60
|
-
|
|
120
|
+
semantic_scores: Dict[str, float] = {}
|
|
121
|
+
if prompt.strip():
|
|
122
|
+
for item in vector_index.search_memory(prompt, limit=max(limit * 6, 20)):
|
|
123
|
+
source_type = item.get("source_type") or "knowledge"
|
|
124
|
+
source_id = str(item.get("source_id") or "")
|
|
125
|
+
if source_type in selected_categories and source_id:
|
|
126
|
+
semantic_scores[f"{source_type}:{source_id}"] = float(item.get("score") or 0.0)
|
|
127
|
+
|
|
128
|
+
def score_record(*, content: str, memory_ref: str, promo_conf: float, timestamp: str | None) -> tuple[float, Dict[str, float]]:
|
|
61
129
|
keyword = _match_score(content, prompt)
|
|
130
|
+
semantic = float(semantic_scores.get(memory_ref, 0.0))
|
|
62
131
|
reinf = reinforcement.get(memory_ref, {})
|
|
63
|
-
reinf_score = float(reinf.get("reward_score", 0.0)) * 0.
|
|
64
|
-
promo_score = float(promo_conf) * 0.
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
132
|
+
reinf_score = float(reinf.get("reward_score", 0.0)) * 0.35
|
|
133
|
+
promo_score = float(promo_conf) * 0.2
|
|
134
|
+
recency = _recency_score(timestamp)
|
|
135
|
+
score = round((keyword * 0.45) + (semantic * 0.35) + reinf_score + promo_score + recency, 3)
|
|
136
|
+
return score, {
|
|
137
|
+
"keyword": round(keyword, 3),
|
|
138
|
+
"semantic": round(semantic, 3),
|
|
139
|
+
"reinforcement": round(reinf_score, 3),
|
|
140
|
+
"promotion": round(promo_score, 3),
|
|
141
|
+
"recency": round(recency, 3),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
for table in selected_categories:
|
|
145
|
+
candidates: Dict[str, Dict[str, Any]] = {}
|
|
68
146
|
try:
|
|
69
147
|
rows = conn.execute(
|
|
70
|
-
f"SELECT id, content, confidence, metadata_json FROM {table} ORDER BY id DESC LIMIT ?",
|
|
71
|
-
(limit *
|
|
148
|
+
f"SELECT id, timestamp, content, confidence, metadata_json FROM {table} ORDER BY id DESC LIMIT ?",
|
|
149
|
+
(max(limit * 20, 50),),
|
|
72
150
|
).fetchall()
|
|
73
151
|
except Exception:
|
|
74
152
|
continue
|
|
75
153
|
for row in rows:
|
|
76
|
-
content = row["content"] if isinstance(row, dict) else row[
|
|
77
|
-
if not _match_score(content, prompt):
|
|
78
|
-
continue
|
|
154
|
+
content = row["content"] if isinstance(row, dict) else row[2]
|
|
79
155
|
mem_ref = f"{table}:{row[0]}"
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
"content": content,
|
|
84
|
-
"score": score_record(content, mem_ref, promo_conf),
|
|
85
|
-
"memory_reference": mem_ref,
|
|
86
|
-
"links": memory_links.get_memory_links(mem_ref),
|
|
87
|
-
"provenance_preview": (metadata or {}).get("provenance_preview") or provenance.preview_from_metadata((metadata or {}).get("metadata")),
|
|
88
|
-
})
|
|
89
|
-
|
|
90
|
-
results[key] = sorted(results[key], key=lambda x: x["score"], reverse=True)[:limit]
|
|
91
|
-
|
|
92
|
-
if prompt.strip() and all(not results.get(bucket) for bucket in selected_categories):
|
|
93
|
-
semantic = vector_index.search_memory(prompt, limit=limit)
|
|
94
|
-
for item in semantic:
|
|
95
|
-
source_type = item.get("source_type") or "knowledge"
|
|
96
|
-
if source_type not in selected_categories:
|
|
97
|
-
continue
|
|
98
|
-
try:
|
|
99
|
-
row = conn.execute(
|
|
100
|
-
f"SELECT id, content, confidence, metadata_json FROM {source_type} WHERE id=?",
|
|
101
|
-
(int(item.get("source_id") or 0),),
|
|
102
|
-
).fetchone()
|
|
103
|
-
except Exception:
|
|
156
|
+
keyword = _match_score(content, prompt)
|
|
157
|
+
semantic = float(semantic_scores.get(mem_ref, 0.0))
|
|
158
|
+
if prompt.strip() and keyword <= 0.0 and semantic <= 0.0:
|
|
104
159
|
continue
|
|
105
|
-
if
|
|
160
|
+
promo_conf = row["confidence"] if isinstance(row, dict) else row[3]
|
|
161
|
+
timestamp = row["timestamp"] if isinstance(row, dict) else row[1]
|
|
162
|
+
raw_metadata = row["metadata_json"] if isinstance(row, dict) else row[4]
|
|
163
|
+
metadata_payload = _parse_metadata(raw_metadata)
|
|
164
|
+
memory_status, governance = _governance_state(metadata_payload)
|
|
165
|
+
if memory_status in {"superseded", "duplicate"}:
|
|
106
166
|
continue
|
|
107
|
-
content = row["content"] if isinstance(row, dict) else row[1]
|
|
108
|
-
mem_ref = f"{source_type}:{row[0]}"
|
|
109
|
-
promo_conf = row["confidence"] if isinstance(row, dict) else row[2]
|
|
110
167
|
metadata = provenance.fetch_reference(mem_ref)
|
|
111
|
-
|
|
168
|
+
score, signals = score_record(content=content, memory_ref=mem_ref, promo_conf=promo_conf, timestamp=timestamp)
|
|
169
|
+
if memory_status == "contested":
|
|
170
|
+
score = round(max(0.0, score - 0.15), 3)
|
|
171
|
+
signals["contradiction_penalty"] = 0.15
|
|
172
|
+
selected_because = max(signals, key=signals.get) if signals else "keyword"
|
|
173
|
+
candidates[mem_ref] = {
|
|
112
174
|
"content": content,
|
|
113
|
-
"score":
|
|
175
|
+
"score": score,
|
|
114
176
|
"memory_reference": mem_ref,
|
|
115
177
|
"links": memory_links.get_memory_links(mem_ref),
|
|
116
|
-
"provenance_preview": (metadata or {}).get("provenance_preview") or provenance.preview_from_metadata((metadata or {}).get("metadata")),
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
178
|
+
"provenance_preview": (metadata or {}).get("provenance_preview") or governance.get("provenance_preview") or provenance.preview_from_metadata((metadata or {}).get("metadata")),
|
|
179
|
+
"retrieval_signals": signals,
|
|
180
|
+
"selected_because": selected_because,
|
|
181
|
+
"timestamp": timestamp,
|
|
182
|
+
"memory_status": memory_status,
|
|
183
|
+
"governance": governance,
|
|
184
|
+
}
|
|
185
|
+
results[table] = sorted(candidates.values(), key=lambda x: x["score"], reverse=True)[:limit]
|
|
120
186
|
|
|
121
187
|
conn.close()
|
|
122
188
|
emit_event(state_store.reports_dir() / "brain_memory.log.jsonl", "brain_memory_retrieval_rank_complete", status="ok")
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
3
4
|
import json
|
|
4
5
|
import math
|
|
6
|
+
import re
|
|
5
7
|
import threading
|
|
6
8
|
from typing import Any, Dict, List, Iterable
|
|
7
9
|
|
|
@@ -22,6 +24,9 @@ EMBEDDING_TABLES: tuple[str, ...] = (
|
|
|
22
24
|
)
|
|
23
25
|
_REBUILD_LOCK = threading.Lock()
|
|
24
26
|
_WRITE_CHUNK_SIZE = 64
|
|
27
|
+
_EMBEDDING_TEXT_LIMIT = 8000
|
|
28
|
+
_HTML_TAG_RE = re.compile(r"<[^>]+>")
|
|
29
|
+
_WHITESPACE_RE = re.compile(r"\s+")
|
|
25
30
|
|
|
26
31
|
|
|
27
32
|
def _ensure_vector_table(conn) -> None:
|
|
@@ -94,30 +99,66 @@ def insert_memory(memory_id: int, content: str, confidence: float, *, source_typ
|
|
|
94
99
|
store.submit_write(_write, timeout=30.0)
|
|
95
100
|
|
|
96
101
|
|
|
97
|
-
def _load_table_rows(table: str, *, limit: int | None = None, descending: bool = False) -> List[Dict[str, Any]]:
|
|
102
|
+
def _load_table_rows(table: str, *, limit: int | None = None, descending: bool = False, missing_only: bool = False) -> List[Dict[str, Any]]:
|
|
98
103
|
conn = store.connect()
|
|
99
104
|
try:
|
|
100
105
|
order = "DESC" if descending else "ASC"
|
|
106
|
+
where = ""
|
|
107
|
+
params: list[Any] = []
|
|
108
|
+
if missing_only:
|
|
109
|
+
where = " WHERE CAST(id AS TEXT) NOT IN (SELECT source_id FROM vector_embeddings WHERE source_type = ?)"
|
|
110
|
+
params.append(table)
|
|
101
111
|
if limit is None:
|
|
102
112
|
rows = conn.execute(
|
|
103
|
-
f"SELECT id, content, confidence, metadata_json FROM {table} ORDER BY id {order}",
|
|
113
|
+
f"SELECT id, content, confidence, metadata_json FROM {table}{where} ORDER BY id {order}",
|
|
114
|
+
tuple(params),
|
|
104
115
|
).fetchall()
|
|
105
116
|
else:
|
|
106
117
|
rows = conn.execute(
|
|
107
|
-
f"SELECT id, content, confidence, metadata_json FROM {table} ORDER BY id {order} LIMIT ?",
|
|
108
|
-
(limit
|
|
118
|
+
f"SELECT id, content, confidence, metadata_json FROM {table}{where} ORDER BY id {order} LIMIT ?",
|
|
119
|
+
tuple(params + [limit]),
|
|
109
120
|
).fetchall()
|
|
110
121
|
finally:
|
|
111
122
|
conn.close()
|
|
112
123
|
return [dict(row) for row in rows]
|
|
113
124
|
|
|
114
125
|
|
|
126
|
+
def _embedding_input(text: str, *, table: str = "knowledge") -> str:
|
|
127
|
+
cleaned = _HTML_TAG_RE.sub(" ", text)
|
|
128
|
+
cleaned = _WHITESPACE_RE.sub(" ", cleaned).strip()
|
|
129
|
+
lowered = cleaned.lower()
|
|
130
|
+
artifactish = (
|
|
131
|
+
"| chunk " in lowered
|
|
132
|
+
or ".sql" in lowered
|
|
133
|
+
or "topology/" in lowered
|
|
134
|
+
or cleaned.count("),(") >= 8
|
|
135
|
+
)
|
|
136
|
+
if table == "knowledge" and artifactish:
|
|
137
|
+
return cleaned[:500]
|
|
138
|
+
if table == "knowledge" and len(cleaned) > 9000:
|
|
139
|
+
return cleaned[:1000]
|
|
140
|
+
if table == "reflections" and len(cleaned) > 8000:
|
|
141
|
+
return cleaned[:1200]
|
|
142
|
+
if len(cleaned) > 20000:
|
|
143
|
+
return cleaned[:2000]
|
|
144
|
+
if len(cleaned) > 12000:
|
|
145
|
+
return cleaned[:4000]
|
|
146
|
+
return cleaned[:_EMBEDDING_TEXT_LIMIT]
|
|
147
|
+
|
|
148
|
+
|
|
115
149
|
def _prepare_embedding_rows(rows: Iterable[Dict[str, Any]], *, table: str) -> List[Dict[str, Any]]:
|
|
116
150
|
prepared: List[Dict[str, Any]] = []
|
|
151
|
+
embedding_cache: Dict[str, List[float] | None] = {}
|
|
117
152
|
for row in rows:
|
|
118
153
|
content = str(row.get("content") or "")
|
|
119
154
|
redacted_content, changed = redaction.redact_text(content)
|
|
120
|
-
|
|
155
|
+
embedding_input = _embedding_input(redacted_content, table=table)
|
|
156
|
+
cache_key = hashlib.sha256(embedding_input.encode("utf-8", errors="ignore")).hexdigest()
|
|
157
|
+
if cache_key in embedding_cache:
|
|
158
|
+
embedding = embedding_cache[cache_key]
|
|
159
|
+
else:
|
|
160
|
+
embedding = embedding_engine.generate_embedding(embedding_input)
|
|
161
|
+
embedding_cache[cache_key] = embedding
|
|
121
162
|
if not embedding:
|
|
122
163
|
continue
|
|
123
164
|
try:
|
|
@@ -213,6 +254,27 @@ def rebuild_vector_index(*, tables: Iterable[str] | None = None) -> int:
|
|
|
213
254
|
return count
|
|
214
255
|
|
|
215
256
|
|
|
257
|
+
def backfill_missing_vectors(*, tables: Iterable[str] | None = None, limit_per_table: int | None = None) -> int:
|
|
258
|
+
emit_event(LOGFILE, "brain_memory_vector_backfill_start", status="ok")
|
|
259
|
+
if not _REBUILD_LOCK.acquire(blocking=False):
|
|
260
|
+
emit_event(LOGFILE, "brain_memory_vector_backfill_complete", status="skipped", reason="already_running")
|
|
261
|
+
return 0
|
|
262
|
+
count = 0
|
|
263
|
+
try:
|
|
264
|
+
requested_tables = [table for table in (tables or EMBEDDING_TABLES) if table in EMBEDDING_TABLES]
|
|
265
|
+
for table in requested_tables:
|
|
266
|
+
prepared = _prepare_embedding_rows(
|
|
267
|
+
_load_table_rows(table, limit=limit_per_table, missing_only=True),
|
|
268
|
+
table=table,
|
|
269
|
+
)
|
|
270
|
+
for offset in range(0, len(prepared), _WRITE_CHUNK_SIZE):
|
|
271
|
+
count += _write_embedding_chunk(table, prepared[offset: offset + _WRITE_CHUNK_SIZE])
|
|
272
|
+
finally:
|
|
273
|
+
_REBUILD_LOCK.release()
|
|
274
|
+
emit_event(LOGFILE, "brain_memory_vector_backfill_complete", status="ok", indexed=count)
|
|
275
|
+
return count
|
|
276
|
+
|
|
277
|
+
|
|
216
278
|
def search_memory(query: str, limit: int = 5) -> List[Dict[str, Any]]:
|
|
217
279
|
emit_event(LOGFILE, "brain_memory_vector_search_start", status="ok")
|
|
218
280
|
conn = store.connect()
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# 2026-03-18 — Memory repair, integrity cleanup, and backfill tooling
|
|
2
|
+
|
|
3
|
+
## Summary
|
|
4
|
+
This pass focused on turning `ocmemog` from a noisy/fragile memory stack into a more repairable and laptop-safe system. The work addressed:
|
|
5
|
+
- bad default vector rebuild behavior
|
|
6
|
+
- misleading health/compat signals
|
|
7
|
+
- missing `memory_reference` writer debt
|
|
8
|
+
- poor freshness summaries
|
|
9
|
+
- lack of an incremental vector backfill path
|
|
10
|
+
- battery-unfriendly defaults in the sidecar launcher
|
|
11
|
+
|
|
12
|
+
## Changes landed
|
|
13
|
+
|
|
14
|
+
### Embedding and rebuild behavior
|
|
15
|
+
- Fixed the vector reindex entrypoint so it defaults to provider-backed Ollama embeddings instead of silently rebuilding weak hash/simple vectors.
|
|
16
|
+
- Confirmed local Ollama embeddings (`nomic-embed-text:latest`) are available and produce 768-dim vectors.
|
|
17
|
+
- Added a new incremental repair path:
|
|
18
|
+
- `backfill_missing_vectors()` in `brain/runtime/memory/vector_index.py`
|
|
19
|
+
- `scripts/ocmemog-backfill-vectors.py`
|
|
20
|
+
- This gives a non-destructive, table-by-table, chunkable way to backfill missing vectors without requiring a full destructive rebuild.
|
|
21
|
+
|
|
22
|
+
### Integrity and writer correctness
|
|
23
|
+
- Fixed `record_reinforcement()` so new `experiences` rows preserve a deterministic `memory_reference`.
|
|
24
|
+
- Added repair support for legacy rows missing `memory_reference`.
|
|
25
|
+
- Ran integrity repair and backfilled `1807` missing references.
|
|
26
|
+
- Fixed duplicate promotion integrity reporting so grouped duplicate counts are reported accurately.
|
|
27
|
+
|
|
28
|
+
### Health and output quality
|
|
29
|
+
- Fixed sidecar compat/health reporting so provider-backed embeddings do not falsely report local hash fallback warnings.
|
|
30
|
+
- Cleaned freshness summaries so placeholder content like `promoted`, `candidate_promoted`, `summary`, and `No local memory summary available` do not pollute advisories.
|
|
31
|
+
- Junk-only rows now surface as `(needs summary cleanup)` instead of pretending they contain a meaningful summary.
|
|
32
|
+
|
|
33
|
+
### Laptop/battery-aware behavior
|
|
34
|
+
- Added battery-aware defaults to `scripts/ocmemog-sidecar.sh`.
|
|
35
|
+
- `OCMEMOG_LAPTOP_MODE=auto|ac|battery` now controls watcher/ingest aggressiveness.
|
|
36
|
+
- On battery the sidecar uses slower polling, smaller batches, and disables sentiment reinforcement by default.
|
|
37
|
+
|
|
38
|
+
## Current integrity state
|
|
39
|
+
After writer/reference repair:
|
|
40
|
+
- `missing_memory_reference` debt is cleared
|
|
41
|
+
- remaining integrity issue is primarily vector backlog:
|
|
42
|
+
- `vector_missing:19935`
|
|
43
|
+
|
|
44
|
+
Observed coverage snapshot during staged backfill work:
|
|
45
|
+
- `knowledge`: 15999 rows, 0 vectors
|
|
46
|
+
- `runbooks`: 179 rows, 152 vectors
|
|
47
|
+
- `lessons`: 76 rows, 76 vectors
|
|
48
|
+
- `directives`: 233 rows, 206 vectors
|
|
49
|
+
- `reflections`: 3460 rows, 83 vectors
|
|
50
|
+
- `tasks`: 505 rows, 0 vectors
|
|
51
|
+
|
|
52
|
+
## Why backlog remains
|
|
53
|
+
The remaining `vector_missing` debt is mostly historical backlog rather than an active write-path failure. Existing new writes can index correctly; the old corpus simply was never fully rebuilt under the corrected provider-backed embedding path.
|
|
54
|
+
|
|
55
|
+
## Recommended staged follow-up
|
|
56
|
+
For laptop-friendly backlog burn-down, use staged backfills in roughly this order:
|
|
57
|
+
1. directives
|
|
58
|
+
2. tasks
|
|
59
|
+
3. runbooks
|
|
60
|
+
4. lessons
|
|
61
|
+
5. reflections
|
|
62
|
+
6. knowledge last
|
|
63
|
+
|
|
64
|
+
## Commits from this sweep
|
|
65
|
+
- `f3d3dd9` — fix: default vector reindex to ollama embeddings
|
|
66
|
+
- `759d23d` — feat: add battery-aware sidecar defaults
|
|
67
|
+
- `4a102eb` — fix: clean memory freshness summaries
|
|
68
|
+
- `9ee7966` — fix: report duplicate promotion counts accurately
|
|
69
|
+
- `8704db9` — fix: preserve and repair experience memory references
|
|
70
|
+
- `5dc3cb9` — feat: add incremental vector backfill tooling
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Local model role matrix — 2026-03-18
|
|
2
|
+
|
|
3
|
+
Purpose: document which installed local model is best suited for which `ocmemog` task so background cognition can be smarter without putting heavy/slow models on every path.
|
|
4
|
+
|
|
5
|
+
Installed local models observed:
|
|
6
|
+
- `phi3:latest`
|
|
7
|
+
- `qwen2.5:7b`
|
|
8
|
+
- `llama3.1:8b`
|
|
9
|
+
- embeddings: `nomic-embed-text:latest`
|
|
10
|
+
|
|
11
|
+
## Intended decision areas
|
|
12
|
+
- unresolved-state rewrite
|
|
13
|
+
- lesson extraction
|
|
14
|
+
- ponder/reflection shaping
|
|
15
|
+
- cluster recommendation wording
|
|
16
|
+
- fallback/speed path
|
|
17
|
+
|
|
18
|
+
## Bakeoff results
|
|
19
|
+
|
|
20
|
+
### Unresolved-state rewrite
|
|
21
|
+
- **Winner:** `qwen2.5:7b`
|
|
22
|
+
- Why: cleanest concise rewrite, best instruction-following, least rambling.
|
|
23
|
+
- Notes:
|
|
24
|
+
- `phi3:latest` tended to be verbose and occasionally hallucination-prone.
|
|
25
|
+
- `llama3.1:8b` produced one outright unusable response ("None found...").
|
|
26
|
+
|
|
27
|
+
### Lesson extraction
|
|
28
|
+
- **Winner:** `qwen2.5:7b`
|
|
29
|
+
- Strong alternate: `llama3.1:8b`
|
|
30
|
+
- Why: `qwen2.5:7b` produced the clearest operational lesson with good cause/effect preservation.
|
|
31
|
+
- Notes:
|
|
32
|
+
- `phi3:latest` was weaker and more generic.
|
|
33
|
+
|
|
34
|
+
### Cluster insight / recommendation shaping
|
|
35
|
+
- **Winner:** `qwen2.5:7b`
|
|
36
|
+
- Why: best structured output, least fluff, most concrete recommendation wording.
|
|
37
|
+
- Notes:
|
|
38
|
+
- `llama3.1:8b` was decent but more wordy/stylized.
|
|
39
|
+
- `phi3:latest` timed out or underperformed on this task.
|
|
40
|
+
|
|
41
|
+
## Recommended model-role split
|
|
42
|
+
- embeddings: `nomic-embed-text:latest`
|
|
43
|
+
- fast fallback cognition: `phi3:latest`
|
|
44
|
+
- default structured memory refinement / ponder model: `qwen2.5:7b`
|
|
45
|
+
- richer optional background cognition: `llama3.1:8b`
|
|
46
|
+
|
|
47
|
+
## Operational recommendation
|
|
48
|
+
- Keep `OCMEMOG_OLLAMA_MODEL=phi3:latest` for lightweight local fallback behavior.
|
|
49
|
+
- Set `OCMEMOG_PONDER_MODEL=qwen2.5:7b` for unresolved-state rewrite, lesson extraction, and cluster recommendation shaping.
|
|
50
|
+
- Consider `llama3.1:8b` for optional deeper background cognition passes where latency is acceptable.
|
package/docs/usage.md
CHANGED
|
@@ -23,7 +23,7 @@ Manual watcher:
|
|
|
23
23
|
```bash
|
|
24
24
|
# defaults to ~/.openclaw/workspace/memory/transcripts if not set
|
|
25
25
|
export OCMEMOG_TRANSCRIPT_DIR="$HOME/.openclaw/workspace/memory/transcripts"
|
|
26
|
-
export OCMEMOG_INGEST_ENDPOINT="http://127.0.0.1:
|
|
26
|
+
export OCMEMOG_INGEST_ENDPOINT="http://127.0.0.1:17891/memory/ingest"
|
|
27
27
|
./scripts/ocmemog-transcript-watcher.sh
|
|
28
28
|
```
|
|
29
29
|
|
|
@@ -34,11 +34,13 @@ export OCMEMOG_TRANSCRIPT_WATCHER=true
|
|
|
34
34
|
./scripts/ocmemog-sidecar.sh
|
|
35
35
|
```
|
|
36
36
|
|
|
37
|
+
On macOS laptops, the launcher defaults to `OCMEMOG_LAPTOP_MODE=auto`, which detects battery power and uses lower-impact watcher settings automatically. Override with `OCMEMOG_LAPTOP_MODE=ac` for wall-power behavior or `OCMEMOG_LAPTOP_MODE=battery` to force conservative mode.
|
|
38
|
+
|
|
37
39
|
Useful environment variables:
|
|
38
40
|
|
|
39
41
|
```bash
|
|
40
42
|
export OCMEMOG_HOST=127.0.0.1
|
|
41
|
-
export OCMEMOG_PORT=
|
|
43
|
+
export OCMEMOG_PORT=17891
|
|
42
44
|
export OCMEMOG_STATE_DIR=/path/to/state
|
|
43
45
|
export OCMEMOG_DB_PATH=/path/to/brain_memory.sqlite3
|
|
44
46
|
export OCMEMOG_MEMORY_MODEL=gpt-4o-mini
|
|
@@ -62,26 +64,26 @@ Default state location in this repo is `.ocmemog-state/`.
|
|
|
62
64
|
Health:
|
|
63
65
|
|
|
64
66
|
```bash
|
|
65
|
-
curl http://127.0.0.1:
|
|
67
|
+
curl http://127.0.0.1:17891/healthz
|
|
66
68
|
```
|
|
67
69
|
|
|
68
70
|
Realtime metrics + events:
|
|
69
71
|
|
|
70
72
|
```bash
|
|
71
|
-
curl http://127.0.0.1:
|
|
72
|
-
curl http://127.0.0.1:
|
|
73
|
+
curl http://127.0.0.1:17891/metrics
|
|
74
|
+
curl http://127.0.0.1:17891/events
|
|
73
75
|
```
|
|
74
76
|
|
|
75
77
|
Dashboard:
|
|
76
78
|
|
|
77
79
|
```bash
|
|
78
|
-
open http://127.0.0.1:
|
|
80
|
+
open http://127.0.0.1:17891/dashboard
|
|
79
81
|
```
|
|
80
82
|
|
|
81
83
|
Search:
|
|
82
84
|
|
|
83
85
|
```bash
|
|
84
|
-
curl -s http://127.0.0.1:
|
|
86
|
+
curl -s http://127.0.0.1:17891/memory/search \
|
|
85
87
|
-H 'content-type: application/json' \
|
|
86
88
|
-d '{"query":"deploy risk","limit":5,"categories":["knowledge","tasks"]}'
|
|
87
89
|
```
|
|
@@ -95,7 +97,7 @@ If `OCMEMOG_API_TOKEN` is set, include the header:
|
|
|
95
97
|
Get by reference:
|
|
96
98
|
|
|
97
99
|
```bash
|
|
98
|
-
curl -s http://127.0.0.1:
|
|
100
|
+
curl -s http://127.0.0.1:17891/memory/get \
|
|
99
101
|
-H 'content-type: application/json' \
|
|
100
102
|
-d '{"reference":"knowledge:12"}'
|
|
101
103
|
```
|
|
@@ -103,7 +105,7 @@ curl -s http://127.0.0.1:17890/memory/get \
|
|
|
103
105
|
Fetch linked context (transcript snippet):
|
|
104
106
|
|
|
105
107
|
```bash
|
|
106
|
-
curl -s http://127.0.0.1:
|
|
108
|
+
curl -s http://127.0.0.1:17891/memory/context \
|
|
107
109
|
-H 'content-type: application/json' \
|
|
108
110
|
-d '{"reference":"knowledge:12","radius":10}'
|
|
109
111
|
```
|
|
@@ -117,7 +119,7 @@ Helper script:
|
|
|
117
119
|
Run pondering (writes summaries into reflections):
|
|
118
120
|
|
|
119
121
|
```bash
|
|
120
|
-
curl -s http://127.0.0.1:
|
|
122
|
+
curl -s http://127.0.0.1:17891/memory/ponder \
|
|
121
123
|
-H 'content-type: application/json' \
|
|
122
124
|
-d '{"max_items":5}'
|
|
123
125
|
```
|
|
@@ -125,13 +127,13 @@ curl -s http://127.0.0.1:17890/memory/ponder \
|
|
|
125
127
|
Fetch latest ponder recommendations:
|
|
126
128
|
|
|
127
129
|
```bash
|
|
128
|
-
curl -s http://127.0.0.1:
|
|
130
|
+
curl -s http://127.0.0.1:17891/memory/ponder/latest?limit=5
|
|
129
131
|
```
|
|
130
132
|
|
|
131
133
|
Ingest content:
|
|
132
134
|
|
|
133
135
|
```bash
|
|
134
|
-
curl -s http://127.0.0.1:
|
|
136
|
+
curl -s http://127.0.0.1:17891/memory/ingest \
|
|
135
137
|
-H 'content-type: application/json' \
|
|
136
138
|
-d '{"content":"remember this","kind":"memory","memory_type":"knowledge"}'
|
|
137
139
|
```
|
|
@@ -139,7 +141,7 @@ curl -s http://127.0.0.1:17890/memory/ingest \
|
|
|
139
141
|
Ingest with context anchors (links to chat/transcript):
|
|
140
142
|
|
|
141
143
|
```bash
|
|
142
|
-
curl -s http://127.0.0.1:
|
|
144
|
+
curl -s http://127.0.0.1:17891/memory/ingest \
|
|
143
145
|
-H 'content-type: application/json' \
|
|
144
146
|
-d '{
|
|
145
147
|
"content":"remember this",
|
|
@@ -157,7 +159,7 @@ curl -s http://127.0.0.1:17890/memory/ingest \
|
|
|
157
159
|
Distill recent experiences:
|
|
158
160
|
|
|
159
161
|
```bash
|
|
160
|
-
curl -s http://127.0.0.1:
|
|
162
|
+
curl -s http://127.0.0.1:17891/memory/distill \
|
|
161
163
|
-H 'content-type: application/json' \
|
|
162
164
|
-d '{"limit":10}'
|
|
163
165
|
```
|
package/index.ts
CHANGED