nexo-brain 7.15.2 → 7.16.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1044 @@
1
+ from __future__ import annotations
2
+ """NEXO DB - Memory Observations v2 primitives.
3
+
4
+ Phase 1 only owns the append-only ``memory_events`` log. Later phases can
5
+ derive observations, indexes, viewer rows, and promotion records from this
6
+ stable substrate without changing hook behaviour again.
7
+ """
8
+
9
+ import hashlib
10
+ import importlib
11
+ import json
12
+ import re
13
+ import sqlite3
14
+ import sys
15
+ from datetime import datetime
16
+ from typing import Any
17
+
18
+
19
+ def _core():
20
+ module = sys.modules.get("db._core")
21
+ if module is None:
22
+ module = importlib.import_module("db._core")
23
+ return module
24
+
25
+
26
+ _REDACT_PATTERNS = (
27
+ re.compile(r"sk-[a-zA-Z0-9_\-]{20,}"),
28
+ re.compile(r"ghp_[a-zA-Z0-9]{20,}"),
29
+ re.compile(r"shpat_[a-f0-9]{20,}"),
30
+ re.compile(r"AKIA[A-Z0-9]{16}"),
31
+ re.compile(r"xox[bp]-[a-zA-Z0-9\-]{20,}"),
32
+ re.compile(r"Bearer\s+[a-zA-Z0-9_\-.=+/]{20,}", re.IGNORECASE),
33
+ re.compile(r"(token\s*[=:]\s*['\"]?)[a-zA-Z0-9_\-]{20,}", re.IGNORECASE),
34
+ re.compile(r"(password\s*[=:]\s*['\"]?)[^\s'\"]{8,}", re.IGNORECASE),
35
+ re.compile(r"(api[_-]?key\s*[=:]\s*['\"]?)[a-zA-Z0-9_\-]{16,}", re.IGNORECASE),
36
+ )
37
+
38
+
39
+ def _table_exists(conn, table_name: str) -> bool:
40
+ try:
41
+ row = conn.execute(
42
+ "SELECT 1 FROM sqlite_master WHERE type='table' AND name = ? LIMIT 1",
43
+ (table_name,),
44
+ ).fetchone()
45
+ except sqlite3.OperationalError:
46
+ return False
47
+ return row is not None
48
+
49
+
50
+ def _is_virtual_fts_table(conn, table_name: str) -> bool:
51
+ try:
52
+ row = conn.execute(
53
+ "SELECT sql FROM sqlite_master WHERE name = ? LIMIT 1",
54
+ (table_name,),
55
+ ).fetchone()
56
+ except sqlite3.OperationalError:
57
+ return False
58
+ sql = str(row["sql"] if row else "").upper()
59
+ return "VIRTUAL TABLE" in sql and "FTS5" in sql
60
+
61
+
62
+ def _truncate(text: str | None, limit: int) -> str:
63
+ clean = str(text or "").strip()
64
+ return clean if len(clean) <= limit else clean[: limit - 3] + "..."
65
+
66
+
67
+ def _json(value: Any, default: Any) -> str:
68
+ if value in (None, ""):
69
+ value = default
70
+ try:
71
+ return json.dumps(value, ensure_ascii=True, sort_keys=True)
72
+ except Exception:
73
+ return json.dumps(default, ensure_ascii=True, sort_keys=True)
74
+
75
+
76
+ def _parse_json(value: str, default: Any) -> Any:
77
+ try:
78
+ parsed = json.loads(value or "")
79
+ return parsed if parsed is not None else default
80
+ except Exception:
81
+ return default
82
+
83
+
84
+ def _normalize_paths(paths: Any) -> list[str]:
85
+ if isinstance(paths, str):
86
+ items = [item.strip() for item in paths.split(",")]
87
+ elif isinstance(paths, (list, tuple, set)):
88
+ items = [str(item).strip() for item in paths]
89
+ else:
90
+ items = []
91
+ seen: set[str] = set()
92
+ result: list[str] = []
93
+ for item in items:
94
+ if not item or item in seen:
95
+ continue
96
+ seen.add(item)
97
+ result.append(item)
98
+ return result[:50]
99
+
100
+
101
+ def _redact_text(value: str) -> tuple[str, bool]:
102
+ text = str(value or "")
103
+ redacted = text
104
+ for pattern in _REDACT_PATTERNS:
105
+ redacted = pattern.sub("[REDACTED]", redacted)
106
+ return redacted, redacted != text
107
+
108
+
109
+ def _redact_value(value: Any) -> tuple[Any, bool]:
110
+ if isinstance(value, str):
111
+ return _redact_text(value)
112
+ if isinstance(value, dict):
113
+ changed = False
114
+ clean: dict[str, Any] = {}
115
+ for key, item in value.items():
116
+ clean_item, item_changed = _redact_value(item)
117
+ clean[str(key)] = clean_item
118
+ changed = changed or item_changed
119
+ return clean, changed
120
+ if isinstance(value, (list, tuple, set)):
121
+ changed = False
122
+ clean_items = []
123
+ for item in value:
124
+ clean_item, item_changed = _redact_value(item)
125
+ clean_items.append(clean_item)
126
+ changed = changed or item_changed
127
+ return clean_items, changed
128
+ return value, False
129
+
130
+
131
+ def _stable_hash(value: Any) -> str:
132
+ if value in (None, ""):
133
+ return ""
134
+ if not isinstance(value, str):
135
+ value = _json(value, {})
136
+ redacted, _ = _redact_text(value)
137
+ return hashlib.sha1(redacted.encode("utf-8", "replace"), usedforsecurity=False).hexdigest()[:24]
138
+
139
+
140
+ def _parse_created_at(value: Any) -> float:
141
+ if value in (None, ""):
142
+ return _core().now_epoch()
143
+ if isinstance(value, (int, float)):
144
+ return float(value)
145
+ text = str(value).strip()
146
+ try:
147
+ return float(text)
148
+ except Exception:
149
+ pass
150
+ for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S.%f"):
151
+ try:
152
+ return datetime.strptime(text.replace("Z", "").split("+")[0], fmt).timestamp()
153
+ except Exception:
154
+ continue
155
+ return _core().now_epoch()
156
+
157
+
158
+ def build_memory_event_uid(
159
+ *,
160
+ source_type: str,
161
+ source_id: str = "",
162
+ event_type: str,
163
+ session_id: str = "",
164
+ tool_name: str = "",
165
+ idempotency_key: str = "",
166
+ created_at: float | None = None,
167
+ ) -> str:
168
+ clean_key = (idempotency_key or "").strip()
169
+ if clean_key:
170
+ base = clean_key
171
+ else:
172
+ stable_source = (source_id or "").strip()
173
+ if stable_source:
174
+ base = "|".join([
175
+ (source_type or "").strip(),
176
+ stable_source,
177
+ (event_type or "").strip(),
178
+ (session_id or "").strip(),
179
+ (tool_name or "").strip(),
180
+ ])
181
+ else:
182
+ base = "|".join([
183
+ (source_type or "").strip(),
184
+ (event_type or "").strip(),
185
+ (session_id or "").strip(),
186
+ (tool_name or "").strip(),
187
+ str(created_at if created_at is not None else _core().now_epoch()),
188
+ ])
189
+ digest = hashlib.sha1(base.encode("utf-8", "replace"), usedforsecurity=False).hexdigest()[:32]
190
+ return f"ME-{digest}"
191
+
192
+
193
+ def _row_to_event(row) -> dict:
194
+ item = dict(row)
195
+ item["file_paths"] = _parse_json(item.pop("file_paths_json", "[]"), [])
196
+ item["metadata"] = _parse_json(item.pop("metadata_json", "{}"), {})
197
+ item["redaction_applied"] = bool(item.get("redaction_applied"))
198
+ return item
199
+
200
+
201
+ def _row_to_observation(row) -> dict:
202
+ item = dict(row)
203
+ item["facts"] = _parse_json(item.pop("facts_json", "{}"), {})
204
+ item["evidence_refs"] = _parse_json(item.pop("evidence_refs_json", "[]"), [])
205
+ item["entities"] = _parse_json(item.pop("entities_json", "[]"), [])
206
+ item["metadata"] = _parse_json(item.pop("metadata_json", "{}"), {})
207
+ return item
208
+
209
+
210
+ def _enqueue_memory_event(conn, event_uid: str, created_at: float) -> None:
211
+ if not _table_exists(conn, "memory_observation_queue"):
212
+ return
213
+ conn.execute(
214
+ """
215
+ INSERT OR IGNORE INTO memory_observation_queue (event_uid, status, created_at, updated_at)
216
+ VALUES (?, 'pending', ?, ?)
217
+ """,
218
+ (event_uid, created_at, created_at),
219
+ )
220
+
221
+
222
+ def record_memory_event(
223
+ *,
224
+ event_type: str,
225
+ source_type: str,
226
+ source_id: str = "",
227
+ session_id: str = "",
228
+ external_session_id: str = "",
229
+ client: str = "",
230
+ conversation_id: str = "",
231
+ project_key: str = "",
232
+ actor: str = "",
233
+ tool_name: str = "",
234
+ file_paths: Any = None,
235
+ command_digest: str = "",
236
+ tool_input: Any = None,
237
+ tool_output: Any = None,
238
+ raw_ref: str = "",
239
+ privacy_level: str = "normal",
240
+ confidence: float = 1.0,
241
+ metadata: dict[str, Any] | None = None,
242
+ event_uid: str = "",
243
+ idempotency_key: str = "",
244
+ created_at: float | None = None,
245
+ enqueue_observation: bool = True,
246
+ ) -> dict:
247
+ clean_event_type = (event_type or "").strip().lower()
248
+ clean_source_type = (source_type or "").strip().lower()
249
+ if not clean_event_type:
250
+ return {"ok": False, "error": "event_type is required"}
251
+ if not clean_source_type:
252
+ return {"ok": False, "error": "source_type is required"}
253
+
254
+ conn = _core().get_db()
255
+ if not _table_exists(conn, "memory_events"):
256
+ return {"ok": True, "skipped": True, "reason": "memory_events table unavailable"}
257
+
258
+ now = float(created_at if created_at is not None else _core().now_epoch())
259
+ paths = _normalize_paths(file_paths)
260
+ meta = dict(metadata or {})
261
+ redaction_applied = False
262
+
263
+ clean_command, command_redacted = _redact_text(command_digest)
264
+ redaction_applied = redaction_applied or command_redacted
265
+ clean_raw_ref, raw_ref_redacted = _redact_text(raw_ref)
266
+ redaction_applied = redaction_applied or raw_ref_redacted
267
+ clean_meta, meta_redacted = _redact_value(meta)
268
+ redaction_applied = redaction_applied or meta_redacted
269
+
270
+ _, input_redacted = _redact_value(tool_input)
271
+ _, output_redacted = _redact_value(tool_output)
272
+ redaction_applied = redaction_applied or input_redacted or output_redacted
273
+ input_hash = _stable_hash(tool_input)
274
+ output_hash = _stable_hash(tool_output)
275
+ uid = (event_uid or "").strip() or build_memory_event_uid(
276
+ source_type=clean_source_type,
277
+ source_id=source_id,
278
+ event_type=clean_event_type,
279
+ session_id=session_id,
280
+ tool_name=tool_name,
281
+ idempotency_key=idempotency_key,
282
+ created_at=now,
283
+ )
284
+
285
+ try:
286
+ cursor = conn.execute(
287
+ """
288
+ INSERT OR IGNORE INTO memory_events (
289
+ event_uid, created_at, session_id, external_session_id, client, conversation_id,
290
+ project_key, source_type, source_id, event_type, actor, tool_name,
291
+ file_paths_json, command_digest, input_hash, output_digest, raw_ref,
292
+ privacy_level, redaction_applied, confidence, metadata_json
293
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
294
+ """,
295
+ (
296
+ uid,
297
+ now,
298
+ _truncate(session_id, 160),
299
+ _truncate(external_session_id, 160),
300
+ _truncate(client, 80),
301
+ _truncate(conversation_id, 160),
302
+ _truncate(project_key, 120),
303
+ clean_source_type,
304
+ _truncate(source_id, 200),
305
+ clean_event_type,
306
+ _truncate(actor, 120),
307
+ _truncate(tool_name, 120),
308
+ _json(paths, []),
309
+ _truncate(clean_command, 240),
310
+ input_hash,
311
+ output_hash,
312
+ _truncate(clean_raw_ref, 500),
313
+ _truncate(privacy_level or "normal", 40),
314
+ 1 if redaction_applied else 0,
315
+ max(0.0, min(1.0, float(confidence or 0.0))),
316
+ _json(clean_meta, {}),
317
+ ),
318
+ )
319
+ if enqueue_observation:
320
+ _enqueue_memory_event(conn, uid, now)
321
+ conn.commit()
322
+ except Exception as exc:
323
+ return {"ok": False, "error": str(exc), "event_uid": uid}
324
+
325
+ row = conn.execute("SELECT * FROM memory_events WHERE event_uid = ?", (uid,)).fetchone()
326
+ event = _row_to_event(row) if row else {"event_uid": uid}
327
+ event["ok"] = True
328
+ event["inserted"] = bool(cursor.rowcount)
329
+ return event
330
+
331
+
332
+ def _derive_observation(event: dict) -> dict:
333
+ metadata = event.get("metadata") or {}
334
+ paths = event.get("file_paths") or []
335
+ event_type = event.get("event_type") or ""
336
+ source_type = event.get("source_type") or ""
337
+ source_id = event.get("source_id") or ""
338
+ tool_name = event.get("tool_name") or ""
339
+
340
+ observation_type = "event"
341
+ subject = source_id or event.get("event_uid") or ""
342
+ salience = 0.45
343
+ summary = metadata.get("summary") or ""
344
+ entities: list[str] = []
345
+
346
+ if event_type == "tool_write":
347
+ observation_type = "code_change"
348
+ subject = paths[0] if paths else tool_name or source_id
349
+ count = len(paths)
350
+ file_note = ", ".join(paths[:4]) if paths else "unknown files"
351
+ summary = summary or f"{tool_name or 'Tool'} wrote {count} file(s): {file_note}."
352
+ entities.extend(paths[:8])
353
+ salience = 0.62
354
+ elif event_type.startswith("protocol_task_"):
355
+ observation_type = "task_result"
356
+ outcome = event_type.removeprefix("protocol_task_") or "closed"
357
+ subject = metadata.get("goal") or source_id
358
+ goal = metadata.get("goal") or source_id or "protocol task"
359
+ summary = summary or f"Protocol task {outcome}: {goal}"
360
+ if metadata.get("outcome"):
361
+ entities.append(str(metadata["outcome"]))
362
+ salience = 0.72 if outcome == "done" else 0.58
363
+ elif "correction" in event_type:
364
+ observation_type = "correction"
365
+ subject = metadata.get("subject") or source_id
366
+ summary = summary or f"Correction captured from {source_type}:{source_id}"
367
+ salience = 0.9
368
+ elif "decision" in event_type:
369
+ observation_type = "decision"
370
+ subject = metadata.get("subject") or source_id
371
+ summary = summary or f"Decision captured from {source_type}:{source_id}"
372
+ salience = 0.82
373
+
374
+ raw_ref = event.get("raw_ref") or ""
375
+ evidence_refs = [f"memory_event:{event.get('event_uid')}"]
376
+ if raw_ref:
377
+ evidence_refs.append(raw_ref)
378
+ if source_type and source_id:
379
+ evidence_refs.append(f"{source_type}:{source_id}")
380
+
381
+ facts = {
382
+ "event_uid": event.get("event_uid"),
383
+ "event_type": event_type,
384
+ "source_type": source_type,
385
+ "source_id": source_id,
386
+ "tool_name": tool_name,
387
+ "file_paths": paths,
388
+ "created_at": event.get("created_at"),
389
+ }
390
+ if metadata:
391
+ facts["metadata"] = metadata
392
+
393
+ source_hash = hashlib.sha1(
394
+ _json(
395
+ {
396
+ "event_uid": event.get("event_uid"),
397
+ "summary": summary,
398
+ "facts": facts,
399
+ },
400
+ {},
401
+ ).encode("utf-8", "replace"),
402
+ usedforsecurity=False,
403
+ ).hexdigest()[:24]
404
+ uid = f"MO-{hashlib.sha1(str(event.get('event_uid')).encode('utf-8'), usedforsecurity=False).hexdigest()[:32]}"
405
+
406
+ return {
407
+ "observation_uid": uid,
408
+ "created_at": float(event.get("created_at") or _core().now_epoch()),
409
+ "updated_at": _core().now_epoch(),
410
+ "project_key": event.get("project_key") or "",
411
+ "session_id": event.get("session_id") or "",
412
+ "observation_type": observation_type,
413
+ "subject": _truncate(subject, 240),
414
+ "summary": _truncate(summary, 1000),
415
+ "facts": facts,
416
+ "evidence_refs": evidence_refs,
417
+ "entities": sorted({str(item) for item in entities if str(item).strip()}),
418
+ "salience": salience,
419
+ "confidence": float(event.get("confidence") or 0.5),
420
+ "stability": 1.0,
421
+ "status": "active",
422
+ "promotion_state": "observation",
423
+ "decay_policy": "normal",
424
+ "source_hash": source_hash,
425
+ "metadata": {
426
+ "source_event_id": event.get("id"),
427
+ "phase": "passive_observation",
428
+ },
429
+ }
430
+
431
+
432
+ def upsert_memory_observation(observation: dict) -> dict:
433
+ conn = _core().get_db()
434
+ if not _table_exists(conn, "memory_observations"):
435
+ return {"ok": True, "skipped": True, "reason": "memory_observations table unavailable"}
436
+ uid = (observation.get("observation_uid") or "").strip()
437
+ if not uid:
438
+ return {"ok": False, "error": "observation_uid is required"}
439
+ now = float(observation.get("updated_at") or _core().now_epoch())
440
+ clean_subject, subject_redacted = _redact_text(observation.get("subject"))
441
+ clean_summary, summary_redacted = _redact_text(observation.get("summary"))
442
+ clean_facts, facts_redacted = _redact_value(observation.get("facts"))
443
+ clean_refs, refs_redacted = _redact_value(observation.get("evidence_refs"))
444
+ clean_entities, entities_redacted = _redact_value(observation.get("entities"))
445
+ clean_metadata, metadata_redacted = _redact_value(observation.get("metadata"))
446
+ if any((subject_redacted, summary_redacted, facts_redacted, refs_redacted, entities_redacted, metadata_redacted)):
447
+ if not isinstance(clean_metadata, dict):
448
+ clean_metadata = {}
449
+ clean_metadata["redaction_applied"] = True
450
+ conn.execute(
451
+ """
452
+ INSERT INTO memory_observations (
453
+ observation_uid, created_at, updated_at, project_key, session_id,
454
+ observation_type, subject, summary, facts_json, evidence_refs_json,
455
+ entities_json, salience, confidence, stability, status, promotion_state,
456
+ decay_policy, source_hash, metadata_json
457
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
458
+ ON CONFLICT(observation_uid) DO UPDATE SET
459
+ updated_at = excluded.updated_at,
460
+ project_key = excluded.project_key,
461
+ session_id = excluded.session_id,
462
+ observation_type = excluded.observation_type,
463
+ subject = excluded.subject,
464
+ summary = excluded.summary,
465
+ facts_json = excluded.facts_json,
466
+ evidence_refs_json = excluded.evidence_refs_json,
467
+ entities_json = excluded.entities_json,
468
+ salience = excluded.salience,
469
+ confidence = excluded.confidence,
470
+ stability = excluded.stability,
471
+ status = excluded.status,
472
+ promotion_state = excluded.promotion_state,
473
+ decay_policy = excluded.decay_policy,
474
+ source_hash = excluded.source_hash,
475
+ metadata_json = excluded.metadata_json
476
+ """,
477
+ (
478
+ uid,
479
+ float(observation.get("created_at") or now),
480
+ now,
481
+ _truncate(observation.get("project_key"), 120),
482
+ _truncate(observation.get("session_id"), 160),
483
+ _truncate(observation.get("observation_type"), 80),
484
+ _truncate(clean_subject, 240),
485
+ _truncate(clean_summary, 1000),
486
+ _json(clean_facts, {}),
487
+ _json(clean_refs, []),
488
+ _json(clean_entities, []),
489
+ max(0.0, min(1.0, float(observation.get("salience") or 0.5))),
490
+ max(0.0, min(1.0, float(observation.get("confidence") or 0.5))),
491
+ max(0.1, min(3.0, float(observation.get("stability") or 1.0))),
492
+ _truncate(observation.get("status") or "active", 40),
493
+ _truncate(observation.get("promotion_state") or "observation", 60),
494
+ _truncate(observation.get("decay_policy") or "normal", 60),
495
+ _truncate(observation.get("source_hash"), 80),
496
+ _json(clean_metadata, {}),
497
+ ),
498
+ )
499
+ conn.commit()
500
+ row = conn.execute("SELECT * FROM memory_observations WHERE observation_uid = ?", (uid,)).fetchone()
501
+ result = _row_to_observation(row) if row else {"observation_uid": uid}
502
+ result["ok"] = True
503
+ return result
504
+
505
+
506
+ def process_memory_observation_queue(limit: int = 25) -> dict:
507
+ conn = _core().get_db()
508
+ if not _table_exists(conn, "memory_observation_queue") or not _table_exists(conn, "memory_observations"):
509
+ return {"ok": True, "processed": 0, "failed": 0, "skipped": True, "reason": "observation tables unavailable"}
510
+ rows = conn.execute(
511
+ """
512
+ SELECT q.id AS queue_id, q.event_uid, e.*
513
+ FROM memory_observation_queue q
514
+ JOIN memory_events e ON e.event_uid = q.event_uid
515
+ WHERE q.status IN ('pending', 'failed')
516
+ ORDER BY q.created_at ASC
517
+ LIMIT ?
518
+ """,
519
+ (max(1, min(int(limit or 25), 200)),),
520
+ ).fetchall()
521
+ processed = 0
522
+ failed = 0
523
+ now = _core().now_epoch()
524
+ for row in rows:
525
+ event = _row_to_event(row)
526
+ queue_id = row["queue_id"]
527
+ try:
528
+ observation = _derive_observation(event)
529
+ upsert_memory_observation(observation)
530
+ conn.execute(
531
+ """
532
+ UPDATE memory_observation_queue
533
+ SET status = 'processed',
534
+ attempts = attempts + 1,
535
+ last_error = '',
536
+ updated_at = ?,
537
+ processed_at = ?
538
+ WHERE id = ?
539
+ """,
540
+ (now, now, queue_id),
541
+ )
542
+ processed += 1
543
+ except Exception as exc:
544
+ conn.execute(
545
+ """
546
+ UPDATE memory_observation_queue
547
+ SET status = 'failed',
548
+ attempts = attempts + 1,
549
+ last_error = ?,
550
+ updated_at = ?
551
+ WHERE id = ?
552
+ """,
553
+ (_truncate(str(exc), 500), now, queue_id),
554
+ )
555
+ failed += 1
556
+ conn.commit()
557
+ return {"ok": failed == 0, "processed": processed, "failed": failed, "total_seen": len(rows)}
558
+
559
+
560
+ def list_memory_events(
561
+ *,
562
+ query: str = "",
563
+ event_type: str = "",
564
+ source_type: str = "",
565
+ source_id: str = "",
566
+ session_id: str = "",
567
+ project_key: str = "",
568
+ limit: int = 20,
569
+ ) -> list[dict]:
570
+ conn = _core().get_db()
571
+ if not _table_exists(conn, "memory_events"):
572
+ return []
573
+ clauses = ["1=1"]
574
+ params: list[Any] = []
575
+ if event_type.strip():
576
+ clauses.append("event_type = ?")
577
+ params.append(event_type.strip().lower())
578
+ if source_type.strip():
579
+ clauses.append("source_type = ?")
580
+ params.append(source_type.strip().lower())
581
+ if source_id.strip():
582
+ clauses.append("source_id = ?")
583
+ params.append(source_id.strip())
584
+ if session_id.strip():
585
+ clauses.append("session_id = ?")
586
+ params.append(session_id.strip())
587
+ if project_key.strip():
588
+ clauses.append("project_key = ?")
589
+ params.append(project_key.strip())
590
+ if query.strip():
591
+ like = f"%{query.strip()}%"
592
+ clauses.append(
593
+ "(event_uid LIKE ? OR source_id LIKE ? OR event_type LIKE ? OR tool_name LIKE ? OR file_paths_json LIKE ? OR metadata_json LIKE ?)"
594
+ )
595
+ params.extend([like, like, like, like, like, like])
596
+
597
+ rows = conn.execute(
598
+ f"""
599
+ SELECT * FROM memory_events
600
+ WHERE {' AND '.join(clauses)}
601
+ ORDER BY created_at DESC, id DESC
602
+ LIMIT ?
603
+ """,
604
+ params + [max(1, min(int(limit or 20), 200))],
605
+ ).fetchall()
606
+ return [_row_to_event(row) for row in rows]
607
+
608
+
609
+ def list_memory_observations(
610
+ *,
611
+ query: str = "",
612
+ observation_type: str = "",
613
+ session_id: str = "",
614
+ project_key: str = "",
615
+ status: str = "",
616
+ limit: int = 20,
617
+ ) -> list[dict]:
618
+ conn = _core().get_db()
619
+ if not _table_exists(conn, "memory_observations"):
620
+ return []
621
+ clauses = ["1=1"]
622
+ params: list[Any] = []
623
+ if observation_type.strip():
624
+ clauses.append("observation_type = ?")
625
+ params.append(observation_type.strip().lower())
626
+ if session_id.strip():
627
+ clauses.append("session_id = ?")
628
+ params.append(session_id.strip())
629
+ if project_key.strip():
630
+ clauses.append("project_key = ?")
631
+ params.append(project_key.strip())
632
+ if status.strip():
633
+ clauses.append("status = ?")
634
+ params.append(status.strip().lower())
635
+ if query.strip():
636
+ like = f"%{query.strip()}%"
637
+ clauses.append(
638
+ "(observation_uid LIKE ? OR observation_type LIKE ? OR subject LIKE ? OR summary LIKE ? OR facts_json LIKE ? OR entities_json LIKE ?)"
639
+ )
640
+ params.extend([like, like, like, like, like, like])
641
+
642
+ rows = conn.execute(
643
+ f"""
644
+ SELECT * FROM memory_observations
645
+ WHERE {' AND '.join(clauses)}
646
+ ORDER BY salience DESC, created_at DESC, id DESC
647
+ LIMIT ?
648
+ """,
649
+ params + [max(1, min(int(limit or 20), 200))],
650
+ ).fetchall()
651
+ return [_row_to_observation(row) for row in rows]
652
+
653
+
654
+ def _fts_query(query: str) -> str:
655
+ words = [word for word in re.findall(r"[A-Za-z0-9_./:-]{2,}", query or "") if len(word) >= 2]
656
+ return " OR ".join(f'"{word}"' for word in words[:12])
657
+
658
+
659
+ def search_memory_observations_fts(
660
+ query: str,
661
+ *,
662
+ project_key: str = "",
663
+ limit: int = 20,
664
+ ) -> list[dict]:
665
+ conn = _core().get_db()
666
+ if not _table_exists(conn, "memory_observations_fts"):
667
+ return []
668
+ fts = _fts_query(query)
669
+ if not fts:
670
+ return []
671
+ sql = """
672
+ SELECT o.*
673
+ FROM memory_observations_fts f
674
+ JOIN memory_observations o ON o.id = f.rowid
675
+ WHERE memory_observations_fts MATCH ?
676
+ """
677
+ params: list[Any] = [fts]
678
+ if project_key.strip():
679
+ sql += " AND o.project_key = ?"
680
+ params.append(project_key.strip())
681
+ sql += " ORDER BY rank LIMIT ?"
682
+ params.append(max(1, min(int(limit or 20), 200)))
683
+ try:
684
+ rows = conn.execute(sql, params).fetchall()
685
+ except Exception:
686
+ return []
687
+ return [_row_to_observation(row) for row in rows]
688
+
689
+
690
+ def memory_observation_stats(days: int = 7) -> dict:
691
+ conn = _core().get_db()
692
+ if not _table_exists(conn, "memory_observations"):
693
+ return {"total": 0, "by_observation_type": {}, "queue": {}, "window_days": days}
694
+ window_days = max(1, int(days or 7))
695
+ cutoff = _core().now_epoch() - (window_days * 86400)
696
+ total = int(
697
+ conn.execute(
698
+ "SELECT COUNT(*) FROM memory_observations WHERE created_at >= ?",
699
+ (cutoff,),
700
+ ).fetchone()[0]
701
+ )
702
+ type_rows = conn.execute(
703
+ """
704
+ SELECT observation_type, COUNT(*) AS cnt
705
+ FROM memory_observations
706
+ WHERE created_at >= ?
707
+ GROUP BY observation_type
708
+ ORDER BY cnt DESC, observation_type ASC
709
+ """,
710
+ (cutoff,),
711
+ ).fetchall()
712
+ queue = {}
713
+ if _table_exists(conn, "memory_observation_queue"):
714
+ queue_rows = conn.execute(
715
+ "SELECT status, COUNT(*) AS cnt FROM memory_observation_queue GROUP BY status"
716
+ ).fetchall()
717
+ queue = {row["status"]: int(row["cnt"]) for row in queue_rows}
718
+ return {
719
+ "window_days": window_days,
720
+ "total": total,
721
+ "by_observation_type": {row["observation_type"]: int(row["cnt"]) for row in type_rows},
722
+ "queue": queue,
723
+ }
724
+
725
+
726
+ def _backfill_uid(source_type: str, source_id: str) -> str:
727
+ digest = hashlib.sha1(f"{source_type}:{source_id}".encode("utf-8"), usedforsecurity=False).hexdigest()[:32]
728
+ return f"MB-{digest}"
729
+
730
+
731
+ def _register_backfill_sql_functions(conn) -> None:
732
+ raw = getattr(conn, "_conn", conn)
733
+ try:
734
+ raw.create_function("memory_backfill_uid", 2, lambda source_type, source_id: _backfill_uid(str(source_type or ""), str(source_id or "")))
735
+ except Exception:
736
+ pass
737
+
738
+
739
+ def _backfill_limit(value: int) -> int:
740
+ return max(1, min(int(value or 100), 1000))
741
+
742
+
743
+ def _backfill_observation(
744
+ *,
745
+ source_type: str,
746
+ source_id: str,
747
+ created_at: Any,
748
+ session_id: str = "",
749
+ project_key: str = "",
750
+ observation_type: str,
751
+ subject: str,
752
+ summary: str,
753
+ facts: dict[str, Any] | None = None,
754
+ salience: float = 0.45,
755
+ ) -> dict:
756
+ clean_summary = _truncate(summary, 1000)
757
+ if not clean_summary:
758
+ return {"ok": False, "skipped": True, "reason": "empty_summary"}
759
+ refs = [f"{source_type}:{source_id}"]
760
+ return upsert_memory_observation(
761
+ {
762
+ "observation_uid": _backfill_uid(source_type, source_id),
763
+ "created_at": _parse_created_at(created_at),
764
+ "updated_at": _core().now_epoch(),
765
+ "project_key": project_key,
766
+ "session_id": session_id,
767
+ "observation_type": observation_type,
768
+ "subject": subject,
769
+ "summary": clean_summary,
770
+ "facts": {"source_type": source_type, "source_id": source_id, **(facts or {})},
771
+ "evidence_refs": refs,
772
+ "entities": [subject] if subject else [],
773
+ "salience": salience,
774
+ "confidence": 0.72,
775
+ "stability": 1.0,
776
+ "status": "active",
777
+ "promotion_state": "backfilled",
778
+ "decay_policy": "normal",
779
+ "source_hash": _stable_hash({"source_type": source_type, "source_id": source_id, "summary": clean_summary}),
780
+ "metadata": {"phase": "controlled_backfill"},
781
+ }
782
+ )
783
+
784
+
785
+ def backfill_memory_observations(
786
+ *,
787
+ sources: list[str] | None = None,
788
+ limit: int = 100,
789
+ ) -> dict:
790
+ conn = _core().get_db()
791
+ if not _table_exists(conn, "memory_observations"):
792
+ return {"ok": True, "created": 0, "skipped": True, "reason": "memory_observations table unavailable"}
793
+ requested = {item.strip() for item in (sources or []) if item.strip()}
794
+ if not requested:
795
+ requested = {"protocol_tasks", "change_log", "session_diary", "recent_events"}
796
+ _register_backfill_sql_functions(conn)
797
+ max_rows = _backfill_limit(limit)
798
+ created = 0
799
+ seen = 0
800
+
801
+ if "protocol_tasks" in requested and _table_exists(conn, "protocol_tasks"):
802
+ rows = conn.execute(
803
+ """
804
+ SELECT * FROM protocol_tasks
805
+ WHERE status != 'open'
806
+ AND NOT EXISTS (
807
+ SELECT 1 FROM memory_observations
808
+ WHERE observation_uid = memory_backfill_uid('protocol_task', protocol_tasks.task_id)
809
+ )
810
+ ORDER BY COALESCE(closed_at, opened_at) DESC
811
+ LIMIT ?
812
+ """,
813
+ (max_rows,),
814
+ ).fetchall()
815
+ for row in rows:
816
+ item = dict(row)
817
+ seen += 1
818
+ result = _backfill_observation(
819
+ source_type="protocol_task",
820
+ source_id=item.get("task_id") or "",
821
+ created_at=item.get("closed_at") or item.get("opened_at"),
822
+ session_id=item.get("session_id") or "",
823
+ project_key=item.get("project_hint") or item.get("area") or "",
824
+ observation_type="task_result",
825
+ subject=item.get("goal") or item.get("task_id") or "",
826
+ summary=f"Protocol task {item.get('status')}: {item.get('goal') or ''}".strip(),
827
+ facts={"status": item.get("status"), "files_changed": item.get("files_changed")},
828
+ salience=0.68,
829
+ )
830
+ created += 1 if result.get("ok") else 0
831
+
832
+ if "change_log" in requested and _table_exists(conn, "change_log"):
833
+ rows = conn.execute(
834
+ """
835
+ SELECT * FROM change_log
836
+ WHERE NOT EXISTS (
837
+ SELECT 1 FROM memory_observations
838
+ WHERE observation_uid = memory_backfill_uid('change_log', change_log.id)
839
+ )
840
+ ORDER BY created_at DESC
841
+ LIMIT ?
842
+ """,
843
+ (max_rows,),
844
+ ).fetchall()
845
+ for row in rows:
846
+ item = dict(row)
847
+ seen += 1
848
+ result = _backfill_observation(
849
+ source_type="change_log",
850
+ source_id=str(item.get("id") or ""),
851
+ created_at=item.get("created_at"),
852
+ session_id=item.get("session_id") or "",
853
+ project_key=item.get("affects") or "",
854
+ observation_type="code_change",
855
+ subject=item.get("files") or "",
856
+ summary=item.get("what_changed") or "",
857
+ facts={"why": item.get("why"), "verify": item.get("verify"), "files": item.get("files")},
858
+ salience=0.62,
859
+ )
860
+ created += 1 if result.get("ok") else 0
861
+
862
+ if "session_diary" in requested and _table_exists(conn, "session_diary"):
863
+ rows = conn.execute(
864
+ """
865
+ SELECT * FROM session_diary
866
+ WHERE NOT EXISTS (
867
+ SELECT 1 FROM memory_observations
868
+ WHERE observation_uid = memory_backfill_uid('session_diary', session_diary.id)
869
+ )
870
+ ORDER BY created_at DESC
871
+ LIMIT ?
872
+ """,
873
+ (max_rows,),
874
+ ).fetchall()
875
+ for row in rows:
876
+ item = dict(row)
877
+ seen += 1
878
+ result = _backfill_observation(
879
+ source_type="session_diary",
880
+ source_id=str(item.get("id") or ""),
881
+ created_at=item.get("created_at"),
882
+ session_id=item.get("session_id") or "",
883
+ project_key=item.get("domain") or "",
884
+ observation_type="conversation_summary",
885
+ subject=item.get("domain") or item.get("session_id") or "",
886
+ summary=item.get("summary") or "",
887
+ facts={"decisions": item.get("decisions"), "pending": item.get("pending")},
888
+ salience=0.52,
889
+ )
890
+ created += 1 if result.get("ok") else 0
891
+
892
+ if "recent_events" in requested and _table_exists(conn, "recent_events"):
893
+ rows = conn.execute(
894
+ """
895
+ SELECT * FROM recent_events
896
+ WHERE NOT EXISTS (
897
+ SELECT 1 FROM memory_observations
898
+ WHERE observation_uid = memory_backfill_uid('recent_event', recent_events.id)
899
+ )
900
+ ORDER BY created_at DESC
901
+ LIMIT ?
902
+ """,
903
+ (max_rows,),
904
+ ).fetchall()
905
+ for row in rows:
906
+ item = dict(row)
907
+ seen += 1
908
+ result = _backfill_observation(
909
+ source_type="recent_event",
910
+ source_id=str(item.get("id") or ""),
911
+ created_at=item.get("created_at"),
912
+ session_id=item.get("session_id") or "",
913
+ project_key=item.get("context_key") or "",
914
+ observation_type="recent_context",
915
+ subject=item.get("title") or item.get("context_key") or "",
916
+ summary=item.get("summary") or item.get("body") or item.get("title") or "",
917
+ facts={"event_type": item.get("event_type"), "context_key": item.get("context_key")},
918
+ salience=0.48,
919
+ )
920
+ created += 1 if result.get("ok") else 0
921
+
922
+ return {"ok": True, "sources": sorted(requested), "seen": seen, "created_or_updated": created}
923
+
924
+
925
+ def memory_observation_health() -> dict:
926
+ conn = _core().get_db()
927
+ tables = {
928
+ "memory_events": _table_exists(conn, "memory_events"),
929
+ "memory_observations": _table_exists(conn, "memory_observations"),
930
+ "memory_observation_queue": _table_exists(conn, "memory_observation_queue"),
931
+ "memory_observations_fts": _table_exists(conn, "memory_observations_fts"),
932
+ }
933
+ counts = {"events": 0, "observations": 0, "queue": {}}
934
+ latest = {"event_created_at": None, "observation_created_at": None}
935
+ if tables["memory_events"]:
936
+ counts["events"] = int(conn.execute("SELECT COUNT(*) FROM memory_events").fetchone()[0])
937
+ latest["event_created_at"] = conn.execute("SELECT MAX(created_at) FROM memory_events").fetchone()[0]
938
+ if tables["memory_observations"]:
939
+ counts["observations"] = int(conn.execute("SELECT COUNT(*) FROM memory_observations").fetchone()[0])
940
+ latest["observation_created_at"] = conn.execute("SELECT MAX(created_at) FROM memory_observations").fetchone()[0]
941
+ if tables["memory_observation_queue"]:
942
+ rows = conn.execute(
943
+ "SELECT status, COUNT(*) AS cnt FROM memory_observation_queue GROUP BY status"
944
+ ).fetchall()
945
+ counts["queue"] = {row["status"]: int(row["cnt"]) for row in rows}
946
+
947
+ fts_enabled = _is_virtual_fts_table(conn, "memory_observations_fts")
948
+ fts_queryable = False
949
+ if tables["memory_observations_fts"]:
950
+ try:
951
+ conn.execute("SELECT rowid FROM memory_observations_fts LIMIT 1").fetchone()
952
+ fts_queryable = True
953
+ except Exception:
954
+ fts_queryable = False
955
+
956
+ missing_required = [name for name in ("memory_events", "memory_observations", "memory_observation_queue") if not tables[name]]
957
+ failed_queue = int(counts["queue"].get("failed", 0))
958
+ return {
959
+ "ok": not missing_required and failed_queue == 0,
960
+ "tables": tables,
961
+ "missing_required": missing_required,
962
+ "counts": counts,
963
+ "latest": latest,
964
+ "fts_enabled": fts_enabled,
965
+ "fts_degraded": tables["memory_observations_fts"] and not fts_enabled,
966
+ "fts_queryable": fts_queryable,
967
+ }
968
+
969
+
970
+ def maintain_memory_observations(
971
+ *,
972
+ process_limit: int = 100,
973
+ retry_failed: bool = True,
974
+ backfill_sources: list[str] | None = None,
975
+ backfill_limit: int = 0,
976
+ ) -> dict:
977
+ conn = _core().get_db()
978
+ reset_failed = 0
979
+ if retry_failed and _table_exists(conn, "memory_observation_queue"):
980
+ cursor = conn.execute(
981
+ """
982
+ UPDATE memory_observation_queue
983
+ SET status = 'pending',
984
+ updated_at = ?
985
+ WHERE status = 'failed'
986
+ AND attempts < 5
987
+ """,
988
+ (_core().now_epoch(),),
989
+ )
990
+ reset_failed = int(cursor.rowcount or 0)
991
+ conn.commit()
992
+
993
+ processed = process_memory_observation_queue(limit=process_limit)
994
+ backfill = {"ok": True, "skipped": True}
995
+ if int(backfill_limit or 0) > 0:
996
+ backfill = backfill_memory_observations(sources=backfill_sources, limit=backfill_limit)
997
+ health = memory_observation_health()
998
+ return {
999
+ "ok": bool(processed.get("ok")) and bool(backfill.get("ok")) and bool(health.get("ok")),
1000
+ "reset_failed": reset_failed,
1001
+ "processed": processed,
1002
+ "backfill": backfill,
1003
+ "health": health,
1004
+ }
1005
+
1006
+
1007
+ def memory_event_stats(days: int = 7) -> dict:
1008
+ conn = _core().get_db()
1009
+ if not _table_exists(conn, "memory_events"):
1010
+ return {"total": 0, "by_event_type": {}, "by_source_type": {}, "window_days": days}
1011
+ window_days = max(1, int(days or 7))
1012
+ cutoff = _core().now_epoch() - (window_days * 86400)
1013
+ total = int(
1014
+ conn.execute(
1015
+ "SELECT COUNT(*) FROM memory_events WHERE created_at >= ?",
1016
+ (cutoff,),
1017
+ ).fetchone()[0]
1018
+ )
1019
+ event_rows = conn.execute(
1020
+ """
1021
+ SELECT event_type, COUNT(*) AS cnt
1022
+ FROM memory_events
1023
+ WHERE created_at >= ?
1024
+ GROUP BY event_type
1025
+ ORDER BY cnt DESC, event_type ASC
1026
+ """,
1027
+ (cutoff,),
1028
+ ).fetchall()
1029
+ source_rows = conn.execute(
1030
+ """
1031
+ SELECT source_type, COUNT(*) AS cnt
1032
+ FROM memory_events
1033
+ WHERE created_at >= ?
1034
+ GROUP BY source_type
1035
+ ORDER BY cnt DESC, source_type ASC
1036
+ """,
1037
+ (cutoff,),
1038
+ ).fetchall()
1039
+ return {
1040
+ "window_days": window_days,
1041
+ "total": total,
1042
+ "by_event_type": {row["event_type"]: int(row["cnt"]) for row in event_rows},
1043
+ "by_source_type": {row["source_type"]: int(row["cnt"]) for row in source_rows},
1044
+ }