nexo-brain 0.2.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -72
- package/bin/nexo-brain 2.js +610 -0
- package/package.json +2 -2
- package/scripts/pre-commit-check 2.sh +55 -0
- package/src/cognitive.py +1582 -56
- package/src/db.py +49 -25
- package/src/hooks/auto_capture.py +208 -0
- package/src/plugins/cognitive_memory.py +276 -17
- package/src/scripts/nexo-catchup.py +32 -15
- package/src/scripts/nexo-cognitive-decay.py +2 -4
- package/src/scripts/nexo-daily-self-audit.py +148 -29
- package/src/scripts/nexo-immune.py +869 -0
- package/src/scripts/nexo-postmortem-consolidator.py +42 -40
- package/src/scripts/nexo-sleep.py +90 -39
- package/src/scripts/nexo-synthesis.py +78 -76
- package/src/tools_sessions.py +2 -2
- package/templates/CLAUDE.md 2.template +89 -0
- package/templates/CLAUDE.md.template +1 -1
package/src/cognitive.py
CHANGED
|
@@ -1,19 +1,29 @@
|
|
|
1
1
|
"""NEXO Cognitive Engine — Vector memory with Atkinson-Shiffrin model."""
|
|
2
2
|
|
|
3
|
+
import base64
|
|
4
|
+
import json
|
|
3
5
|
import math
|
|
4
6
|
import os
|
|
7
|
+
import re
|
|
5
8
|
import sqlite3
|
|
6
9
|
import numpy as np
|
|
7
10
|
from datetime import datetime, timedelta
|
|
8
11
|
from pathlib import Path
|
|
9
12
|
from typing import Optional
|
|
10
13
|
|
|
11
|
-
|
|
12
|
-
COGNITIVE_DB = str(NEXO_HOME / "cognitive.db")
|
|
14
|
+
COGNITIVE_DB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cognitive.db")
|
|
13
15
|
EMBEDDING_DIM = 384
|
|
14
16
|
LAMBDA_STM = 0.1 # half-life ~7 days
|
|
15
17
|
LAMBDA_LTM = 0.012 # half-life ~60 days
|
|
16
18
|
|
|
19
|
+
# Prediction Error Gate thresholds
|
|
20
|
+
PE_GATE_REJECT = 0.85 # similarity > this → reject (not novel enough)
|
|
21
|
+
PE_GATE_REFINE = 0.70 # similarity between REFINE and REJECT → refinement (update existing)
|
|
22
|
+
# similarity < REFINE → novel (store as new)
|
|
23
|
+
|
|
24
|
+
# Session-level gate stats (reset each process lifetime)
|
|
25
|
+
_gate_stats = {"accepted_novel": 0, "accepted_refinement": 0, "rejected": 0}
|
|
26
|
+
|
|
17
27
|
# Discriminating entities — if these differ between two high-similarity memories,
|
|
18
28
|
# they are siblings (similar-but-incompatible), NOT duplicates to merge.
|
|
19
29
|
DISCRIMINATING_ENTITIES = {
|
|
@@ -36,29 +46,31 @@ POSITIVE_SIGNALS = {
|
|
|
36
46
|
"gracias", "genial", "perfecto", "bien", "excelente", "bueno", "me gusta",
|
|
37
47
|
"correcto", "sí", "dale", "hazlo", "adelante", "ok", "vale", "great",
|
|
38
48
|
"nice", "good", "exactly", "buen trabajo", "bien hecho", "fenomenal",
|
|
39
|
-
"thanks", "perfect", "awesome", "excellent", "well done",
|
|
40
49
|
}
|
|
41
50
|
NEGATIVE_SIGNALS = {
|
|
42
51
|
"no", "mal", "otra vez", "ya te dije", "frustr", "error", "fallo",
|
|
43
52
|
"cansad", "siempre", "nunca", "por qué no", "no funciona", "roto",
|
|
44
|
-
"no sirve", "horrible", "desastre", "
|
|
45
|
-
"
|
|
53
|
+
"no sirve", "horrible", "desastre", "qué coño", "joder", "mierda",
|
|
54
|
+
"hostia", "me cago", "irritad", "harto",
|
|
55
|
+
"broken", "nothing works", "doesn't work", "not working", "fix it",
|
|
56
|
+
"wrong", "failed", "failing", "annoying", "frustrated", "damn", "shit",
|
|
57
|
+
"wtf", "terrible", "useless", "stupid", "hate", "worst", "sucks",
|
|
58
|
+
"again",
|
|
46
59
|
}
|
|
47
60
|
URGENCY_SIGNALS = {
|
|
48
61
|
"rápido", "ya", "ahora", "urgente", "asap", "inmediatamente", "corre",
|
|
49
|
-
"urgent", "now", "immediately", "hurry",
|
|
50
62
|
}
|
|
51
63
|
|
|
52
64
|
# Trust score events and their point values
|
|
53
65
|
TRUST_EVENTS = {
|
|
54
66
|
# Positive
|
|
55
67
|
"explicit_thanks": +3,
|
|
56
|
-
"delegation": +2, #
|
|
57
|
-
"paradigm_shift": +2, #
|
|
68
|
+
"delegation": +2, # the user delegates new task without micromanaging
|
|
69
|
+
"paradigm_shift": +2, # the user teaches, NEXO learns
|
|
58
70
|
"sibling_detected": +3, # NEXO avoided context error on its own
|
|
59
71
|
"proactive_action": +2, # NEXO did something useful without being asked
|
|
60
72
|
# Negative
|
|
61
|
-
"correction": -3, #
|
|
73
|
+
"correction": -3, # the user corrects NEXO
|
|
62
74
|
"repeated_error": -7, # Error on something NEXO already had a learning for
|
|
63
75
|
"override": -5, # NEXO's memory was wrong
|
|
64
76
|
"correction_fatigue": -10, # Same memory corrected 3+ times
|
|
@@ -68,20 +80,103 @@ TRUST_EVENTS = {
|
|
|
68
80
|
_model = None
|
|
69
81
|
_conn = None
|
|
70
82
|
|
|
83
|
+
# --- Secret redaction patterns ---
|
|
84
|
+
_REDACT_PATTERNS = [
|
|
85
|
+
# Specific API key formats
|
|
86
|
+
(re.compile(r'sk-[a-zA-Z0-9_\-]{20,}'), '[REDACTED:api_key]'),
|
|
87
|
+
(re.compile(r'ghp_[a-zA-Z0-9]{20,}'), '[REDACTED:api_key]'),
|
|
88
|
+
(re.compile(r'shpat_[a-f0-9]{20,}'), '[REDACTED:api_key]'),
|
|
89
|
+
(re.compile(r'AKIA[A-Z0-9]{16}'), '[REDACTED:api_key]'),
|
|
90
|
+
(re.compile(r'xox[bp]-[a-zA-Z0-9\-]{20,}'), '[REDACTED:api_key]'),
|
|
91
|
+
# Bearer tokens
|
|
92
|
+
(re.compile(r'Bearer\s+[a-zA-Z0-9_\-\.=+/]{20,}'), '[REDACTED:bearer_token]'),
|
|
93
|
+
# Connection strings with credentials
|
|
94
|
+
(re.compile(r'(mysql|postgresql|postgres|mongodb|redis)://[^\s"\']+@[^\s"\']+'), '[REDACTED:connection_string]'),
|
|
95
|
+
# Generic token assignments
|
|
96
|
+
(re.compile(r'(token\s*[=:]\s*["\']?)([a-zA-Z0-9_\-]{20,})', re.IGNORECASE),
|
|
97
|
+
lambda m: m.group(1) + '[REDACTED:token]'),
|
|
98
|
+
# Password assignments
|
|
99
|
+
(re.compile(r'(password\s*[=:]\s*["\']?)([^\s"\']{8,})', re.IGNORECASE),
|
|
100
|
+
lambda m: m.group(1) + '[REDACTED:password]'),
|
|
101
|
+
# SSH with private IPs (server credentials context)
|
|
102
|
+
(re.compile(r'ssh\s+\S+@\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'), '[REDACTED:ssh_credential]'),
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def redact_secrets(text: str) -> str:
|
|
107
|
+
"""Scan text for secrets and replace with [REDACTED:<type>] placeholders.
|
|
108
|
+
|
|
109
|
+
Fast regex-only detection. Not overly aggressive — won't redact normal
|
|
110
|
+
hex strings, UUIDs, or short tokens that aren't secrets.
|
|
111
|
+
"""
|
|
112
|
+
if not text:
|
|
113
|
+
return text
|
|
114
|
+
result = text
|
|
115
|
+
for pattern, replacement in _REDACT_PATTERNS:
|
|
116
|
+
if callable(replacement):
|
|
117
|
+
result = pattern.sub(replacement, result)
|
|
118
|
+
else:
|
|
119
|
+
result = pattern.sub(replacement, result)
|
|
120
|
+
return result
|
|
121
|
+
|
|
71
122
|
|
|
72
123
|
def _get_db() -> sqlite3.Connection:
|
|
73
124
|
"""Get or create SQLite connection with WAL mode."""
|
|
74
125
|
global _conn
|
|
75
126
|
if _conn is None:
|
|
76
|
-
NEXO_HOME.mkdir(parents=True, exist_ok=True)
|
|
77
127
|
_conn = sqlite3.connect(COGNITIVE_DB, check_same_thread=False)
|
|
78
128
|
_conn.execute("PRAGMA journal_mode=WAL")
|
|
79
129
|
_conn.execute("PRAGMA synchronous=NORMAL")
|
|
80
130
|
_conn.row_factory = sqlite3.Row
|
|
81
131
|
_init_tables(_conn)
|
|
132
|
+
_migrate_lifecycle(_conn)
|
|
133
|
+
_migrate_co_activation(_conn)
|
|
82
134
|
return _conn
|
|
83
135
|
|
|
84
136
|
|
|
137
|
+
def _migrate_lifecycle(conn: sqlite3.Connection):
|
|
138
|
+
"""Add lifecycle_state, snooze_until, and redaction_applied columns if they don't exist (idempotent)."""
|
|
139
|
+
for table in ("stm_memories", "ltm_memories"):
|
|
140
|
+
for col, col_type in [
|
|
141
|
+
("lifecycle_state", "TEXT DEFAULT 'active'"),
|
|
142
|
+
("snooze_until", "TEXT"),
|
|
143
|
+
("redaction_applied", "INTEGER DEFAULT 0"),
|
|
144
|
+
]:
|
|
145
|
+
try:
|
|
146
|
+
conn.execute(f"ALTER TABLE {table} ADD COLUMN {col} {col_type}")
|
|
147
|
+
conn.commit()
|
|
148
|
+
except sqlite3.OperationalError as e:
|
|
149
|
+
if "duplicate column" in str(e).lower():
|
|
150
|
+
pass
|
|
151
|
+
else:
|
|
152
|
+
raise
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _migrate_co_activation(conn: sqlite3.Connection):
|
|
156
|
+
"""Add co_activation and prospective_triggers tables if they don't exist (idempotent)."""
|
|
157
|
+
conn.executescript("""
|
|
158
|
+
CREATE TABLE IF NOT EXISTS co_activation (
|
|
159
|
+
memory_a_id INTEGER NOT NULL,
|
|
160
|
+
memory_b_id INTEGER NOT NULL,
|
|
161
|
+
strength REAL DEFAULT 1.0,
|
|
162
|
+
co_access_count INTEGER DEFAULT 1,
|
|
163
|
+
last_co_access TEXT DEFAULT (datetime('now')),
|
|
164
|
+
PRIMARY KEY (memory_a_id, memory_b_id)
|
|
165
|
+
);
|
|
166
|
+
|
|
167
|
+
CREATE TABLE IF NOT EXISTS prospective_triggers (
|
|
168
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
169
|
+
trigger_pattern TEXT NOT NULL,
|
|
170
|
+
action TEXT NOT NULL,
|
|
171
|
+
context TEXT DEFAULT '',
|
|
172
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
173
|
+
fired_at TEXT,
|
|
174
|
+
status TEXT DEFAULT 'armed'
|
|
175
|
+
);
|
|
176
|
+
""")
|
|
177
|
+
conn.commit()
|
|
178
|
+
|
|
179
|
+
|
|
85
180
|
def _init_tables(conn: sqlite3.Connection):
|
|
86
181
|
"""Create tables if they don't exist."""
|
|
87
182
|
conn.executescript("""
|
|
@@ -136,6 +231,16 @@ def _init_tables(conn: sqlite3.Connection):
|
|
|
136
231
|
UNIQUE(memory_a_id, memory_b_id)
|
|
137
232
|
);
|
|
138
233
|
|
|
234
|
+
-- Dreamed pairs: track which memory pairs have been processed by dream_cycle
|
|
235
|
+
CREATE TABLE IF NOT EXISTS dreamed_pairs (
|
|
236
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
237
|
+
memory_a_id INTEGER NOT NULL,
|
|
238
|
+
memory_b_id INTEGER NOT NULL,
|
|
239
|
+
insight_id INTEGER, -- LTM ID of the generated insight
|
|
240
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
241
|
+
UNIQUE(memory_a_id, memory_b_id)
|
|
242
|
+
);
|
|
243
|
+
|
|
139
244
|
-- Trust score: NEXO's alignment index (0-100, starts at 50)
|
|
140
245
|
CREATE TABLE IF NOT EXISTS trust_score (
|
|
141
246
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
@@ -146,7 +251,7 @@ def _init_tables(conn: sqlite3.Connection):
|
|
|
146
251
|
created_at TEXT DEFAULT (datetime('now'))
|
|
147
252
|
);
|
|
148
253
|
|
|
149
|
-
-- Sentiment readings: user's detected mood per interaction
|
|
254
|
+
-- Sentiment readings: the user's detected mood per interaction
|
|
150
255
|
CREATE TABLE IF NOT EXISTS sentiment_log (
|
|
151
256
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
152
257
|
sentiment TEXT NOT NULL, -- 'positive', 'negative', 'neutral', 'urgent'
|
|
@@ -155,7 +260,24 @@ def _init_tables(conn: sqlite3.Connection):
|
|
|
155
260
|
created_at TEXT DEFAULT (datetime('now'))
|
|
156
261
|
);
|
|
157
262
|
|
|
158
|
-
--
|
|
263
|
+
-- Quarantine: new memories held for validation before promotion to STM
|
|
264
|
+
CREATE TABLE IF NOT EXISTS quarantine (
|
|
265
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
266
|
+
content TEXT NOT NULL,
|
|
267
|
+
embedding BLOB NOT NULL,
|
|
268
|
+
source TEXT DEFAULT 'inferred',
|
|
269
|
+
source_type TEXT NOT NULL,
|
|
270
|
+
source_id TEXT DEFAULT '',
|
|
271
|
+
source_title TEXT DEFAULT '',
|
|
272
|
+
domain TEXT DEFAULT '',
|
|
273
|
+
confidence REAL DEFAULT 0.5,
|
|
274
|
+
promotion_checks INTEGER DEFAULT 0,
|
|
275
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
276
|
+
promoted_at TEXT,
|
|
277
|
+
status TEXT DEFAULT 'pending'
|
|
278
|
+
);
|
|
279
|
+
|
|
280
|
+
-- Correction tracking: when the user overrides a memory's guidance
|
|
159
281
|
CREATE TABLE IF NOT EXISTS memory_corrections (
|
|
160
282
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
161
283
|
memory_id INTEGER NOT NULL,
|
|
@@ -205,6 +327,319 @@ def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
|
205
327
|
return float(np.dot(a, b) / (norm_a * norm_b))
|
|
206
328
|
|
|
207
329
|
|
|
330
|
+
|
|
331
|
+
# ============================================================================
|
|
332
|
+
# FEATURE 1: HyDE Query Expansion (adapted from Vestige hyde.rs)
|
|
333
|
+
# Template-based Hypothetical Document Embeddings for improved search recall.
|
|
334
|
+
# Classifies query intent, generates 3-5 semantic variants, embeds all,
|
|
335
|
+
# averages into centroid embedding for broader semantic coverage.
|
|
336
|
+
# ============================================================================
|
|
337
|
+
|
|
338
|
+
def _classify_query_intent(query: str) -> str:
|
|
339
|
+
"""Classify query intent into one of 6 categories (Vestige-style)."""
|
|
340
|
+
lower = query.lower().strip()
|
|
341
|
+
if lower.startswith(("how to", "how do", "steps", "cómo")):
|
|
342
|
+
return "howto"
|
|
343
|
+
if lower.startswith(("what is", "what are", "define", "explain", "qué es")):
|
|
344
|
+
return "definition"
|
|
345
|
+
if lower.startswith(("why", "por qué")) or "reason" in lower or "porque" in lower:
|
|
346
|
+
return "reasoning"
|
|
347
|
+
if lower.startswith(("when", "cuándo")) or "date" in lower or "timeline" in lower or "fecha" in lower:
|
|
348
|
+
return "temporal"
|
|
349
|
+
if any(c in query for c in ("(", "{", "::", "def ", "class ", "fn ", "function ")):
|
|
350
|
+
return "technical"
|
|
351
|
+
return "lookup"
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _expand_query_variants(query: str) -> list[str]:
|
|
355
|
+
"""Generate 3-5 expanded query variants based on intent (Vestige-style)."""
|
|
356
|
+
intent = _classify_query_intent(query)
|
|
357
|
+
clean = query.strip().rstrip("?.!")
|
|
358
|
+
variants = [query]
|
|
359
|
+
|
|
360
|
+
templates = {
|
|
361
|
+
"definition": [
|
|
362
|
+
f"{clean} is a concept that involves",
|
|
363
|
+
f"The definition of {clean} in the context of this project",
|
|
364
|
+
f"{clean} refers to a type of",
|
|
365
|
+
],
|
|
366
|
+
"howto": [
|
|
367
|
+
f"The steps to {clean} are as follows",
|
|
368
|
+
f"To accomplish {clean}, you need to",
|
|
369
|
+
f"A guide for {clean} including",
|
|
370
|
+
],
|
|
371
|
+
"reasoning": [
|
|
372
|
+
f"The reason {clean} is because",
|
|
373
|
+
f"{clean} happens due to the following factors",
|
|
374
|
+
f"The explanation for {clean} involves",
|
|
375
|
+
],
|
|
376
|
+
"temporal": [
|
|
377
|
+
f"{clean} occurred at a specific time",
|
|
378
|
+
f"The timeline of {clean} shows",
|
|
379
|
+
f"Events related to {clean} in chronological order",
|
|
380
|
+
],
|
|
381
|
+
"lookup": [
|
|
382
|
+
f"Information about {clean} including details",
|
|
383
|
+
f"{clean} is related to the following topics",
|
|
384
|
+
f"Key facts about {clean}",
|
|
385
|
+
f"Previously we handled {clean} by",
|
|
386
|
+
],
|
|
387
|
+
"technical": [
|
|
388
|
+
f"{clean} implementation details and code",
|
|
389
|
+
f"Code pattern for {clean}",
|
|
390
|
+
],
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
variants.extend(templates.get(intent, templates["lookup"]))
|
|
394
|
+
return variants
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def hyde_expand_query(query: str) -> np.ndarray:
|
|
398
|
+
"""HyDE: embed expanded query variants and return their centroid.
|
|
399
|
+
|
|
400
|
+
Instead of embedding just the raw query, generates 3-5 semantic
|
|
401
|
+
variants and returns the averaged (centroid) embedding. This gives
|
|
402
|
+
~60% of full LLM-based HyDE quality with zero latency overhead.
|
|
403
|
+
|
|
404
|
+
Based on Vestige's template-based HyDE (hyde.rs) and the original
|
|
405
|
+
HyDE paper (Gao et al., 2022).
|
|
406
|
+
"""
|
|
407
|
+
variants = _expand_query_variants(query)
|
|
408
|
+
model = _get_model()
|
|
409
|
+
embeddings = list(model.embed(variants))
|
|
410
|
+
arrays = [np.array(e, dtype=np.float32) for e in embeddings]
|
|
411
|
+
|
|
412
|
+
centroid = np.mean(arrays, axis=0).astype(np.float32)
|
|
413
|
+
norm = np.linalg.norm(centroid)
|
|
414
|
+
if norm > 0:
|
|
415
|
+
centroid = centroid / norm
|
|
416
|
+
|
|
417
|
+
return centroid
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
# ============================================================================
|
|
421
|
+
# FEATURE 2: Spreading Activation / Co-Activation Reinforcement
|
|
422
|
+
# Adapted from Vestige spreading_activation.rs and ClawMem store.ts
|
|
423
|
+
# Memories retrieved together get co-activation links that boost
|
|
424
|
+
# future retrievals of associated memories.
|
|
425
|
+
# ============================================================================
|
|
426
|
+
|
|
427
|
+
CO_ACTIVATION_DECAY = 0.7
|
|
428
|
+
CO_ACTIVATION_BOOST = 0.05
|
|
429
|
+
CO_ACTIVATION_MIN_STRENGTH = 0.1
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def _canonical_co_id(store: str, mid: int) -> int:
|
|
433
|
+
"""Create a canonical hash ID for co-activation tracking."""
|
|
434
|
+
return hash(f"{store}:{mid}") % (2**31)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def record_co_activation(memory_ids: list[tuple[str, int]]):
|
|
438
|
+
"""Record co-activation between all pairs of retrieved memories.
|
|
439
|
+
|
|
440
|
+
Called after search returns results. Memories surfaced together
|
|
441
|
+
get their co-activation links reinforced (ClawMem pattern).
|
|
442
|
+
"""
|
|
443
|
+
if len(memory_ids) < 2:
|
|
444
|
+
return
|
|
445
|
+
|
|
446
|
+
db = _get_db()
|
|
447
|
+
now = datetime.utcnow().isoformat()
|
|
448
|
+
|
|
449
|
+
hashes = [_canonical_co_id(store, mid) for store, mid in memory_ids]
|
|
450
|
+
|
|
451
|
+
for i in range(len(hashes)):
|
|
452
|
+
for j in range(i + 1, len(hashes)):
|
|
453
|
+
a, b = min(hashes[i], hashes[j]), max(hashes[i], hashes[j])
|
|
454
|
+
db.execute("""
|
|
455
|
+
INSERT INTO co_activation (memory_a_id, memory_b_id, strength, co_access_count, last_co_access)
|
|
456
|
+
VALUES (?, ?, 1.0, 1, ?)
|
|
457
|
+
ON CONFLICT(memory_a_id, memory_b_id) DO UPDATE SET
|
|
458
|
+
strength = MIN(5.0, strength + 0.3),
|
|
459
|
+
co_access_count = co_access_count + 1,
|
|
460
|
+
last_co_access = excluded.last_co_access
|
|
461
|
+
""", (a, b, now))
|
|
462
|
+
|
|
463
|
+
db.commit()
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _get_co_activated_neighbors(memory_ids: list[tuple[str, int]], depth: int = 1) -> dict[int, float]:
|
|
467
|
+
"""Get co-activated neighbor boosts for a set of memory IDs.
|
|
468
|
+
|
|
469
|
+
Returns {canonical_hash: boost_score} for neighbor memories.
|
|
470
|
+
Uses BFS spreading with decay per hop (Vestige pattern).
|
|
471
|
+
"""
|
|
472
|
+
db = _get_db()
|
|
473
|
+
boosts = {}
|
|
474
|
+
|
|
475
|
+
source_hashes = set(_canonical_co_id(s, m) for s, m in memory_ids)
|
|
476
|
+
current_level = list(source_hashes)
|
|
477
|
+
|
|
478
|
+
for hop in range(depth):
|
|
479
|
+
decay = CO_ACTIVATION_DECAY ** (hop + 1)
|
|
480
|
+
next_level = []
|
|
481
|
+
|
|
482
|
+
for src_hash in current_level:
|
|
483
|
+
rows = db.execute("""
|
|
484
|
+
SELECT memory_a_id, memory_b_id, strength FROM co_activation
|
|
485
|
+
WHERE (memory_a_id = ? OR memory_b_id = ?) AND strength >= ?
|
|
486
|
+
""", (src_hash, src_hash, CO_ACTIVATION_MIN_STRENGTH)).fetchall()
|
|
487
|
+
|
|
488
|
+
for row in rows:
|
|
489
|
+
neighbor_id = row["memory_b_id"] if row["memory_a_id"] == src_hash else row["memory_a_id"]
|
|
490
|
+
if neighbor_id in source_hashes:
|
|
491
|
+
continue
|
|
492
|
+
|
|
493
|
+
boost = row["strength"] * decay * CO_ACTIVATION_BOOST
|
|
494
|
+
if neighbor_id not in boosts or boosts[neighbor_id] < boost:
|
|
495
|
+
boosts[neighbor_id] = boost
|
|
496
|
+
next_level.append(neighbor_id)
|
|
497
|
+
|
|
498
|
+
current_level = next_level
|
|
499
|
+
|
|
500
|
+
return boosts
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
# ============================================================================
|
|
504
|
+
# FEATURE 3: Prospective Memory (adapted from Vestige prospective_memory.rs)
|
|
505
|
+
# "Remember to do X when Y happens" — intention-based triggers that fire
|
|
506
|
+
# when incoming text matches a pattern (keyword or semantic).
|
|
507
|
+
# ============================================================================
|
|
508
|
+
|
|
509
|
+
def create_trigger(pattern: str, action: str, context: str = "") -> int:
|
|
510
|
+
"""Create a prospective memory trigger.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
pattern: Keywords or phrase to match (case-insensitive, comma-separated for multiple)
|
|
514
|
+
action: What to do when the trigger fires
|
|
515
|
+
context: Optional context about why this trigger was created
|
|
516
|
+
Returns:
|
|
517
|
+
Trigger ID
|
|
518
|
+
"""
|
|
519
|
+
db = _get_db()
|
|
520
|
+
cur = db.execute(
|
|
521
|
+
"INSERT INTO prospective_triggers (trigger_pattern, action, context) VALUES (?, ?, ?)",
|
|
522
|
+
(pattern, action, context)
|
|
523
|
+
)
|
|
524
|
+
db.commit()
|
|
525
|
+
return cur.lastrowid
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def check_triggers(text: str, use_semantic: bool = False, semantic_threshold: float = 0.7) -> list[dict]:
|
|
529
|
+
"""Check text against all armed triggers. Fires matches.
|
|
530
|
+
|
|
531
|
+
Uses keyword matching by default. If use_semantic=True, also checks
|
|
532
|
+
semantic similarity (Vestige TriggerPattern.matches pattern).
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
text: Input text to check
|
|
536
|
+
use_semantic: Also do embedding similarity matching
|
|
537
|
+
semantic_threshold: Min cosine similarity for semantic match
|
|
538
|
+
Returns:
|
|
539
|
+
List of fired triggers with actions
|
|
540
|
+
"""
|
|
541
|
+
if not text or not text.strip():
|
|
542
|
+
return []
|
|
543
|
+
|
|
544
|
+
db = _get_db()
|
|
545
|
+
armed = db.execute(
|
|
546
|
+
"SELECT * FROM prospective_triggers WHERE status = 'armed'"
|
|
547
|
+
).fetchall()
|
|
548
|
+
|
|
549
|
+
if not armed:
|
|
550
|
+
return []
|
|
551
|
+
|
|
552
|
+
text_lower = text.lower()
|
|
553
|
+
text_vec = None
|
|
554
|
+
if use_semantic:
|
|
555
|
+
text_vec = embed(text)
|
|
556
|
+
|
|
557
|
+
fired = []
|
|
558
|
+
now = datetime.utcnow().isoformat()
|
|
559
|
+
|
|
560
|
+
for trigger in armed:
|
|
561
|
+
pattern = trigger["trigger_pattern"].lower()
|
|
562
|
+
matched = False
|
|
563
|
+
match_type = ""
|
|
564
|
+
|
|
565
|
+
# Keyword match (comma-separated OR)
|
|
566
|
+
keywords = [kw.strip() for kw in pattern.split(",") if kw.strip()]
|
|
567
|
+
if any(kw in text_lower for kw in keywords):
|
|
568
|
+
matched = True
|
|
569
|
+
match_type = "keyword"
|
|
570
|
+
|
|
571
|
+
# Semantic match (optional, more expensive)
|
|
572
|
+
if not matched and use_semantic and text_vec is not None:
|
|
573
|
+
pattern_vec = embed(trigger["trigger_pattern"])
|
|
574
|
+
sim = cosine_similarity(text_vec, pattern_vec)
|
|
575
|
+
if sim >= semantic_threshold:
|
|
576
|
+
matched = True
|
|
577
|
+
match_type = f"semantic({sim:.3f})"
|
|
578
|
+
|
|
579
|
+
if matched:
|
|
580
|
+
db.execute(
|
|
581
|
+
"UPDATE prospective_triggers SET status = 'fired', fired_at = ? WHERE id = ?",
|
|
582
|
+
(now, trigger["id"])
|
|
583
|
+
)
|
|
584
|
+
fired.append({
|
|
585
|
+
"id": trigger["id"],
|
|
586
|
+
"pattern": trigger["trigger_pattern"],
|
|
587
|
+
"action": trigger["action"],
|
|
588
|
+
"context": trigger["context"],
|
|
589
|
+
"match_type": match_type,
|
|
590
|
+
"created_at": trigger["created_at"],
|
|
591
|
+
})
|
|
592
|
+
|
|
593
|
+
if fired:
|
|
594
|
+
db.commit()
|
|
595
|
+
|
|
596
|
+
return fired
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def list_triggers(status: str = "armed") -> list[dict]:
|
|
600
|
+
"""List prospective triggers filtered by status."""
|
|
601
|
+
db = _get_db()
|
|
602
|
+
if status == "all":
|
|
603
|
+
rows = db.execute("SELECT * FROM prospective_triggers ORDER BY created_at DESC").fetchall()
|
|
604
|
+
else:
|
|
605
|
+
rows = db.execute(
|
|
606
|
+
"SELECT * FROM prospective_triggers WHERE status = ? ORDER BY created_at DESC",
|
|
607
|
+
(status,)
|
|
608
|
+
).fetchall()
|
|
609
|
+
return [dict(row) for row in rows]
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def delete_trigger(trigger_id: int) -> str:
|
|
613
|
+
"""Delete a prospective trigger by ID."""
|
|
614
|
+
db = _get_db()
|
|
615
|
+
cur = db.execute("DELETE FROM prospective_triggers WHERE id = ?", (trigger_id,))
|
|
616
|
+
db.commit()
|
|
617
|
+
return f"Trigger #{trigger_id} {'deleted' if cur.rowcount else 'not found'}."
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def rearm_trigger(trigger_id: int) -> str:
|
|
621
|
+
"""Re-arm a fired trigger so it can fire again."""
|
|
622
|
+
db = _get_db()
|
|
623
|
+
cur = db.execute(
|
|
624
|
+
"UPDATE prospective_triggers SET status = 'armed', fired_at = NULL WHERE id = ?",
|
|
625
|
+
(trigger_id,)
|
|
626
|
+
)
|
|
627
|
+
db.commit()
|
|
628
|
+
return f"Trigger #{trigger_id} {'re-armed' if cur.rowcount else 'not found'}."
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def _auto_restore_snoozed(db: sqlite3.Connection):
|
|
632
|
+
"""Restore snoozed memories whose snooze_until date has passed."""
|
|
633
|
+
now = datetime.utcnow().isoformat()
|
|
634
|
+
for table in ("stm_memories", "ltm_memories"):
|
|
635
|
+
db.execute(
|
|
636
|
+
f"UPDATE {table} SET lifecycle_state = 'active', snooze_until = NULL "
|
|
637
|
+
f"WHERE lifecycle_state = 'snoozed' AND snooze_until IS NOT NULL AND snooze_until <= ?",
|
|
638
|
+
(now,)
|
|
639
|
+
)
|
|
640
|
+
db.commit()
|
|
641
|
+
|
|
642
|
+
|
|
208
643
|
def search(
|
|
209
644
|
query_text: str,
|
|
210
645
|
top_k: int = 10,
|
|
@@ -212,29 +647,52 @@ def search(
|
|
|
212
647
|
stores: str = "both",
|
|
213
648
|
exclude_dormant: bool = True,
|
|
214
649
|
rehearse: bool = True,
|
|
215
|
-
source_type_filter: str = ""
|
|
650
|
+
source_type_filter: str = "",
|
|
651
|
+
include_archived: bool = False,
|
|
652
|
+
use_hyde: bool = False,
|
|
653
|
+
spreading_depth: int = 0
|
|
216
654
|
) -> list[dict]:
|
|
217
|
-
"""Full vector search across STM and/or LTM with rehearsal and dormant reactivation.
|
|
655
|
+
"""Full vector search across STM and/or LTM with rehearsal and dormant reactivation.
|
|
656
|
+
|
|
657
|
+
Args:
|
|
658
|
+
use_hyde: If True, use HyDE query expansion for richer embedding (default False)
|
|
659
|
+
spreading_depth: If >0, fetch co-activated neighbors and boost their scores (default 0)
|
|
660
|
+
"""
|
|
218
661
|
db = _get_db()
|
|
219
|
-
|
|
662
|
+
if use_hyde:
|
|
663
|
+
query_vec = hyde_expand_query(query_text)
|
|
664
|
+
else:
|
|
665
|
+
query_vec = embed(query_text)
|
|
220
666
|
if np.linalg.norm(query_vec) == 0:
|
|
221
667
|
return []
|
|
222
668
|
|
|
669
|
+
# Auto-restore snoozed memories whose snooze_until has passed
|
|
670
|
+
_auto_restore_snoozed(db)
|
|
671
|
+
|
|
223
672
|
results = []
|
|
224
673
|
reactivated_ids = set()
|
|
225
674
|
|
|
675
|
+
# Lifecycle filter: exclude snoozed always; exclude archived unless requested
|
|
676
|
+
_lc = " AND (lifecycle_state IS NULL OR lifecycle_state = 'active' OR lifecycle_state = 'pinned'"
|
|
677
|
+
if include_archived:
|
|
678
|
+
_lc += " OR lifecycle_state = 'archived'"
|
|
679
|
+
_lc += ")"
|
|
680
|
+
|
|
226
681
|
# Search STM
|
|
227
682
|
if stores in ("both", "stm"):
|
|
228
|
-
where = "WHERE promoted_to_ltm = 0"
|
|
683
|
+
where = "WHERE promoted_to_ltm = 0" + _lc
|
|
684
|
+
params = []
|
|
229
685
|
if source_type_filter:
|
|
230
|
-
where +=
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
rows = db.execute(f"SELECT * FROM stm_memories {where}").fetchall()
|
|
686
|
+
where += " AND source_type = ?"
|
|
687
|
+
params.append(source_type_filter)
|
|
688
|
+
rows = db.execute(f"SELECT * FROM stm_memories {where}", params).fetchall()
|
|
234
689
|
|
|
235
690
|
for row in rows:
|
|
236
691
|
vec = _blob_to_array(row["embedding"])
|
|
237
692
|
score = cosine_similarity(query_vec, vec)
|
|
693
|
+
lifecycle = row["lifecycle_state"] or "active"
|
|
694
|
+
if lifecycle == "pinned":
|
|
695
|
+
score = min(1.0, score + 0.2)
|
|
238
696
|
if score >= min_score:
|
|
239
697
|
results.append({
|
|
240
698
|
"store": "stm",
|
|
@@ -248,20 +706,24 @@ def search(
|
|
|
248
706
|
"strength": row["strength"],
|
|
249
707
|
"access_count": row["access_count"],
|
|
250
708
|
"score": score,
|
|
709
|
+
"lifecycle_state": lifecycle,
|
|
251
710
|
})
|
|
252
711
|
|
|
253
712
|
# Search LTM (active)
|
|
254
713
|
if stores in ("both", "ltm"):
|
|
255
|
-
where = "WHERE is_dormant = 0"
|
|
714
|
+
where = "WHERE is_dormant = 0" + _lc
|
|
715
|
+
params = []
|
|
256
716
|
if source_type_filter:
|
|
257
717
|
where += " AND source_type = ?"
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
rows = db.execute(f"SELECT * FROM ltm_memories {where}").fetchall()
|
|
718
|
+
params.append(source_type_filter)
|
|
719
|
+
rows = db.execute(f"SELECT * FROM ltm_memories {where}", params).fetchall()
|
|
261
720
|
|
|
262
721
|
for row in rows:
|
|
263
722
|
vec = _blob_to_array(row["embedding"])
|
|
264
723
|
score = cosine_similarity(query_vec, vec)
|
|
724
|
+
lifecycle = row["lifecycle_state"] or "active"
|
|
725
|
+
if lifecycle == "pinned":
|
|
726
|
+
score = min(1.0, score + 0.2)
|
|
265
727
|
if score >= min_score:
|
|
266
728
|
results.append({
|
|
267
729
|
"store": "ltm",
|
|
@@ -276,6 +738,7 @@ def search(
|
|
|
276
738
|
"access_count": row["access_count"],
|
|
277
739
|
"score": score,
|
|
278
740
|
"tags": row["tags"],
|
|
741
|
+
"lifecycle_state": lifecycle,
|
|
279
742
|
})
|
|
280
743
|
|
|
281
744
|
# Check dormant LTM for reactivation
|
|
@@ -313,6 +776,78 @@ def search(
|
|
|
313
776
|
results.sort(key=lambda x: x["score"], reverse=True)
|
|
314
777
|
results = results[:top_k]
|
|
315
778
|
|
|
779
|
+
# Spreading activation: boost co-activated neighbors (Feature 2)
|
|
780
|
+
co_activation_applied = False
|
|
781
|
+
if spreading_depth > 0 and results:
|
|
782
|
+
memory_ids = [(r["store"], r["id"]) for r in results]
|
|
783
|
+
neighbor_boosts = _get_co_activated_neighbors(memory_ids, depth=spreading_depth)
|
|
784
|
+
|
|
785
|
+
if neighbor_boosts:
|
|
786
|
+
co_activation_applied = True
|
|
787
|
+
# Boost existing results that are neighbors
|
|
788
|
+
existing_hashes = set()
|
|
789
|
+
for r in results:
|
|
790
|
+
co_hash = _canonical_co_id(r["store"], r["id"])
|
|
791
|
+
existing_hashes.add(co_hash)
|
|
792
|
+
if co_hash in neighbor_boosts:
|
|
793
|
+
boost = neighbor_boosts[co_hash]
|
|
794
|
+
r["score"] = min(1.0, r["score"] + boost)
|
|
795
|
+
r["co_activation_boost"] = boost
|
|
796
|
+
|
|
797
|
+
# Add neighbor memories not already in results
|
|
798
|
+
new_neighbor_hashes = set(neighbor_boosts.keys()) - existing_hashes
|
|
799
|
+
if new_neighbor_hashes:
|
|
800
|
+
for store_name, table in [("stm", "stm_memories"), ("ltm", "ltm_memories")]:
|
|
801
|
+
rows = db.execute(f"SELECT * FROM {table}").fetchall()
|
|
802
|
+
for row in rows:
|
|
803
|
+
nh = _canonical_co_id(store_name, row["id"])
|
|
804
|
+
if nh in new_neighbor_hashes:
|
|
805
|
+
boost = neighbor_boosts[nh]
|
|
806
|
+
results.append({
|
|
807
|
+
"store": store_name,
|
|
808
|
+
"id": row["id"],
|
|
809
|
+
"content": row["content"],
|
|
810
|
+
"source_type": row.get("source_type", ""),
|
|
811
|
+
"source_id": row.get("source_id", ""),
|
|
812
|
+
"tags": row.get("tags", ""),
|
|
813
|
+
"domain": row.get("domain", ""),
|
|
814
|
+
"created_at": row.get("created_at", ""),
|
|
815
|
+
"strength": row.get("strength", 0.0),
|
|
816
|
+
"access_count": row.get("access_count", 0),
|
|
817
|
+
"score": min(1.0, boost),
|
|
818
|
+
"co_activation_boost": boost,
|
|
819
|
+
"lifecycle_state": row.get("lifecycle_state", "active"),
|
|
820
|
+
})
|
|
821
|
+
new_neighbor_hashes.discard(nh)
|
|
822
|
+
|
|
823
|
+
# Re-sort after applying boosts
|
|
824
|
+
results.sort(key=lambda x: x["score"], reverse=True)
|
|
825
|
+
|
|
826
|
+
# Add rank explanations
|
|
827
|
+
for rank, r in enumerate(results, 1):
|
|
828
|
+
score = r["score"]
|
|
829
|
+
store = r["store"].upper()
|
|
830
|
+
strength = r.get("strength", 0.0)
|
|
831
|
+
access_count = r.get("access_count", 0)
|
|
832
|
+
created = r.get("created_at", "")
|
|
833
|
+
tags = r.get("tags", "")
|
|
834
|
+
reactivated = r.get("reactivated", False)
|
|
835
|
+
|
|
836
|
+
ranking_desc = "semantic_similarity"
|
|
837
|
+
if use_hyde:
|
|
838
|
+
ranking_desc = "hyde_centroid_similarity"
|
|
839
|
+
parts = [f"Ranked #{rank}: {ranking_desc}={score:.3f}"]
|
|
840
|
+
parts.append(f"store={store}, strength={strength:.2f}, accesses={access_count}")
|
|
841
|
+
if r.get("co_activation_boost"):
|
|
842
|
+
parts.append(f"co_activation_boost=+{r['co_activation_boost']:.3f}")
|
|
843
|
+
if created:
|
|
844
|
+
parts.append(f"created={created[:10]}")
|
|
845
|
+
if tags:
|
|
846
|
+
parts.append(f"tags={tags}")
|
|
847
|
+
if reactivated:
|
|
848
|
+
parts.append("REACTIVATED (was dormant, score>0.8 triggered revival)")
|
|
849
|
+
r["explanation"] = " | ".join(parts)
|
|
850
|
+
|
|
316
851
|
# Rehearsal: update strength and access_count for returned results
|
|
317
852
|
if rehearse and results:
|
|
318
853
|
now = datetime.utcnow().isoformat()
|
|
@@ -326,6 +861,13 @@ def search(
|
|
|
326
861
|
)
|
|
327
862
|
db.commit()
|
|
328
863
|
|
|
864
|
+
# Record co-activation for future spreading (Feature 2)
|
|
865
|
+
if results and len(results) >= 2:
|
|
866
|
+
try:
|
|
867
|
+
record_co_activation([(r["store"], r["id"]) for r in results])
|
|
868
|
+
except Exception:
|
|
869
|
+
pass # Non-critical — don't break search
|
|
870
|
+
|
|
329
871
|
# Log retrieval
|
|
330
872
|
top_score = results[0]["score"] if results else 0.0
|
|
331
873
|
db.execute(
|
|
@@ -342,19 +884,92 @@ def ingest(
|
|
|
342
884
|
source_type: str,
|
|
343
885
|
source_id: str = "",
|
|
344
886
|
source_title: str = "",
|
|
345
|
-
domain: str = ""
|
|
887
|
+
domain: str = "",
|
|
888
|
+
source: str = "inferred",
|
|
889
|
+
skip_quarantine: bool = False,
|
|
890
|
+
bypass_gate: bool = False,
|
|
891
|
+
bypass_security: bool = False
|
|
346
892
|
) -> int:
|
|
347
|
-
"""Embed and store content
|
|
893
|
+
"""Embed and store content. Routes through quarantine unless skip_quarantine=True or source='user_direct'.
|
|
894
|
+
|
|
895
|
+
Security scan runs FIRST (unless bypass_security=True).
|
|
896
|
+
Prediction Error Gate runs BEFORE storage unless bypass_gate=True.
|
|
897
|
+
If gate rejects (content too similar to existing memory), returns 0.
|
|
898
|
+
If gate says 'refinement', merges into existing memory and returns its ID.
|
|
899
|
+
|
|
900
|
+
Args:
|
|
901
|
+
content: Text content to store
|
|
902
|
+
source_type: Type of source (e.g. 'learning', 'change', 'diary')
|
|
903
|
+
source_id: Optional source identifier
|
|
904
|
+
source_title: Optional title
|
|
905
|
+
domain: Optional domain tag
|
|
906
|
+
source: Origin — 'user_direct', 'inferred', or 'agent_observation'
|
|
907
|
+
skip_quarantine: If True, bypass quarantine and store directly in STM (backward compat)
|
|
908
|
+
bypass_gate: If True, skip prediction error gate and store regardless
|
|
909
|
+
bypass_security: If True, skip security scan (for trusted sources)
|
|
910
|
+
|
|
911
|
+
Returns:
|
|
912
|
+
Row ID (negative if quarantined, 0 if gate-rejected, positive if stored/refined)
|
|
913
|
+
"""
|
|
914
|
+
# Security scan BEFORE prediction error gate (adapted from ShieldCortex pipeline)
|
|
915
|
+
if not bypass_security:
|
|
916
|
+
scan = security_scan(content)
|
|
917
|
+
if scan["risk_score"] >= 0.8:
|
|
918
|
+
# High risk — reject with reason logged
|
|
919
|
+
return 0
|
|
920
|
+
if scan["sanitized_content"] != content:
|
|
921
|
+
# Use sanitized content going forward
|
|
922
|
+
content = scan["sanitized_content"]
|
|
923
|
+
|
|
924
|
+
# Run prediction error gate unless bypassed
|
|
925
|
+
if not bypass_gate:
|
|
926
|
+
should_store, novelty, reason, match = prediction_error_gate(content)
|
|
927
|
+
if not should_store:
|
|
928
|
+
return 0 # Gate rejected — content is redundant
|
|
929
|
+
if reason == "refinement" and match:
|
|
930
|
+
return _refine_memory(match, content)
|
|
931
|
+
|
|
348
932
|
db = _get_db()
|
|
349
|
-
|
|
933
|
+
clean_content = redact_secrets(content)
|
|
934
|
+
was_redacted = 1 if clean_content != content else 0
|
|
935
|
+
vec = embed(clean_content)
|
|
350
936
|
blob = _array_to_blob(vec)
|
|
937
|
+
|
|
938
|
+
# user_direct = fast-track: quarantine then immediate promote
|
|
939
|
+
if source == "user_direct" and not skip_quarantine:
|
|
940
|
+
cur = db.execute(
|
|
941
|
+
"""INSERT INTO quarantine (content, embedding, source, source_type, source_id, source_title, domain, confidence, status, promoted_at)
|
|
942
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, 1.0, 'promoted', datetime('now'))""",
|
|
943
|
+
(clean_content, blob, source, source_type, source_id, source_title, domain)
|
|
944
|
+
)
|
|
945
|
+
db.commit()
|
|
946
|
+
# Now actually store in STM
|
|
947
|
+
cur2 = db.execute(
|
|
948
|
+
"""INSERT INTO stm_memories (content, embedding, source_type, source_id, source_title, domain, redaction_applied)
|
|
949
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
|
950
|
+
(clean_content, blob, source_type, source_id, source_title, domain, was_redacted)
|
|
951
|
+
)
|
|
952
|
+
db.commit()
|
|
953
|
+
return cur2.lastrowid
|
|
954
|
+
|
|
955
|
+
# skip_quarantine = direct STM (backward compatibility)
|
|
956
|
+
if skip_quarantine:
|
|
957
|
+
cur = db.execute(
|
|
958
|
+
"""INSERT INTO stm_memories (content, embedding, source_type, source_id, source_title, domain, redaction_applied)
|
|
959
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
|
960
|
+
(clean_content, blob, source_type, source_id, source_title, domain, was_redacted)
|
|
961
|
+
)
|
|
962
|
+
db.commit()
|
|
963
|
+
return cur.lastrowid
|
|
964
|
+
|
|
965
|
+
# Route to quarantine
|
|
351
966
|
cur = db.execute(
|
|
352
|
-
"""INSERT INTO
|
|
353
|
-
VALUES (?, ?, ?, ?, ?, ?)""",
|
|
354
|
-
(
|
|
967
|
+
"""INSERT INTO quarantine (content, embedding, source, source_type, source_id, source_title, domain)
|
|
968
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
|
969
|
+
(clean_content, blob, source, source_type, source_id, source_title, domain)
|
|
355
970
|
)
|
|
356
971
|
db.commit()
|
|
357
|
-
return cur.lastrowid
|
|
972
|
+
return -cur.lastrowid # Negative = quarantined
|
|
358
973
|
|
|
359
974
|
|
|
360
975
|
def ingest_to_ltm(
|
|
@@ -363,16 +978,31 @@ def ingest_to_ltm(
|
|
|
363
978
|
source_id: str = "",
|
|
364
979
|
source_title: str = "",
|
|
365
980
|
domain: str = "",
|
|
366
|
-
tags: str = ""
|
|
981
|
+
tags: str = "",
|
|
982
|
+
bypass_gate: bool = False
|
|
367
983
|
) -> int:
|
|
368
|
-
"""Embed and store content directly in LTM. Returns row ID.
|
|
984
|
+
"""Embed and store content directly in LTM. Returns row ID.
|
|
985
|
+
|
|
986
|
+
Prediction Error Gate runs BEFORE storage unless bypass_gate=True.
|
|
987
|
+
If gate rejects, returns 0. If refinement, merges and returns existing ID.
|
|
988
|
+
"""
|
|
989
|
+
# Run prediction error gate unless bypassed
|
|
990
|
+
if not bypass_gate:
|
|
991
|
+
should_store, novelty, reason, match = prediction_error_gate(content)
|
|
992
|
+
if not should_store:
|
|
993
|
+
return 0 # Gate rejected
|
|
994
|
+
if reason == "refinement" and match:
|
|
995
|
+
return _refine_memory(match, content)
|
|
996
|
+
|
|
369
997
|
db = _get_db()
|
|
370
|
-
|
|
998
|
+
clean_content = redact_secrets(content)
|
|
999
|
+
was_redacted = 1 if clean_content != content else 0
|
|
1000
|
+
vec = embed(clean_content)
|
|
371
1001
|
blob = _array_to_blob(vec)
|
|
372
1002
|
cur = db.execute(
|
|
373
|
-
"""INSERT INTO ltm_memories (content, embedding, source_type, source_id, source_title, domain, tags)
|
|
374
|
-
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
|
375
|
-
(
|
|
1003
|
+
"""INSERT INTO ltm_memories (content, embedding, source_type, source_id, source_title, domain, tags, redaction_applied)
|
|
1004
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
1005
|
+
(clean_content, blob, source_type, source_id, source_title, domain, tags, was_redacted)
|
|
376
1006
|
)
|
|
377
1007
|
db.commit()
|
|
378
1008
|
return cur.lastrowid
|
|
@@ -383,16 +1013,16 @@ def apply_decay():
|
|
|
383
1013
|
db = _get_db()
|
|
384
1014
|
now = datetime.utcnow()
|
|
385
1015
|
|
|
386
|
-
# STM decay
|
|
387
|
-
rows = db.execute("SELECT id, last_accessed, strength FROM stm_memories WHERE promoted_to_ltm = 0").fetchall()
|
|
1016
|
+
# STM decay (skip pinned)
|
|
1017
|
+
rows = db.execute("SELECT id, last_accessed, strength FROM stm_memories WHERE promoted_to_ltm = 0 AND (lifecycle_state IS NULL OR lifecycle_state != 'pinned')").fetchall()
|
|
388
1018
|
for row in rows:
|
|
389
1019
|
last = datetime.fromisoformat(row["last_accessed"])
|
|
390
1020
|
hours = (now - last).total_seconds() / 3600.0
|
|
391
1021
|
new_strength = row["strength"] * math.exp(-LAMBDA_STM * hours)
|
|
392
1022
|
db.execute("UPDATE stm_memories SET strength = ? WHERE id = ?", (new_strength, row["id"]))
|
|
393
1023
|
|
|
394
|
-
# LTM decay
|
|
395
|
-
rows = db.execute("SELECT id, last_accessed, strength FROM ltm_memories WHERE is_dormant = 0").fetchall()
|
|
1024
|
+
# LTM decay (skip pinned)
|
|
1025
|
+
rows = db.execute("SELECT id, last_accessed, strength FROM ltm_memories WHERE is_dormant = 0 AND (lifecycle_state IS NULL OR lifecycle_state != 'pinned')").fetchall()
|
|
396
1026
|
for row in rows:
|
|
397
1027
|
last = datetime.fromisoformat(row["last_accessed"])
|
|
398
1028
|
hours = (now - last).total_seconds() / 3600.0
|
|
@@ -414,11 +1044,12 @@ def promote_stm_to_ltm():
|
|
|
414
1044
|
|
|
415
1045
|
promoted = 0
|
|
416
1046
|
for row in rows:
|
|
1047
|
+
redacted = row["redaction_applied"] if "redaction_applied" in row.keys() else 0
|
|
417
1048
|
db.execute(
|
|
418
|
-
"""INSERT INTO ltm_memories (content, embedding, source_type, source_id, source_title, domain, original_stm_id)
|
|
419
|
-
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
|
1049
|
+
"""INSERT INTO ltm_memories (content, embedding, source_type, source_id, source_title, domain, original_stm_id, redaction_applied)
|
|
1050
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
420
1051
|
(row["content"], row["embedding"], row["source_type"], row["source_id"],
|
|
421
|
-
row["source_title"], row["domain"], row["id"])
|
|
1052
|
+
row["source_title"], row["domain"], row["id"], redacted)
|
|
422
1053
|
)
|
|
423
1054
|
db.execute("UPDATE stm_memories SET promoted_to_ltm = 1 WHERE id = ?", (row["id"],))
|
|
424
1055
|
promoted += 1
|
|
@@ -456,18 +1087,175 @@ def ingest_sensory(
|
|
|
456
1087
|
) -> int:
|
|
457
1088
|
"""Embed and store a sensory register event in STM with source_type='sensory'."""
|
|
458
1089
|
db = _get_db()
|
|
459
|
-
|
|
1090
|
+
clean_content = redact_secrets(content)
|
|
1091
|
+
was_redacted = 1 if clean_content != content else 0
|
|
1092
|
+
vec = embed(clean_content)
|
|
460
1093
|
blob = _array_to_blob(vec)
|
|
461
1094
|
ts = created_at or datetime.utcnow().isoformat()
|
|
462
1095
|
cur = db.execute(
|
|
463
|
-
"""INSERT INTO stm_memories (content, embedding, source_type, source_id, domain, created_at)
|
|
464
|
-
VALUES (?, ?, 'sensory', ?, ?, ?)""",
|
|
465
|
-
(
|
|
1096
|
+
"""INSERT INTO stm_memories (content, embedding, source_type, source_id, domain, created_at, redaction_applied)
|
|
1097
|
+
VALUES (?, ?, 'sensory', ?, ?, ?, ?)""",
|
|
1098
|
+
(clean_content, blob, source_id, domain, ts, was_redacted)
|
|
466
1099
|
)
|
|
467
1100
|
db.commit()
|
|
468
1101
|
return cur.lastrowid
|
|
469
1102
|
|
|
470
1103
|
|
|
1104
|
+
# ---------------------------------------------------------------------------
|
|
1105
|
+
# Prediction Error Gate — hippocampal novelty filter
|
|
1106
|
+
# ---------------------------------------------------------------------------
|
|
1107
|
+
|
|
1108
|
+
def prediction_error_gate(
|
|
1109
|
+
content: str,
|
|
1110
|
+
threshold: float = PE_GATE_REJECT,
|
|
1111
|
+
refine_threshold: float = PE_GATE_REFINE,
|
|
1112
|
+
) -> tuple[bool, float, str, Optional[dict]]:
|
|
1113
|
+
"""Prediction Error Gate — hippocampal novelty filter for memory ingestion.
|
|
1114
|
+
|
|
1115
|
+
Compares incoming content against ALL existing memories (STM + LTM).
|
|
1116
|
+
Decides whether the content is novel enough to store, a refinement of
|
|
1117
|
+
something existing, or redundant.
|
|
1118
|
+
|
|
1119
|
+
Based on the neuroscience principle that prediction errors (mismatches
|
|
1120
|
+
between expected and actual input) gate what gets encoded into memory.
|
|
1121
|
+
High prediction error = novel = store. Low prediction error = redundant = reject.
|
|
1122
|
+
|
|
1123
|
+
Args:
|
|
1124
|
+
content: The text content to evaluate
|
|
1125
|
+
threshold: Similarity above this -> reject as redundant (default 0.85)
|
|
1126
|
+
refine_threshold: Similarity between this and threshold -> refinement (default 0.70)
|
|
1127
|
+
|
|
1128
|
+
Returns:
|
|
1129
|
+
Tuple of (should_store, novelty_score, reason, best_match_info)
|
|
1130
|
+
- should_store: True if content should be stored
|
|
1131
|
+
- novelty_score: 1.0 = completely novel, 0.0 = exact duplicate
|
|
1132
|
+
- reason: 'novel', 'refinement', 'rejected', or 'novel_sibling'
|
|
1133
|
+
- best_match_info: dict with best matching memory details, or None
|
|
1134
|
+
"""
|
|
1135
|
+
global _gate_stats
|
|
1136
|
+
|
|
1137
|
+
if not content or not content.strip():
|
|
1138
|
+
return (False, 0.0, "rejected", None)
|
|
1139
|
+
|
|
1140
|
+
content_vec = embed(content[:500])
|
|
1141
|
+
if np.linalg.norm(content_vec) == 0:
|
|
1142
|
+
return (False, 0.0, "rejected", None)
|
|
1143
|
+
|
|
1144
|
+
db = _get_db()
|
|
1145
|
+
best_score = 0.0
|
|
1146
|
+
best_match = None
|
|
1147
|
+
|
|
1148
|
+
# Scan both STM and LTM for the closest match
|
|
1149
|
+
for table, store_name in [("stm_memories", "stm"), ("ltm_memories", "ltm")]:
|
|
1150
|
+
extra_where = ""
|
|
1151
|
+
if table == "stm_memories":
|
|
1152
|
+
extra_where = " AND promoted_to_ltm = 0"
|
|
1153
|
+
elif table == "ltm_memories":
|
|
1154
|
+
extra_where = " AND is_dormant = 0"
|
|
1155
|
+
|
|
1156
|
+
rows = db.execute(
|
|
1157
|
+
f"SELECT id, content, embedding, source_type, domain FROM {table} WHERE 1=1{extra_where}"
|
|
1158
|
+
).fetchall()
|
|
1159
|
+
|
|
1160
|
+
for row in rows:
|
|
1161
|
+
vec = _blob_to_array(row["embedding"])
|
|
1162
|
+
score = cosine_similarity(content_vec, vec)
|
|
1163
|
+
if score > best_score:
|
|
1164
|
+
best_score = score
|
|
1165
|
+
best_match = {
|
|
1166
|
+
"store": store_name,
|
|
1167
|
+
"id": row["id"],
|
|
1168
|
+
"content": row["content"],
|
|
1169
|
+
"source_type": row["source_type"],
|
|
1170
|
+
"domain": row["domain"],
|
|
1171
|
+
"similarity": round(score, 4),
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
novelty_score = round(1.0 - best_score, 4)
|
|
1175
|
+
|
|
1176
|
+
if best_score > threshold:
|
|
1177
|
+
# Check for siblings before rejecting -- if discriminating entities differ,
|
|
1178
|
+
# this is NOT a duplicate, it's a sibling (same fix for different platforms)
|
|
1179
|
+
if best_match:
|
|
1180
|
+
is_sibling, discriminators = _memories_are_siblings(content, best_match["content"])
|
|
1181
|
+
if is_sibling:
|
|
1182
|
+
_gate_stats["accepted_novel"] += 1
|
|
1183
|
+
best_match["discriminators"] = discriminators
|
|
1184
|
+
return (True, novelty_score, "novel_sibling", best_match)
|
|
1185
|
+
|
|
1186
|
+
_gate_stats["rejected"] += 1
|
|
1187
|
+
return (False, novelty_score, "rejected", best_match)
|
|
1188
|
+
|
|
1189
|
+
elif best_score >= refine_threshold:
|
|
1190
|
+
# Refinement zone -- similar but has enough new info to warrant update
|
|
1191
|
+
_gate_stats["accepted_refinement"] += 1
|
|
1192
|
+
return (True, novelty_score, "refinement", best_match)
|
|
1193
|
+
|
|
1194
|
+
else:
|
|
1195
|
+
# Novel content -- no close match found
|
|
1196
|
+
_gate_stats["accepted_novel"] += 1
|
|
1197
|
+
return (True, novelty_score, "novel", best_match)
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
def _refine_memory(match_info: dict, new_content: str) -> int:
|
|
1201
|
+
"""Merge new content into an existing memory (refinement, not replacement).
|
|
1202
|
+
|
|
1203
|
+
Appends genuinely new information to the existing memory and re-embeds.
|
|
1204
|
+
|
|
1205
|
+
Args:
|
|
1206
|
+
match_info: Dict from prediction_error_gate with store, id, content
|
|
1207
|
+
new_content: The new content that refines the existing memory
|
|
1208
|
+
|
|
1209
|
+
Returns:
|
|
1210
|
+
The ID of the updated memory
|
|
1211
|
+
"""
|
|
1212
|
+
db = _get_db()
|
|
1213
|
+
table = "stm_memories" if match_info["store"] == "stm" else "ltm_memories"
|
|
1214
|
+
memory_id = match_info["id"]
|
|
1215
|
+
|
|
1216
|
+
# Check word-level diff to avoid appending near-identical text
|
|
1217
|
+
existing_words = set(match_info["content"].lower().split())
|
|
1218
|
+
new_words = set(new_content.lower().split())
|
|
1219
|
+
unique_new = new_words - existing_words
|
|
1220
|
+
|
|
1221
|
+
if len(unique_new) < 3:
|
|
1222
|
+
# Almost no new words -- just strengthen the existing memory
|
|
1223
|
+
now = datetime.utcnow().isoformat()
|
|
1224
|
+
db.execute(
|
|
1225
|
+
f"UPDATE {table} SET strength = MIN(1.0, strength + 0.1), "
|
|
1226
|
+
f"access_count = access_count + 1, last_accessed = ? WHERE id = ?",
|
|
1227
|
+
(now, memory_id)
|
|
1228
|
+
)
|
|
1229
|
+
db.commit()
|
|
1230
|
+
return memory_id
|
|
1231
|
+
|
|
1232
|
+
# Append new content as refinement
|
|
1233
|
+
merged_content = match_info["content"] + "\n\n[REFINED]: " + new_content
|
|
1234
|
+
new_vec = embed(merged_content)
|
|
1235
|
+
new_blob = _array_to_blob(new_vec)
|
|
1236
|
+
now = datetime.utcnow().isoformat()
|
|
1237
|
+
|
|
1238
|
+
db.execute(
|
|
1239
|
+
f"UPDATE {table} SET content = ?, embedding = ?, strength = MIN(1.0, strength + 0.15), "
|
|
1240
|
+
f"access_count = access_count + 1, last_accessed = ? WHERE id = ?",
|
|
1241
|
+
(merged_content, new_blob, now, memory_id)
|
|
1242
|
+
)
|
|
1243
|
+
db.commit()
|
|
1244
|
+
return memory_id
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
def get_gate_stats() -> dict:
|
|
1248
|
+
"""Return prediction error gate statistics for the current session."""
|
|
1249
|
+
total = sum(_gate_stats.values())
|
|
1250
|
+
return {
|
|
1251
|
+
"accepted_novel": _gate_stats["accepted_novel"],
|
|
1252
|
+
"accepted_refinement": _gate_stats["accepted_refinement"],
|
|
1253
|
+
"rejected": _gate_stats["rejected"],
|
|
1254
|
+
"total_evaluated": total,
|
|
1255
|
+
"rejection_rate_pct": round(_gate_stats["rejected"] / total * 100, 1) if total > 0 else 0.0,
|
|
1256
|
+
}
|
|
1257
|
+
|
|
1258
|
+
|
|
471
1259
|
def detect_patterns(content_vec: np.ndarray, threshold: float = 0.65) -> list[dict]:
|
|
472
1260
|
"""Compare a vector against LTM to find matching patterns (potential repetitions)."""
|
|
473
1261
|
db = _get_db()
|
|
@@ -512,6 +1300,234 @@ def gc_ltm_dormant(min_age_days: int = 30) -> int:
|
|
|
512
1300
|
return cur.rowcount or 0
|
|
513
1301
|
|
|
514
1302
|
|
|
1303
|
+
def _check_quarantine_contradiction(content_vec: np.ndarray) -> list[dict]:
|
|
1304
|
+
"""Check if a quarantined memory contradicts existing LTM (cosine > 0.8 with opposite sentiment)."""
|
|
1305
|
+
db = _get_db()
|
|
1306
|
+
rows = db.execute(
|
|
1307
|
+
"SELECT id, content, embedding, strength FROM ltm_memories WHERE is_dormant = 0 AND strength > 0.5"
|
|
1308
|
+
).fetchall()
|
|
1309
|
+
|
|
1310
|
+
contradictions = []
|
|
1311
|
+
for row in rows:
|
|
1312
|
+
vec = _blob_to_array(row["embedding"])
|
|
1313
|
+
score = cosine_similarity(content_vec, vec)
|
|
1314
|
+
if score >= 0.8:
|
|
1315
|
+
contradictions.append({
|
|
1316
|
+
"ltm_id": row["id"],
|
|
1317
|
+
"content": row["content"][:200],
|
|
1318
|
+
"similarity": round(score, 3),
|
|
1319
|
+
"strength": row["strength"],
|
|
1320
|
+
})
|
|
1321
|
+
return contradictions
|
|
1322
|
+
|
|
1323
|
+
|
|
1324
|
+
def _check_quarantine_second_occurrence(content_vec: np.ndarray, exclude_id: int) -> bool:
|
|
1325
|
+
"""Check if a similar memory already exists in quarantine (promoted or pending) — confirms the pattern."""
|
|
1326
|
+
db = _get_db()
|
|
1327
|
+
rows = db.execute(
|
|
1328
|
+
"SELECT id, embedding FROM quarantine WHERE id != ? AND status IN ('pending', 'promoted')",
|
|
1329
|
+
(exclude_id,)
|
|
1330
|
+
).fetchall()
|
|
1331
|
+
for row in rows:
|
|
1332
|
+
vec = _blob_to_array(row["embedding"])
|
|
1333
|
+
score = cosine_similarity(content_vec, vec)
|
|
1334
|
+
if score >= 0.75:
|
|
1335
|
+
return True
|
|
1336
|
+
|
|
1337
|
+
# Also check STM for existing similar memories
|
|
1338
|
+
stm_rows = db.execute(
|
|
1339
|
+
"SELECT embedding FROM stm_memories WHERE promoted_to_ltm = 0"
|
|
1340
|
+
).fetchall()
|
|
1341
|
+
for row in stm_rows:
|
|
1342
|
+
vec = _blob_to_array(row["embedding"])
|
|
1343
|
+
score = cosine_similarity(content_vec, vec)
|
|
1344
|
+
if score >= 0.75:
|
|
1345
|
+
return True
|
|
1346
|
+
|
|
1347
|
+
return False
|
|
1348
|
+
|
|
1349
|
+
|
|
1350
|
+
def process_quarantine() -> dict:
|
|
1351
|
+
"""Process the quarantine queue — promote, reject, or expire items based on policy.
|
|
1352
|
+
|
|
1353
|
+
Promotion policy:
|
|
1354
|
+
- source='user_direct' → already promoted at ingest time
|
|
1355
|
+
- source='inferred' + confirmed by second occurrence → promote
|
|
1356
|
+
- source='agent_observation' + no LTM contradiction + >24h old → promote
|
|
1357
|
+
- Contradicts existing LTM → status='rejected', flag for dissonance check
|
|
1358
|
+
- >7 days without promotion → status='expired'
|
|
1359
|
+
|
|
1360
|
+
Returns:
|
|
1361
|
+
Dict with counts: promoted, rejected, expired, still_pending
|
|
1362
|
+
"""
|
|
1363
|
+
db = _get_db()
|
|
1364
|
+
now = datetime.utcnow()
|
|
1365
|
+
expire_cutoff = (now - timedelta(days=7)).isoformat()
|
|
1366
|
+
age_24h = (now - timedelta(hours=24)).isoformat()
|
|
1367
|
+
|
|
1368
|
+
pending = db.execute(
|
|
1369
|
+
"SELECT * FROM quarantine WHERE status = 'pending'"
|
|
1370
|
+
).fetchall()
|
|
1371
|
+
|
|
1372
|
+
promoted = 0
|
|
1373
|
+
rejected = 0
|
|
1374
|
+
expired = 0
|
|
1375
|
+
still_pending = 0
|
|
1376
|
+
|
|
1377
|
+
for row in pending:
|
|
1378
|
+
q_id = row["id"]
|
|
1379
|
+
content = row["content"]
|
|
1380
|
+
source = row["source"]
|
|
1381
|
+
created_at = row["created_at"]
|
|
1382
|
+
content_vec = _blob_to_array(row["embedding"])
|
|
1383
|
+
|
|
1384
|
+
# Check expiration first
|
|
1385
|
+
if created_at < expire_cutoff:
|
|
1386
|
+
db.execute("UPDATE quarantine SET status = 'expired' WHERE id = ?", (q_id,))
|
|
1387
|
+
expired += 1
|
|
1388
|
+
continue
|
|
1389
|
+
|
|
1390
|
+
# Check for contradiction with LTM
|
|
1391
|
+
contradictions = _check_quarantine_contradiction(content_vec)
|
|
1392
|
+
if contradictions:
|
|
1393
|
+
db.execute("UPDATE quarantine SET status = 'rejected', promotion_checks = promotion_checks + 1 WHERE id = ?", (q_id,))
|
|
1394
|
+
rejected += 1
|
|
1395
|
+
continue
|
|
1396
|
+
|
|
1397
|
+
should_promote = False
|
|
1398
|
+
|
|
1399
|
+
if source == "inferred":
|
|
1400
|
+
# Promote if confirmed by second occurrence
|
|
1401
|
+
if _check_quarantine_second_occurrence(content_vec, q_id):
|
|
1402
|
+
should_promote = True
|
|
1403
|
+
|
|
1404
|
+
elif source == "agent_observation":
|
|
1405
|
+
# Promote after 24h if no contradiction (already checked above)
|
|
1406
|
+
if created_at <= age_24h:
|
|
1407
|
+
should_promote = True
|
|
1408
|
+
|
|
1409
|
+
if should_promote:
|
|
1410
|
+
# Promote to STM
|
|
1411
|
+
cur = db.execute(
|
|
1412
|
+
"""INSERT INTO stm_memories (content, embedding, source_type, source_id, source_title, domain, redaction_applied)
|
|
1413
|
+
VALUES (?, ?, ?, ?, ?, ?, 0)""",
|
|
1414
|
+
(content, row["embedding"], row["source_type"], row["source_id"],
|
|
1415
|
+
row["source_title"], row["domain"])
|
|
1416
|
+
)
|
|
1417
|
+
db.execute(
|
|
1418
|
+
"UPDATE quarantine SET status = 'promoted', promoted_at = datetime('now'), confidence = 1.0 WHERE id = ?",
|
|
1419
|
+
(q_id,)
|
|
1420
|
+
)
|
|
1421
|
+
promoted += 1
|
|
1422
|
+
else:
|
|
1423
|
+
db.execute("UPDATE quarantine SET promotion_checks = promotion_checks + 1 WHERE id = ?", (q_id,))
|
|
1424
|
+
still_pending += 1
|
|
1425
|
+
|
|
1426
|
+
db.commit()
|
|
1427
|
+
|
|
1428
|
+
return {
|
|
1429
|
+
"promoted": promoted,
|
|
1430
|
+
"rejected": rejected,
|
|
1431
|
+
"expired": expired,
|
|
1432
|
+
"still_pending": still_pending,
|
|
1433
|
+
"total_processed": promoted + rejected + expired + still_pending,
|
|
1434
|
+
}
|
|
1435
|
+
|
|
1436
|
+
|
|
1437
|
+
def quarantine_list(status: str = "pending", limit: int = 20) -> list[dict]:
|
|
1438
|
+
"""List quarantine items by status.
|
|
1439
|
+
|
|
1440
|
+
Args:
|
|
1441
|
+
status: Filter by status — 'pending', 'promoted', 'rejected', 'expired', or 'all'
|
|
1442
|
+
limit: Max results
|
|
1443
|
+
"""
|
|
1444
|
+
db = _get_db()
|
|
1445
|
+
if status == "all":
|
|
1446
|
+
rows = db.execute(
|
|
1447
|
+
"SELECT * FROM quarantine ORDER BY created_at DESC LIMIT ?", (limit,)
|
|
1448
|
+
).fetchall()
|
|
1449
|
+
else:
|
|
1450
|
+
rows = db.execute(
|
|
1451
|
+
"SELECT * FROM quarantine WHERE status = ? ORDER BY created_at DESC LIMIT ?",
|
|
1452
|
+
(status, limit)
|
|
1453
|
+
).fetchall()
|
|
1454
|
+
|
|
1455
|
+
results = []
|
|
1456
|
+
for row in rows:
|
|
1457
|
+
results.append({
|
|
1458
|
+
"id": row["id"],
|
|
1459
|
+
"content": row["content"][:200],
|
|
1460
|
+
"source": row["source"],
|
|
1461
|
+
"source_type": row["source_type"],
|
|
1462
|
+
"domain": row["domain"],
|
|
1463
|
+
"confidence": row["confidence"],
|
|
1464
|
+
"promotion_checks": row["promotion_checks"],
|
|
1465
|
+
"status": row["status"],
|
|
1466
|
+
"created_at": row["created_at"],
|
|
1467
|
+
"promoted_at": row["promoted_at"],
|
|
1468
|
+
})
|
|
1469
|
+
return results
|
|
1470
|
+
|
|
1471
|
+
|
|
1472
|
+
def quarantine_promote(quarantine_id: int) -> str:
|
|
1473
|
+
"""Manually promote a quarantine item to STM.
|
|
1474
|
+
|
|
1475
|
+
Args:
|
|
1476
|
+
quarantine_id: ID of the quarantine entry to promote
|
|
1477
|
+
"""
|
|
1478
|
+
db = _get_db()
|
|
1479
|
+
row = db.execute("SELECT * FROM quarantine WHERE id = ?", (quarantine_id,)).fetchone()
|
|
1480
|
+
if row is None:
|
|
1481
|
+
return f"ERROR: Quarantine item #{quarantine_id} not found."
|
|
1482
|
+
if row["status"] == "promoted":
|
|
1483
|
+
return f"Quarantine item #{quarantine_id} is already promoted."
|
|
1484
|
+
|
|
1485
|
+
# Insert into STM
|
|
1486
|
+
db.execute(
|
|
1487
|
+
"""INSERT INTO stm_memories (content, embedding, source_type, source_id, source_title, domain, redaction_applied)
|
|
1488
|
+
VALUES (?, ?, ?, ?, ?, ?, 0)""",
|
|
1489
|
+
(row["content"], row["embedding"], row["source_type"], row["source_id"],
|
|
1490
|
+
row["source_title"], row["domain"])
|
|
1491
|
+
)
|
|
1492
|
+
db.execute(
|
|
1493
|
+
"UPDATE quarantine SET status = 'promoted', promoted_at = datetime('now'), confidence = 1.0 WHERE id = ?",
|
|
1494
|
+
(quarantine_id,)
|
|
1495
|
+
)
|
|
1496
|
+
db.commit()
|
|
1497
|
+
return f"Quarantine item #{quarantine_id} promoted to STM."
|
|
1498
|
+
|
|
1499
|
+
|
|
1500
|
+
def quarantine_reject(quarantine_id: int, reason: str = "") -> str:
|
|
1501
|
+
"""Manually reject a quarantine item.
|
|
1502
|
+
|
|
1503
|
+
Args:
|
|
1504
|
+
quarantine_id: ID of the quarantine entry to reject
|
|
1505
|
+
reason: Optional rejection reason
|
|
1506
|
+
"""
|
|
1507
|
+
db = _get_db()
|
|
1508
|
+
row = db.execute("SELECT * FROM quarantine WHERE id = ?", (quarantine_id,)).fetchone()
|
|
1509
|
+
if row is None:
|
|
1510
|
+
return f"ERROR: Quarantine item #{quarantine_id} not found."
|
|
1511
|
+
if row["status"] in ("promoted", "rejected"):
|
|
1512
|
+
return f"Quarantine item #{quarantine_id} is already {row['status']}."
|
|
1513
|
+
|
|
1514
|
+
db.execute("UPDATE quarantine SET status = 'rejected' WHERE id = ?", (quarantine_id,))
|
|
1515
|
+
db.commit()
|
|
1516
|
+
return f"Quarantine item #{quarantine_id} rejected.{' Reason: ' + reason if reason else ''}"
|
|
1517
|
+
|
|
1518
|
+
|
|
1519
|
+
def quarantine_stats() -> dict:
|
|
1520
|
+
"""Return quarantine queue statistics."""
|
|
1521
|
+
db = _get_db()
|
|
1522
|
+
counts = {}
|
|
1523
|
+
for status in ("pending", "promoted", "rejected", "expired"):
|
|
1524
|
+
counts[status] = db.execute(
|
|
1525
|
+
"SELECT COUNT(*) FROM quarantine WHERE status = ?", (status,)
|
|
1526
|
+
).fetchone()[0]
|
|
1527
|
+
counts["total"] = sum(counts.values())
|
|
1528
|
+
return counts
|
|
1529
|
+
|
|
1530
|
+
|
|
515
1531
|
def format_results(results: list[dict]) -> str:
|
|
516
1532
|
"""Format search results with enriched context."""
|
|
517
1533
|
if not results:
|
|
@@ -541,7 +1557,9 @@ def format_results(results: list[dict]) -> str:
|
|
|
541
1557
|
|
|
542
1558
|
store_tag = r["store"].upper()
|
|
543
1559
|
reactivated = " [REACTIVATED]" if r.get("reactivated") else ""
|
|
544
|
-
|
|
1560
|
+
explanation = r.get("explanation", "")
|
|
1561
|
+
explain_line = f"\n ⚙ {explanation}" if explanation else ""
|
|
1562
|
+
lines.append(f"{header} [{store_tag}]{reactivated}\n {preview}{explain_line}")
|
|
545
1563
|
|
|
546
1564
|
# Sibling mention: if this LTM memory has siblings, note them
|
|
547
1565
|
if r["store"] == "ltm":
|
|
@@ -938,7 +1956,7 @@ def resolve_dissonance(memory_id: int, resolution: str, context: str = "") -> st
|
|
|
938
1956
|
Args:
|
|
939
1957
|
memory_id: The LTM memory that conflicts with the new instruction
|
|
940
1958
|
resolution: One of:
|
|
941
|
-
- 'paradigm_shift':
|
|
1959
|
+
- 'paradigm_shift': the user changed his mind permanently. Decay old memory,
|
|
942
1960
|
new instruction becomes the standard.
|
|
943
1961
|
- 'exception': This is a one-time override. Keep old memory as standard.
|
|
944
1962
|
- 'override': Old memory was wrong. Mark as corrupted and decay to dormant.
|
|
@@ -1037,7 +2055,7 @@ def check_correction_fatigue() -> list[dict]:
|
|
|
1037
2055
|
|
|
1038
2056
|
|
|
1039
2057
|
def detect_sentiment(text: str) -> dict:
|
|
1040
|
-
"""Analyze user text for sentiment signals.
|
|
2058
|
+
"""Analyze the user's text for sentiment signals.
|
|
1041
2059
|
|
|
1042
2060
|
Returns detected sentiment, intensity, and action guidance for NEXO.
|
|
1043
2061
|
Not a model — keyword + heuristic based. Fast and deterministic.
|
|
@@ -1076,17 +2094,17 @@ def detect_sentiment(text: str) -> dict:
|
|
|
1076
2094
|
sentiment = "negative"
|
|
1077
2095
|
intensity = min(1.0, 0.3 + neg_score * 0.15)
|
|
1078
2096
|
if intensity > 0.7:
|
|
1079
|
-
guidance = "MODE: Ultra-
|
|
2097
|
+
guidance = "MODE: Ultra-conciso. Cero explicaciones. Resolver y mostrar resultado."
|
|
1080
2098
|
else:
|
|
1081
|
-
guidance = "MODE:
|
|
2099
|
+
guidance = "MODE: Conciso. Menos contexto, más acción directa."
|
|
1082
2100
|
elif pos_score > neg_score and pos_score >= 1:
|
|
1083
2101
|
sentiment = "positive"
|
|
1084
2102
|
intensity = min(1.0, 0.3 + pos_score * 0.15)
|
|
1085
|
-
guidance = "MODE: Normal.
|
|
2103
|
+
guidance = "MODE: Normal. Buen momento para proponer ideas de backlog o mejoras."
|
|
1086
2104
|
elif urgency_hits:
|
|
1087
2105
|
sentiment = "urgent"
|
|
1088
2106
|
intensity = 0.8
|
|
1089
|
-
guidance = "MODE:
|
|
2107
|
+
guidance = "MODE: Acción inmediata. Sin preámbulos."
|
|
1090
2108
|
else:
|
|
1091
2109
|
sentiment = "neutral"
|
|
1092
2110
|
intensity = 0.5
|
|
@@ -1101,7 +2119,7 @@ def detect_sentiment(text: str) -> dict:
|
|
|
1101
2119
|
|
|
1102
2120
|
|
|
1103
2121
|
def log_sentiment(text: str) -> dict:
|
|
1104
|
-
"""Detect and log user sentiment. Returns the detection result."""
|
|
2122
|
+
"""Detect and log the user's sentiment. Returns the detection result."""
|
|
1105
2123
|
result = detect_sentiment(text)
|
|
1106
2124
|
if result["sentiment"] != "neutral":
|
|
1107
2125
|
db = _get_db()
|
|
@@ -1190,6 +2208,155 @@ def get_trust_history(days: int = 7) -> dict:
|
|
|
1190
2208
|
}
|
|
1191
2209
|
|
|
1192
2210
|
|
|
2211
|
+
def dream_cycle(max_insights: int = 50) -> dict:
|
|
2212
|
+
"""Memory Dreaming — discover hidden connections between recent memories.
|
|
2213
|
+
|
|
2214
|
+
Retrieves memories accessed in the last 24h (STM + LTM), finds pairs with
|
|
2215
|
+
moderate similarity (0.4-0.7 — related but not duplicates), and creates
|
|
2216
|
+
'dream_insight' LTM memories linking them. Skips pairs already dreamed about.
|
|
2217
|
+
|
|
2218
|
+
Uses pure vector math — no LLM calls.
|
|
2219
|
+
|
|
2220
|
+
Returns:
|
|
2221
|
+
Dict with 'insights_created' count and 'insights' list of details.
|
|
2222
|
+
"""
|
|
2223
|
+
db = _get_db()
|
|
2224
|
+
cutoff_24h = (datetime.utcnow() - timedelta(hours=24)).isoformat()
|
|
2225
|
+
|
|
2226
|
+
# 1. Gather all memories accessed in the last 24 hours
|
|
2227
|
+
recent_memories = []
|
|
2228
|
+
|
|
2229
|
+
stm_rows = db.execute(
|
|
2230
|
+
"""SELECT id, content, embedding, source_type, source_title, domain, 'stm' as store
|
|
2231
|
+
FROM stm_memories
|
|
2232
|
+
WHERE last_accessed >= ? AND promoted_to_ltm = 0""",
|
|
2233
|
+
(cutoff_24h,)
|
|
2234
|
+
).fetchall()
|
|
2235
|
+
|
|
2236
|
+
ltm_rows = db.execute(
|
|
2237
|
+
"""SELECT id, content, embedding, source_type, source_title, domain, 'ltm' as store
|
|
2238
|
+
FROM ltm_memories
|
|
2239
|
+
WHERE last_accessed >= ? AND is_dormant = 0""",
|
|
2240
|
+
(cutoff_24h,)
|
|
2241
|
+
).fetchall()
|
|
2242
|
+
|
|
2243
|
+
for row in stm_rows + ltm_rows:
|
|
2244
|
+
recent_memories.append({
|
|
2245
|
+
"id": row["id"],
|
|
2246
|
+
"content": row["content"],
|
|
2247
|
+
"vec": _blob_to_array(row["embedding"]),
|
|
2248
|
+
"source_type": row["source_type"],
|
|
2249
|
+
"source_title": row["source_title"] or "",
|
|
2250
|
+
"domain": row["domain"] or "",
|
|
2251
|
+
"store": row["store"],
|
|
2252
|
+
})
|
|
2253
|
+
|
|
2254
|
+
if len(recent_memories) < 2:
|
|
2255
|
+
return {"insights_created": 0, "insights": [], "memories_scanned": len(recent_memories)}
|
|
2256
|
+
|
|
2257
|
+
# 2. Get already-dreamed pairs to skip
|
|
2258
|
+
dreamed = set()
|
|
2259
|
+
for row in db.execute("SELECT memory_a_id, memory_b_id FROM dreamed_pairs").fetchall():
|
|
2260
|
+
dreamed.add((row["memory_a_id"], row["memory_b_id"]))
|
|
2261
|
+
dreamed.add((row["memory_b_id"], row["memory_a_id"]))
|
|
2262
|
+
|
|
2263
|
+
# 3. Batch compute all pairwise cosine similarities
|
|
2264
|
+
# Build matrix for fast numpy dot product
|
|
2265
|
+
n = len(recent_memories)
|
|
2266
|
+
vecs = np.array([m["vec"] for m in recent_memories], dtype=np.float32)
|
|
2267
|
+
norms = np.linalg.norm(vecs, axis=1, keepdims=True)
|
|
2268
|
+
norms[norms == 0] = 1.0 # avoid division by zero
|
|
2269
|
+
normalized = vecs / norms
|
|
2270
|
+
sim_matrix = normalized @ normalized.T # (n x n) cosine similarity matrix
|
|
2271
|
+
|
|
2272
|
+
# 4. Find pairs in the sweet spot (0.4-0.7) — related but not duplicates
|
|
2273
|
+
candidate_pairs = []
|
|
2274
|
+
for i in range(n):
|
|
2275
|
+
for j in range(i + 1, n):
|
|
2276
|
+
score = float(sim_matrix[i, j])
|
|
2277
|
+
if 0.4 <= score <= 0.7:
|
|
2278
|
+
# Use composite key for dreamed check (store:id to disambiguate stm vs ltm)
|
|
2279
|
+
pair_key = (
|
|
2280
|
+
f"{recent_memories[i]['store']}:{recent_memories[i]['id']}",
|
|
2281
|
+
f"{recent_memories[j]['store']}:{recent_memories[j]['id']}",
|
|
2282
|
+
)
|
|
2283
|
+
# For DB tracking we use LTM IDs when both are LTM, else skip dreamed check
|
|
2284
|
+
a_id, b_id = recent_memories[i]["id"], recent_memories[j]["id"]
|
|
2285
|
+
if (a_id, b_id) in dreamed or (b_id, a_id) in dreamed:
|
|
2286
|
+
continue
|
|
2287
|
+
candidate_pairs.append((i, j, score))
|
|
2288
|
+
|
|
2289
|
+
# Sort by similarity descending (strongest connections first)
|
|
2290
|
+
candidate_pairs.sort(key=lambda x: x[2], reverse=True)
|
|
2291
|
+
|
|
2292
|
+
# 5. Generate insights (capped at max_insights)
|
|
2293
|
+
insights = []
|
|
2294
|
+
for i, j, score in candidate_pairs[:max_insights]:
|
|
2295
|
+
mem_a = recent_memories[i]
|
|
2296
|
+
mem_b = recent_memories[j]
|
|
2297
|
+
|
|
2298
|
+
# Build titles — use source_title if available, else first 60 chars of content
|
|
2299
|
+
title_a = mem_a["source_title"] or mem_a["content"][:60].replace("\n", " ").strip()
|
|
2300
|
+
title_b = mem_b["source_title"] or mem_b["content"][:60].replace("\n", " ").strip()
|
|
2301
|
+
|
|
2302
|
+
# Build domain context
|
|
2303
|
+
domains = set(filter(None, [mem_a["domain"], mem_b["domain"]]))
|
|
2304
|
+
domain_str = ", ".join(domains) if domains else "general"
|
|
2305
|
+
|
|
2306
|
+
# Create insight content
|
|
2307
|
+
insight_content = (
|
|
2308
|
+
f"[Dream Insight] Connection found between:\n"
|
|
2309
|
+
f" A: {title_a}\n"
|
|
2310
|
+
f" B: {title_b}\n"
|
|
2311
|
+
f"Similarity: {score:.3f} | Domains: {domain_str}\n"
|
|
2312
|
+
f"These memories appeared together in the same 24h window and share moderate semantic overlap, "
|
|
2313
|
+
f"suggesting a potential relationship worth investigating."
|
|
2314
|
+
)
|
|
2315
|
+
|
|
2316
|
+
# Create embedding as average of the two source vectors (midpoint in vector space)
|
|
2317
|
+
insight_vec = (mem_a["vec"] + mem_b["vec"]) / 2.0
|
|
2318
|
+
insight_vec = insight_vec / (np.linalg.norm(insight_vec) or 1.0) # re-normalize
|
|
2319
|
+
blob = _array_to_blob(insight_vec)
|
|
2320
|
+
|
|
2321
|
+
# Store as LTM with dream_insight tag
|
|
2322
|
+
cur = db.execute(
|
|
2323
|
+
"""INSERT INTO ltm_memories (content, embedding, source_type, source_id, source_title, domain, tags, strength)
|
|
2324
|
+
VALUES (?, ?, 'dream_insight', ?, ?, ?, 'dream_insight', 0.5)""",
|
|
2325
|
+
(insight_content, blob,
|
|
2326
|
+
f"{mem_a['store']}:{mem_a['id']},{mem_b['store']}:{mem_b['id']}",
|
|
2327
|
+
f"Dream: {title_a[:30]} <-> {title_b[:30]}",
|
|
2328
|
+
domain_str)
|
|
2329
|
+
)
|
|
2330
|
+
insight_id = cur.lastrowid
|
|
2331
|
+
|
|
2332
|
+
# Track the dreamed pair
|
|
2333
|
+
a_id, b_id = mem_a["id"], mem_b["id"]
|
|
2334
|
+
try:
|
|
2335
|
+
db.execute(
|
|
2336
|
+
"INSERT OR IGNORE INTO dreamed_pairs (memory_a_id, memory_b_id, insight_id) VALUES (?, ?, ?)",
|
|
2337
|
+
(min(a_id, b_id), max(a_id, b_id), insight_id)
|
|
2338
|
+
)
|
|
2339
|
+
except Exception:
|
|
2340
|
+
pass
|
|
2341
|
+
|
|
2342
|
+
insights.append({
|
|
2343
|
+
"insight_id": insight_id,
|
|
2344
|
+
"title_a": title_a[:80],
|
|
2345
|
+
"title_b": title_b[:80],
|
|
2346
|
+
"similarity": round(score, 4),
|
|
2347
|
+
"domain": domain_str,
|
|
2348
|
+
})
|
|
2349
|
+
|
|
2350
|
+
db.commit()
|
|
2351
|
+
|
|
2352
|
+
return {
|
|
2353
|
+
"insights_created": len(insights),
|
|
2354
|
+
"insights": insights,
|
|
2355
|
+
"memories_scanned": len(recent_memories),
|
|
2356
|
+
"candidates_found": len(candidate_pairs),
|
|
2357
|
+
}
|
|
2358
|
+
|
|
2359
|
+
|
|
1193
2360
|
def get_stats() -> dict:
|
|
1194
2361
|
"""Return statistics about the cognitive memory system."""
|
|
1195
2362
|
db = _get_db()
|
|
@@ -1211,6 +2378,9 @@ def get_stats() -> dict:
|
|
|
1211
2378
|
"SELECT domain, COUNT(*) as cnt FROM ltm_memories WHERE is_dormant = 0 AND domain != '' GROUP BY domain ORDER BY cnt DESC LIMIT 5"
|
|
1212
2379
|
).fetchall()
|
|
1213
2380
|
|
|
2381
|
+
# Quarantine stats
|
|
2382
|
+
q_stats = quarantine_stats()
|
|
2383
|
+
|
|
1214
2384
|
return {
|
|
1215
2385
|
"stm_active": stm_active,
|
|
1216
2386
|
"ltm_active": ltm_active,
|
|
@@ -1221,4 +2391,360 @@ def get_stats() -> dict:
|
|
|
1221
2391
|
"avg_retrieval_score": round(avg_retrieval_score, 3),
|
|
1222
2392
|
"top_domains_stm": [(r["domain"], r["cnt"]) for r in top_domains_stm],
|
|
1223
2393
|
"top_domains_ltm": [(r["domain"], r["cnt"]) for r in top_domains_ltm],
|
|
2394
|
+
"quarantine": q_stats,
|
|
2395
|
+
"prediction_error_gate": get_gate_stats(),
|
|
1224
2396
|
}
|
|
2397
|
+
|
|
2398
|
+
def set_lifecycle(memory_id: int, state: str, store: str = "auto", snooze_until: str = "") -> str:
|
|
2399
|
+
"""Set the lifecycle state of a memory.
|
|
2400
|
+
|
|
2401
|
+
Args:
|
|
2402
|
+
memory_id: Memory ID
|
|
2403
|
+
state: 'active', 'pinned', 'snoozed', 'archived'
|
|
2404
|
+
store: 'stm', 'ltm', or 'auto' (tries both)
|
|
2405
|
+
snooze_until: Required for 'snoozed' state — ISO date string (YYYY-MM-DD or full datetime)
|
|
2406
|
+
"""
|
|
2407
|
+
if state not in ("active", "pinned", "snoozed", "archived"):
|
|
2408
|
+
return f"Invalid state: {state}. Must be active, pinned, snoozed, or archived."
|
|
2409
|
+
|
|
2410
|
+
if state == "snoozed" and not snooze_until:
|
|
2411
|
+
return "snooze_until is required when setting state to 'snoozed'."
|
|
2412
|
+
|
|
2413
|
+
db = _get_db()
|
|
2414
|
+
|
|
2415
|
+
tables = []
|
|
2416
|
+
if store == "auto":
|
|
2417
|
+
tables = ["stm_memories", "ltm_memories"]
|
|
2418
|
+
elif store == "stm":
|
|
2419
|
+
tables = ["stm_memories"]
|
|
2420
|
+
elif store == "ltm":
|
|
2421
|
+
tables = ["ltm_memories"]
|
|
2422
|
+
else:
|
|
2423
|
+
return f"Invalid store: {store}. Must be stm, ltm, or auto."
|
|
2424
|
+
|
|
2425
|
+
found = False
|
|
2426
|
+
found_table = None
|
|
2427
|
+
for table in tables:
|
|
2428
|
+
row = db.execute(f"SELECT id FROM {table} WHERE id = ?", (memory_id,)).fetchone()
|
|
2429
|
+
if row:
|
|
2430
|
+
found = True
|
|
2431
|
+
found_table = table
|
|
2432
|
+
break
|
|
2433
|
+
|
|
2434
|
+
if not found:
|
|
2435
|
+
return f"Memory #{memory_id} not found in {store}."
|
|
2436
|
+
|
|
2437
|
+
snooze_val = snooze_until if state == "snoozed" else None
|
|
2438
|
+
db.execute(
|
|
2439
|
+
f"UPDATE {found_table} SET lifecycle_state = ?, snooze_until = ? WHERE id = ?",
|
|
2440
|
+
(state, snooze_val, memory_id)
|
|
2441
|
+
)
|
|
2442
|
+
db.commit()
|
|
2443
|
+
|
|
2444
|
+
store_name = "STM" if found_table == "stm_memories" else "LTM"
|
|
2445
|
+
extra = f" until {snooze_until}" if state == "snoozed" else ""
|
|
2446
|
+
return f"Memory #{memory_id} ({store_name}) → {state}{extra}"
|
|
2447
|
+
|
|
2448
|
+
|
|
2449
|
+
# ---------------------------------------------------------------------------
|
|
2450
|
+
# Feature 1: Auto-Merge Duplicates
|
|
2451
|
+
# Inspired by Vestige's union-find clustering and claude-cortex's Jaccard
|
|
2452
|
+
# similarity merge. Runs during sleep cycle AFTER dream_cycle.
|
|
2453
|
+
# ---------------------------------------------------------------------------
|
|
2454
|
+
|
|
2455
|
+
def auto_merge_duplicates(threshold: float = 0.92) -> dict:
|
|
2456
|
+
"""Auto-merge near-duplicate LTM memories with cosine similarity > threshold.
|
|
2457
|
+
|
|
2458
|
+
Unlike consolidate_semantic (threshold=0.9, runs during decay), this uses a
|
|
2459
|
+
higher threshold (0.92) and is designed for the sleep cycle. It respects
|
|
2460
|
+
sibling detection: memories with differing discriminating entities are never
|
|
2461
|
+
merged, even at 0.99 similarity.
|
|
2462
|
+
|
|
2463
|
+
Merge strategy (adapted from claude-cortex):
|
|
2464
|
+
- Keep the longer/richer memory
|
|
2465
|
+
- Append unique info from the shorter one (if >5 unique words)
|
|
2466
|
+
- Re-embed merged content
|
|
2467
|
+
- Sum access_count from both
|
|
2468
|
+
- Delete the duplicate
|
|
2469
|
+
- Log every merge for audit
|
|
2470
|
+
|
|
2471
|
+
Returns:
|
|
2472
|
+
Dict with scanned, merged, kept counts and merge_log details.
|
|
2473
|
+
"""
|
|
2474
|
+
db = _get_db()
|
|
2475
|
+
rows = db.execute(
|
|
2476
|
+
"SELECT id, content, embedding, source_type, domain, access_count, strength, tags "
|
|
2477
|
+
"FROM ltm_memories WHERE is_dormant = 0 AND "
|
|
2478
|
+
"(lifecycle_state IS NULL OR lifecycle_state = 'active')"
|
|
2479
|
+
).fetchall()
|
|
2480
|
+
|
|
2481
|
+
if len(rows) < 2:
|
|
2482
|
+
return {"scanned": len(rows), "merged": 0, "kept": len(rows), "merge_log": []}
|
|
2483
|
+
|
|
2484
|
+
# Build memory list with vectors (batch load like dream_cycle)
|
|
2485
|
+
memories = []
|
|
2486
|
+
for row in rows:
|
|
2487
|
+
memories.append({
|
|
2488
|
+
"id": row["id"],
|
|
2489
|
+
"content": row["content"],
|
|
2490
|
+
"vec": _blob_to_array(row["embedding"]),
|
|
2491
|
+
"source_type": row["source_type"],
|
|
2492
|
+
"domain": row["domain"] or "",
|
|
2493
|
+
"access_count": row["access_count"],
|
|
2494
|
+
"strength": row["strength"],
|
|
2495
|
+
"tags": row["tags"] or "",
|
|
2496
|
+
})
|
|
2497
|
+
|
|
2498
|
+
n = len(memories)
|
|
2499
|
+
|
|
2500
|
+
# Batch cosine similarity matrix (same approach as dream_cycle)
|
|
2501
|
+
vecs = np.array([m["vec"] for m in memories], dtype=np.float32)
|
|
2502
|
+
norms = np.linalg.norm(vecs, axis=1, keepdims=True)
|
|
2503
|
+
norms[norms == 0] = 1.0
|
|
2504
|
+
normalized = vecs / norms
|
|
2505
|
+
sim_matrix = normalized @ normalized.T
|
|
2506
|
+
|
|
2507
|
+
merged_ids = set()
|
|
2508
|
+
merge_log = []
|
|
2509
|
+
|
|
2510
|
+
for i in range(n):
|
|
2511
|
+
if memories[i]["id"] in merged_ids:
|
|
2512
|
+
continue
|
|
2513
|
+
for j in range(i + 1, n):
|
|
2514
|
+
if memories[j]["id"] in merged_ids:
|
|
2515
|
+
continue
|
|
2516
|
+
|
|
2517
|
+
score = float(sim_matrix[i, j])
|
|
2518
|
+
if score < threshold:
|
|
2519
|
+
continue
|
|
2520
|
+
|
|
2521
|
+
# Sibling check — never merge if discriminating entities differ
|
|
2522
|
+
is_sibling, discriminators = _memories_are_siblings(
|
|
2523
|
+
memories[i]["content"], memories[j]["content"]
|
|
2524
|
+
)
|
|
2525
|
+
if is_sibling:
|
|
2526
|
+
continue
|
|
2527
|
+
|
|
2528
|
+
# Domain/tags compatibility check
|
|
2529
|
+
if memories[i]["domain"] and memories[j]["domain"]:
|
|
2530
|
+
if memories[i]["domain"] != memories[j]["domain"]:
|
|
2531
|
+
continue
|
|
2532
|
+
|
|
2533
|
+
# Determine keep vs drop: prefer longer content, then higher access_count
|
|
2534
|
+
if len(memories[i]["content"]) >= len(memories[j]["content"]):
|
|
2535
|
+
keep, drop = memories[i], memories[j]
|
|
2536
|
+
elif memories[i]["access_count"] > memories[j]["access_count"]:
|
|
2537
|
+
keep, drop = memories[i], memories[j]
|
|
2538
|
+
else:
|
|
2539
|
+
keep, drop = memories[j], memories[i]
|
|
2540
|
+
|
|
2541
|
+
# Merge content: append unique info from drop (Jaccard-style word diff)
|
|
2542
|
+
keep_words = set(keep["content"].lower().split())
|
|
2543
|
+
drop_words = set(drop["content"].lower().split())
|
|
2544
|
+
unique_words = drop_words - keep_words
|
|
2545
|
+
|
|
2546
|
+
new_content = keep["content"]
|
|
2547
|
+
if len(unique_words) > 5:
|
|
2548
|
+
new_content = keep["content"] + "\n\n[AUTO-MERGED]: " + drop["content"]
|
|
2549
|
+
|
|
2550
|
+
# Re-embed merged content
|
|
2551
|
+
new_vec = embed(new_content)
|
|
2552
|
+
new_blob = _array_to_blob(new_vec)
|
|
2553
|
+
|
|
2554
|
+
# Merge tags
|
|
2555
|
+
keep_tags = set(filter(None, keep["tags"].split(",")))
|
|
2556
|
+
drop_tags = set(filter(None, drop["tags"].split(",")))
|
|
2557
|
+
merged_tags = ",".join(sorted(keep_tags | drop_tags))
|
|
2558
|
+
|
|
2559
|
+
# Update keep, delete drop
|
|
2560
|
+
new_access = keep["access_count"] + drop["access_count"]
|
|
2561
|
+
db.execute(
|
|
2562
|
+
"UPDATE ltm_memories SET content = ?, embedding = ?, "
|
|
2563
|
+
"access_count = ?, tags = ?, strength = MIN(1.0, strength + 0.1) WHERE id = ?",
|
|
2564
|
+
(new_content, new_blob, new_access, merged_tags, keep["id"])
|
|
2565
|
+
)
|
|
2566
|
+
db.execute("DELETE FROM ltm_memories WHERE id = ?", (drop["id"],))
|
|
2567
|
+
merged_ids.add(drop["id"])
|
|
2568
|
+
|
|
2569
|
+
merge_log.append({
|
|
2570
|
+
"kept_id": keep["id"],
|
|
2571
|
+
"dropped_id": drop["id"],
|
|
2572
|
+
"similarity": round(score, 4),
|
|
2573
|
+
"unique_words_appended": len(unique_words) if len(unique_words) > 5 else 0,
|
|
2574
|
+
"kept_preview": keep["content"][:80],
|
|
2575
|
+
"dropped_preview": drop["content"][:80],
|
|
2576
|
+
})
|
|
2577
|
+
|
|
2578
|
+
if merge_log:
|
|
2579
|
+
db.commit()
|
|
2580
|
+
|
|
2581
|
+
return {
|
|
2582
|
+
"scanned": n,
|
|
2583
|
+
"merged": len(merge_log),
|
|
2584
|
+
"kept": n - len(merge_log),
|
|
2585
|
+
"merge_log": merge_log,
|
|
2586
|
+
}
|
|
2587
|
+
|
|
2588
|
+
|
|
2589
|
+
# ---------------------------------------------------------------------------
|
|
2590
|
+
# Feature 2: Security Pipeline (Memory Poisoning Defense)
|
|
2591
|
+
# Adapted from ShieldCortex's 6-layer defence pipeline:
|
|
2592
|
+
# - instruction-detector.ts → pattern groups with weights
|
|
2593
|
+
# - encoding-detector.ts → base64, homoglyphs, invisible chars
|
|
2594
|
+
# - credential-leak scanner → reuses existing redact_secrets()
|
|
2595
|
+
# ---------------------------------------------------------------------------
|
|
2596
|
+
|
|
2597
|
+
# Injection patterns (adapted from ShieldCortex instruction-detector.ts)
|
|
2598
|
+
_INJECTION_PATTERNS = [
|
|
2599
|
+
# System prompt markers (weight 0.9)
|
|
2600
|
+
(re.compile(r'\[SYSTEM:', re.IGNORECASE), "system_prompt_marker", 0.9),
|
|
2601
|
+
(re.compile(r'<<SYS>>', re.IGNORECASE), "system_prompt_marker", 0.9),
|
|
2602
|
+
(re.compile(r'\[INST\]', re.IGNORECASE), "system_prompt_marker", 0.9),
|
|
2603
|
+
(re.compile(r'<\|im_start\|>', re.IGNORECASE), "system_prompt_marker", 0.9),
|
|
2604
|
+
(re.compile(r'<\|system\|>', re.IGNORECASE), "system_prompt_marker", 0.9),
|
|
2605
|
+
(re.compile(r'^SYSTEM\s*:', re.IGNORECASE | re.MULTILINE), "system_prompt_marker", 0.9),
|
|
2606
|
+
|
|
2607
|
+
# Hidden instructions (weight 0.8)
|
|
2608
|
+
(re.compile(r'ignore\s+(all\s+)?previous\s+(instructions?|prompts?|context)', re.IGNORECASE), "hidden_instruction", 0.8),
|
|
2609
|
+
(re.compile(r'forget\s+everything', re.IGNORECASE), "hidden_instruction", 0.8),
|
|
2610
|
+
(re.compile(r'new\s+instructions?\s*:', re.IGNORECASE), "hidden_instruction", 0.8),
|
|
2611
|
+
(re.compile(r'you\s+are\s+now\b', re.IGNORECASE), "hidden_instruction", 0.8),
|
|
2612
|
+
(re.compile(r'disregard\s+(all\s+)?(previous|above|prior)', re.IGNORECASE), "hidden_instruction", 0.8),
|
|
2613
|
+
(re.compile(r'override\s+(previous|all|system)', re.IGNORECASE), "hidden_instruction", 0.8),
|
|
2614
|
+
|
|
2615
|
+
# Memory manipulation (weight 0.7)
|
|
2616
|
+
(re.compile(r'save\s+(this\s+)?to\s+memory', re.IGNORECASE), "memory_manipulation", 0.7),
|
|
2617
|
+
(re.compile(r'remember\s+this\s+(instruction|command|rule)', re.IGNORECASE), "memory_manipulation", 0.7),
|
|
2618
|
+
(re.compile(r'from\s+now\s+on\s*(,\s*)?always', re.IGNORECASE), "memory_manipulation", 0.7),
|
|
2619
|
+
(re.compile(r'inject\s+(into\s+)?memory', re.IGNORECASE), "memory_manipulation", 0.7),
|
|
2620
|
+
|
|
2621
|
+
# Behavioral modification (weight 0.7)
|
|
2622
|
+
(re.compile(r'your\s+new\s+rule\s+is', re.IGNORECASE), "behavioral_mod", 0.7),
|
|
2623
|
+
(re.compile(r'always\s+respond\s+with', re.IGNORECASE), "behavioral_mod", 0.7),
|
|
2624
|
+
(re.compile(r'when\s+(the\s+)?user\s+asks', re.IGNORECASE), "behavioral_mod", 0.7),
|
|
2625
|
+
|
|
2626
|
+
# Delimiter attacks (weight 0.75)
|
|
2627
|
+
(re.compile(r'\n{5,}[\s\S]{0,500}\b(instruction|command|system|ignore)\b', re.IGNORECASE), "delimiter_attack", 0.75),
|
|
2628
|
+
(re.compile(r'<!--[\s\S]{0,200}?(instruction|command|system|ignore|inject|override)[\s\S]{0,200}?-->', re.IGNORECASE), "delimiter_attack", 0.75),
|
|
2629
|
+
]
|
|
2630
|
+
|
|
2631
|
+
# Max content length to scan (prevents ReDOS, adapted from ShieldCortex)
|
|
2632
|
+
_MAX_SECURITY_SCAN_LENGTH = 50000
|
|
2633
|
+
|
|
2634
|
+
|
|
2635
|
+
def security_scan(content: str) -> dict:
|
|
2636
|
+
"""Security scan for memory poisoning defense.
|
|
2637
|
+
|
|
2638
|
+
Adapted from ShieldCortex's 6-layer defence pipeline. Checks:
|
|
2639
|
+
1. Input sanitization — strip injection patterns
|
|
2640
|
+
2. Pattern detection — base64, homoglyphs, invisible chars
|
|
2641
|
+
3. Behavioral scoring — content trying to modify NEXO behavior
|
|
2642
|
+
4. Credential detection — reuses existing redact_secrets()
|
|
2643
|
+
|
|
2644
|
+
Args:
|
|
2645
|
+
content: Text content to scan
|
|
2646
|
+
|
|
2647
|
+
Returns:
|
|
2648
|
+
Dict with safe (bool), flags (list), sanitized_content (str),
|
|
2649
|
+
risk_score (float 0-1)
|
|
2650
|
+
"""
|
|
2651
|
+
if not content or not content.strip():
|
|
2652
|
+
return {"safe": True, "flags": [], "sanitized_content": "", "risk_score": 0.0}
|
|
2653
|
+
|
|
2654
|
+
flags = []
|
|
2655
|
+
max_weight = 0.0
|
|
2656
|
+
total_weight = 0.0
|
|
2657
|
+
matches_count = 0
|
|
2658
|
+
sanitized = content
|
|
2659
|
+
|
|
2660
|
+
# Truncate for safety (ShieldCortex pattern)
|
|
2661
|
+
scan_text = content[:_MAX_SECURITY_SCAN_LENGTH] if len(content) > _MAX_SECURITY_SCAN_LENGTH else content
|
|
2662
|
+
|
|
2663
|
+
# --- Layer 1: Injection pattern detection ---
|
|
2664
|
+
for pattern, category, weight in _INJECTION_PATTERNS:
|
|
2665
|
+
if pattern.search(scan_text):
|
|
2666
|
+
flag = f"{category}:{pattern.pattern[:50]}"
|
|
2667
|
+
flags.append(flag)
|
|
2668
|
+
max_weight = max(max_weight, weight)
|
|
2669
|
+
total_weight += weight
|
|
2670
|
+
matches_count += 1
|
|
2671
|
+
# Sanitize: remove the matched pattern
|
|
2672
|
+
sanitized = pattern.sub("[SANITIZED]", sanitized)
|
|
2673
|
+
|
|
2674
|
+
# --- Layer 2: Encoding/obfuscation detection (from ShieldCortex encoding-detector.ts) ---
|
|
2675
|
+
|
|
2676
|
+
# Base64 blocks > 100 chars
|
|
2677
|
+
b64_pattern = re.compile(r'(?:[A-Za-z0-9+/]{4}){25,}(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?')
|
|
2678
|
+
b64_matches = b64_pattern.findall(scan_text)
|
|
2679
|
+
for b64_match in b64_matches:
|
|
2680
|
+
try:
|
|
2681
|
+
decoded = base64.b64decode(b64_match).decode("utf-8", errors="ignore")
|
|
2682
|
+
printable_ratio = len(re.sub(r'[^\x20-\x7E]', '', decoded)) / max(len(decoded), 1)
|
|
2683
|
+
if printable_ratio > 0.7 and len(decoded) > 10:
|
|
2684
|
+
flags.append(f"base64_encoded:{decoded[:60]}")
|
|
2685
|
+
max_weight = max(max_weight, 0.6)
|
|
2686
|
+
total_weight += 0.6
|
|
2687
|
+
matches_count += 1
|
|
2688
|
+
except Exception:
|
|
2689
|
+
pass
|
|
2690
|
+
|
|
2691
|
+
# Zero-width / invisible characters (from ShieldCortex)
|
|
2692
|
+
invisible_chars = re.findall(r'[\u200B\u200C\u200D\uFEFF\u202E]', scan_text)
|
|
2693
|
+
if len(invisible_chars) > 2:
|
|
2694
|
+
flags.append(f"invisible_chars:{len(invisible_chars)}_found")
|
|
2695
|
+
max_weight = max(max_weight, 0.5)
|
|
2696
|
+
total_weight += 0.5
|
|
2697
|
+
matches_count += 1
|
|
2698
|
+
# Remove invisible chars
|
|
2699
|
+
sanitized = re.sub(r'[\u200B\u200C\u200D\uFEFF\u202E]', '', sanitized)
|
|
2700
|
+
|
|
2701
|
+
# Unicode homoglyphs — Cyrillic chars that look like Latin (from ShieldCortex)
|
|
2702
|
+
homoglyphs = re.findall(
|
|
2703
|
+
r'[\u0430\u0435\u043E\u0440\u0441\u0443\u0445\u0410\u0412\u0415\u041A\u041C\u041D\u041E\u0420\u0421\u0422\u0423\u0425]',
|
|
2704
|
+
scan_text
|
|
2705
|
+
)
|
|
2706
|
+
if len(homoglyphs) > 3:
|
|
2707
|
+
flags.append(f"unicode_homoglyphs:{len(homoglyphs)}_cyrillic")
|
|
2708
|
+
max_weight = max(max_weight, 0.5)
|
|
2709
|
+
total_weight += 0.5
|
|
2710
|
+
matches_count += 1
|
|
2711
|
+
|
|
2712
|
+
# --- Layer 3: Behavioral scoring ---
|
|
2713
|
+
behavioral_patterns = [
|
|
2714
|
+
(re.compile(r'\balways\s+do\b', re.IGNORECASE), "behavioral:always_do"),
|
|
2715
|
+
(re.compile(r'\bnever\s+do\b', re.IGNORECASE), "behavioral:never_do"),
|
|
2716
|
+
(re.compile(r'\byour\s+new\s+rule\b', re.IGNORECASE), "behavioral:new_rule"),
|
|
2717
|
+
(re.compile(r'\byou\s+must\s+always\b', re.IGNORECASE), "behavioral:must_always"),
|
|
2718
|
+
(re.compile(r'\bchange\s+your\s+behavior\b', re.IGNORECASE), "behavioral:change_behavior"),
|
|
2719
|
+
]
|
|
2720
|
+
for bp, label in behavioral_patterns:
|
|
2721
|
+
if bp.search(scan_text):
|
|
2722
|
+
flags.append(label)
|
|
2723
|
+
max_weight = max(max_weight, 0.4)
|
|
2724
|
+
total_weight += 0.4
|
|
2725
|
+
matches_count += 1
|
|
2726
|
+
|
|
2727
|
+
# --- Layer 4: Credential detection (reuse existing redact_secrets) ---
|
|
2728
|
+
redacted = redact_secrets(scan_text)
|
|
2729
|
+
if redacted != scan_text:
|
|
2730
|
+
flags.append("credentials_detected")
|
|
2731
|
+
sanitized = redact_secrets(sanitized)
|
|
2732
|
+
# Don't increase risk score for creds — still store (redacted)
|
|
2733
|
+
# but flag for awareness
|
|
2734
|
+
|
|
2735
|
+
# Calculate risk score (0-1): weighted by max_weight and count
|
|
2736
|
+
if matches_count == 0:
|
|
2737
|
+
risk_score = 0.0
|
|
2738
|
+
else:
|
|
2739
|
+
# ShieldCortex approach: max weight dominates, count adds diminishing returns
|
|
2740
|
+
risk_score = min(1.0, max_weight + (matches_count - 1) * 0.05)
|
|
2741
|
+
|
|
2742
|
+
safe = risk_score < 0.5
|
|
2743
|
+
|
|
2744
|
+
return {
|
|
2745
|
+
"safe": safe,
|
|
2746
|
+
"flags": flags,
|
|
2747
|
+
"sanitized_content": sanitized,
|
|
2748
|
+
"risk_score": round(risk_score, 3),
|
|
2749
|
+
}
|
|
2750
|
+
|