engram-lite 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- engram/__init__.py +30 -0
- engram/cli/__init__.py +5 -0
- engram/cli/demo.py +60 -0
- engram/cli/main.py +99 -0
- engram/config.py +79 -0
- engram/core/__init__.py +5 -0
- engram/core/anchors.py +101 -0
- engram/core/compaction.py +51 -0
- engram/core/consolidation.py +91 -0
- engram/core/dates.py +129 -0
- engram/core/entities.py +61 -0
- engram/core/eviction.py +36 -0
- engram/core/extraction.py +107 -0
- engram/core/memory.py +583 -0
- engram/core/promotion.py +153 -0
- engram/core/redaction.py +44 -0
- engram/core/retrieval.py +188 -0
- engram/core/rrf.py +25 -0
- engram/core/salience.py +213 -0
- engram/core/subjects.py +20 -0
- engram/core/tags.py +210 -0
- engram/embeddings/__init__.py +50 -0
- engram/embeddings/base.py +19 -0
- engram/embeddings/hashing.py +30 -0
- engram/embeddings/local.py +46 -0
- engram/integrations/__init__.py +4 -0
- engram/integrations/hermes.py +248 -0
- engram/settings.py +102 -0
- engram/storage/__init__.py +1 -0
- engram/storage/db.py +66 -0
- engram/storage/repository.py +329 -0
- engram/storage/schema.py +171 -0
- engram_lite-0.1.0.dist-info/METADATA +199 -0
- engram_lite-0.1.0.dist-info/RECORD +37 -0
- engram_lite-0.1.0.dist-info/WHEEL +4 -0
- engram_lite-0.1.0.dist-info/entry_points.txt +2 -0
- engram_lite-0.1.0.dist-info/licenses/LICENSE +202 -0
engram/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""engram — a small, local-first agentic memory engine.
|
|
2
|
+
|
|
3
|
+
One SQLite file holds the facts, a keyword index (FTS5), and a vector index
|
|
4
|
+
(sqlite-vec). Meant to be shared across your AI tools (Claude, Cursor, …) and
|
|
5
|
+
across sessions. See docs/memory-docs/ for the design and docs/code-docs/ for the
|
|
6
|
+
code guide.
|
|
7
|
+
|
|
8
|
+
Quick start:
|
|
9
|
+
|
|
10
|
+
from engram import Memory
|
|
11
|
+
mem = Memory("my_memory.db")
|
|
12
|
+
mem.save("Bob owns the payments service", subject="Bob")
|
|
13
|
+
mem.search("who owns payments?", subject="Bob")
|
|
14
|
+
|
|
15
|
+
Layout:
|
|
16
|
+
engram.core the engine logic (memory, consolidation, retrieval, …)
|
|
17
|
+
engram.storage SQLite connection, schema, row/index operations
|
|
18
|
+
engram.embeddings text → vector (local model, with a stub fallback)
|
|
19
|
+
engram.cli the `engram` command (demo / status / rebuild)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from .core.memory import Memory
|
|
23
|
+
from .embeddings import get_embedder
|
|
24
|
+
|
|
25
|
+
__all__ = ["Memory", "get_embedder"]
|
|
26
|
+
try:
|
|
27
|
+
from importlib.metadata import version as _pkg_version
|
|
28
|
+
__version__ = _pkg_version("engram-lite")
|
|
29
|
+
except Exception: # not installed (running from a checkout)
|
|
30
|
+
__version__ = "0.1.0"
|
engram/cli/__init__.py
ADDED
engram/cli/demo.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""`engram demo` — an interactive REPL to see the engine work (no AI tool needed).
|
|
2
|
+
|
|
3
|
+
For each line you type: FIND relevant memories, then SAVE the salient ones.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
|
|
9
|
+
from ..core.memory import Memory
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def run(args: argparse.Namespace) -> int:
|
|
13
|
+
mem = Memory(path="engram_demo.db")
|
|
14
|
+
print("engram demo — type a message, or :quit. (:help for commands)\n")
|
|
15
|
+
pinned_subject = None
|
|
16
|
+
while True:
|
|
17
|
+
try:
|
|
18
|
+
line = input("you> ").strip()
|
|
19
|
+
except (EOFError, KeyboardInterrupt):
|
|
20
|
+
print()
|
|
21
|
+
break
|
|
22
|
+
if not line:
|
|
23
|
+
continue
|
|
24
|
+
if line in (":quit", ":q"):
|
|
25
|
+
break
|
|
26
|
+
if line in (":help", ":h"):
|
|
27
|
+
print(" :list :forget <id> :subject <name> :quit")
|
|
28
|
+
continue
|
|
29
|
+
if line == ":list":
|
|
30
|
+
for f in mem.all_current():
|
|
31
|
+
print(f" [{f['id'][:8]}] ({f['block_id']}) {f['value']}")
|
|
32
|
+
continue
|
|
33
|
+
if line.startswith(":forget "):
|
|
34
|
+
mem.forget(line.split(" ", 1)[1].strip())
|
|
35
|
+
print(" forgotten.")
|
|
36
|
+
continue
|
|
37
|
+
if line.startswith(":subject "):
|
|
38
|
+
pinned_subject = line.split(" ", 1)[1].strip()
|
|
39
|
+
print(f" subject pinned: {pinned_subject}")
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
hits = mem.search(line, k=3)
|
|
43
|
+
if hits:
|
|
44
|
+
print(" 🧠 recalled:")
|
|
45
|
+
for h in hits:
|
|
46
|
+
print(f" - {h['value']}")
|
|
47
|
+
else:
|
|
48
|
+
print(" 🧠 (nothing relevant remembered yet)")
|
|
49
|
+
res = mem.remember(line, subject=pinned_subject) # gated save (salience)
|
|
50
|
+
if res["decision"] == "SKIP":
|
|
51
|
+
print(f" · skipped — {res['reason']}")
|
|
52
|
+
else:
|
|
53
|
+
print(f" 💾 {res['decision']} → block '{res['block_id']}'")
|
|
54
|
+
print()
|
|
55
|
+
mem.close()
|
|
56
|
+
return 0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def add_parser(sub) -> None:
|
|
60
|
+
sub.add_parser("demo", help="interactive demo REPL").set_defaults(func=run)
|
engram/cli/main.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""The `engram` command — dispatches to subcommands.
|
|
2
|
+
|
|
3
|
+
engram demo interactive REPL
|
|
4
|
+
engram status print current env settings
|
|
5
|
+
engram rebuild re-embed a store after changing the embedding model
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
|
|
11
|
+
from . import demo
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _run_selftest(args: argparse.Namespace) -> int:
|
|
15
|
+
"""Verify the engine works end-to-end (offline, no model download)."""
|
|
16
|
+
import os
|
|
17
|
+
import tempfile
|
|
18
|
+
|
|
19
|
+
from ..core.memory import Memory
|
|
20
|
+
from ..embeddings import HashEmbedder
|
|
21
|
+
|
|
22
|
+
fd, path = tempfile.mkstemp(suffix=".db")
|
|
23
|
+
os.close(fd)
|
|
24
|
+
try:
|
|
25
|
+
m = Memory(path, embedder=HashEmbedder())
|
|
26
|
+
m.remember("selftest: the payments service is owned by Bob", subject="selftest")
|
|
27
|
+
hits = m.search("who owns payments?", subject="selftest")
|
|
28
|
+
m.close()
|
|
29
|
+
if hits:
|
|
30
|
+
print("✓ engine OK — sqlite-vec + FTS5 + vector search working")
|
|
31
|
+
return 0
|
|
32
|
+
print("✗ engine ran but recall failed")
|
|
33
|
+
return 1
|
|
34
|
+
except Exception as exc: # noqa: BLE001
|
|
35
|
+
print(f"✗ engine FAILED: {exc}")
|
|
36
|
+
return 1
|
|
37
|
+
finally:
|
|
38
|
+
os.unlink(path)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _run_rebuild(args) -> int:
|
|
42
|
+
from ..core.memory import Memory
|
|
43
|
+
res = Memory.reembed(args.db)
|
|
44
|
+
print(f"re-embedded {res['reembedded']} facts at {res['dim']}-dim "
|
|
45
|
+
f"({res['model']}) — store ready")
|
|
46
|
+
return 0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _run_status(args: argparse.Namespace) -> int:
|
|
50
|
+
import os
|
|
51
|
+
import sqlite3
|
|
52
|
+
|
|
53
|
+
from .. import config
|
|
54
|
+
from ..settings import Settings
|
|
55
|
+
|
|
56
|
+
s = Settings.from_env()
|
|
57
|
+
print(s.summary())
|
|
58
|
+
# show "how full" without loading the embedding model — a plain sqlite read
|
|
59
|
+
if os.path.exists(s.db_path):
|
|
60
|
+
con = sqlite3.connect(s.db_path)
|
|
61
|
+
try:
|
|
62
|
+
cur = con.execute(
|
|
63
|
+
"SELECT COUNT(*) FROM facts WHERE superseded_by IS NULL "
|
|
64
|
+
"AND validation_status = 'fresh'"
|
|
65
|
+
).fetchone()[0]
|
|
66
|
+
tot = con.execute("SELECT COUNT(*) FROM facts").fetchone()[0]
|
|
67
|
+
pages = con.execute("PRAGMA page_count").fetchone()[0]
|
|
68
|
+
psize = con.execute("PRAGMA page_size").fetchone()[0]
|
|
69
|
+
print(f"usage: {cur} current / {tot} total facts · cap {config.MAX_FACTS} "
|
|
70
|
+
f"· db {pages * psize / 1024:.0f} KB")
|
|
71
|
+
except sqlite3.Error:
|
|
72
|
+
print("usage: (no memory yet)")
|
|
73
|
+
finally:
|
|
74
|
+
con.close()
|
|
75
|
+
else:
|
|
76
|
+
print("usage: (no DB at this path yet)")
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def main(argv: list[str] | None = None) -> int:
|
|
81
|
+
parser = argparse.ArgumentParser(prog="engram", description="engram — local agentic memory")
|
|
82
|
+
sub = parser.add_subparsers(dest="command")
|
|
83
|
+
|
|
84
|
+
demo.add_parser(sub)
|
|
85
|
+
sub.add_parser("status", help="print the current env settings").set_defaults(func=_run_status)
|
|
86
|
+
sub.add_parser("selftest", help="verify the engine works (offline)").set_defaults(func=_run_selftest)
|
|
87
|
+
rb = sub.add_parser("rebuild", help="re-embed a store after changing ENGRAM_EMBEDDER_MODEL")
|
|
88
|
+
rb.add_argument("db", help="path to the memory .db file")
|
|
89
|
+
rb.set_defaults(func=_run_rebuild)
|
|
90
|
+
|
|
91
|
+
args = parser.parse_args(argv)
|
|
92
|
+
if not getattr(args, "func", None):
|
|
93
|
+
parser.print_help()
|
|
94
|
+
return 0
|
|
95
|
+
return args.func(args)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
raise SystemExit(main())
|
engram/config.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Build-time knobs — constants the engine is tuned with.
|
|
2
|
+
|
|
3
|
+
Distinct from `settings.py`, which reads *runtime* flags from environment variables.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
# ── storage ──────────────────────────────────────────────────────────────────
|
|
8
|
+
DEFAULT_DB_PATH = "engram.db" # one SQLite file holds everything (STORAGE_AND_RETRIEVAL §3)
|
|
9
|
+
|
|
10
|
+
# ── embeddings ───────────────────────────────────────────────────────────────
|
|
11
|
+
EMBED_MODEL = "BAAI/bge-small-en-v1.5" # small, local, offline model (fastembed)
|
|
12
|
+
FALLBACK_DIM = 384 # dim of the hash-stub embedder + bge-small
|
|
13
|
+
# Pin the model cache to a STABLE dir so it downloads once and stays offline —
|
|
14
|
+
# fastembed's default is $TMPDIR, which the OS wipes (silent re-downloads).
|
|
15
|
+
# Override with ENGRAM_MODEL_CACHE.
|
|
16
|
+
MODEL_CACHE_DIR = "~/.cache/engram/models"
|
|
17
|
+
|
|
18
|
+
# ── consolidation thresholds (cosine similarity, 0..1) ───────────────────────
|
|
19
|
+
# Four-way operation set: ADD / UPDATE / DELETE / NOOP — here NOOP is upgraded
|
|
20
|
+
# to REINFORCE (bump confidence instead of silently dropping the duplicate).
|
|
21
|
+
REINFORCE_SIM = 0.97 # ~identical to an existing fact → just bump it, no new row
|
|
22
|
+
UPDATE_SIM = 0.86 # close enough to be the same fact → refine, contradict, or supersede
|
|
23
|
+
|
|
24
|
+
# ── extraction (one interaction → several atomic facts) ──────────────────────
|
|
25
|
+
# The write path's biggest quality lever: split a message into discrete, reusable facts
|
|
26
|
+
# instead of storing it as one blob (MEMORY_LIFECYCLE §2).
|
|
27
|
+
MAX_FACTS_PER_INTERACTION = 8 # cap on candidates pulled from a single remember() call
|
|
28
|
+
|
|
29
|
+
# ── retrieval ────────────────────────────────────────────────────────────────
|
|
30
|
+
CANDIDATES_PER_CHANNEL = 30 # how many to pull from keyword, vector, and entity search
|
|
31
|
+
RRF_K = 60 # Reciprocal Rank Fusion constant (INDEXING_DEEP_DIVE §7)
|
|
32
|
+
DEFAULT_TOP_K = 5 # how many memories to return by default
|
|
33
|
+
SEARCH_K_CAP = 200 # k is clamped to [1, this]: a mistyped/hostile k must
|
|
34
|
+
# neither blow the knn query limit nor blackout serving
|
|
35
|
+
LEDGER_CAP = 2000 # decision-ledger rows kept (capped rotation)
|
|
36
|
+
ENTITY_RERANK_BONUS = 0.15 # multiplicative boost per query entity a fact names
|
|
37
|
+
# (max 3); tuned on LoCoMo dev convs 26+30 ONLY
|
|
38
|
+
|
|
39
|
+
# framework-stamped pseudo-tags that carry no task information — treating them
|
|
40
|
+
# as a lane caused a total silent serving blackout (loss census P1)
|
|
41
|
+
GENERIC_TASK_TAGS = frozenset({
|
|
42
|
+
"conversation", "chat", "general", "message", "misc", "context", "default",
|
|
43
|
+
"session", "dialogue", "turn",
|
|
44
|
+
})
|
|
45
|
+
MIN_SIMILARITY = 0.30 # vector candidates below this cosine are dropped (noise floor)
|
|
46
|
+
|
|
47
|
+
# recency: at equal relevance, fresher memories rank higher (INDEXING_DEEP_DIVE §7).
|
|
48
|
+
# final_score = rrf_score × ((1 - RECENCY_WEIGHT) + RECENCY_WEIGHT × freshness),
|
|
49
|
+
# where freshness = 0.5 ** (age_days / RECENCY_HALFLIFE_DAYS) ∈ (0, 1].
|
|
50
|
+
RECENCY_WEIGHT = 0.30 # how much recency may tilt the ranking (0 = off)
|
|
51
|
+
RECENCY_HALFLIFE_DAYS = 30.0 # a memory this old contributes half its freshness
|
|
52
|
+
|
|
53
|
+
# ── entities (the third retrieval signal) ────────────────────────────────────
|
|
54
|
+
ENTITY_MIN_LEN = 2 # ignore entity tokens shorter than this
|
|
55
|
+
|
|
56
|
+
# common words ignored in the keyword (FTS) query so it doesn't match everything
|
|
57
|
+
STOPWORDS = {
|
|
58
|
+
"a", "an", "the", "is", "are", "was", "were", "be", "to", "of", "in", "on",
|
|
59
|
+
"for", "and", "or", "i", "you", "we", "it", "this", "that", "what", "who",
|
|
60
|
+
"whom", "how", "why", "when", "where", "should", "would", "could", "my",
|
|
61
|
+
"your", "our", "here", "there", "do", "does", "did", "use", "used", "can",
|
|
62
|
+
"will", "with", "about", "from", "at", "as", "by", "me", "s",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# ── conditioned promotion (the lane model — see core/promotion.py) ───────────
|
|
66
|
+
PROMOTION_FLOOR_FRAC = 0.34 # keep a fact only if score >= this fraction of the top score
|
|
67
|
+
PROMOTION_EPSILON = 0.05 # weight of the word-overlap tie-break signal
|
|
68
|
+
PROMOTION_OVERFETCH = 4 # promotion re-ranks OVERFETCH×k retrieval candidates
|
|
69
|
+
PROMOTION_MIN_CANDIDATES = 30 # ...but never fewer than this many candidates
|
|
70
|
+
LANE_FETCH_LIMIT = 100 # lane channel: max lane-tagged facts added as candidates
|
|
71
|
+
TAGS_PER_FACT_CAP = 8 # max tags stored per fact
|
|
72
|
+
|
|
73
|
+
# ── salience (what to save) + compaction (how compactly) ─────────────────────
|
|
74
|
+
KEY_CHAR_CAP = 160 # the short embedded label is trimmed to this
|
|
75
|
+
VALUE_CHAR_CAP = 1000 # the stored value is capped (guardrail against blobs)
|
|
76
|
+
MAX_TEXT_CHARS = 2000 # longer than this → assumed file/output dump, skip saving
|
|
77
|
+
|
|
78
|
+
# ── size limit / eviction ────────────────────────────────────────────────────
|
|
79
|
+
MAX_FACTS = 5000 # per DB; least-used facts are evicted beyond this
|
engram/core/__init__.py
ADDED
engram/core/anchors.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Anchor tokens — the parts of a fact that make it THAT fact.
|
|
2
|
+
|
|
3
|
+
"orders p99 is 450ms" and "refunds p99 is 450ms" share almost every word, but
|
|
4
|
+
each carries a content token the other lacks. Any merge / collapse / invalidate
|
|
5
|
+
decision between two such texts is a guess, and losing a true fact is worse
|
|
6
|
+
than keeping a near-sibling.
|
|
7
|
+
|
|
8
|
+
The guard is only consulted AFTER a caller has established near-identity
|
|
9
|
+
(consolidation: cosine ≥ 0.86 on the same block; serve-time collapse:
|
|
10
|
+
Jaccard ≥ 0.6) — inside a frame that similar, a differing content stem is not
|
|
11
|
+
paraphrase variance, it is the identity of a different fact. One-sided
|
|
12
|
+
differences (only one text has a negator, or extra detail) do NOT trigger:
|
|
13
|
+
that's the genuine update/negation case the write path exists to handle.
|
|
14
|
+
|
|
15
|
+
Loss census (2026-07-05) receipts: the UPDATE band silently superseded sibling
|
|
16
|
+
facts at cos 0.86-0.95, DELETE false-fired on motion verbs, and serve-time
|
|
17
|
+
near-dup collapse hid 8/12 distinct sibling pairs.
|
|
18
|
+
"""
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import re
|
|
22
|
+
from typing import Set
|
|
23
|
+
|
|
24
|
+
# words that flip a statement's polarity — genuine negators only. Motion /
|
|
25
|
+
# completion verbs ("dropped the kids at school", "the meeting ended") are NOT
|
|
26
|
+
# negation; they false-fired the DELETE path (census) and live nowhere here.
|
|
27
|
+
NEGATORS = {
|
|
28
|
+
"not", "no", "never", "isn't", "aren't", "wasn't", "weren't", "don't",
|
|
29
|
+
"doesn't", "didn't", "won't", "can't", "cannot", "longer", # "no longer"
|
|
30
|
+
"cancelled", "canceled", "discontinued", "deprecated",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
_STOP = {
|
|
34
|
+
"the", "a", "an", "of", "to", "and", "or", "for", "with", "on", "in", "at",
|
|
35
|
+
"is", "are", "was", "were", "has", "have", "had", "be", "been", "its",
|
|
36
|
+
"it's", "this", "that", "these", "those", "every", "each", "all", "some",
|
|
37
|
+
"we", "our", "they", "their", "he", "she", "his", "her", "you", "your",
|
|
38
|
+
"i", "my", "me", "now", "then", "also", "very", "just", "will", "would",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
_TOKEN = re.compile(r"[A-Za-z0-9'’\-]+")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _stem(tok: str) -> str:
|
|
45
|
+
"""Cheap derivational stem so 'deploys'/'deploying'/'deployed' agree.
|
|
46
|
+
Only used for anchor comparison — never stored."""
|
|
47
|
+
for suf in ("ing", "ed", "es", "s"):
|
|
48
|
+
if tok.endswith(suf) and len(tok) - len(suf) >= 3:
|
|
49
|
+
tok = tok[: -len(suf)]
|
|
50
|
+
if len(tok) >= 2 and tok[-1] == tok[-2]:
|
|
51
|
+
tok = tok[:-1]
|
|
52
|
+
break
|
|
53
|
+
return tok[:-1] if tok.endswith("e") else tok
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _strong_weak(text: str) -> tuple[Set[str], Set[str]]:
|
|
57
|
+
"""STRONG anchors are definitive identity tokens: digits ("450ms",
|
|
58
|
+
"Tuesday at 10am"'s 10am), negators, and mid-sentence capitalized words
|
|
59
|
+
(names, days, products). WEAK anchors are the remaining content stems."""
|
|
60
|
+
strong: Set[str] = set()
|
|
61
|
+
weak: Set[str] = set()
|
|
62
|
+
for i, tok in enumerate(_TOKEN.findall(text or "")):
|
|
63
|
+
low = tok.lower()
|
|
64
|
+
if any(c.isdigit() for c in tok):
|
|
65
|
+
strong.add(low)
|
|
66
|
+
elif low in NEGATORS:
|
|
67
|
+
strong.add(low)
|
|
68
|
+
elif i > 0 and tok[0].isupper():
|
|
69
|
+
strong.add(_stem(low))
|
|
70
|
+
elif len(low) >= 3 and low not in _STOP:
|
|
71
|
+
# drop apostrophes BEFORE stemming: "lot's" → lots → lot (a
|
|
72
|
+
# trailing-strip left "lot'" and the filler floor never matched)
|
|
73
|
+
weak.add(_stem(low.replace("'", "").replace("’", "")))
|
|
74
|
+
return strong, weak
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def anchor_tokens(text: str) -> Set[str]:
|
|
78
|
+
"""All identity-bearing tokens of a fact (strong ∪ weak)."""
|
|
79
|
+
strong, weak = _strong_weak(text)
|
|
80
|
+
return strong | weak
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def exclusive_anchors(a: str, b: str) -> bool:
|
|
84
|
+
"""True when the two near-identical texts are DIFFERENT facts.
|
|
85
|
+
|
|
86
|
+
Two signals, both deterministic:
|
|
87
|
+
1. strong anchors differ on both sides — "payments deploys Tuesday" vs
|
|
88
|
+
"... Thursday", "p99 450ms" vs a different number owner, one side
|
|
89
|
+
negated per anchor asymmetry;
|
|
90
|
+
2. a 1-for-1 content-stem substitution in an otherwise shared frame —
|
|
91
|
+
"Bob owns payments" vs "Bob owns refunds". Paraphrases differ by
|
|
92
|
+
ADDITIVE wording (several extra stems on one or both sides), never by
|
|
93
|
+
a clean substitution, so genuine rewordings still merge/collapse.
|
|
94
|
+
"""
|
|
95
|
+
sa, wa = _strong_weak(a)
|
|
96
|
+
sb, wb = _strong_weak(b)
|
|
97
|
+
if (sa - sb) and (sb - sa):
|
|
98
|
+
return True
|
|
99
|
+
ea = wa - wb - sb
|
|
100
|
+
eb = wb - wa - sa
|
|
101
|
+
return len(ea) == 1 and len(eb) == 1
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Keep each memory small, so it costs little to store and few tokens to return.
|
|
2
|
+
|
|
3
|
+
- `compact_key` → a short one-line label (the thing we embed + the handle).
|
|
4
|
+
- `cap_value` → the full text, trimmed to a guardrail length.
|
|
5
|
+
|
|
6
|
+
This is the cheap, offline version of "summarize the memory." A real LLM summary
|
|
7
|
+
could fill the same role (MEMORY_LIFECYCLE §2/§6); here we just normalize + trim.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
from .. import config
|
|
14
|
+
from . import anchors
|
|
15
|
+
|
|
16
|
+
# first-sentence boundary with the SAME digit + abbreviation guards extraction
|
|
17
|
+
# uses: without them "Priorities: 1." and "Dr." became the embedded key and the
|
|
18
|
+
# consolidation handle — matching nothing at recall and colliding across every
|
|
19
|
+
# "Dr. X" fact (loss census P1)
|
|
20
|
+
_FIRST_SENTENCE = re.compile(
|
|
21
|
+
r"(?<=[^\d][.!?])"
|
|
22
|
+
r"(?<!\b[A-Z]\.)(?<!Dr\.)(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!St\.)(?<!Ave\.)"
|
|
23
|
+
r"(?<!Blvd\.)(?<!Prof\.)(?<!Jr\.)(?<!Sr\.)(?<!vs\.)(?<!e\.g\.)(?<!i\.e\.)(?<!etc\.)"
|
|
24
|
+
r"\s"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def compact_key(text: str, cap: int = config.KEY_CHAR_CAP) -> str:
|
|
29
|
+
"""A short, clean one-liner: collapse whitespace, take the first sentence, trim.
|
|
30
|
+
|
|
31
|
+
When the cap truncates, anchor tokens (names/numbers) from the cut tail are
|
|
32
|
+
appended — they're what makes the fact findable, and losing them made
|
|
33
|
+
semantic match on the cut content impossible (loss census P1).
|
|
34
|
+
"""
|
|
35
|
+
t = " ".join(text.split())
|
|
36
|
+
first = _FIRST_SENTENCE.split(t, maxsplit=1)[0]
|
|
37
|
+
if len(first) <= cap:
|
|
38
|
+
return first
|
|
39
|
+
head = t[:cap].rsplit(" ", 1)[0]
|
|
40
|
+
lost = anchors.anchor_tokens(t) - anchors.anchor_tokens(head)
|
|
41
|
+
if lost:
|
|
42
|
+
head = f"{head} {' '.join(sorted(lost)[:4])}"[: cap + 48]
|
|
43
|
+
return head
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def cap_value(text: str, cap: int = config.VALUE_CHAR_CAP) -> str:
|
|
47
|
+
"""The full memory text, capped so a stray long paste can't bloat the store."""
|
|
48
|
+
t = text.strip()
|
|
49
|
+
if len(t) <= cap:
|
|
50
|
+
return t
|
|
51
|
+
return t[:cap].rsplit(" ", 1)[0] + " …"
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""The "is this new or already known?" decision (MEMORY_LIFECYCLE §5).
|
|
2
|
+
|
|
3
|
+
Given a new fact's vector + key and its block, look at the nearest existing fact
|
|
4
|
+
*in that block* and classify the write into a four-way operation set:
|
|
5
|
+
|
|
6
|
+
REINFORCE near-identical (≥ REINFORCE_SIM) → bump confidence, no new row
|
|
7
|
+
DELETE same topic but the new fact NEGATES the old → retire the stale belief, add new
|
|
8
|
+
UPDATE same topic, refined/changed value → supersede the old version
|
|
9
|
+
ADD nothing close enough → brand-new fact
|
|
10
|
+
|
|
11
|
+
(The "already known, do nothing" case is often called NOOP; here it becomes REINFORCE
|
|
12
|
+
so repetition strengthens a memory instead of being silently dropped.)
|
|
13
|
+
|
|
14
|
+
Pure decision logic — it does not write; `memory.py` applies the result via the
|
|
15
|
+
repository. Contradiction is detected deterministically (negation polarity); a
|
|
16
|
+
semantic LLM resolver could be slotted in behind this same contract.
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import re
|
|
21
|
+
from typing import Optional, Tuple
|
|
22
|
+
|
|
23
|
+
from .. import config
|
|
24
|
+
from ..storage import repository
|
|
25
|
+
from . import anchors
|
|
26
|
+
|
|
27
|
+
# words/markers that flip a statement's polarity ("Bob no longer owns payments").
|
|
28
|
+
# Motion/completion verbs (dropped/stopped/ended/removed) are NOT here: "Bob
|
|
29
|
+
# dropped the kids at school" is an event, not a negation — with them in the
|
|
30
|
+
# list it invalidated Bob's standing routine (loss census 2026-07-05).
|
|
31
|
+
_NEGATIONS = (
|
|
32
|
+
"not", "no longer", "never", "isn't", "aren't", "wasn't", "weren't", "don't",
|
|
33
|
+
"doesn't", "didn't", "won't", "can't", "cannot",
|
|
34
|
+
"deprecated", "cancelled", "canceled", "discontinued", "no more",
|
|
35
|
+
)
|
|
36
|
+
_NEG_RE = re.compile(r"\b(" + "|".join(re.escape(n) for n in _NEGATIONS) + r")\b", re.IGNORECASE)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _cosine_from_l2(distance: float) -> float:
|
|
40
|
+
"""For unit vectors, euclidean^2 = 2 - 2cos ⇒ cos = 1 - d^2/2."""
|
|
41
|
+
return 1.0 - (distance * distance) / 2.0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _has_negation(text: str) -> bool:
|
|
45
|
+
return bool(_NEG_RE.search(text or ""))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _contradicts(old_key: str, new_key: str) -> bool:
|
|
49
|
+
"""True when one statement negates the other (opposite polarity)."""
|
|
50
|
+
return _has_negation(old_key) != _has_negation(new_key)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _nearest_in_block(conn, block_id: str, vec) -> Optional[Tuple[str, float, str]]:
|
|
54
|
+
for fid, dist in repository.nearest(conn, vec, k=10):
|
|
55
|
+
row = repository.get(conn, fid)
|
|
56
|
+
if row is None or row["superseded_by"] is not None:
|
|
57
|
+
continue
|
|
58
|
+
if row["validation_status"] != "fresh":
|
|
59
|
+
continue
|
|
60
|
+
if row["block_id"] != block_id:
|
|
61
|
+
continue
|
|
62
|
+
return fid, _cosine_from_l2(dist), row["key"]
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def decide(conn, block_id: str, vec, key: str = "") -> Tuple[str, Optional[str], str]:
|
|
67
|
+
"""Return (operation, target_fact_id, note). target is None for ADD.
|
|
68
|
+
|
|
69
|
+
`note` is non-empty when a merge was DEMOTED by the anchor guard — the
|
|
70
|
+
caller records it in the decision ledger so demotions are auditable.
|
|
71
|
+
"""
|
|
72
|
+
nearest = _nearest_in_block(conn, block_id, vec)
|
|
73
|
+
if nearest is None:
|
|
74
|
+
return "ADD", None, ""
|
|
75
|
+
fid, sim, old_key = nearest
|
|
76
|
+
|
|
77
|
+
if sim >= config.UPDATE_SIM:
|
|
78
|
+
# anchor guard: if each statement carries an identity token the other
|
|
79
|
+
# lacks (orders vs refunds, Tuesday vs Thursday), these are two TRUE
|
|
80
|
+
# sibling facts however similar the wording — merging/retiring either
|
|
81
|
+
# one destroys real memory. Keep both. (Loss census 2026-07-05.)
|
|
82
|
+
if anchors.exclusive_anchors(old_key, key):
|
|
83
|
+
return "ADD", None, f"merge demoted: exclusive anchors vs {fid} (cos {sim:.2f})"
|
|
84
|
+
# same topic: does the new statement contradict (negate) the old one?
|
|
85
|
+
if _contradicts(old_key, key):
|
|
86
|
+
return "DELETE", fid, ""
|
|
87
|
+
if sim >= config.REINFORCE_SIM:
|
|
88
|
+
return "REINFORCE", fid, ""
|
|
89
|
+
return "UPDATE", fid, ""
|
|
90
|
+
|
|
91
|
+
return "ADD", None, ""
|
engram/core/dates.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Deterministic relative-date resolution at capture time.
|
|
2
|
+
|
|
3
|
+
People state facts in relative time — "I went last Friday", "we're moving next
|
|
4
|
+
month" — and the absolute date exists only in the conversation's own timestamp.
|
|
5
|
+
LLM-extraction systems resolve this during ingestion with a model call;
|
|
6
|
+
we resolve it with calendar arithmetic: zero LLM, same information.
|
|
7
|
+
|
|
8
|
+
`resolve_relatives(text, anchor)` annotates each relative expression inline
|
|
9
|
+
with its computed absolute form: "I went last Friday [= Friday, 14 July 2023]".
|
|
10
|
+
Inline (not appended) so keyword/vector retrieval keeps the date next to the
|
|
11
|
+
event it dates. Text inside existing [= ...] annotations is never re-annotated.
|
|
12
|
+
|
|
13
|
+
Measured motivation (LoCoMo track B): 58% of temporal failures had the
|
|
14
|
+
evidence SERVED but relative — the answering model can't do the arithmetic
|
|
15
|
+
reliably; the store should carry it resolved.
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import datetime as _dt
|
|
20
|
+
import re
|
|
21
|
+
from typing import Optional
|
|
22
|
+
|
|
23
|
+
_MONTHS = "january february march april may june july august september october november december".split()
|
|
24
|
+
_WEEKDAYS = "monday tuesday wednesday thursday friday saturday sunday".split()
|
|
25
|
+
_WORD_NUMS = {"a": 1, "an": 1, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
|
|
26
|
+
"six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "couple of": 2,
|
|
27
|
+
"few": 3}
|
|
28
|
+
|
|
29
|
+
_ANCHOR_PATTERNS = [
|
|
30
|
+
"%d %B, %Y", "%d %B %Y", "%B %d, %Y", "%B %d %Y", "%Y-%m-%d", "%d/%m/%Y",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_anchor(s: Optional[str]) -> Optional[_dt.date]:
|
|
35
|
+
"""Parse a conversation timestamp like '8 May, 2023' into a date."""
|
|
36
|
+
if not s:
|
|
37
|
+
return None
|
|
38
|
+
s = s.strip()
|
|
39
|
+
# tolerate a leading time ("1:56 pm on 8 May, 2023")
|
|
40
|
+
if " on " in s:
|
|
41
|
+
s = s.split(" on ")[-1].strip()
|
|
42
|
+
for fmt in _ANCHOR_PATTERNS:
|
|
43
|
+
try:
|
|
44
|
+
return _dt.datetime.strptime(s, fmt).date()
|
|
45
|
+
except ValueError:
|
|
46
|
+
continue
|
|
47
|
+
m = re.search(r"(\d{1,2})\s+([A-Za-z]+),?\s+(\d{4})", s)
|
|
48
|
+
if m and m.group(2).lower() in _MONTHS:
|
|
49
|
+
return _dt.date(int(m.group(3)), _MONTHS.index(m.group(2).lower()) + 1,
|
|
50
|
+
int(m.group(1)))
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _fmt_date(d: _dt.date) -> str:
|
|
55
|
+
return f"{_WEEKDAYS[d.weekday()].capitalize()}, {d.day} {_MONTHS[d.month - 1].capitalize()} {d.year}"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _fmt_month(d: _dt.date) -> str:
|
|
59
|
+
return f"{_MONTHS[d.month - 1].capitalize()} {d.year}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _month_shift(d: _dt.date, months: int) -> _dt.date:
|
|
63
|
+
m = d.month - 1 + months
|
|
64
|
+
return _dt.date(d.year + m // 12, m % 12 + 1, 1)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def resolve_relatives(text: str, anchor: Optional[_dt.date],
|
|
68
|
+
max_annotations: int = 3) -> str:
|
|
69
|
+
"""Annotate relative time expressions with computed absolutes, inline."""
|
|
70
|
+
if anchor is None or "[=" in text:
|
|
71
|
+
return text
|
|
72
|
+
count = 0
|
|
73
|
+
|
|
74
|
+
def ann(match_text: str, resolved: str) -> str:
|
|
75
|
+
nonlocal count
|
|
76
|
+
count += 1
|
|
77
|
+
return f"{match_text} [= {resolved}]"
|
|
78
|
+
|
|
79
|
+
def sub(pattern: str, resolver) -> None:
|
|
80
|
+
nonlocal text
|
|
81
|
+
def repl(m: re.Match) -> str:
|
|
82
|
+
if count >= max_annotations:
|
|
83
|
+
return m.group(0)
|
|
84
|
+
r = resolver(m)
|
|
85
|
+
return ann(m.group(0), r) if r else m.group(0)
|
|
86
|
+
text = re.sub(pattern, repl, text, flags=re.IGNORECASE)
|
|
87
|
+
|
|
88
|
+
# exact-day expressions
|
|
89
|
+
sub(r"\byesterday\b", lambda m: _fmt_date(anchor - _dt.timedelta(days=1)))
|
|
90
|
+
sub(r"\blast night\b", lambda m: _fmt_date(anchor - _dt.timedelta(days=1)))
|
|
91
|
+
sub(r"\b(today|tonight|this morning|this evening|this afternoon)\b",
|
|
92
|
+
lambda m: _fmt_date(anchor))
|
|
93
|
+
sub(r"\btomorrow\b", lambda m: _fmt_date(anchor + _dt.timedelta(days=1)))
|
|
94
|
+
# last/next <weekday>: the most recent such day strictly before (after) anchor
|
|
95
|
+
sub(r"\blast\s+(monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b",
|
|
96
|
+
lambda m: _fmt_date(anchor - _dt.timedelta(
|
|
97
|
+
days=((anchor.weekday() - _WEEKDAYS.index(m.group(1).lower())) % 7) or 7)))
|
|
98
|
+
sub(r"\bnext\s+(monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b",
|
|
99
|
+
lambda m: _fmt_date(anchor + _dt.timedelta(
|
|
100
|
+
days=((_WEEKDAYS.index(m.group(1).lower()) - anchor.weekday()) % 7) or 7)))
|
|
101
|
+
# periods — phrased the way LoCoMo golds phrase them
|
|
102
|
+
sub(r"\blast\s+week(end)?\b",
|
|
103
|
+
lambda m: f"the week{'end' if m.group(1) else ''} before {anchor.day} "
|
|
104
|
+
f"{_MONTHS[anchor.month - 1].capitalize()} {anchor.year}")
|
|
105
|
+
sub(r"\bthis\s+week(end)?\b",
|
|
106
|
+
lambda m: f"the week{'end' if m.group(1) else ''} of {anchor.day} "
|
|
107
|
+
f"{_MONTHS[anchor.month - 1].capitalize()} {anchor.year}")
|
|
108
|
+
sub(r"\blast\s+month\b", lambda m: _fmt_month(_month_shift(anchor, -1)))
|
|
109
|
+
sub(r"\bnext\s+month\b", lambda m: _fmt_month(_month_shift(anchor, 1)))
|
|
110
|
+
sub(r"\bthis\s+month\b", lambda m: _fmt_month(anchor))
|
|
111
|
+
sub(r"\blast\s+year\b", lambda m: str(anchor.year - 1))
|
|
112
|
+
sub(r"\bnext\s+year\b", lambda m: str(anchor.year + 1))
|
|
113
|
+
# "N days/weeks/months/years ago"
|
|
114
|
+
def ago(m: re.Match) -> Optional[str]:
|
|
115
|
+
raw = m.group(1).lower()
|
|
116
|
+
n = _WORD_NUMS.get(raw) or (int(raw) if raw.isdigit() else None)
|
|
117
|
+
if n is None:
|
|
118
|
+
return None
|
|
119
|
+
unit = m.group(2).lower()
|
|
120
|
+
if unit == "day":
|
|
121
|
+
return _fmt_date(anchor - _dt.timedelta(days=n))
|
|
122
|
+
if unit == "week":
|
|
123
|
+
return _fmt_date(anchor - _dt.timedelta(weeks=n))
|
|
124
|
+
if unit == "month":
|
|
125
|
+
return _fmt_month(_month_shift(anchor, -n))
|
|
126
|
+
return str(anchor.year - n)
|
|
127
|
+
sub(r"\b(\d+|a|an|one|two|three|four|five|six|seven|eight|nine|ten|few|couple of)"
|
|
128
|
+
r"\s+(day|week|month|year)s?\s+ago\b", ago)
|
|
129
|
+
return text
|