merken 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- merken-0.1.0/.claude/settings.local.json +57 -0
- merken-0.1.0/.gitignore +39 -0
- merken-0.1.0/CLAUDE.md +136 -0
- merken-0.1.0/CONSTITUTION.md +235 -0
- merken-0.1.0/LICENSE +21 -0
- merken-0.1.0/PKG-INFO +349 -0
- merken-0.1.0/README.md +321 -0
- merken-0.1.0/docs/architecture.md +461 -0
- merken-0.1.0/docs/cli.md +539 -0
- merken-0.1.0/docs/extending.md +921 -0
- merken-0.1.0/docs/mcp-server.md +518 -0
- merken-0.1.0/docs/primitives.md +606 -0
- merken-0.1.0/experiments/BENCHMARK_STRATEGY.md +500 -0
- merken-0.1.0/experiments/README.md +62 -0
- merken-0.1.0/experiments/__init__.py +1 -0
- merken-0.1.0/experiments/loop_quality/README.md +66 -0
- merken-0.1.0/experiments/loop_quality/RESULTS.md +589 -0
- merken-0.1.0/experiments/loop_quality/__init__.py +14 -0
- merken-0.1.0/experiments/loop_quality/runner.py +454 -0
- merken-0.1.0/experiments/loop_quality/scenario.py +117 -0
- merken-0.1.0/experiments/loop_quality/scenarios/__init__.py +1 -0
- merken-0.1.0/experiments/loop_quality/scenarios/analytics_project.json +88 -0
- merken-0.1.0/experiments/loop_quality/scenarios/jay_vstash_2026_04_09_snapshot.json +132 -0
- merken-0.1.0/experiments/loop_quality/scenarios/knowledge_update.json +128 -0
- merken-0.1.0/experiments/loop_quality/scenarios/noisy_agent_stream.json +172 -0
- merken-0.1.0/experiments/loop_quality/scenarios/session_2026_04_09.json +88 -0
- merken-0.1.0/experiments/loop_quality/smoke_real_vstash.py +184 -0
- merken-0.1.0/experiments/retrieval/README.md +28 -0
- merken-0.1.0/experiments/retrieval/__init__.py +9 -0
- merken-0.1.0/experiments/retrieval/lmeb/RESULTS.md +92 -0
- merken-0.1.0/experiments/retrieval/lmeb/__init__.py +0 -0
- merken-0.1.0/experiments/retrieval/lmeb/runner.py +427 -0
- merken-0.1.0/experiments/retrieval/longmemeval/README.md +56 -0
- merken-0.1.0/experiments/retrieval/longmemeval/RESULTS.md +150 -0
- merken-0.1.0/experiments/retrieval/longmemeval/__init__.py +5 -0
- merken-0.1.0/experiments/retrieval/longmemeval/dataset.py +180 -0
- merken-0.1.0/experiments/retrieval/longmemeval/fixtures/tiny.json +65 -0
- merken-0.1.0/experiments/retrieval/longmemeval/run_colab.ipynb +146 -0
- merken-0.1.0/experiments/retrieval/longmemeval/run_overnight.sh +66 -0
- merken-0.1.0/experiments/retrieval/longmemeval/runner.py +427 -0
- merken-0.1.0/merken/__init__.py +70 -0
- merken-0.1.0/merken/audit.py +161 -0
- merken-0.1.0/merken/classification.py +193 -0
- merken-0.1.0/merken/cli.py +479 -0
- merken-0.1.0/merken/consolidation.py +500 -0
- merken-0.1.0/merken/mcp_server.py +368 -0
- merken-0.1.0/merken/memory.py +823 -0
- merken-0.1.0/merken/policies/__init__.py +66 -0
- merken-0.1.0/merken/policies/should_consolidate.py +114 -0
- merken-0.1.0/merken/policies/should_forget.py +226 -0
- merken-0.1.0/merken/policies/should_recall.py +135 -0
- merken-0.1.0/merken/policies/should_remember.py +252 -0
- merken-0.1.0/merken/policies/types.py +74 -0
- merken-0.1.0/notes/prior-art.md +274 -0
- merken-0.1.0/notes/research-2026-04-09.md +537 -0
- merken-0.1.0/notes/silt.md +144 -0
- merken-0.1.0/notes/vstash-issue-collection-default-filter.md +91 -0
- merken-0.1.0/pyproject.toml +67 -0
- merken-0.1.0/tests/test_cli.py +373 -0
- merken-0.1.0/tests/test_embed_model_resolution.py +103 -0
- merken-0.1.0/tests/test_longmemeval_runner.py +118 -0
- merken-0.1.0/tests/test_loop_quality.py +299 -0
- merken-0.1.0/tests/test_mcp_server.py +337 -0
- merken-0.1.0/tests/test_recall_interleave.py +54 -0
- merken-0.1.0/tests/test_should_consolidate.py +589 -0
- merken-0.1.0/tests/test_should_forget.py +236 -0
- merken-0.1.0/tests/test_should_recall.py +150 -0
- merken-0.1.0/tests/test_should_remember.py +211 -0
- merken-0.1.0/tests/test_smoke.py +53 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"WebFetch(domain:www.mempalace.tech)",
|
|
5
|
+
"Bash(gh repo:*)",
|
|
6
|
+
"Read(//Users/jaysonsteffens/Desktop/Personal/Projects/**)",
|
|
7
|
+
"Bash(python3 -c \"import vstash; print\\(vstash.__version__\\); print\\(dir\\(vstash\\)\\)\")",
|
|
8
|
+
"Bash(python3 -c \"from vstash import Memory; help\\(Memory.__init__\\)\")",
|
|
9
|
+
"Bash(python3 -c \"from vstash import Memory; print\\([m for m in dir\\(Memory\\) if not m.startswith\\('_'\\)]\\)\")",
|
|
10
|
+
"Bash(python3 -c \"from vstash import Memory; help\\(Memory.remember\\)\")",
|
|
11
|
+
"Bash(python3 -c \"from vstash import Memory; help\\(Memory.add\\)\")",
|
|
12
|
+
"Bash(python3 -c \"from vstash import Memory; help\\(Memory.search\\)\")",
|
|
13
|
+
"Bash(python3 -m pytest tests/test_smoke.py -v)",
|
|
14
|
+
"Bash(git init:*)",
|
|
15
|
+
"Bash(python3 -m pytest tests/ -v)",
|
|
16
|
+
"Bash(python3 -c \"import datasets; print\\('datasets', datasets.__version__\\)\")",
|
|
17
|
+
"Bash(python3 -c \"import urllib.request; print\\('urllib ok'\\)\")",
|
|
18
|
+
"Bash(python3 -c \"from vstash import SearchResult; import dataclasses; print\\([f.name for f in dataclasses.fields\\(SearchResult\\)]\\)\")",
|
|
19
|
+
"Bash(python3 -m experiments.longmemeval.runner)",
|
|
20
|
+
"Bash(gh*)",
|
|
21
|
+
"Bash(curl -fsSL -o experiments/longmemeval/.cache/longmemeval_oracle.json \"https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_oracle.json\")",
|
|
22
|
+
"Bash(curl -fsSL -o experiments/longmemeval/.cache/longmemeval_s_cleaned.json \"https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json\")",
|
|
23
|
+
"Bash(python3:*)",
|
|
24
|
+
"Bash(kill 14987)",
|
|
25
|
+
"Bash(pkill -f \"experiments.longmemeval.runner\")",
|
|
26
|
+
"Bash(git mv:*)",
|
|
27
|
+
"Bash(grep -rn \"experiments\\\\.longmemeval\\\\|experiments/longmemeval\" merken/ experiments/ tests/ CLAUDE.md *.md)",
|
|
28
|
+
"Bash(pip install:*)",
|
|
29
|
+
"Bash(merken --help)",
|
|
30
|
+
"Bash(rm -rf /tmp/merken_smoke)",
|
|
31
|
+
"Bash(merken --db /tmp/merken_smoke/test.db --project smoke_test remember \"the first real CLI test event about merken deployment via pip install -e dot\")",
|
|
32
|
+
"Bash(merken --db /tmp/merken_smoke/test.db --project smoke_test remember \"the second event about layered recall and consolidation working end to end\")",
|
|
33
|
+
"Bash(merken --db /tmp/merken_smoke/test.db --project smoke_test status)",
|
|
34
|
+
"Bash(merken --db /tmp/merken_smoke/test.db --project smoke_test recall \"real CLI test\")",
|
|
35
|
+
"Bash(merken --db /tmp/merken_smoke/test.db --project smoke_test consolidate --force)",
|
|
36
|
+
"Bash(chmod +x experiments/retrieval/longmemeval/run_overnight.sh)",
|
|
37
|
+
"Bash(timeout 2 merken-mcp)",
|
|
38
|
+
"WebFetch(domain:arxiv.org)",
|
|
39
|
+
"WebFetch(domain:github.com)",
|
|
40
|
+
"Bash(bash:*)",
|
|
41
|
+
"Bash(python:*)",
|
|
42
|
+
"Bash(huggingface-cli download:*)",
|
|
43
|
+
"WebFetch(domain:raw.githubusercontent.com)",
|
|
44
|
+
"Bash(experiments/retrieval/longmemeval/run_overnight.sh:*)",
|
|
45
|
+
"Bash(merken remember:*)",
|
|
46
|
+
"Bash(merken recall:*)",
|
|
47
|
+
"Bash(chmod +x:*)",
|
|
48
|
+
"Bash(merken --project merken recall \"recent decisions\" --top-k 3 --json)",
|
|
49
|
+
"Bash(merken --json --project merken recall \"recent decisions\" --top-k 3)",
|
|
50
|
+
"WebFetch(domain:pypi.org)",
|
|
51
|
+
"WebFetch(domain:tmsearch.uspto.gov)",
|
|
52
|
+
"WebFetch(domain:www.tmdn.org)",
|
|
53
|
+
"Bash(grep:*)",
|
|
54
|
+
"Bash(ls:*)"
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
}
|
merken-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
*.egg-info/
|
|
10
|
+
*.egg
|
|
11
|
+
.eggs/
|
|
12
|
+
|
|
13
|
+
# Virtualenvs
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
|
|
18
|
+
# Tooling
|
|
19
|
+
.pytest_cache/
|
|
20
|
+
.mypy_cache/
|
|
21
|
+
.ruff_cache/
|
|
22
|
+
.coverage
|
|
23
|
+
htmlcov/
|
|
24
|
+
|
|
25
|
+
# Editors
|
|
26
|
+
.vscode/
|
|
27
|
+
.idea/
|
|
28
|
+
*.swp
|
|
29
|
+
.DS_Store
|
|
30
|
+
|
|
31
|
+
# vstash / engram local data
|
|
32
|
+
*.db
|
|
33
|
+
*.db-journal
|
|
34
|
+
*.db-wal
|
|
35
|
+
*.db-shm
|
|
36
|
+
.engram/
|
|
37
|
+
|
|
38
|
+
# Benchmark dataset cache
|
|
39
|
+
experiments/**/.cache/
|
merken-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# CLAUDE.md — merken
|
|
2
|
+
|
|
3
|
+
This file is the entry point for any Claude session opened in the
|
|
4
|
+
merken repo. It should be cheap to read and keep a current Claude
|
|
5
|
+
session aligned with the state of the repo.
|
|
6
|
+
|
|
7
|
+
## Read these first, in order
|
|
8
|
+
|
|
9
|
+
1. [`CONSTITUTION.md`](CONSTITUTION.md) — what merken is, what it
|
|
10
|
+
isn't, and the principles. Disagreements get edits to that file
|
|
11
|
+
*before* any code.
|
|
12
|
+
2. The vstash repo's `CLAUDE.md` — merken is a strict consumer of
|
|
13
|
+
vstash's public API. Understand the substrate before touching
|
|
14
|
+
the loop.
|
|
15
|
+
3. [`experiments/BENCHMARK_STRATEGY.md`](experiments/BENCHMARK_STRATEGY.md)
|
|
16
|
+
— how merken measures itself, which benchmarks are load-bearing
|
|
17
|
+
and which are noise.
|
|
18
|
+
4. [`notes/silt.md`](notes/silt.md) — working notes on the patterns
|
|
19
|
+
Silt caught that became hard rules.
|
|
20
|
+
|
|
21
|
+
## Current state (as of 2026-04-09)
|
|
22
|
+
|
|
23
|
+
All four decision primitives from CONSTITUTION §5.1 are implemented:
|
|
24
|
+
|
|
25
|
+
- **`should_remember`** — `HeuristicWriteDecider` (default) and
|
|
26
|
+
`AlwaysWrite` (baseline). The heuristic decider now hydrates its
|
|
27
|
+
dedup set from vstash on first use, so cross-invocation dedup
|
|
28
|
+
works for CLI and MCP.
|
|
29
|
+
- **`should_consolidate`** — `PeriodicConsolidator` (default) and
|
|
30
|
+
`NeverConsolidate`. Uses `cluster_by_embedding` with complete
|
|
31
|
+
linkage and threshold 0.70 (picked via grid search on three
|
|
32
|
+
loop_quality scenarios, 2026-04-09).
|
|
33
|
+
- **`should_recall`** — `LayeredRecaller` (default, semantic-first
|
|
34
|
+
with episodic fallback) and `SemanticOnlyRecaller` baseline.
|
|
35
|
+
`Memory.recall` does round-robin interleave across layers with
|
|
36
|
+
dedup-by-path.
|
|
37
|
+
- **`should_forget`** — `NeverForget` (default, safe) and
|
|
38
|
+
`ForgetConsolidated`. Tombstone-not-delete: full text preserved
|
|
39
|
+
in `merken_tombstones`, reversible.
|
|
40
|
+
|
|
41
|
+
Deployed surfaces:
|
|
42
|
+
|
|
43
|
+
- **Python SDK** — `from merken import Memory`. Four primitives
|
|
44
|
+
accessible as `Memory` methods.
|
|
45
|
+
- **CLI** — `merken` on `$PATH` after `pip install -e .`.
|
|
46
|
+
Eight subcommands map 1:1 to `Memory` methods:
|
|
47
|
+
`remember | recall | consolidate | forget | audit | tombstones | status | stats`.
|
|
48
|
+
See `merken --help`.
|
|
49
|
+
- **MCP server** — `merken-mcp` on `$PATH`, `python -m merken.mcp_server`,
|
|
50
|
+
or `claude mcp add merken -- python -m merken.mcp_server`. Eight
|
|
51
|
+
tools, one per CLI subcommand. Default DB is `~/.merken/<project>.db`,
|
|
52
|
+
deliberately isolated from `~/.vstash/memory.db`.
|
|
53
|
+
- **Claude Code hooks** — live in `~/.claude/settings.json`.
|
|
54
|
+
Three hooks: `SessionStart` (recall context), `PreCompact`
|
|
55
|
+
(save to memory), `UserPromptSubmit` (search memory).
|
|
56
|
+
Scripts at `~/.claude/hooks/merken-*.sh`.
|
|
57
|
+
|
|
58
|
+
Safety net (`experiments/loop_quality/`):
|
|
59
|
+
|
|
60
|
+
- Three scenarios, all running under `pytest tests/test_loop_quality.py`
|
|
61
|
+
via parametrized `test_runner_completes_on_every_scenario`:
|
|
62
|
+
1. `analytics_project` (synthetic control, 100%/100%/100%)
|
|
63
|
+
2. `session_2026_04_09` (synthetic borderline, 100%/100%/33%)
|
|
64
|
+
3. `jay_vstash_2026_04_09_snapshot` (real organic, 100%/100%/80%)
|
|
65
|
+
- A new decider or policy change that drops any scenario below its
|
|
66
|
+
current pass_rate is a regression. Investigate before merging.
|
|
67
|
+
|
|
68
|
+
## Hard rules
|
|
69
|
+
|
|
70
|
+
These survive across sessions. Breaking any of them requires an
|
|
71
|
+
explicit case in the PR description.
|
|
72
|
+
|
|
73
|
+
- **vstash is a hard dependency.** Never reach into
|
|
74
|
+
`vstash._private`. Never read SQLite tables directly *in production
|
|
75
|
+
code* (probes in `notes/` or one-off investigations are fine).
|
|
76
|
+
If the public API is missing something, the fix is a vstash PR.
|
|
77
|
+
(CONSTITUTION §4.4, §6.)
|
|
78
|
+
- **Glass box.** Every decision the loop makes writes an audit row
|
|
79
|
+
to the `merken_audit` collection. No exceptions — even skipped
|
|
80
|
+
writes and never-forgotten events produce audit trails. Tombstoned
|
|
81
|
+
events additionally write to `merken_tombstones` so they're
|
|
82
|
+
recoverable.
|
|
83
|
+
- **No new vector storage.** No ChromaDB, no second store, no FTS
|
|
84
|
+
reimplementation. (CONSTITUTION §3, §8.)
|
|
85
|
+
- **No bespoke compression dialect.** Measure with a real tokenizer
|
|
86
|
+
before claiming any compression result. See `notes/prior-art.md`
|
|
87
|
+
for the cautionary tale.
|
|
88
|
+
- **Empirical first.** Every default-policy change cites a benchmark
|
|
89
|
+
in `experiments/`. "I think it's better" does not ship.
|
|
90
|
+
(CONSTITUTION §9, operationalized in `experiments/BENCHMARK_STRATEGY.md`.)
|
|
91
|
+
- **Silt's rule: "before proposing an algorithm, look at the
|
|
92
|
+
distribution of the data."** Every time we reached for a new
|
|
93
|
+
decider or a smarter policy without first measuring the
|
|
94
|
+
distribution we were working against, we overengineered and had
|
|
95
|
+
to retract later. When a number looks bad, the first move is to
|
|
96
|
+
grid-search the knobs you already have against the bar you
|
|
97
|
+
already built. Only after that exhausts the simple moves is a
|
|
98
|
+
new algorithm justified. See `notes/silt.md` for the specific
|
|
99
|
+
interventions this rule survives.
|
|
100
|
+
- **Test fixtures are not ground truth.** Two separate sessions,
|
|
101
|
+
tests passed green while the real behavior on organic content
|
|
102
|
+
was broken. Every new decider gets run against at least one
|
|
103
|
+
real-content scenario (the `jay_vstash_*_snapshot` family) before
|
|
104
|
+
landing. Synthetic tests are necessary but not sufficient.
|
|
105
|
+
- **Cross-model test independence.** Test text pairs used in
|
|
106
|
+
consolidation tests must cluster above threshold in both
|
|
107
|
+
`BAAI/bge-small-en-v1.5` AND
|
|
108
|
+
`sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`.
|
|
109
|
+
vstash 0.27.0 can resolve to either depending on the environment.
|
|
110
|
+
See `tests/test_mcp_server.py::test_consolidate_force_builds_fact`
|
|
111
|
+
for the pattern.
|
|
112
|
+
|
|
113
|
+
## What's NOT next
|
|
114
|
+
|
|
115
|
+
- A fifth decision primitive. Four are enough.
|
|
116
|
+
- LLM-based consolidation in the hot path. Gated on a scenario
|
|
117
|
+
where the non-LLM loop leaves real value on the table.
|
|
118
|
+
- A knowledge graph (CONSTITUTION §6, optional, gated).
|
|
119
|
+
- Shell completion, colors, a web UI. All noise for the scope.
|
|
120
|
+
|
|
121
|
+
## What IS next (approximately)
|
|
122
|
+
|
|
123
|
+
- Phase A of `experiments/BENCHMARK_STRATEGY.md` — the overnight
|
|
124
|
+
LongMemEval full n=500 run, script at
|
|
125
|
+
`experiments/retrieval/longmemeval/run_overnight.sh`.
|
|
126
|
+
- Claude Code hooks hardening: error handling, threshold tuning,
|
|
127
|
+
integration tests for the hook scripts.
|
|
128
|
+
- Additional `loop_quality/` scenarios from Jay's real work:
|
|
129
|
+
perf migration notes, MedLocal hackathon logs, Kafka meeting
|
|
130
|
+
threads, daily reviews.
|
|
131
|
+
- LoCoMo runner under `experiments/retrieval/locomo/`. Needs a
|
|
132
|
+
judge model choice first (open question in BENCHMARK_STRATEGY.md).
|
|
133
|
+
|
|
134
|
+
## Branching
|
|
135
|
+
|
|
136
|
+
`feature/*` → `develop` → `main` via release PR. Mirrors vstash.
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# merken — Constitution
|
|
2
|
+
|
|
3
|
+
> *merken (n.): the physical trace a memory leaves in the brain.*
|
|
4
|
+
|
|
5
|
+
**Status:** draft v0.1 — written 2026-04-07. This document defines what merken is, what it is *not*, and the principles that should outlive any specific implementation. Everything below is meant to be negotiated; nothing below is meant to be ignored without a reason.
|
|
6
|
+
|
|
7
|
+
**Working name:** `merken`. Renaming is cheap right now and expensive later — decide before the first commit lands code.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## 1. Why this exists
|
|
12
|
+
|
|
13
|
+
vstash is a **substrate**: a glass-box retrieval engine with vector + FTS5 + RRF, observability, integrity checks, explicit limits, and an explicit API contract (#132 → #135 in the vstash repo). It is small, honest, and opinionated about *retrieval*.
|
|
14
|
+
|
|
15
|
+
It is deliberately **not** opinionated about the agent loop on top: when to write a memory, when to recall, when to consolidate raw events into facts, when to let things decay. Those decisions are the difference between "vector database" and "memory system", and they belong in their own project.
|
|
16
|
+
|
|
17
|
+
Frameworks like Mem0, Zep, and LangChain memory bundle the substrate and the loop into one black box. The substrate inside them is usually an off-the-shelf vector DB, the loop is hard to inspect, and the moment you want to change either you're rewriting half the framework.
|
|
18
|
+
|
|
19
|
+
merken unbundles these. **vstash is the substrate. merken is the loop.** They talk through vstash's stable Python API; merken never reaches into vstash internals.
|
|
20
|
+
|
|
21
|
+
If you wanted a one-line pitch:
|
|
22
|
+
|
|
23
|
+
> merken turns vstash from a *vector store you query* into *a memory you live with*.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## 2. What merken is
|
|
28
|
+
|
|
29
|
+
- A small Python library that wraps vstash with an agent-loop policy layer.
|
|
30
|
+
- A set of **decision primitives**: `should_remember`, `should_recall`, `should_consolidate`, `should_forget`. Each one is a function with explicit inputs and outputs that you can test, override, or replace.
|
|
31
|
+
- A **consolidation pipeline** that turns episodic events (raw conversations, tool calls, observations) into semantic facts (compressed, deduplicated, named).
|
|
32
|
+
- An **inspection surface**: every decision is logged with its inputs, the policy that fired, and the resulting write. The agent's memory should be as auditable as vstash's queries already are (the glass-box principle).
|
|
33
|
+
- A **single Python process by default**, the same way vstash is. Distribution comes later, if at all.
|
|
34
|
+
|
|
35
|
+
It is built **on top of** vstash, not instead of it. The pyproject says `vstash >= 0.25.0` and that's a hard dep, not optional.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## 3. What merken is NOT
|
|
40
|
+
|
|
41
|
+
- **Not a vector database.** vstash already is one. We do not store embeddings, run ANN, or maintain an FTS index. If we ever feel the urge to, we are wrong.
|
|
42
|
+
- **Not a framework.** No plugin registry, no DSL, no YAML config that compiles into a graph. Functions and classes that you import.
|
|
43
|
+
- **Not an LLM provider abstraction.** Engram calls *one* LLM (or none, for the rule-based decisions). The user picks the model the same way they would for vstash's chat module — through a function argument or a config field, not a 40-class adapter hierarchy.
|
|
44
|
+
- **Not RAG-as-a-service.** RAG is a special case of "recall, then ask." We support it. We do not center the project on it.
|
|
45
|
+
- **Not multi-agent coordination.** One agent, one memory. Multi-agent is a different problem with its own tradeoffs.
|
|
46
|
+
- **Not an autonomous loop.** We do not own `while True: think(); act()`. We provide the *primitives* the loop calls; the loop itself is the user's code (or an integration like Claude Code, an MCP server, a LangGraph node).
|
|
47
|
+
|
|
48
|
+
If a feature request looks like one of those bullets, the answer is "no" by default and "show me the empirical case" by exception.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## 4. Principles (non-negotiables)
|
|
53
|
+
|
|
54
|
+
These come straight from vstash and stay because they earned their place there.
|
|
55
|
+
|
|
56
|
+
1. **Local-first.** No mandatory network calls. The default install runs offline against a local LLM and a local vstash file. Cloud is opt-in, never assumed.
|
|
57
|
+
2. **Glass box.** Every decision the agent makes about memory is inspectable. Inputs, policy that fired, output, side effects. If you can't explain a write, you don't write.
|
|
58
|
+
3. **Single process by default.** No daemons, no message queues, no Redis. The same SQLite-and-a-script ergonomics that make vstash livable.
|
|
59
|
+
4. **vstash is a hard dependency.** Do not reimplement retrieval. Do not reach into vstash internals. If vstash's public API is missing something merken needs, the right move is a vstash PR, not a workaround.
|
|
60
|
+
5. **Empirical first.** Every claim about memory quality needs a benchmark. "It feels better with consolidation on" is not enough. The benchmarks live in `experiments/` the same way vstash's do, and they decide what ships.
|
|
61
|
+
6. **Honest about boundaries.** Same `LimitError` discipline as vstash (#133). When the substrate or the loop can't do what was asked, say so explicitly with a named exception, not a stack trace.
|
|
62
|
+
7. **No premature abstraction.** Three similar lines of code is better than the abstraction we'll regret. If you can't name three concrete callers, you can't ship the helper.
|
|
63
|
+
|
|
64
|
+
A change that breaks any of these needs an explicit case in the PR description. "Just this once" is the start of every framework.
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## 5. The memory model
|
|
69
|
+
|
|
70
|
+
merken thinks of memory in three layers, borrowed from cognitive science but kept loose because the brain analogy is suggestive, not normative.
|
|
71
|
+
|
|
72
|
+
### 5.1 Episodic memory
|
|
73
|
+
Raw events the agent saw: a user message, a tool call's output, an observation. High volume, mostly write-once, mostly low information density.
|
|
74
|
+
|
|
75
|
+
**Storage:** vstash. One document per event, tagged with `layer="episodic"` and a stable session/conversation ID. Recall via vstash's hybrid search.
|
|
76
|
+
|
|
77
|
+
**merken's role:** decide whether the event is worth writing at all (`should_remember`), and provide convenience wrappers that resolve the right collection / project / tags from agent context.
|
|
78
|
+
|
|
79
|
+
### 5.2 Semantic memory
|
|
80
|
+
Consolidated facts derived from many episodic events: "the user prefers Spanish", "project X uses PostgreSQL", "the last deploy was 2026-04-05". Lower volume, higher information density, higher reuse.
|
|
81
|
+
|
|
82
|
+
**Storage:** also vstash, but with `layer="semantic"` and a different collection. The same retrieval pipeline, the same MMR dedup, the same RRF — merken does not reinvent any of it.
|
|
83
|
+
|
|
84
|
+
**merken's role:** the **consolidation pipeline**. Periodically (or on-demand) pull recent episodic chunks, ask an LLM to extract atomic facts, deduplicate against existing semantic memory, and write the survivors. This is the part that takes vstash from "search engine" to "agent that remembers."
|
|
85
|
+
|
|
86
|
+
### 5.3 Procedural memory
|
|
87
|
+
How the agent learned to do things: a successful sequence of tool calls, a recovery pattern after an error, a prompt that worked well for a class of question.
|
|
88
|
+
|
|
89
|
+
**Storage:** vstash with `layer="procedural"`. Same shape, different tag.
|
|
90
|
+
|
|
91
|
+
**merken's role:** capture-on-success and recall-on-similar-task. This is the layer most agent frameworks ignore; we treat it as first-class.
|
|
92
|
+
|
|
93
|
+
These three layers share one substrate (vstash) and one query path (vstash's hybrid search). What differentiates them is:
|
|
94
|
+
- the *write policy* (when does merken decide to add a row to this layer?),
|
|
95
|
+
- the *recall policy* (when does merken pull from this layer into the agent's prompt?),
|
|
96
|
+
- and the *consolidation policy* (how does this layer feed the next?).
|
|
97
|
+
|
|
98
|
+
The agent loop calls merken for those policies. merken calls vstash for the actual storage and retrieval.
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## 6. Architecture stance
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
┌──────────────────────────────────────────────────────┐
|
|
106
|
+
│ agent loop (your code, Claude Code, LangGraph, MCP) │
|
|
107
|
+
└──────────────────┬───────────────────────────────────┘
|
|
108
|
+
│ remember(event), recall(query), …
|
|
109
|
+
▼
|
|
110
|
+
┌──────────────────────────────────────────────────────┐
|
|
111
|
+
│ merken │
|
|
112
|
+
│ ├─ decision policies │
|
|
113
|
+
│ │ should_remember / should_recall / │
|
|
114
|
+
│ │ should_consolidate / should_forget │
|
|
115
|
+
│ ├─ consolidation pipeline (episodic → semantic) │
|
|
116
|
+
│ ├─ inspection log (every decision, audit-grade) │
|
|
117
|
+
│ └─ Python SDK + (later) MCP server + CLI │
|
|
118
|
+
└──────────────────┬───────────────────────────────────┘
|
|
119
|
+
│ vstash.Memory.add / .search / .remember
|
|
120
|
+
▼
|
|
121
|
+
┌──────────────────────────────────────────────────────┐
|
|
122
|
+
│ vstash (substrate — glass box) │
|
|
123
|
+
│ sqlite-vec + FTS5 + adaptive RRF + MMR dedup │
|
|
124
|
+
│ metrics, limits, integrity, explicit contracts │
|
|
125
|
+
└──────────────────────────────────────────────────────┘
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
**The boundary is sacred.** merken does not import from `vstash._private`. It does not read SQLite tables directly. It does not bypass vstash's validation. If the boundary is in the way, the answer is to move the boundary in vstash, not to drill through it from above.
|
|
129
|
+
|
|
130
|
+
This is the whole reason we did the substrate-strengthening quartet (#132–#135) in vstash first: the boundary is now strong enough to support a serious consumer.
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## 7. Public surface principle
|
|
135
|
+
|
|
136
|
+
merken should be embarrassingly small at the top level. The smell test:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from merken import Memory
|
|
140
|
+
|
|
141
|
+
mem = Memory(project="my_agent")
|
|
142
|
+
|
|
143
|
+
# write path
|
|
144
|
+
mem.remember("the user said: …", layer="episodic")
|
|
145
|
+
fact_id = mem.consolidate(window="last_24h") # episodic → semantic
|
|
146
|
+
|
|
147
|
+
# read path
|
|
148
|
+
context = mem.recall("what did the user say about deployment?")
|
|
149
|
+
procedural = mem.recall_skill("recover from a failed migration")
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
If this snippet grows past one screen, we are losing.
|
|
153
|
+
|
|
154
|
+
Underneath, every method is a thin wrapper around a decision policy + a vstash call. The policies are pluggable but the *defaults* must be good enough that no one has to plug.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## 8. What we will NOT build (anti-roadmap)
|
|
159
|
+
|
|
160
|
+
Listed explicitly so we don't drift:
|
|
161
|
+
|
|
162
|
+
- A new vector database, even a small one, even "just for caching."
|
|
163
|
+
- An LLM provider abstraction layer with N adapters.
|
|
164
|
+
- A YAML / TOML / JSON DSL that compiles into a graph.
|
|
165
|
+
- Multi-agent orchestration, agent communication protocols, agent marketplaces.
|
|
166
|
+
- Automatic hyperparameter tuning of the decision policies.
|
|
167
|
+
- A web UI as the primary surface (CLI + Python SDK come first; a UI can come later as a separate optional package).
|
|
168
|
+
- "Memory as a service" with hosted state.
|
|
169
|
+
- A "smart router" that picks an LLM per query.
|
|
170
|
+
- Hooks into proprietary IDEs as a primary surface (MCP server, yes — vendor-specific plugins, no).
|
|
171
|
+
|
|
172
|
+
If something on this list ever lands in merken, it should be because we wrote down why and the document survived a re-read three weeks later.
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## 9. Empirical bar
|
|
177
|
+
|
|
178
|
+
Every claim about memory *quality* (not throughput, not ergonomics) needs a benchmark in `experiments/`. The bar is the same as vstash's:
|
|
179
|
+
|
|
180
|
+
- The benchmark uses a real dataset, not a synthetic toy.
|
|
181
|
+
- It runs against vstash + merken and against a baseline (no-merken, just vstash; or a competing framework if comparable).
|
|
182
|
+
- Results are reported with confidence intervals, not point estimates.
|
|
183
|
+
- The dataset is small enough that the benchmark runs in under five minutes on a laptop.
|
|
184
|
+
|
|
185
|
+
Releases that change a default policy must cite the benchmark that justified the change. "I think it's better" does not ship.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## 10. Open questions (decide before / during day 1)
|
|
190
|
+
|
|
191
|
+
These are the calls that block the first commit. None of them are deeply technical; all of them are easier to get right at the start than to undo later.
|
|
192
|
+
|
|
193
|
+
1. **Name.** `merken` is a working title. Alternatives floated: `mneme`, `anamnesis`, `vstash-mind`, `recallable`. Decide before the repo gets a remote.
|
|
194
|
+
2. **License.** vstash is presumably MIT/Apache — match it unless there's a reason not to.
|
|
195
|
+
3. **Repo location.** Sibling of vstash on GitHub (`stffns/merken`) vs. monorepo. Sibling is the default; only consider monorepo if there's a clear coupling reason.
|
|
196
|
+
4. **Python version floor.** Match vstash (3.10+) unless the new project uses syntax that needs higher.
|
|
197
|
+
5. **Default LLM backend.** Local-first means we ship an Ollama default and document Cerebras / OpenAI as opt-in. Same model resolution as vstash's `[inference]` block.
|
|
198
|
+
6. **Decision-policy format.** Plain Python functions vs. small Pydantic models that wrap rules. Lean Python functions; only escalate to models if we hit a real reuse problem.
|
|
199
|
+
7. **Inspection log storage.** Reuse vstash (a `layer="audit"` collection)? A separate SQLite file? A JSONL append-only? Pick one and move on; switching later is cheap because the surface is one method.
|
|
200
|
+
8. **MCP server: now or later?** Later. Get the SDK right first.
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## 11. Day 1 plan
|
|
205
|
+
|
|
206
|
+
Concrete first moves when the new session opens here. None of this commits to architecture beyond what the constitution already allows.
|
|
207
|
+
|
|
208
|
+
1. **Read this constitution.** Disagreements get edits to this file before any code.
|
|
209
|
+
2. **Decide the open questions in §10**, at least name and license and repo location.
|
|
210
|
+
3. **Bootstrap the repo:**
|
|
211
|
+
- `pyproject.toml` with `vstash >= 0.25.0`, `pydantic >= 2`, `pytest`, `ruff`.
|
|
212
|
+
- `merken/__init__.py` with `__version__ = "0.1.0"` and `from .memory import Memory`.
|
|
213
|
+
- `merken/memory.py` with a `Memory` class that wraps `vstash.Memory` and exposes a single method: `remember(text: str, layer: str = "episodic")`. That's it. No consolidation yet, no decision policies. Just the wiring.
|
|
214
|
+
- `tests/test_smoke.py` that ingests one event, recalls it, asserts it comes back.
|
|
215
|
+
- `CLAUDE.md` that points at this file and at vstash's CLAUDE.md.
|
|
216
|
+
4. **Run the smoke test.** If it passes, we have a project. If not, we have a list.
|
|
217
|
+
5. **Commit on `develop`** (mirror vstash's branching: `feature/*` → `develop` → `main` via release PR).
|
|
218
|
+
6. **Stop.** Resist the urge to also write the consolidation pipeline today. The first day is for the floor and the door; everything else comes after.
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## 12. Resume context (load this in the next session)
|
|
223
|
+
|
|
224
|
+
A short block that a fresh Claude session can read to understand where things stand. Update it as the project evolves.
|
|
225
|
+
|
|
226
|
+
> **Project:** merken — agent-loop layer on top of vstash.
|
|
227
|
+
> **Status:** constitution drafted 2026-04-07, no code yet.
|
|
228
|
+
> **Substrate:** vstash 0.25.0 on PyPI, with observability (#132), explicit limits (#133), integrity / `vstash check` (#134), and schema versioning + API stability docs (#135). The substrate-strengthening quartet is done; the substrate is ready for an external consumer.
|
|
229
|
+
> **Why now:** the user wanted to keep vstash narrow as a glass-box retrieval engine and put the agent-loop opinions in their own project.
|
|
230
|
+
> **First task:** read `CONSTITUTION.md` end to end, decide §10 open questions, then execute §11 day 1 plan.
|
|
231
|
+
> **What NOT to do:** rebuild retrieval, add a framework layer, or extend the constitution before reading it.
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
*Last updated: 2026-04-07. Edits to this document are normal; edits without a paragraph explaining the change in the PR description are not.*
|
merken-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jayson Steffens
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|