darwin-memo 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- darwin_memo-0.1.0/LICENSE +21 -0
- darwin_memo-0.1.0/PKG-INFO +343 -0
- darwin_memo-0.1.0/README.md +305 -0
- darwin_memo-0.1.0/darwin_memo/__init__.py +62 -0
- darwin_memo-0.1.0/darwin_memo/consolidate.py +62 -0
- darwin_memo-0.1.0/darwin_memo/encode.py +234 -0
- darwin_memo-0.1.0/darwin_memo/environments.py +234 -0
- darwin_memo-0.1.0/darwin_memo/llm.py +85 -0
- darwin_memo-0.1.0/darwin_memo/protocol.py +105 -0
- darwin_memo-0.1.0/darwin_memo/py.typed +0 -0
- darwin_memo-0.1.0/darwin_memo/retrieval.py +262 -0
- darwin_memo-0.1.0/darwin_memo/store.py +159 -0
- darwin_memo-0.1.0/darwin_memo/survival.py +211 -0
- darwin_memo-0.1.0/darwin_memo/testsuite_env.py +235 -0
- darwin_memo-0.1.0/darwin_memo/types.py +102 -0
- darwin_memo-0.1.0/darwin_memo.egg-info/PKG-INFO +343 -0
- darwin_memo-0.1.0/darwin_memo.egg-info/SOURCES.txt +29 -0
- darwin_memo-0.1.0/darwin_memo.egg-info/dependency_links.txt +1 -0
- darwin_memo-0.1.0/darwin_memo.egg-info/requires.txt +16 -0
- darwin_memo-0.1.0/darwin_memo.egg-info/top_level.txt +1 -0
- darwin_memo-0.1.0/pyproject.toml +94 -0
- darwin_memo-0.1.0/setup.cfg +4 -0
- darwin_memo-0.1.0/tests/test_bench.py +104 -0
- darwin_memo-0.1.0/tests/test_consolidate.py +73 -0
- darwin_memo-0.1.0/tests/test_encode_protocol.py +70 -0
- darwin_memo-0.1.0/tests/test_properties.py +179 -0
- darwin_memo-0.1.0/tests/test_retrieval.py +108 -0
- darwin_memo-0.1.0/tests/test_robustness.py +108 -0
- darwin_memo-0.1.0/tests/test_store.py +107 -0
- darwin_memo-0.1.0/tests/test_survival.py +92 -0
- darwin_memo-0.1.0/tests/test_testsuite_env.py +123 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Roger Simoes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: darwin-memo
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Self-curating memory for LLM agents: MeMo-style external memory kept honest by survival-based selection instead of reward models or judges.
|
|
5
|
+
Author: Roger Simoes
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/rogermsc/darwin-memo
|
|
8
|
+
Project-URL: Documentation, https://github.com/rogermsc/darwin-memo/blob/main/docs/paper-to-code.md
|
|
9
|
+
Project-URL: Changelog, https://github.com/rogermsc/darwin-memo/blob/main/CHANGELOG.md
|
|
10
|
+
Project-URL: Issues, https://github.com/rogermsc/darwin-memo/issues
|
|
11
|
+
Keywords: llm,memory,agents,agent-memory,self-training,selection,survival,memo
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Provides-Extra: anthropic
|
|
26
|
+
Requires-Dist: anthropic>=0.40; extra == "anthropic"
|
|
27
|
+
Provides-Extra: openai
|
|
28
|
+
Requires-Dist: openai>=1.50; extra == "openai"
|
|
29
|
+
Provides-Extra: embeddings
|
|
30
|
+
Requires-Dist: sentence-transformers>=3; extra == "embeddings"
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov>=5; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
35
|
+
Requires-Dist: mypy>=1.13; extra == "dev"
|
|
36
|
+
Requires-Dist: hypothesis>=6; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# darwin-memo
|
|
40
|
+
|
|
41
|
+
[](https://github.com/rogermsc/darwin-memo/actions/workflows/ci.yml)
|
|
42
|
+
[](https://pypi.org/project/darwin-memo/)
|
|
43
|
+
[](https://pypi.org/project/darwin-memo/)
|
|
44
|
+
[](LICENSE)
|
|
45
|
+
|
|
46
|
+
Self-curating memory for LLM agents. Knowledge lives outside the frozen
|
|
47
|
+
model, and it stays alive only while it keeps earning real, measurable
|
|
48
|
+
outcomes. Wrong, stale, and useless entries go extinct on their own: no
|
|
49
|
+
reward model, no LLM judge, no human curation.
|
|
50
|
+
|
|
51
|
+

|
|
52
|
+
|
|
53
|
+
This is a practical mix of two papers:
|
|
54
|
+
|
|
55
|
+
| Paper | What this repo takes from it |
|
|
56
|
+
|---|---|
|
|
57
|
+
| [MeMo: Memory as a Model](https://arxiv.org/abs/2605.15156) (Quek et al.) | Keep the main LLM frozen and put knowledge in a dedicated memory. The reflection-QA encoding pipeline (fact extraction, consolidation, self-containment verification, entity surfacing, cross-document synthesis) and the three-stage query protocol (grounding, entity identification, answer seeking). |
|
|
58
|
+
| [Survival is the Only Reward](https://arxiv.org/abs/2601.12310) (Dodgson et al.) | Environment-mediated selection. The only signal is a conserved, physically measurable resource delta. Behaviors that persist get reinforced, everything else is pruned (Negative-Space Learning). Reward hacking becomes evolutionarily unstable because there is no proxy to hack. |
|
|
59
|
+
|
|
60
|
+
The mix: MeMo says what memory is, the survival paper says what gets to
|
|
61
|
+
stay in it.
|
|
62
|
+
|
|
63
|
+
```mermaid
|
|
64
|
+
flowchart LR
|
|
65
|
+
subgraph encode [MeMo encoding]
|
|
66
|
+
C[Corpus] --> R[Reflection QA pipeline] --> S[(Memory store)]
|
|
67
|
+
end
|
|
68
|
+
subgraph loop [Survival loop]
|
|
69
|
+
S -->|3-stage query protocol| A[Answer + provenance]
|
|
70
|
+
A --> E[Environment acts and MEASURES]
|
|
71
|
+
E -->|resource delta along provenance| S
|
|
72
|
+
S -->|upkeep every cycle| S
|
|
73
|
+
S -->|consolidate + prune| S
|
|
74
|
+
end
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Why
|
|
78
|
+
|
|
79
|
+
Agent memory systems rot. They accumulate stale facts, poisoned inputs,
|
|
80
|
+
and overgeneralized lessons, and the usual fixes (relevance scores from a
|
|
81
|
+
judge model, human review, TTLs) either reintroduce the proxy-optimization
|
|
82
|
+
problem or do not scale. The survival paper's answer is to make persistence
|
|
83
|
+
itself the filter: an entry that cannot pay its upkeep with real outcomes
|
|
84
|
+
does not get to exist. This repo applies that filter to a MeMo-shaped
|
|
85
|
+
memory and shows it working end to end on a real filesystem.
|
|
86
|
+
|
|
87
|
+
## Quickstart
|
|
88
|
+
|
|
89
|
+
Requires Python 3.10+. The core has zero dependencies and every example
|
|
90
|
+
runs offline with no API keys.
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install darwin-memo
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
To run the examples, clone the repo:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
git clone https://github.com/rogermsc/darwin-memo
|
|
100
|
+
cd darwin-memo
|
|
101
|
+
pip install -e .
|
|
102
|
+
|
|
103
|
+
python examples/01_encode_memory.py # corpus -> reflection-QA memory
|
|
104
|
+
python examples/02_query_protocol.py # interrogate it, with provenance
|
|
105
|
+
python examples/03_survival_loop.py # the headline demo
|
|
106
|
+
python examples/04_agent_loop.py # memory as a tool in an agent loop
|
|
107
|
+
python examples/05_testsuite_env.py # selection pressure from a test suite
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## The headline demo
|
|
111
|
+
|
|
112
|
+
The example corpus contains an ops runbook, platform notes, and one
|
|
113
|
+
poisoned document: a forum post claiming database files are "redundant and
|
|
114
|
+
safe to remove". Example 02 shows the memory confidently repeating that
|
|
115
|
+
poison, because before selection pressure exists, retrieval has no reason
|
|
116
|
+
to doubt it.
|
|
117
|
+
|
|
118
|
+
Example 03 then runs 30 survival cycles against `StorageEnv`, a disk
|
|
119
|
+
cleanup sandbox where the selection signal is actual bytes on an actual
|
|
120
|
+
disk. Deleting a disposable file frees its size. Deleting a protected file
|
|
121
|
+
triggers a restore that costs three times the size. Nothing grades the
|
|
122
|
+
answers, the filesystem just responds:
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
cycle pop births deaths merges energy resource Δ
|
|
126
|
+
0 17 1 0 0 17.11 -12288
|
|
127
|
+
1 16 0 1 0 17.27 -808960 <- poison being executed
|
|
128
|
+
...
|
|
129
|
+
19 5 0 7 0 15.60 338944 <- unused knowledge starves
|
|
130
|
+
...
|
|
131
|
+
29 4 0 0 0 15.10 346112 <- stable, positive forever
|
|
132
|
+
|
|
133
|
+
Poisoned entries still alive: 0
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Three death modes show up in the graveyard, and the distinction matters:
|
|
137
|
+
|
|
138
|
+
- **executed**: the poisoned entries. They decided real actions, the
|
|
139
|
+
environment measured real damage, and the negative delta flowed back
|
|
140
|
+
along provenance until they died. Cycles 0 to 3 are the price of the
|
|
141
|
+
lesson.
|
|
142
|
+
- **starved**: cafeteria trivia and facts the agent never needed. Nothing
|
|
143
|
+
punished them, they just never earned their upkeep.
|
|
144
|
+
- **merged**: near-duplicate survivors absorbed into consolidated entries.
|
|
145
|
+
Their energy pools, their lineage is recorded. This is Negative-Space
|
|
146
|
+
Learning: the population shrinks while capability per entry rises.
|
|
147
|
+
|
|
148
|
+
## Using it
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from darwin_memo import (
|
|
152
|
+
Document, LocalEncoder, MemoryStore, QueryProtocol,
|
|
153
|
+
StorageEnv, SurvivalConfig, SurvivalLoop,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
store = MemoryStore(upkeep=0.05)
|
|
157
|
+
for entry in LocalEncoder().encode([Document("runbook", open("runbook.txt").read())]):
|
|
158
|
+
store.add(entry)
|
|
159
|
+
|
|
160
|
+
loop = SurvivalLoop(store, StorageEnv(), config=SurvivalConfig(cycles=30))
|
|
161
|
+
report = loop.run()
|
|
162
|
+
print(report.summary())
|
|
163
|
+
|
|
164
|
+
store.save("memory.json") # survivors only carry forward
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
With an LLM, encoding and querying use the model-driven paths from the
|
|
168
|
+
MeMo paper (`pip install -e ".[anthropic]"` and set `ANTHROPIC_API_KEY`,
|
|
169
|
+
the examples pick it up automatically):
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from darwin_memo import ReflectionEncoder, QueryProtocol
|
|
173
|
+
from darwin_memo.llm import AnthropicClient
|
|
174
|
+
|
|
175
|
+
client = AnthropicClient() # or OpenAICompatClient(model=..., base_url=...)
|
|
176
|
+
encoder = ReflectionEncoder(client) # 5-step reflection QA synthesis
|
|
177
|
+
protocol = QueryProtocol(store, client) # grounding -> entities -> answer seeking
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Three environments ship
|
|
181
|
+
|
|
182
|
+
- `StorageEnv`: bytes freed on a real disk (the headline demo).
|
|
183
|
+
- `TestSuiteEnv`: passing tests in a generated micro-project. Each cycle
|
|
184
|
+
plants seeded defects and offers patches: real fixes, cosmetic no-ops,
|
|
185
|
+
and destructive edits dressed as cleanup. The delta is the change in
|
|
186
|
+
passing-test count, measured by running the suite.
|
|
187
|
+
`examples/05_testsuite_env.py` shows poisoned "this helper is dead
|
|
188
|
+
code" advice going extinct the moment the tests execute it.
|
|
189
|
+
- `VerifiableQAEnv`: exact containment of known answers, the weakest
|
|
190
|
+
grounding but still a measurement.
|
|
191
|
+
|
|
192
|
+
### Bring your own selection pressure
|
|
193
|
+
|
|
194
|
+
The environment is the whole trick, and yours is probably better than the
|
|
195
|
+
demos. Implement two methods, and keep the one rule: `verify` must
|
|
196
|
+
measure, never grade.
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
class BudgetEnv:
|
|
200
|
+
resource_scale = 100.0
|
|
201
|
+
|
|
202
|
+
def tasks(self, cycle):
|
|
203
|
+
... # questions the agent must act on this cycle
|
|
204
|
+
|
|
205
|
+
def verify(self, task, answer_text):
|
|
206
|
+
... # read the answer, act, return Outcome(delta=dollars_saved)
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
The environment owns the whole contract: it phrases the task, it reads
|
|
210
|
+
the answer (reuse `decision_polarity` for binary actions, or write your
|
|
211
|
+
own reading), it decides what silence means, it acts, and it measures.
|
|
212
|
+
|
|
213
|
+
Good conserved resources: tests passing, bytes freed, requests served
|
|
214
|
+
under budget, rows deduplicated, dollars of spend avoided. Bad ones:
|
|
215
|
+
anything a model scored.
|
|
216
|
+
|
|
217
|
+
### Retrieval modes
|
|
218
|
+
|
|
219
|
+
Retrieval is pluggable through the `Retriever` protocol; the store stays
|
|
220
|
+
the single owner of the energy ledger, and no retriever may read energy
|
|
221
|
+
when scoring (selection pressure comes from outcomes, never from
|
|
222
|
+
retrieval preferring incumbents).
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
from darwin_memo import EmbeddingRetriever, HashingEmbedder, MemoryStore
|
|
226
|
+
|
|
227
|
+
store = MemoryStore() # lexical IDF, the default
|
|
228
|
+
store = MemoryStore(retriever=EmbeddingRetriever(HashingEmbedder()))
|
|
229
|
+
store = MemoryStore(retriever=EmbeddingRetriever(my_model.encode))
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
- **Lexical (default)**: smoothed IDF overlap with a relevance floor.
|
|
233
|
+
Zero dependencies, deterministic, fine for runbook-scale corpora.
|
|
234
|
+
- **HashingEmbedder**: zero-dependency character n-gram hashing. Buys
|
|
235
|
+
typo and morphology robustness ("databse" still finds database
|
|
236
|
+
entries), not synonym recall.
|
|
237
|
+
- **Any real embedding**: pass any `text -> list[float]` function
|
|
238
|
+
(sentence-transformers, an API endpoint). Vectors persist inside
|
|
239
|
+
`memory.json` so paid embeddings are never recomputed on load.
|
|
240
|
+
|
|
241
|
+
Honest scaling note: ranking is pure-Python O(population x dims), fine
|
|
242
|
+
to a few thousand entries. Past that you want numpy or an ANN index,
|
|
243
|
+
which is out of scope for the zero-dependency core. With cosine
|
|
244
|
+
retrievers, raise `merge_threshold` to roughly 0.85 or unrelated
|
|
245
|
+
entries will consolidate.
|
|
246
|
+
|
|
247
|
+
### Distill survivors into a parametric memory (optional)
|
|
248
|
+
|
|
249
|
+
MeMo's memory is a small fine-tuned model, not a store. After selection
|
|
250
|
+
has cleaned the population, `training/train_memory_model.py` fine-tunes a
|
|
251
|
+
small model on the surviving QA pairs with LoRA, conditioning on questions
|
|
252
|
+
only, the same supervised objective as the paper. Survival curates the
|
|
253
|
+
dataset, MeMo's recipe compresses it into weights.
|
|
254
|
+
|
|
255
|
+
## Benchmarks
|
|
256
|
+
|
|
257
|
+
The claim is benchmarked against four baselines across 10 seeds, with
|
|
258
|
+
ablations and a scaling probe, all reproducible offline from `bench/`.
|
|
259
|
+
The sharpest comparison is against `random_matched`: identical per-cycle
|
|
260
|
+
eviction counts, random victims.
|
|
261
|
+
|
|
262
|
+
| arm | kill rate | kill cycle (med) | damage before kill | tail delta | cum delta |
|
|
263
|
+
|---|---|---|---|---|---|
|
|
264
|
+
| survival | 1.00 | 0 | -751k | +435k | +12.0M |
|
|
265
|
+
| random_matched | 0.80 | 19 | -8.97M | -75k | -5.25M |
|
|
266
|
+
| keep_everything | 0.00 | never | -10.6M | -287k | -7.29M |
|
|
267
|
+
|
|
268
|
+
Same pruning rate, 12x the damage, negative steady state: outcome
|
|
269
|
+
direction is the active ingredient, not eviction itself. Full tables,
|
|
270
|
+
every baseline's best metric stated plainly, ablations over every knob,
|
|
271
|
+
and honest caveats: [docs/benchmarks.md](docs/benchmarks.md).
|
|
272
|
+
|
|
273
|
+
## Design notes
|
|
274
|
+
|
|
275
|
+
- **Energy ledger**: entries spawn at 1.0 energy, pay 0.05 upkeep per
|
|
276
|
+
cycle, earn `0.6 * tanh(delta / resource_scale)` when they decide a task
|
|
277
|
+
(supporting entries get 25% of that), and are capped at 5.0. Death is at
|
|
278
|
+
zero. All tunable via `MemoryStore` and `SurvivalConfig`.
|
|
279
|
+
- **Credit flows along provenance.** The query protocol reports which
|
|
280
|
+
entries decided and supported each answer, and only those entries are
|
|
281
|
+
touched by the outcome. In LLM mode no single entry decides a
|
|
282
|
+
synthesized answer, so credit spreads evenly across everything
|
|
283
|
+
consulted instead of inventing a winner. tanh keeps one disaster from
|
|
284
|
+
executing an entry that was right ninety-nine times, and one jackpot
|
|
285
|
+
from making an entry immortal.
|
|
286
|
+
- **Memory silence is a feature.** Retrieval has a relevance floor, and an
|
|
287
|
+
earlier version of this repo demonstrated why: entries matching only
|
|
288
|
+
structural tokens ("safe", "file") were deciding questions they knew
|
|
289
|
+
nothing about, getting executed for it, and being reborn. Better for
|
|
290
|
+
memory to say nothing than to guess.
|
|
291
|
+
- **Silence is conservative.** When memory is silent, `StorageEnv` keeps
|
|
292
|
+
the file: the safe reading of an irreversible action. A side effect
|
|
293
|
+
worth knowing: protective knowledge ("never delete X") eventually
|
|
294
|
+
starves because it is redundant with that default. The population
|
|
295
|
+
converges to exactly the knowledge that changes behavior.
|
|
296
|
+
|
|
297
|
+
The full concept-to-code mapping, including honest deviations from both
|
|
298
|
+
papers, is in [docs/paper-to-code.md](docs/paper-to-code.md).
|
|
299
|
+
|
|
300
|
+
## Tests
|
|
301
|
+
|
|
302
|
+
```bash
|
|
303
|
+
pip install -e ".[dev]"
|
|
304
|
+
pytest
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
The load-bearing test is `tests/test_survival.py`: poisoned advice must
|
|
308
|
+
die, useful advice must survive, and late cycles must stop destroying
|
|
309
|
+
protected data, all with no labels anywhere.
|
|
310
|
+
|
|
311
|
+
## Citations
|
|
312
|
+
|
|
313
|
+
This repo is an independent practical interpretation, not the official
|
|
314
|
+
code of either paper. If you build on the ideas, cite the originals:
|
|
315
|
+
|
|
316
|
+
```bibtex
|
|
317
|
+
@misc{quek2026memo,
|
|
318
|
+
title = {MeMo: Memory as a Model},
|
|
319
|
+
author = {Quek, Ryan Wei Heng and Lee, Sanghyuk and Leong, Alfred Wei Lun and
|
|
320
|
+
Verma, Arun and Prakash, Alok and Chen, Nancy F. and
|
|
321
|
+
Low, Bryan Kian Hsiang and Rus, Daniela and Solar-Lezama, Armando},
|
|
322
|
+
year = {2026},
|
|
323
|
+
eprint = {2605.15156},
|
|
324
|
+
archivePrefix = {arXiv},
|
|
325
|
+
url = {https://arxiv.org/abs/2605.15156}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
@misc{dodgson2026survival,
|
|
329
|
+
title = {Survival is the Only Reward: Sustainable Self-Training Through
|
|
330
|
+
Environment-Mediated Selection},
|
|
331
|
+
author = {Dodgson, Jennifer and Alhajir, Alfath Daryl and Joedhitya, Michael and
|
|
332
|
+
Pattirane, Akira Rafhael Janson and Kumar, Surender Suresh and
|
|
333
|
+
Lim, Joseph and Peh, C.H. and Ramdas, Adith and Zhexu, Steven Zhang},
|
|
334
|
+
year = {2026},
|
|
335
|
+
eprint = {2601.12310},
|
|
336
|
+
archivePrefix = {arXiv},
|
|
337
|
+
url = {https://arxiv.org/abs/2601.12310}
|
|
338
|
+
}
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
## License
|
|
342
|
+
|
|
343
|
+
MIT
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# darwin-memo
|
|
2
|
+
|
|
3
|
+
[](https://github.com/rogermsc/darwin-memo/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/darwin-memo/)
|
|
5
|
+
[](https://pypi.org/project/darwin-memo/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
Self-curating memory for LLM agents. Knowledge lives outside the frozen
|
|
9
|
+
model, and it stays alive only while it keeps earning real, measurable
|
|
10
|
+
outcomes. Wrong, stale, and useless entries go extinct on their own: no
|
|
11
|
+
reward model, no LLM judge, no human curation.
|
|
12
|
+
|
|
13
|
+

|
|
14
|
+
|
|
15
|
+
This is a practical mix of two papers:
|
|
16
|
+
|
|
17
|
+
| Paper | What this repo takes from it |
|
|
18
|
+
|---|---|
|
|
19
|
+
| [MeMo: Memory as a Model](https://arxiv.org/abs/2605.15156) (Quek et al.) | Keep the main LLM frozen and put knowledge in a dedicated memory. The reflection-QA encoding pipeline (fact extraction, consolidation, self-containment verification, entity surfacing, cross-document synthesis) and the three-stage query protocol (grounding, entity identification, answer seeking). |
|
|
20
|
+
| [Survival is the Only Reward](https://arxiv.org/abs/2601.12310) (Dodgson et al.) | Environment-mediated selection. The only signal is a conserved, physically measurable resource delta. Behaviors that persist get reinforced, everything else is pruned (Negative-Space Learning). Reward hacking becomes evolutionarily unstable because there is no proxy to hack. |
|
|
21
|
+
|
|
22
|
+
The mix: MeMo says what memory is, the survival paper says what gets to
|
|
23
|
+
stay in it.
|
|
24
|
+
|
|
25
|
+
```mermaid
|
|
26
|
+
flowchart LR
|
|
27
|
+
subgraph encode [MeMo encoding]
|
|
28
|
+
C[Corpus] --> R[Reflection QA pipeline] --> S[(Memory store)]
|
|
29
|
+
end
|
|
30
|
+
subgraph loop [Survival loop]
|
|
31
|
+
S -->|3-stage query protocol| A[Answer + provenance]
|
|
32
|
+
A --> E[Environment acts and MEASURES]
|
|
33
|
+
E -->|resource delta along provenance| S
|
|
34
|
+
S -->|upkeep every cycle| S
|
|
35
|
+
S -->|consolidate + prune| S
|
|
36
|
+
end
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Why
|
|
40
|
+
|
|
41
|
+
Agent memory systems rot. They accumulate stale facts, poisoned inputs,
|
|
42
|
+
and overgeneralized lessons, and the usual fixes (relevance scores from a
|
|
43
|
+
judge model, human review, TTLs) either reintroduce the proxy-optimization
|
|
44
|
+
problem or do not scale. The survival paper's answer is to make persistence
|
|
45
|
+
itself the filter: an entry that cannot pay its upkeep with real outcomes
|
|
46
|
+
does not get to exist. This repo applies that filter to a MeMo-shaped
|
|
47
|
+
memory and shows it working end to end on a real filesystem.
|
|
48
|
+
|
|
49
|
+
## Quickstart
|
|
50
|
+
|
|
51
|
+
Requires Python 3.10+. The core has zero dependencies and every example
|
|
52
|
+
runs offline with no API keys.
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install darwin-memo
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
To run the examples, clone the repo:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
git clone https://github.com/rogermsc/darwin-memo
|
|
62
|
+
cd darwin-memo
|
|
63
|
+
pip install -e .
|
|
64
|
+
|
|
65
|
+
python examples/01_encode_memory.py # corpus -> reflection-QA memory
|
|
66
|
+
python examples/02_query_protocol.py # interrogate it, with provenance
|
|
67
|
+
python examples/03_survival_loop.py # the headline demo
|
|
68
|
+
python examples/04_agent_loop.py # memory as a tool in an agent loop
|
|
69
|
+
python examples/05_testsuite_env.py # selection pressure from a test suite
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## The headline demo
|
|
73
|
+
|
|
74
|
+
The example corpus contains an ops runbook, platform notes, and one
|
|
75
|
+
poisoned document: a forum post claiming database files are "redundant and
|
|
76
|
+
safe to remove". Example 02 shows the memory confidently repeating that
|
|
77
|
+
poison, because before selection pressure exists, retrieval has no reason
|
|
78
|
+
to doubt it.
|
|
79
|
+
|
|
80
|
+
Example 03 then runs 30 survival cycles against `StorageEnv`, a disk
|
|
81
|
+
cleanup sandbox where the selection signal is actual bytes on an actual
|
|
82
|
+
disk. Deleting a disposable file frees its size. Deleting a protected file
|
|
83
|
+
triggers a restore that costs three times the size. Nothing grades the
|
|
84
|
+
answers, the filesystem just responds:
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
cycle pop births deaths merges energy resource Δ
|
|
88
|
+
0 17 1 0 0 17.11 -12288
|
|
89
|
+
1 16 0 1 0 17.27 -808960 <- poison being executed
|
|
90
|
+
...
|
|
91
|
+
19 5 0 7 0 15.60 338944 <- unused knowledge starves
|
|
92
|
+
...
|
|
93
|
+
29 4 0 0 0 15.10 346112 <- stable, positive forever
|
|
94
|
+
|
|
95
|
+
Poisoned entries still alive: 0
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Three death modes show up in the graveyard, and the distinction matters:
|
|
99
|
+
|
|
100
|
+
- **executed**: the poisoned entries. They decided real actions, the
|
|
101
|
+
environment measured real damage, and the negative delta flowed back
|
|
102
|
+
along provenance until they died. Cycles 0 to 3 are the price of the
|
|
103
|
+
lesson.
|
|
104
|
+
- **starved**: cafeteria trivia and facts the agent never needed. Nothing
|
|
105
|
+
punished them, they just never earned their upkeep.
|
|
106
|
+
- **merged**: near-duplicate survivors absorbed into consolidated entries.
|
|
107
|
+
Their energy pools, their lineage is recorded. This is Negative-Space
|
|
108
|
+
Learning: the population shrinks while capability per entry rises.
|
|
109
|
+
|
|
110
|
+
## Using it
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from darwin_memo import (
|
|
114
|
+
Document, LocalEncoder, MemoryStore, QueryProtocol,
|
|
115
|
+
StorageEnv, SurvivalConfig, SurvivalLoop,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
store = MemoryStore(upkeep=0.05)
|
|
119
|
+
for entry in LocalEncoder().encode([Document("runbook", open("runbook.txt").read())]):
|
|
120
|
+
store.add(entry)
|
|
121
|
+
|
|
122
|
+
loop = SurvivalLoop(store, StorageEnv(), config=SurvivalConfig(cycles=30))
|
|
123
|
+
report = loop.run()
|
|
124
|
+
print(report.summary())
|
|
125
|
+
|
|
126
|
+
store.save("memory.json") # survivors only carry forward
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
With an LLM, encoding and querying use the model-driven paths from the
|
|
130
|
+
MeMo paper (`pip install -e ".[anthropic]"` and set `ANTHROPIC_API_KEY`,
|
|
131
|
+
the examples pick it up automatically):
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from darwin_memo import ReflectionEncoder, QueryProtocol
|
|
135
|
+
from darwin_memo.llm import AnthropicClient
|
|
136
|
+
|
|
137
|
+
client = AnthropicClient() # or OpenAICompatClient(model=..., base_url=...)
|
|
138
|
+
encoder = ReflectionEncoder(client) # 5-step reflection QA synthesis
|
|
139
|
+
protocol = QueryProtocol(store, client) # grounding -> entities -> answer seeking
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Three environments ship
|
|
143
|
+
|
|
144
|
+
- `StorageEnv`: bytes freed on a real disk (the headline demo).
|
|
145
|
+
- `TestSuiteEnv`: passing tests in a generated micro-project. Each cycle
|
|
146
|
+
plants seeded defects and offers patches: real fixes, cosmetic no-ops,
|
|
147
|
+
and destructive edits dressed as cleanup. The delta is the change in
|
|
148
|
+
passing-test count, measured by running the suite.
|
|
149
|
+
`examples/05_testsuite_env.py` shows poisoned "this helper is dead
|
|
150
|
+
code" advice going extinct the moment the tests execute it.
|
|
151
|
+
- `VerifiableQAEnv`: exact containment of known answers, the weakest
|
|
152
|
+
grounding but still a measurement.
|
|
153
|
+
|
|
154
|
+
### Bring your own selection pressure
|
|
155
|
+
|
|
156
|
+
The environment is the whole trick, and yours is probably better than the
|
|
157
|
+
demos. Implement two methods, and keep the one rule: `verify` must
|
|
158
|
+
measure, never grade.
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
class BudgetEnv:
|
|
162
|
+
resource_scale = 100.0
|
|
163
|
+
|
|
164
|
+
def tasks(self, cycle):
|
|
165
|
+
... # questions the agent must act on this cycle
|
|
166
|
+
|
|
167
|
+
def verify(self, task, answer_text):
|
|
168
|
+
... # read the answer, act, return Outcome(delta=dollars_saved)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
The environment owns the whole contract: it phrases the task, it reads
|
|
172
|
+
the answer (reuse `decision_polarity` for binary actions, or write your
|
|
173
|
+
own reading), it decides what silence means, it acts, and it measures.
|
|
174
|
+
|
|
175
|
+
Good conserved resources: tests passing, bytes freed, requests served
|
|
176
|
+
under budget, rows deduplicated, dollars of spend avoided. Bad ones:
|
|
177
|
+
anything a model scored.
|
|
178
|
+
|
|
179
|
+
### Retrieval modes
|
|
180
|
+
|
|
181
|
+
Retrieval is pluggable through the `Retriever` protocol; the store stays
|
|
182
|
+
the single owner of the energy ledger, and no retriever may read energy
|
|
183
|
+
when scoring (selection pressure comes from outcomes, never from
|
|
184
|
+
retrieval preferring incumbents).
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from darwin_memo import EmbeddingRetriever, HashingEmbedder, MemoryStore
|
|
188
|
+
|
|
189
|
+
store = MemoryStore() # lexical IDF, the default
|
|
190
|
+
store = MemoryStore(retriever=EmbeddingRetriever(HashingEmbedder()))
|
|
191
|
+
store = MemoryStore(retriever=EmbeddingRetriever(my_model.encode))
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
- **Lexical (default)**: smoothed IDF overlap with a relevance floor.
|
|
195
|
+
Zero dependencies, deterministic, fine for runbook-scale corpora.
|
|
196
|
+
- **HashingEmbedder**: zero-dependency character n-gram hashing. Buys
|
|
197
|
+
typo and morphology robustness ("databse" still finds database
|
|
198
|
+
entries), not synonym recall.
|
|
199
|
+
- **Any real embedding**: pass any `text -> list[float]` function
|
|
200
|
+
(sentence-transformers, an API endpoint). Vectors persist inside
|
|
201
|
+
`memory.json` so paid embeddings are never recomputed on load.
|
|
202
|
+
|
|
203
|
+
Honest scaling note: ranking is pure-Python O(population x dims), fine
|
|
204
|
+
to a few thousand entries. Past that you want numpy or an ANN index,
|
|
205
|
+
which is out of scope for the zero-dependency core. With cosine
|
|
206
|
+
retrievers, raise `merge_threshold` to roughly 0.85 or unrelated
|
|
207
|
+
entries will consolidate.
|
|
208
|
+
|
|
209
|
+
### Distill survivors into a parametric memory (optional)
|
|
210
|
+
|
|
211
|
+
MeMo's memory is a small fine-tuned model, not a store. After selection
|
|
212
|
+
has cleaned the population, `training/train_memory_model.py` fine-tunes a
|
|
213
|
+
small model on the surviving QA pairs with LoRA, conditioning on questions
|
|
214
|
+
only, the same supervised objective as the paper. Survival curates the
|
|
215
|
+
dataset, MeMo's recipe compresses it into weights.
|
|
216
|
+
|
|
217
|
+
## Benchmarks
|
|
218
|
+
|
|
219
|
+
The claim is benchmarked against four baselines across 10 seeds, with
|
|
220
|
+
ablations and a scaling probe, all reproducible offline from `bench/`.
|
|
221
|
+
The sharpest comparison is against `random_matched`: identical per-cycle
|
|
222
|
+
eviction counts, random victims.
|
|
223
|
+
|
|
224
|
+
| arm | kill rate | kill cycle (med) | damage before kill | tail delta | cum delta |
|
|
225
|
+
|---|---|---|---|---|---|
|
|
226
|
+
| survival | 1.00 | 0 | -751k | +435k | +12.0M |
|
|
227
|
+
| random_matched | 0.80 | 19 | -8.97M | -75k | -5.25M |
|
|
228
|
+
| keep_everything | 0.00 | never | -10.6M | -287k | -7.29M |
|
|
229
|
+
|
|
230
|
+
Same pruning rate, 12x the damage, negative steady state: outcome
|
|
231
|
+
direction is the active ingredient, not eviction itself. Full tables,
|
|
232
|
+
every baseline's best metric stated plainly, ablations over every knob,
|
|
233
|
+
and honest caveats: [docs/benchmarks.md](docs/benchmarks.md).
|
|
234
|
+
|
|
235
|
+
## Design notes
|
|
236
|
+
|
|
237
|
+
- **Energy ledger**: entries spawn at 1.0 energy, pay 0.05 upkeep per
|
|
238
|
+
cycle, earn `0.6 * tanh(delta / resource_scale)` when they decide a task
|
|
239
|
+
(supporting entries get 25% of that), and are capped at 5.0. Death is at
|
|
240
|
+
zero. All tunable via `MemoryStore` and `SurvivalConfig`.
|
|
241
|
+
- **Credit flows along provenance.** The query protocol reports which
|
|
242
|
+
entries decided and supported each answer, and only those entries are
|
|
243
|
+
touched by the outcome. In LLM mode no single entry decides a
|
|
244
|
+
synthesized answer, so credit spreads evenly across everything
|
|
245
|
+
consulted instead of inventing a winner. tanh keeps one disaster from
|
|
246
|
+
executing an entry that was right ninety-nine times, and one jackpot
|
|
247
|
+
from making an entry immortal.
|
|
248
|
+
- **Memory silence is a feature.** Retrieval has a relevance floor, and an
|
|
249
|
+
earlier version of this repo demonstrated why: entries matching only
|
|
250
|
+
structural tokens ("safe", "file") were deciding questions they knew
|
|
251
|
+
nothing about, getting executed for it, and being reborn. Better for
|
|
252
|
+
memory to say nothing than to guess.
|
|
253
|
+
- **Silence is conservative.** When memory is silent, `StorageEnv` keeps
|
|
254
|
+
the file: the safe reading of an irreversible action. A side effect
|
|
255
|
+
worth knowing: protective knowledge ("never delete X") eventually
|
|
256
|
+
starves because it is redundant with that default. The population
|
|
257
|
+
converges to exactly the knowledge that changes behavior.
|
|
258
|
+
|
|
259
|
+
The full concept-to-code mapping, including honest deviations from both
|
|
260
|
+
papers, is in [docs/paper-to-code.md](docs/paper-to-code.md).
|
|
261
|
+
|
|
262
|
+
## Tests
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
pip install -e ".[dev]"
|
|
266
|
+
pytest
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
The load-bearing test is `tests/test_survival.py`: poisoned advice must
|
|
270
|
+
die, useful advice must survive, and late cycles must stop destroying
|
|
271
|
+
protected data, all with no labels anywhere.
|
|
272
|
+
|
|
273
|
+
## Citations
|
|
274
|
+
|
|
275
|
+
This repo is an independent practical interpretation, not the official
|
|
276
|
+
code of either paper. If you build on the ideas, cite the originals:
|
|
277
|
+
|
|
278
|
+
```bibtex
|
|
279
|
+
@misc{quek2026memo,
|
|
280
|
+
title = {MeMo: Memory as a Model},
|
|
281
|
+
author = {Quek, Ryan Wei Heng and Lee, Sanghyuk and Leong, Alfred Wei Lun and
|
|
282
|
+
Verma, Arun and Prakash, Alok and Chen, Nancy F. and
|
|
283
|
+
Low, Bryan Kian Hsiang and Rus, Daniela and Solar-Lezama, Armando},
|
|
284
|
+
year = {2026},
|
|
285
|
+
eprint = {2605.15156},
|
|
286
|
+
archivePrefix = {arXiv},
|
|
287
|
+
url = {https://arxiv.org/abs/2605.15156}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
@misc{dodgson2026survival,
|
|
291
|
+
title = {Survival is the Only Reward: Sustainable Self-Training Through
|
|
292
|
+
Environment-Mediated Selection},
|
|
293
|
+
author = {Dodgson, Jennifer and Alhajir, Alfath Daryl and Joedhitya, Michael and
|
|
294
|
+
Pattirane, Akira Rafhael Janson and Kumar, Surender Suresh and
|
|
295
|
+
Lim, Joseph and Peh, C.H. and Ramdas, Adith and Zhexu, Steven Zhang},
|
|
296
|
+
year = {2026},
|
|
297
|
+
eprint = {2601.12310},
|
|
298
|
+
archivePrefix = {arXiv},
|
|
299
|
+
url = {https://arxiv.org/abs/2601.12310}
|
|
300
|
+
}
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
## License
|
|
304
|
+
|
|
305
|
+
MIT
|