nested-learning 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. nested_learning/__init__.py +12 -0
  2. nested_learning/__main__.py +12 -0
  3. nested_learning/assoc_memory.py +23 -0
  4. nested_learning/backbones.py +147 -0
  5. nested_learning/capabilities.py +104 -0
  6. nested_learning/cli.py +253 -0
  7. nested_learning/cms.py +92 -0
  8. nested_learning/config_utils.py +50 -0
  9. nested_learning/configs/ablations/cms_sparse.yaml +46 -0
  10. nested_learning/configs/ablations/selfmod_chunked_8_64.yaml +24 -0
  11. nested_learning/configs/ablations/selfmod_momentum_off.yaml +23 -0
  12. nested_learning/configs/ablations/selfmod_momentum_on.yaml +23 -0
  13. nested_learning/configs/ablations/selfmod_no_alpha.yaml +23 -0
  14. nested_learning/configs/ablations/selfmod_no_cms.yaml +23 -0
  15. nested_learning/configs/ablations/selfmod_rank1_precond_off.yaml +23 -0
  16. nested_learning/configs/data/continual_segments_sample.yaml +9 -0
  17. nested_learning/configs/data/fineweb_edu_longdoc_filtered_sample.yaml +14 -0
  18. nested_learning/configs/data/fineweb_edu_mixture_full.yaml +14 -0
  19. nested_learning/configs/data/fineweb_edu_mixture_sample.yaml +14 -0
  20. nested_learning/configs/data/refinedweb_mixture.yaml +48 -0
  21. nested_learning/configs/data/refinedweb_mixture_filtered.yaml +48 -0
  22. nested_learning/configs/data/refinedweb_mixture_full.yaml +48 -0
  23. nested_learning/configs/data/refinedweb_mixture_sample.yaml +51 -0
  24. nested_learning/configs/deepspeed/zero3.json +25 -0
  25. nested_learning/configs/hope/mid.yaml +118 -0
  26. nested_learning/configs/hope/mid_fsdp.yaml +47 -0
  27. nested_learning/configs/hope/pilot.yaml +2 -0
  28. nested_learning/configs/hope/pilot_attention.yaml +9 -0
  29. nested_learning/configs/hope/pilot_selfmod.yaml +20 -0
  30. nested_learning/configs/hope/pilot_transformer.yaml +9 -0
  31. nested_learning/configs/hope/target.yaml +145 -0
  32. nested_learning/configs/hope/target_fsdp.yaml +47 -0
  33. nested_learning/configs/mid_smoke.yaml +99 -0
  34. nested_learning/configs/mid_stage2.yaml +110 -0
  35. nested_learning/configs/mid_stage2_smoke.yaml +102 -0
  36. nested_learning/configs/mid_titan_baseline.yaml +92 -0
  37. nested_learning/configs/pilot.yaml +127 -0
  38. nested_learning/configs/pilot_paper_faithful.yaml +42 -0
  39. nested_learning/configs/pilot_selfmod_paper_faithful.yaml +18 -0
  40. nested_learning/configs/pilot_smoke.yaml +80 -0
  41. nested_learning/configs/resolved/cms_sparse_eval.yaml +105 -0
  42. nested_learning/configs/resolved/phase2_pilot_attention_eval.yaml +49 -0
  43. nested_learning/configs/resolved/phase2_pilot_transformer_eval.yaml +49 -0
  44. nested_learning/continual_classification.py +136 -0
  45. nested_learning/continual_streaming.py +283 -0
  46. nested_learning/data.py +153 -0
  47. nested_learning/device.py +21 -0
  48. nested_learning/eval_state.py +72 -0
  49. nested_learning/fast_state.py +108 -0
  50. nested_learning/functional.py +69 -0
  51. nested_learning/hope/__init__.py +0 -0
  52. nested_learning/hope/block.py +1973 -0
  53. nested_learning/hope/self_mod.py +40 -0
  54. nested_learning/instrumentation.py +38 -0
  55. nested_learning/levels.py +94 -0
  56. nested_learning/logging_utils.py +64 -0
  57. nested_learning/memorize.py +382 -0
  58. nested_learning/model.py +604 -0
  59. nested_learning/optim/__init__.py +0 -0
  60. nested_learning/optim/deep.py +102 -0
  61. nested_learning/optim/factory.py +13 -0
  62. nested_learning/optim/m3.py +121 -0
  63. nested_learning/optim/manager.py +151 -0
  64. nested_learning/titan/__init__.py +0 -0
  65. nested_learning/titan/memory.py +88 -0
  66. nested_learning/titan/model.py +412 -0
  67. nested_learning/titan/self_modifying.py +724 -0
  68. nested_learning/tokenizer.py +28 -0
  69. nested_learning/tokenizer_coverage.py +77 -0
  70. nested_learning/training.py +1600 -0
  71. nested_learning/transformer.py +104 -0
  72. nested_learning-0.2.0.dist-info/METADATA +390 -0
  73. nested_learning-0.2.0.dist-info/RECORD +76 -0
  74. nested_learning-0.2.0.dist-info/WHEEL +4 -0
  75. nested_learning-0.2.0.dist-info/entry_points.txt +2 -0
  76. nested_learning-0.2.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ import torch
6
+ from torch import nn
7
+
8
+ from .backbones import AttentionConfig, SelfAttention
9
+ from .fast_state import AttentionKVCache
10
+
11
+
12
+ @dataclass
13
+ class TransformerBlockConfig:
14
+ dim: int
15
+ heads: int
16
+ mlp_hidden_multiplier: int = 4
17
+ activation: str = "gelu"
18
+ qk_l2_norm: bool = False
19
+ local_conv_window: int | None = None
20
+
21
+
22
+ class FeedForward(nn.Module):
23
+ def __init__(
24
+ self,
25
+ dim: int,
26
+ *,
27
+ hidden_multiplier: int = 4,
28
+ activation: str = "gelu",
29
+ ) -> None:
30
+ super().__init__()
31
+ hidden = dim * hidden_multiplier
32
+ if activation == "relu":
33
+ act: nn.Module = nn.ReLU()
34
+ elif activation == "silu":
35
+ act = nn.SiLU()
36
+ else:
37
+ act = nn.GELU()
38
+ self.norm = nn.LayerNorm(dim)
39
+ self.net = nn.Sequential(
40
+ nn.Linear(dim, hidden, bias=False),
41
+ act,
42
+ nn.Linear(hidden, dim, bias=False),
43
+ )
44
+
45
+ def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore[override]
46
+ residual = x
47
+ x = self.norm(x)
48
+ return residual + self.net(x)
49
+
50
+
51
+ class TransformerBlock(nn.Module):
52
+ """
53
+ Baseline Transformer block: Attention -> MLP (no TITAN/CMS learning updates).
54
+
55
+ This is used for Phase 2 comparisons (HOPE-Attention vs standard Transformer).
56
+ """
57
+
58
+ def __init__(self, config: TransformerBlockConfig) -> None:
59
+ super().__init__()
60
+ self.config = config
61
+ self.attn = SelfAttention(
62
+ AttentionConfig(
63
+ dim=config.dim,
64
+ heads=config.heads,
65
+ qk_l2_norm=config.qk_l2_norm,
66
+ local_conv_window=config.local_conv_window,
67
+ )
68
+ )
69
+ self.mlp = FeedForward(
70
+ config.dim,
71
+ hidden_multiplier=config.mlp_hidden_multiplier,
72
+ activation=config.activation,
73
+ )
74
+
75
+ def forward(
76
+ self,
77
+ x: torch.Tensor,
78
+ *,
79
+ teach_signal: torch.Tensor | None = None,
80
+ surprise_value: float | None = None,
81
+ fast_state=None,
82
+ finalize_updates: bool = True,
83
+ attention_cache: AttentionKVCache | None = None,
84
+ return_attention_cache: bool = False,
85
+ differentiable_updates: bool = False,
86
+ ) -> torch.Tensor | tuple[torch.Tensor, AttentionKVCache]:
87
+ _ = (teach_signal, surprise_value, fast_state, finalize_updates, differentiable_updates)
88
+ if return_attention_cache:
89
+ attn_out, next_cache = self.attn(
90
+ x,
91
+ kv_cache=attention_cache,
92
+ return_kv_cache=True,
93
+ )
94
+ return self.mlp(attn_out), next_cache
95
+ return self.mlp(self.attn(x, kv_cache=attention_cache))
96
+
97
+ def set_surprise_threshold(self, threshold: float | None) -> None:
98
+ _ = threshold
99
+
100
+ def set_surprise_metric(self, metric: str) -> None:
101
+ _ = metric
102
+
103
+ def set_allowed_levels(self, allowed) -> None:
104
+ _ = allowed
@@ -0,0 +1,390 @@
1
+ Metadata-Version: 2.4
2
+ Name: nested-learning
3
+ Version: 0.2.0
4
+ Summary: Reproduction of Google's Nested Learning (HOPE) architecture
5
+ Author-email: Nested Learning Team <nested-learning@example.com>
6
+ License: Apache-2.0
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.10
9
+ Requires-Dist: datasets<3.0,>=2.19
10
+ Requires-Dist: einops>=0.7.0
11
+ Requires-Dist: huggingface-hub<1.0,>=0.23
12
+ Requires-Dist: hydra-core>=1.3.2
13
+ Requires-Dist: langdetect>=1.0.9
14
+ Requires-Dist: numpy>=1.26
15
+ Requires-Dist: omegaconf>=2.3.0
16
+ Requires-Dist: pyyaml>=6.0
17
+ Requires-Dist: sentencepiece>=0.2.0
18
+ Requires-Dist: torch<3,>=2.9
19
+ Requires-Dist: tqdm>=4.66
20
+ Requires-Dist: typer>=0.12
21
+ Requires-Dist: typing-extensions>=4.9
22
+ Requires-Dist: zstandard>=0.22.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: mypy>=1.11; extra == 'dev'
25
+ Requires-Dist: pytest-cov>=4.1; extra == 'dev'
26
+ Requires-Dist: pytest>=7.4; extra == 'dev'
27
+ Requires-Dist: ruff>=0.6.8; extra == 'dev'
28
+ Requires-Dist: types-pyyaml; extra == 'dev'
29
+ Provides-Extra: gpu
30
+ Requires-Dist: torchaudio<3,>=2.9; extra == 'gpu'
31
+ Requires-Dist: torchvision<1,>=0.24; extra == 'gpu'
32
+ Provides-Extra: logging
33
+ Requires-Dist: wandb>=0.18.0; extra == 'logging'
34
+ Provides-Extra: viz
35
+ Requires-Dist: matplotlib>=3.8; extra == 'viz'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # Nested Learning Reproduction
39
+
40
+ ![CI](https://github.com/kmccleary3301/nested_learning/actions/workflows/ci.yml/badge.svg)
41
+ ![Security](https://github.com/kmccleary3301/nested_learning/actions/workflows/security.yml/badge.svg)
42
+ ![Python](https://img.shields.io/badge/python-3.10%20to%203.12-blue)
43
+ ![PyTorch](https://img.shields.io/badge/pytorch-2.9.0-red)
44
+ ![License](https://img.shields.io/badge/license-Apache--2.0-green)
45
+ ![Status](https://img.shields.io/badge/tests-smoke--ready-lightgrey)
46
+
47
+ Mechanism-level reproduction of Google's Nested Learning (HOPE) architecture (HOPE blocks, CMS, and Self‑Modifying TITANs), matching the quality bar set by lucidrains' TITAN reference while remaining fully open-source and `uv` managed.
48
+
49
+ Faithfulness scope (high level):
50
+ - ✅ HOPE / CMS / Self‑Modifying Titans update rules + wiring (mechanism-level)
51
+ - ✅ Tensor-level invariants covered by unit tests (teach-signal, δℓ, CMS chunking, causality)
52
+ - ✅ Boundary-target online chunking + optional attention-cache carry path are implemented
53
+ - ⚠️ Stable default uses stop-grad online writes; an experimental single-process boundary-state mode supports differentiable write paths
54
+ - ⚠️ Multi‑GPU mechanism-auditing online updates are not supported in this repo (DDP disables some features)
55
+
56
+ Paper reference pin:
57
+ - Source: `google_papers/Nested_Learning_Full_Paper/Nested_Learning_Full_Paper.md`
58
+ - SHA-256: `7524af0724ac8e3bad9163bf0e79c85b490a26bc30b92d96b0bdf17a27f9febc`
59
+
60
+ ## Quickstart
61
+ ```bash
62
+ uv python install 3.12
63
+ uv sync --all-extras
64
+ uv run nl doctor --json > logs/runtime_doctor.json
65
+ uv run bash scripts/data/run_sample.sh
66
+ uv run nl smoke --config-name pilot_smoke --device cpu
67
+ uv run bash scripts/run_smoke.sh pilot # CPU-friendly HOPE block smoke test
68
+ uv run bash scripts/run_e2e_smoke.sh # sync + sample data + smoke train + zeroshot eval
69
+ uv run bash scripts/run_mechanism_audit_smoke.sh
70
+ uv run python scripts/eval/zeroshot.py \
71
+ --config configs/hope/pilot.yaml \
72
+ --checkpoint artifacts/examples/pilot_dummy.pt \
73
+ --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
74
+ --tasks piqa --max-samples 32 --device cpu
75
+ ```
76
+
77
+ ## Requirements
78
+ - Python 3.10-3.12
79
+ - PyTorch 2.9.x+ (golden environment in this repo uses 2.9.x)
80
+ - `uv` (recommended for development) or `pip` for package-style usage
81
+
82
+ ## Compatibility
83
+ - Support tiers and OS/runtime matrix: `docs/COMPATIBILITY_MATRIX.md`
84
+ - Versioning/stability policy: `docs/VERSIONING_POLICY.md`
85
+ - Golden repro environment: Python 3.12 + `uv lock` + PyTorch 2.9.x
86
+
87
+ ## Installation (pip-first)
88
+ 1. Create and activate a virtual environment.
89
+ 2. Install Torch first (CPU/CUDA wheel selection is backend-specific).
90
+ 3. Install this project.
91
+
92
+ CPU example:
93
+ ```bash
94
+ python -m venv .venv
95
+ source .venv/bin/activate
96
+ python -m pip install --upgrade pip
97
+ python -m pip install "torch>=2.9,<3" --index-url https://download.pytorch.org/whl/cpu
98
+ python -m pip install -e .
99
+ ```
100
+
101
+ CUDA example (adjust index URL to your CUDA runtime):
102
+ ```bash
103
+ python -m venv .venv
104
+ source .venv/bin/activate
105
+ python -m pip install --upgrade pip
106
+ python -m pip install "torch>=2.9,<3" --index-url https://download.pytorch.org/whl/cu128
107
+ python -m pip install -e .
108
+ ```
109
+
110
+ ## Setup (uv dev workflow)
111
+ ```bash
112
+ uv python install 3.12
113
+ uv sync --all-extras
114
+ ```
115
+
116
+ Developer checks:
117
+ - `uv run ruff check .`
118
+ - `uv run mypy src`
119
+ - `uv run pytest`
120
+ - `uv run bash scripts/checks/run_fidelity_ci_subset.sh`
121
+ - `uv run python scripts/checks/compliance_report.py --config configs/pilot.yaml --output eval/compliance_report.json`
122
+
123
+ ## CLI
124
+ The package ships with `nl` for portable workflows across local/dev/prod environments.
125
+
126
+ ```bash
127
+ # runtime compatibility snapshot
128
+ uv run nl doctor --json
129
+
130
+ # architecture/config smoke on chosen device
131
+ uv run nl smoke --config-name pilot_smoke --device cpu --batch-size 1 --seq-len 8
132
+
133
+ # static fidelity checks for a config
134
+ uv run nl audit --config-name pilot_paper_faithful
135
+
136
+ # train with Hydra overrides
137
+ uv run nl train --config-name pilot --override train.device=cuda:1 --override train.steps=100
138
+ ```
139
+
140
+ `python -m nested_learning ...` is also supported.
141
+
142
+ ## First 30 Minutes
143
+ Use this path for a fast first success on CPU:
144
+
145
+ ```bash
146
+ uv sync --all-extras
147
+ uv run bash scripts/data/run_sample.sh
148
+ uv run bash scripts/run_smoke.sh pilot
149
+ uv run bash scripts/run_mechanism_audit_smoke.sh
150
+ ```
151
+
152
+ This confirms:
153
+ - data/tokenizer pipeline is operational,
154
+ - model/training loop runs end-to-end,
155
+ - cadence checks pass for a mechanism-auditing smoke run.
156
+
157
+ ## Data Pipeline
158
+ 1. **Tokenizer training**
159
+ ```bash
160
+ uv run python scripts/data/train_tokenizer.py \
161
+ --manifest configs/data/refinedweb_mixture.yaml \
162
+ --vocab-size 32000 \
163
+ --output-dir artifacts/tokenizer/refinedweb_mix \
164
+ --log-file data/mixtures/refinedweb_mix_tokenizer.json
165
+ ```
166
+ 2. **Corpus filtering + sharding**
167
+ ```bash
168
+ uv run python scripts/data/process_mixture.py \
169
+ configs/data/refinedweb_mixture_filtered.yaml \
170
+ --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
171
+ --log-file data/mixtures/refinedweb_mix_filtered_shards.json
172
+ ```
173
+ 3. **Sample pipeline** (downloads/licensed datasets, filters, shards, records stats)
174
+ ```bash
175
+ uv run bash scripts/data/run_sample.sh
176
+ ```
177
+ 4. **Full pipeline** (set env vars like `RW_LIMIT`, `WIKI_LIMIT`, etc. to scale ingestion)
178
+ ```bash
179
+ uv run bash scripts/data/run_full.sh # default ~50k docs per corpus; increase limits as needed
180
+ ```
181
+
182
+ ### Data Troubleshooting
183
+ - If `scripts/data/run_sample.sh` cannot find `artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model`, rerun:
184
+ ```bash
185
+ uv run bash scripts/data/run_sample.sh
186
+ ```
187
+ The script auto-trains the tokenizer when missing.
188
+ - If `scripts/data/run_full.sh` fails with `Bad split: train. Available splits: ['test']`, use split fallback:
189
+ ```bash
190
+ FALLBACK_SPLIT=test uv run bash scripts/data/run_full.sh
191
+ ```
192
+ You can also override per-corpus splits (for example `RW_SPLIT=test`).
193
+
194
+ ## Training
195
+ - Single GPU / CPU:
196
+ ```bash
197
+ uv run nl train --config-name pilot_smoke
198
+ ```
199
+ - Apple Silicon (MPS, if available):
200
+ ```bash
201
+ uv run nl train --config-name pilot_smoke --override train.device=mps
202
+ ```
203
+ - Script-based entrypoint (legacy-compatible):
204
+ ```bash
205
+ uv run python train.py --config-name pilot_smoke
206
+ ```
207
+ - DDP (torchrun):
208
+ ```bash
209
+ torchrun --nproc_per_node=2 train_dist.py --config-name mid
210
+ ```
211
+ - CPU-only DDP smoke (verifies `gloo` backend and deterministic seeding):
212
+ ```bash
213
+ uv run bash scripts/run_cpu_ddp_smoke.sh
214
+ ```
215
+ - FSDP (see `docs/FSDP_SCALING_GUIDE.md` for VRAM/batch sizing):
216
+ ```bash
217
+ # 760M run
218
+ torchrun --nproc_per_node=2 train_fsdp.py --config-name hope/mid_fsdp
219
+ # 1.3B run
220
+ torchrun --nproc_per_node=2 train_fsdp.py --config-name hope/target_fsdp
221
+ ```
222
+ - DeepSpeed (requires `deepspeed` installed separately):
223
+ ```bash
224
+ deepspeed --num_gpus=2 train_deepspeed.py --config-name target \
225
+ deepspeed.config=configs/deepspeed/zero3.json
226
+ ```
227
+
228
+ ### Mechanism-auditing presets (HOPE / Nested Learning)
229
+
230
+ Use the mechanism-auditing preset configs (single GPU):
231
+
232
+ ```bash
233
+ uv run python train.py --config-name pilot_paper_faithful
234
+ # HOPE self-mod variant:
235
+ uv run python train.py --config-name pilot_selfmod_paper_faithful
236
+ ```
237
+
238
+ Notes:
239
+ - These presets set `data.batch_size=1` to avoid cross-sample fast-memory sharing.
240
+ - Online chunking supports one-token overlap **or** explicit boundary-target mode (`train.online_boundary_targets=true`).
241
+ - Optional attention-state carry across chunks is available in training via `train.online_carry_attention_cache=true`.
242
+ - The exact sequence/segment/chunk/buffer semantics are documented in `docs/STREAMING_CONTRACT.md`.
243
+
244
+ Overrides:
245
+ - `optim.type=m3` (paper optimizer option)
246
+ - `train.steps=...` / `train.device=...`
247
+
248
+ See `docs/PAPER_COMPLIANCE.md` for full fidelity notes.
249
+ See `docs/STREAMING_CONTRACT.md` for the precise streaming/update contract used by this repo.
250
+
251
+ ## Scope Boundaries (Current)
252
+ - This repo targets mechanism-auditing fidelity, not full paper-scale results parity.
253
+ - Boundary-state gradient-through-write exists as an experimental constrained path; it is not yet treated as production/full-scale paper reproduction.
254
+ - Distributed mechanism-auditing path for boundary-target + attention-cache carry is not implemented.
255
+
256
+ ### Pilot (3 B tokens) workflow
257
+ 1. Ensure TMUX session:
258
+ ```bash
259
+ tmux new -s pilot_train
260
+ ```
261
+ 2. Launch the long run on `cuda:1` (≈52 h wall clock):
262
+ ```bash
263
+ set -a && source git.env && set +a
264
+ export UV_CACHE_DIR=/tmp/uv-cache UV_LINK_MODE=copy
265
+ uv run python train.py --config-name pilot \
266
+ logging.enabled=true logging.backend=wandb \
267
+ logging.project=nested-learning logging.run_name=pilot-main-$(date +%Y%m%d%H%M%S) \
268
+ train.device=cuda:1
269
+ ```
270
+ 3. Checkpoints appear in `artifacts/checkpoints/pilot/step_*.pt` every 1 000 steps; the accompanying W&B run captures full telemetry.
271
+ 4. Copy the final checkpoint, config, logs, and eval JSON/CSV into `artifacts/pilot_release/` for distribution.
272
+
273
+ ## Logging
274
+ Set `logging.enabled=true` in Hydra configs (or override via CLI) to send metrics to W&B (default). For local JSON logs, use `logging.backend=json logging.path=logs/run.json`. Sample outputs reside in `logs/` and `artifacts/examples/`.
275
+
276
+ ## Evaluation
277
+ - Zero-shot:
278
+ ```bash
279
+ uv run python scripts/eval/zeroshot.py \
280
+ --config configs/hope/mid.yaml \
281
+ --checkpoint checkpoints/mid/step_000100.pt \
282
+ --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
283
+ --tasks all --max-samples 200 --device cuda:0
284
+ ```
285
+ Use `uv run python scripts/eval/zeroshot.py --list-tasks` to display the full benchmark roster (PIQA, HellaSwag, WinoGrande, ARC-E/C, BoolQ, SIQA, CommonsenseQA, OpenBookQA). See `docs/zeroshot_eval.md` for details.
286
+ - Needle-in-a-Haystack:
287
+ ```bash
288
+ uv run python scripts/eval/niah.py \
289
+ --config configs/hope/mid.yaml \
290
+ --checkpoint checkpoints/mid/step_000100.pt \
291
+ --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
292
+ --context-lengths 2048 4096 8192 --samples-per-length 20
293
+ ```
294
+ - Continual-learning forgetting:
295
+ ```bash
296
+ uv run python scripts/eval/continual.py \
297
+ --config configs/hope/mid.yaml \
298
+ --checkpoints checkpoints/mid/step_000050.pt checkpoints/mid/step_000100.pt \
299
+ --segments-yaml configs/data/continual_segments_sample.yaml \
300
+ --batch-size 4 --max-batches 10 --memorize --memorize-steps 2
301
+ ```
302
+ Plot forgetting curves via `uv run python scripts/eval/plot_forgetting.py --continual-json eval/continual_mid.json`.
303
+ - Long-context diagnostics:
304
+ ```bash
305
+ uv run python scripts/eval/passkey.py --config configs/hope/pilot.yaml --checkpoint artifacts/checkpoints/pilot/step_230000.pt \
306
+ --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model --samples 64 --memorize
307
+
308
+ uv run python scripts/eval/pg19_perplexity.py --config configs/hope/pilot.yaml --checkpoint artifacts/checkpoints/pilot/step_230000.pt \
309
+ --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model --max-samples 64
310
+ ```
311
+
312
+ Evaluation summaries are written to `eval/` alongside per-task JSON metrics.
313
+
314
+ ### Test-time memorization toggles
315
+ Every evaluator supports TITAN-style memorization so you can reproduce test-time adaptation:
316
+ ```bash
317
+ uv run python scripts/eval/zeroshot.py \
318
+ ... \
319
+ --memorize \
320
+ --memorize-steps 2 \
321
+ --memorize-use-correct-answer \
322
+ --memorize-no-reset # optional: retain updates across samples
323
+ --memorize-paths titan,cms_fast \
324
+ --memorize-surprise-threshold 0.01
325
+ ```
326
+ - `--memorize` turns on the learner with one LMS step per example by default.
327
+ - `--memorize-steps` controls the number of adaptation passes per prompt.
328
+ - `--memorize-use-correct-answer` injects ground-truth text during memorization for ablations.
329
+ - `--memorize-no-reset` carries memories across samples; omit it to reset every question.
330
+ - `--memorize-paths` restricts which levels receive teach-signal updates (`titan`, `cms_fast`, or `all`).
331
+ - `--memorize-surprise-threshold` gates updates on average teach-signal norm, matching the paper’s surprise trigger.
332
+
333
+ Memorization metrics (baseline vs adaptive) are emitted alongside task accuracy for easy comparisons.
334
+
335
+ ## Architecture variants
336
+ Select the paper-defined variant via `model.block_variant` in Hydra configs:
337
+ - `hope_attention` (paper HOPE-Attention): `Attention → CMS` (paper-defined).
338
+ - `hope_selfmod` (paper HOPE scaffold): `Self-modifying Titans (Eqs. 83–93; Eq. 91 residual MLP memories) → CMS` with (by default) **fixed q** and **local conv window=4**, plus chunked updates via `model.self_mod_chunk_size` (others) and `model.self_mod_chunk_size_memory` (M_memory). See `docs/PAPER_COMPLIANCE.md` for the “differentiable read / update-pass writes” semantics.
339
+ - `hope_hybrid` (legacy): `Attention + TitanMemory + CMS` (exploratory; not paper-defined).
340
+ - `transformer` (baseline): `Attention → MLP` (no TITAN/CMS learning updates; useful for Phase 2 comparisons).
341
+
342
+ Self-modifying Titans knobs (ablation-friendly, paper-aligned):
343
+ - `model.self_mod_objective` (`l2` vs `dot`), `model.self_mod_use_rank1_precond` (DGD-like preconditioner), `model.self_mod_use_alpha` (weight-decay/retention gate), `model.self_mod_stopgrad_vhat`, `model.self_mod_momentum`, `model.self_mod_adaptive_q`, `model.self_mod_local_conv_window`.
344
+
345
+ ## Fast state (Nested Learning semantics)
346
+ In-context updates can run against a per-context fast state so meta parameters never change:
347
+ - `HOPEModel.init_fast_state()` / `TitanOnlyModel.init_fast_state()` returns a `ModelFastState`.
348
+ - `MemorizeConfig.use_fast_state=true` (default) requires passing `fast_state` into `memorize_tokens()` / `memorize_sequence()`; evaluation scripts handle this automatically.
349
+ - Training can also run update passes against a per-batch fast state via `train.use_fast_state=true` (meta+delta fast state: meta params are learnable; online updates write deltas only). If `data.batch_size>1`, CMS/TITAN fast state is shared across the batch; use `data.batch_size=1` for strict per-context semantics. See `docs/PAPER_COMPLIANCE.md`.
350
+
351
+ ## Releases
352
+ Before tagging or announcing a new checkpoint, work through:
353
+ - `docs/release_checklist.md` (model/eval artifact release bundle)
354
+ - `docs/PACKAGE_RELEASE_CHECKLIST.md` (package/GitHub/PyPI release flow)
355
+ - `docs/PYPI_TRUSTED_PUBLISHING.md` (one-time OIDC setup for TestPyPI/PyPI)
356
+
357
+ For versioning semantics and breaking-change expectations, see `docs/VERSIONING_POLICY.md`.
358
+
359
+ For reproducibility bug reports, use `docs/BUG_REPORT_CHECKLIST.md`.
360
+
361
+ ## Performance & optimizer options
362
+ - **Mixed precision:** enable bf16 autocast via `train.mixed_precision.enabled=true train.mixed_precision.dtype=bf16` (already enabled in pilot/mid/target configs).
363
+ - **`torch.compile`:** accelerate attention/core loops by toggling `train.compile.enable=true train.compile.mode=max-autotune`; failure falls back to eager unless `train.compile.strict=true`.
364
+ - **Muon hybrid (default):** all HOPE configs now set `optim.type=muon`, routing ≥2D tensors through PyTorch 2.9's Muon optimizer while embeddings/norms stay on AdamW. Training logs emit `optim.muon_param_elems` / `optim.adamw_param_elems` so you can confirm the split.
365
+ - **Fused AdamW fallback:** override with `optim.type=adamw optim.fused=auto` if Muon is unavailable or if you want to compare against the AdamW ablation in `reports/ablations.md`.
366
+ - **Surprise gating:** set `model.surprise_threshold=<float>` to gate all inner updates. By default the surprise metric is the average L2 norm of the (scaled/clipped) teach signal (`model.surprise_metric=l2`); you can also use `loss` or `logit_entropy` for ablations. Evaluation CLIs expose `--memorize-surprise-threshold` for ad-hoc gating.
367
+
368
+ All Hydra knobs can be overridden from the CLI or composed via config groups (`configs/hope/*.yaml`). Use these flags in tandem with `scripts/run_e2e_smoke.sh` (automation) or `scripts/run_cpu_ddp_smoke.sh` (CPU-only determinism check) to validate releases quickly.
369
+
370
+ ## Documentation & References
371
+ - `docs/IMPLEMENTATION_STATUS.md` – current mechanism-level status matrix.
372
+ - `docs/PAPER_COMPLIANCE.md` – equation-to-code fidelity notes and explicit boundaries.
373
+ - `docs/STREAMING_CONTRACT.md` – exact sequence/segment/chunk/update semantics.
374
+ - `docs/release_checklist.md` – release readiness checklist.
375
+ - `docs/data_pipeline.md` – large-scale sharding/tokenizer workflow.
376
+ - `docs/scaling_guidance.md` – roadmap for expanding data + compute footprints.
377
+ - `docs/stage2_plan.md` – Stage 2 architecture + experiment roadmap.
378
+ - `docs/PHASE_2_PLAN.md` – detailed Phase 2 execution plan.
379
+ - `docs/PLAN_PROGRESS_P7.md` – progress tracker for the latest faithfulness remediation sprint.
380
+ - `docs/experiments_report.md` – draft paper covering completed experiments.
381
+ - `docs/future_directions.md` – prioritized roadmap after the initial release.
382
+ - `reports/stage2_smoke.md` – exact commands/artifacts for the release-ready smoke workflow.
383
+ - `docs/FSDP_SCALING_GUIDE.md` – dual-RTX 6000 Ada instructions for the mid/target FSDP configs.
384
+ - `google_papers/` – PDFs/markdown of Nested Learning & TITAN papers.
385
+ - `CHANGELOG.md` – user-facing changes per release.
386
+
387
+ ## Contributing
388
+ 1. Run formatting/tests (`uv run ruff check .`, `uv run pytest`).
389
+ 2. Document new configs or scripts in the relevant docs under `docs/` and update `CHANGELOG.md`.
390
+ 3. Open a PR referencing the relevant NL/TITAN spec sections and tests.
@@ -0,0 +1,76 @@
1
+ nested_learning/__init__.py,sha256=mCyCGTd8lICAuNgITh_BtLVoVFjg8mTNjtuQRUxNtgE,379
2
+ nested_learning/__main__.py,sha256=lT_tvQZf-ZPcGVSjIm7_KL1WBYwkz4KkL7_WDcMud-Y,130
3
+ nested_learning/assoc_memory.py,sha256=nXbuSlH41J5PoENb9xCDAysdwqvW4l9x9bQ4gpAByqc,584
4
+ nested_learning/backbones.py,sha256=7bLMdbeEaodl_nTtkpQDb37TwAMqYGdOEUCXwdz6jZA,5651
5
+ nested_learning/capabilities.py,sha256=Yu0ojfpGq7uvAj2TfXTU03K2i767iL4L9m6R9Ak7ERI,3596
6
+ nested_learning/cli.py,sha256=61DK4Yb9B1dPBuIgxw_pZUd7z8N5R_DtVoLbbwewMkM,8294
7
+ nested_learning/cms.py,sha256=uP67svtGbjH-DW0d2HK48U14W0cnyBTo93-W4O6ZwIE,2725
8
+ nested_learning/config_utils.py,sha256=CNYWrXLavcnE19b0t6QSyznexVGaYFgiRFZrzeMKXBk,1542
9
+ nested_learning/continual_classification.py,sha256=8XQJw8jhNvxs-lM7F0fYcSk8CalRkSUj28QBwyie6GA,3692
10
+ nested_learning/continual_streaming.py,sha256=suIgKzBkDrYEKgToHHsklt8wIEaZQ7Zc9ad1R26QJnA,9837
11
+ nested_learning/data.py,sha256=zyTTBJW9JBahB-Zf6PYH5xbGe1VGJnBsABm85-vA9R0,5259
12
+ nested_learning/device.py,sha256=PynH0ptNapWoCCg9SXStajkSfR19Qk8HKHofw2PwKa4,742
13
+ nested_learning/eval_state.py,sha256=XHdRpY4ZlOOsScva5V9B6xK2HEM4JpxJhaK6E915_D0,2226
14
+ nested_learning/fast_state.py,sha256=ccbHWFiAJaPzjIhHC61EqCpgoHGLVml9vkYu3a9CDiY,3216
15
+ nested_learning/functional.py,sha256=VnissTh-E2IzGDEyy_p-4jbr6Uytgk8H7LYSRfykqto,1844
16
+ nested_learning/instrumentation.py,sha256=bMiJ_WDRZ5jtQfiXlhHoLXKjBHREH-6_RmLrSggn8XA,1176
17
+ nested_learning/levels.py,sha256=Cb3hpeuQlKK4PEvaPo5qSLc64iBCkpS8lsGGBiUy9S8,2966
18
+ nested_learning/logging_utils.py,sha256=WiEgc8Hp7I-g2pgS-V5Xaeq_qtNo_z5m5A55EdEwAFg,1961
19
+ nested_learning/memorize.py,sha256=mhNKrzOk_gvViumppi_U9UhxkW2BdoHeZDzbIw0y_-Q,15863
20
+ nested_learning/model.py,sha256=6xQHQcasBqtMk8LxGlm6BI8QOIW6JafHai9LvQc7wSo,25855
21
+ nested_learning/tokenizer.py,sha256=v4KZEXMMBirHFe1rbNU-_GbQo4tXDw5V015XNZlHeko,881
22
+ nested_learning/tokenizer_coverage.py,sha256=zFWAUFoaaugG_iTL43i23bDrgzuS8viwcZ8MVDrIK2U,2723
23
+ nested_learning/training.py,sha256=Afkmc4ku0XAG-THnbf-M3XPvTa5RdjKY1rPdc9NrJnk,63618
24
+ nested_learning/transformer.py,sha256=cQUQ-cxbTRWSDN1GV3nyzlAm6zeAekRHFVFzmkLRzCU,3047
25
+ nested_learning/hope/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ nested_learning/hope/block.py,sha256=9L--naAZJW5K9h5c_V3STlJQA5LAq4YpB5Orjl2D0y0,79373
27
+ nested_learning/hope/self_mod.py,sha256=0m6P3SvrWdXqG1Fm3BlRI98VJGyKifueHfbIIDzqHNU,1384
28
+ nested_learning/optim/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ nested_learning/optim/deep.py,sha256=xVz4qzTtv_RI5VpMxpc8ND-qfhVxigyLbmjMrGAqbXw,3507
30
+ nested_learning/optim/factory.py,sha256=933VZ_8aSUeg-QStqDHQDs6v9d9iKh0FUNUJxozIglc,394
31
+ nested_learning/optim/m3.py,sha256=bWlsOyU8UPyElNG9Jg-8olWJVIci7B2Wvfe1hlm8C90,4067
32
+ nested_learning/optim/manager.py,sha256=_q6uvc_g4DvKJeVulr9bQeI2yDWg405WUHlLYHdaiTw,5278
33
+ nested_learning/titan/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ nested_learning/titan/memory.py,sha256=_KokLWLqpTtuRPFrwvAIjNYrMm90EV3gyoA2Y6z_i8U,2913
35
+ nested_learning/titan/model.py,sha256=ZcxGYtxaanRPvnR9qAhYRvLb12tuc9ucwnMuvqGY_iU,16408
36
+ nested_learning/titan/self_modifying.py,sha256=IMN_kirJAyQupSS9FuVX3Q4KZq18SlsIJULJgpFhPo4,27665
37
+ nested_learning/configs/mid_smoke.yaml,sha256=kQ6zJQsJraHjJbt4AD-yvOT5kaRC8-zFOSOo_V1bSTQ,1926
38
+ nested_learning/configs/mid_stage2.yaml,sha256=POhkK2qWjVjlmMNvGzu2EUHOfgJSyGw0Aa7dvOs6dYk,2146
39
+ nested_learning/configs/mid_stage2_smoke.yaml,sha256=Tqmx7qnOLq2XT-BrosdX6iHBepF-alTTPzxkEjR-xOo,1990
40
+ nested_learning/configs/mid_titan_baseline.yaml,sha256=KIYC8Ix-ecKxaBLonvORh6FJz9VhATX4EWAVsH7DLa4,1773
41
+ nested_learning/configs/pilot.yaml,sha256=jpqzfqV51zbPXGKiEP1SuaffewXi-iObsLWtchlNARY,2632
42
+ nested_learning/configs/pilot_paper_faithful.yaml,sha256=atghlgOe3DrqdUgD8rKx8pvJzjFmeQNdseXjmjnB1sM,1578
43
+ nested_learning/configs/pilot_selfmod_paper_faithful.yaml,sha256=uA9jPgguqxknql_EYSn58T7MCh-az8npaYAXmLpQz1Y,446
44
+ nested_learning/configs/pilot_smoke.yaml,sha256=X_r4kvSrAWk4129n7XjVANA9Czsx3kIhKE61Ye9geq4,1393
45
+ nested_learning/configs/ablations/cms_sparse.yaml,sha256=HisciXA82J8accmLYnQRCN-zblRuVF5Aq86nFV1QQ2Q,858
46
+ nested_learning/configs/ablations/selfmod_chunked_8_64.yaml,sha256=J7MTx125eaJWaTpZdodudokBbiCLDLYa7Y0ZYvPQUPw,476
47
+ nested_learning/configs/ablations/selfmod_momentum_off.yaml,sha256=ip8pClpLLm99WHoCyb_oxHzsxik5rY-6W8nA0F6Xytk,443
48
+ nested_learning/configs/ablations/selfmod_momentum_on.yaml,sha256=f36QxMltp1-q5mZ9EvE54UKwzxL0HIV4ObWW-EbAfT0,440
49
+ nested_learning/configs/ablations/selfmod_no_alpha.yaml,sha256=QlUELjfSBTCWBIpN2pHI8RpVsC8R7ESzJ8Nj6FG54PU,434
50
+ nested_learning/configs/ablations/selfmod_no_cms.yaml,sha256=VGO0MVhq_Axih9ekdCmpHEXZv74RNwE0PpmZ87NnTgg,417
51
+ nested_learning/configs/ablations/selfmod_rank1_precond_off.yaml,sha256=q06eSRFtyVDtihx7VRmPhJ9ZdW7h_TXaQcayxfEcimA,445
52
+ nested_learning/configs/data/continual_segments_sample.yaml,sha256=j-oODM2UDKK5gj6RsRhJRuwb39wpzFo2yFlU_WA9CV8,284
53
+ nested_learning/configs/data/fineweb_edu_longdoc_filtered_sample.yaml,sha256=fyAez4AkhCcsBTZbKZUKrNTxkpIyAuc99jClOqZ2C6U,414
54
+ nested_learning/configs/data/fineweb_edu_mixture_full.yaml,sha256=uLHDZObs0Lewail63dUnk9IP1u7NRyqaz46zr3IozOU,353
55
+ nested_learning/configs/data/fineweb_edu_mixture_sample.yaml,sha256=T7f7TUhgul48CCD75dc8gqqnAd48qeTui2Zn7D2LPEc,355
56
+ nested_learning/configs/data/refinedweb_mixture.yaml,sha256=51QeWc-MCYp1Hvvd-4euHhNLHaybfC7Y3rO-0YhvNbE,1239
57
+ nested_learning/configs/data/refinedweb_mixture_filtered.yaml,sha256=0EhG1iL9IGp9a-NNZ0um0v1Sy8j6b_cjYCedFXuGAr4,1294
58
+ nested_learning/configs/data/refinedweb_mixture_full.yaml,sha256=KvsofwVgjVaJ2wIX11ZgGBCTBCmEBJoamClQ8D1ZuSY,1270
59
+ nested_learning/configs/data/refinedweb_mixture_sample.yaml,sha256=XnUeAkiihw9hGOoh68MS7MH4nCNQKre4SnROkfQlBUk,1288
60
+ nested_learning/configs/deepspeed/zero3.json,sha256=QbqP_AVNFqNsC5D2D7DxynFQmfDxFRSUrIG0RJVpc-o,462
61
+ nested_learning/configs/hope/mid.yaml,sha256=Snhuay5AJ0gR763cW2s0L_2vlYhA-Lt5daUZFCqv0Ko,2284
62
+ nested_learning/configs/hope/mid_fsdp.yaml,sha256=aPT_BdqGN8WcedIXYWUX7DjVtdmY3tvat1L4kVwKl00,887
63
+ nested_learning/configs/hope/pilot.yaml,sha256=6CnWGt2kY0O7iG7J09h_JsVmblUbhQxZURd-Jmspw2w,21
64
+ nested_learning/configs/hope/pilot_attention.yaml,sha256=8JsAXyULT2t3FV7WvHOgcl-idTGrrekeBYjtKcrByGI,115
65
+ nested_learning/configs/hope/pilot_selfmod.yaml,sha256=A5mFZrI1AkJzO041AVCWYjgDtKXwTvHVIN9YBrBWync,436
66
+ nested_learning/configs/hope/pilot_transformer.yaml,sha256=0u9MayjjXlgLODkqWQ1cRZ8msaJ-8aokw4pwBGrIUpY,112
67
+ nested_learning/configs/hope/target.yaml,sha256=Wm4PGyjT8TswW6fguM4yh8_0YNM63JqIJxv-9RCluGI,2912
68
+ nested_learning/configs/hope/target_fsdp.yaml,sha256=mDPj3827bOZm43ZD976Q3Pa4uwM-LSSQLS0IsQziveI,878
69
+ nested_learning/configs/resolved/cms_sparse_eval.yaml,sha256=h4PrYtINvTs3-cKF_rRwrJAktHeVfqXQaQAtoUujfAo,2110
70
+ nested_learning/configs/resolved/phase2_pilot_attention_eval.yaml,sha256=n2zewCxfVgNjlQV9zEZ7QBLhdzZda8E_MMYYIsqvx-4,1008
71
+ nested_learning/configs/resolved/phase2_pilot_transformer_eval.yaml,sha256=LSfw7NQHT9qWMoXA2hRUcfjm4XfFYDlH814_4t9EAVg,1005
72
+ nested_learning-0.2.0.dist-info/METADATA,sha256=N4Q8ikRIJFXzcXGzbK25YgDT69Qs__KNhHOXXuByYV0,18727
73
+ nested_learning-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
74
+ nested_learning-0.2.0.dist-info/entry_points.txt,sha256=kHVkNi_IXL_a8JTBGuqp86jzflHXUFoO14dLEgNM3_Q,47
75
+ nested_learning-0.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
76
+ nested_learning-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ nl = nested_learning.cli:app