PyPI - nested-learning - Versions diffs - 0.2.0__py3-none-any.whl - Mend

nested-learning 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

nested_learning/__init__.py +12 -0
nested_learning/__main__.py +12 -0
nested_learning/assoc_memory.py +23 -0
nested_learning/backbones.py +147 -0
nested_learning/capabilities.py +104 -0
nested_learning/cli.py +253 -0
nested_learning/cms.py +92 -0
nested_learning/config_utils.py +50 -0
nested_learning/configs/ablations/cms_sparse.yaml +46 -0
nested_learning/configs/ablations/selfmod_chunked_8_64.yaml +24 -0
nested_learning/configs/ablations/selfmod_momentum_off.yaml +23 -0
nested_learning/configs/ablations/selfmod_momentum_on.yaml +23 -0
nested_learning/configs/ablations/selfmod_no_alpha.yaml +23 -0
nested_learning/configs/ablations/selfmod_no_cms.yaml +23 -0
nested_learning/configs/ablations/selfmod_rank1_precond_off.yaml +23 -0
nested_learning/configs/data/continual_segments_sample.yaml +9 -0
nested_learning/configs/data/fineweb_edu_longdoc_filtered_sample.yaml +14 -0
nested_learning/configs/data/fineweb_edu_mixture_full.yaml +14 -0
nested_learning/configs/data/fineweb_edu_mixture_sample.yaml +14 -0
nested_learning/configs/data/refinedweb_mixture.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_filtered.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_full.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_sample.yaml +51 -0
nested_learning/configs/deepspeed/zero3.json +25 -0
nested_learning/configs/hope/mid.yaml +118 -0
nested_learning/configs/hope/mid_fsdp.yaml +47 -0
nested_learning/configs/hope/pilot.yaml +2 -0
nested_learning/configs/hope/pilot_attention.yaml +9 -0
nested_learning/configs/hope/pilot_selfmod.yaml +20 -0
nested_learning/configs/hope/pilot_transformer.yaml +9 -0
nested_learning/configs/hope/target.yaml +145 -0
nested_learning/configs/hope/target_fsdp.yaml +47 -0
nested_learning/configs/mid_smoke.yaml +99 -0
nested_learning/configs/mid_stage2.yaml +110 -0
nested_learning/configs/mid_stage2_smoke.yaml +102 -0
nested_learning/configs/mid_titan_baseline.yaml +92 -0
nested_learning/configs/pilot.yaml +127 -0
nested_learning/configs/pilot_paper_faithful.yaml +42 -0
nested_learning/configs/pilot_selfmod_paper_faithful.yaml +18 -0
nested_learning/configs/pilot_smoke.yaml +80 -0
nested_learning/configs/resolved/cms_sparse_eval.yaml +105 -0
nested_learning/configs/resolved/phase2_pilot_attention_eval.yaml +49 -0
nested_learning/configs/resolved/phase2_pilot_transformer_eval.yaml +49 -0
nested_learning/continual_classification.py +136 -0
nested_learning/continual_streaming.py +283 -0
nested_learning/data.py +153 -0
nested_learning/device.py +21 -0
nested_learning/eval_state.py +72 -0
nested_learning/fast_state.py +108 -0
nested_learning/functional.py +69 -0
nested_learning/hope/__init__.py +0 -0
nested_learning/hope/block.py +1973 -0
nested_learning/hope/self_mod.py +40 -0
nested_learning/instrumentation.py +38 -0
nested_learning/levels.py +94 -0
nested_learning/logging_utils.py +64 -0
nested_learning/memorize.py +382 -0
nested_learning/model.py +604 -0
nested_learning/optim/__init__.py +0 -0
nested_learning/optim/deep.py +102 -0
nested_learning/optim/factory.py +13 -0
nested_learning/optim/m3.py +121 -0
nested_learning/optim/manager.py +151 -0
nested_learning/titan/__init__.py +0 -0
nested_learning/titan/memory.py +88 -0
nested_learning/titan/model.py +412 -0
nested_learning/titan/self_modifying.py +724 -0
nested_learning/tokenizer.py +28 -0
nested_learning/tokenizer_coverage.py +77 -0
nested_learning/training.py +1600 -0
nested_learning/transformer.py +104 -0
nested_learning-0.2.0.dist-info/METADATA +390 -0
nested_learning-0.2.0.dist-info/RECORD +76 -0
nested_learning-0.2.0.dist-info/WHEEL +4 -0
nested_learning-0.2.0.dist-info/entry_points.txt +2 -0
nested_learning-0.2.0.dist-info/licenses/LICENSE +201 -0

nested_learning/transformer.py ADDED Viewed

@@ -0,0 +1,104 @@
+from __future__ import annotations
+from dataclasses import dataclass
+import torch
+from torch import nn
+from .backbones import AttentionConfig, SelfAttention
+from .fast_state import AttentionKVCache
+@dataclass
+class TransformerBlockConfig:
+    dim: int
+    heads: int
+    mlp_hidden_multiplier: int = 4
+    activation: str = "gelu"
+    qk_l2_norm: bool = False
+    local_conv_window: int | None = None
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        *,
+        hidden_multiplier: int = 4,
+        activation: str = "gelu",
+    ) -> None:
+        super().__init__()
+        hidden = dim * hidden_multiplier
+        if activation == "relu":
+            act: nn.Module = nn.ReLU()
+        elif activation == "silu":
+            act = nn.SiLU()
+        else:
+            act = nn.GELU()
+        self.norm = nn.LayerNorm(dim)
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden, bias=False),
+            act,
+            nn.Linear(hidden, dim, bias=False),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
+        residual = x
+        x = self.norm(x)
+        return residual + self.net(x)
+class TransformerBlock(nn.Module):
+    """
+    Baseline Transformer block: Attention -> MLP (no TITAN/CMS learning updates).
+    This is used for Phase 2 comparisons (HOPE-Attention vs standard Transformer).
+    """
+    def __init__(self, config: TransformerBlockConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.attn = SelfAttention(
+            AttentionConfig(
+                dim=config.dim,
+                heads=config.heads,
+                qk_l2_norm=config.qk_l2_norm,
+                local_conv_window=config.local_conv_window,
+            )
+        )
+        self.mlp = FeedForward(
+            config.dim,
+            hidden_multiplier=config.mlp_hidden_multiplier,
+            activation=config.activation,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        *,
+        teach_signal: torch.Tensor | None = None,
+        surprise_value: float | None = None,
+        fast_state=None,
+        finalize_updates: bool = True,
+        attention_cache: AttentionKVCache | None = None,
+        return_attention_cache: bool = False,
+        differentiable_updates: bool = False,
+    ) -> torch.Tensor | tuple[torch.Tensor, AttentionKVCache]:
+        _ = (teach_signal, surprise_value, fast_state, finalize_updates, differentiable_updates)
+        if return_attention_cache:
+            attn_out, next_cache = self.attn(
+                x,
+                kv_cache=attention_cache,
+                return_kv_cache=True,
+            )
+            return self.mlp(attn_out), next_cache
+        return self.mlp(self.attn(x, kv_cache=attention_cache))
+    def set_surprise_threshold(self, threshold: float | None) -> None:
+        _ = threshold
+    def set_surprise_metric(self, metric: str) -> None:
+        _ = metric
+    def set_allowed_levels(self, allowed) -> None:
+        _ = allowed

nested_learning-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,390 @@
+Metadata-Version: 2.4
+Name: nested-learning
+Version: 0.2.0
+Summary: Reproduction of Google's Nested Learning (HOPE) architecture
+Author-email: Nested Learning Team <nested-learning@example.com>
+License: Apache-2.0
+License-File: LICENSE
+Requires-Python: >=3.10
+Requires-Dist: datasets<3.0,>=2.19
+Requires-Dist: einops>=0.7.0
+Requires-Dist: huggingface-hub<1.0,>=0.23
+Requires-Dist: hydra-core>=1.3.2
+Requires-Dist: langdetect>=1.0.9
+Requires-Dist: numpy>=1.26
+Requires-Dist: omegaconf>=2.3.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: sentencepiece>=0.2.0
+Requires-Dist: torch<3,>=2.9
+Requires-Dist: tqdm>=4.66
+Requires-Dist: typer>=0.12
+Requires-Dist: typing-extensions>=4.9
+Requires-Dist: zstandard>=0.22.0
+Provides-Extra: dev
+Requires-Dist: mypy>=1.11; extra == 'dev'
+Requires-Dist: pytest-cov>=4.1; extra == 'dev'
+Requires-Dist: pytest>=7.4; extra == 'dev'
+Requires-Dist: ruff>=0.6.8; extra == 'dev'
+Requires-Dist: types-pyyaml; extra == 'dev'
+Provides-Extra: gpu
+Requires-Dist: torchaudio<3,>=2.9; extra == 'gpu'
+Requires-Dist: torchvision<1,>=0.24; extra == 'gpu'
+Provides-Extra: logging
+Requires-Dist: wandb>=0.18.0; extra == 'logging'
+Provides-Extra: viz
+Requires-Dist: matplotlib>=3.8; extra == 'viz'
+Description-Content-Type: text/markdown
+# Nested Learning Reproduction
+![CI](https://github.com/kmccleary3301/nested_learning/actions/workflows/ci.yml/badge.svg)
+![Security](https://github.com/kmccleary3301/nested_learning/actions/workflows/security.yml/badge.svg)
+![Python](https://img.shields.io/badge/python-3.10%20to%203.12-blue)
+![PyTorch](https://img.shields.io/badge/pytorch-2.9.0-red)
+![License](https://img.shields.io/badge/license-Apache--2.0-green)
+![Status](https://img.shields.io/badge/tests-smoke--ready-lightgrey)
+Mechanism-level reproduction of Google's Nested Learning (HOPE) architecture (HOPE blocks, CMS, and Self‑Modifying TITANs), matching the quality bar set by lucidrains' TITAN reference while remaining fully open-source and `uv` managed.
+Faithfulness scope (high level):
+- ✅ HOPE / CMS / Self‑Modifying Titans update rules + wiring (mechanism-level)
+- ✅ Tensor-level invariants covered by unit tests (teach-signal, δℓ, CMS chunking, causality)
+- ✅ Boundary-target online chunking + optional attention-cache carry path are implemented
+- ⚠️ Stable default uses stop-grad online writes; an experimental single-process boundary-state mode supports differentiable write paths
+- ⚠️ Multi‑GPU mechanism-auditing online updates are not supported in this repo (DDP disables some features)
+Paper reference pin:
+- Source: `google_papers/Nested_Learning_Full_Paper/Nested_Learning_Full_Paper.md`
+- SHA-256: `7524af0724ac8e3bad9163bf0e79c85b490a26bc30b92d96b0bdf17a27f9febc`
+## Quickstart
+```bash
+uv python install 3.12
+uv sync --all-extras
+uv run nl doctor --json > logs/runtime_doctor.json
+uv run bash scripts/data/run_sample.sh
+uv run nl smoke --config-name pilot_smoke --device cpu
+uv run bash scripts/run_smoke.sh pilot  # CPU-friendly HOPE block smoke test
+uv run bash scripts/run_e2e_smoke.sh    # sync + sample data + smoke train + zeroshot eval
+uv run bash scripts/run_mechanism_audit_smoke.sh
+uv run python scripts/eval/zeroshot.py \
+  --config configs/hope/pilot.yaml \
+  --checkpoint artifacts/examples/pilot_dummy.pt \
+  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
+  --tasks piqa --max-samples 32 --device cpu
+```
+## Requirements
+- Python 3.10-3.12
+- PyTorch 2.9.x+ (golden environment in this repo uses 2.9.x)
+- `uv` (recommended for development) or `pip` for package-style usage
+## Compatibility
+- Support tiers and OS/runtime matrix: `docs/COMPATIBILITY_MATRIX.md`
+- Versioning/stability policy: `docs/VERSIONING_POLICY.md`
+- Golden repro environment: Python 3.12 + `uv lock` + PyTorch 2.9.x
+## Installation (pip-first)
+1. Create and activate a virtual environment.
+2. Install Torch first (CPU/CUDA wheel selection is backend-specific).
+3. Install this project.
+CPU example:
+```bash
+python -m venv .venv
+source .venv/bin/activate
+python -m pip install --upgrade pip
+python -m pip install "torch>=2.9,<3" --index-url https://download.pytorch.org/whl/cpu
+python -m pip install -e .
+```
+CUDA example (adjust index URL to your CUDA runtime):
+```bash
+python -m venv .venv
+source .venv/bin/activate
+python -m pip install --upgrade pip
+python -m pip install "torch>=2.9,<3" --index-url https://download.pytorch.org/whl/cu128
+python -m pip install -e .
+```
+## Setup (uv dev workflow)
+```bash
+uv python install 3.12
+uv sync --all-extras
+```
+Developer checks:
+- `uv run ruff check .`
+- `uv run mypy src`
+- `uv run pytest`
+- `uv run bash scripts/checks/run_fidelity_ci_subset.sh`
+- `uv run python scripts/checks/compliance_report.py --config configs/pilot.yaml --output eval/compliance_report.json`
+## CLI
+The package ships with `nl` for portable workflows across local/dev/prod environments.
+```bash
+# runtime compatibility snapshot
+uv run nl doctor --json
+# architecture/config smoke on chosen device
+uv run nl smoke --config-name pilot_smoke --device cpu --batch-size 1 --seq-len 8
+# static fidelity checks for a config
+uv run nl audit --config-name pilot_paper_faithful
+# train with Hydra overrides
+uv run nl train --config-name pilot --override train.device=cuda:1 --override train.steps=100
+```
+`python -m nested_learning ...` is also supported.
+## First 30 Minutes
+Use this path for a fast first success on CPU:
+```bash
+uv sync --all-extras
+uv run bash scripts/data/run_sample.sh
+uv run bash scripts/run_smoke.sh pilot
+uv run bash scripts/run_mechanism_audit_smoke.sh
+```
+This confirms:
+- data/tokenizer pipeline is operational,
+- model/training loop runs end-to-end,
+- cadence checks pass for a mechanism-auditing smoke run.
+## Data Pipeline
+1. **Tokenizer training**
+   ```bash
+   uv run python scripts/data/train_tokenizer.py \
+     --manifest configs/data/refinedweb_mixture.yaml \
+     --vocab-size 32000 \
+     --output-dir artifacts/tokenizer/refinedweb_mix \
+     --log-file data/mixtures/refinedweb_mix_tokenizer.json
+   ```
+2. **Corpus filtering + sharding**
+   ```bash
+   uv run python scripts/data/process_mixture.py \
+     configs/data/refinedweb_mixture_filtered.yaml \
+     --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
+     --log-file data/mixtures/refinedweb_mix_filtered_shards.json
+   ```
+3. **Sample pipeline** (downloads/licensed datasets, filters, shards, records stats)
+   ```bash
+   uv run bash scripts/data/run_sample.sh
+   ```
+4. **Full pipeline** (set env vars like `RW_LIMIT`, `WIKI_LIMIT`, etc. to scale ingestion)
+  ```bash
+  uv run bash scripts/data/run_full.sh  # default ~50k docs per corpus; increase limits as needed
+  ```
+### Data Troubleshooting
+- If `scripts/data/run_sample.sh` cannot find `artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model`, rerun:
+  ```bash
+  uv run bash scripts/data/run_sample.sh
+  ```
+  The script auto-trains the tokenizer when missing.
+- If `scripts/data/run_full.sh` fails with `Bad split: train. Available splits: ['test']`, use split fallback:
+  ```bash
+  FALLBACK_SPLIT=test uv run bash scripts/data/run_full.sh
+  ```
+  You can also override per-corpus splits (for example `RW_SPLIT=test`).
+## Training
+- Single GPU / CPU:
+  ```bash
+  uv run nl train --config-name pilot_smoke
+  ```
+- Apple Silicon (MPS, if available):
+  ```bash
+  uv run nl train --config-name pilot_smoke --override train.device=mps
+  ```
+- Script-based entrypoint (legacy-compatible):
+  ```bash
+  uv run python train.py --config-name pilot_smoke
+  ```
+- DDP (torchrun):
+  ```bash
+  torchrun --nproc_per_node=2 train_dist.py --config-name mid
+  ```
+- CPU-only DDP smoke (verifies `gloo` backend and deterministic seeding):
+  ```bash
+  uv run bash scripts/run_cpu_ddp_smoke.sh
+  ```
+- FSDP (see `docs/FSDP_SCALING_GUIDE.md` for VRAM/batch sizing):
+  ```bash
+  # 760M run
+  torchrun --nproc_per_node=2 train_fsdp.py --config-name hope/mid_fsdp
+  # 1.3B run
+  torchrun --nproc_per_node=2 train_fsdp.py --config-name hope/target_fsdp
+  ```
+- DeepSpeed (requires `deepspeed` installed separately):
+  ```bash
+  deepspeed --num_gpus=2 train_deepspeed.py --config-name target \
+    deepspeed.config=configs/deepspeed/zero3.json
+  ```
+### Mechanism-auditing presets (HOPE / Nested Learning)
+Use the mechanism-auditing preset configs (single GPU):
+```bash
+uv run python train.py --config-name pilot_paper_faithful
+# HOPE self-mod variant:
+uv run python train.py --config-name pilot_selfmod_paper_faithful
+```
+Notes:
+- These presets set `data.batch_size=1` to avoid cross-sample fast-memory sharing.
+- Online chunking supports one-token overlap **or** explicit boundary-target mode (`train.online_boundary_targets=true`).
+- Optional attention-state carry across chunks is available in training via `train.online_carry_attention_cache=true`.
+- The exact sequence/segment/chunk/buffer semantics are documented in `docs/STREAMING_CONTRACT.md`.
+Overrides:
+- `optim.type=m3` (paper optimizer option)
+- `train.steps=...` / `train.device=...`
+See `docs/PAPER_COMPLIANCE.md` for full fidelity notes.
+See `docs/STREAMING_CONTRACT.md` for the precise streaming/update contract used by this repo.
+## Scope Boundaries (Current)
+- This repo targets mechanism-auditing fidelity, not full paper-scale results parity.
+- Boundary-state gradient-through-write exists as an experimental constrained path; it is not yet treated as production/full-scale paper reproduction.
+- Distributed mechanism-auditing path for boundary-target + attention-cache carry is not implemented.
+### Pilot (3 B tokens) workflow
+1. Ensure TMUX session:
+   ```bash
+   tmux new -s pilot_train
+   ```
+2. Launch the long run on `cuda:1` (≈52 h wall clock):
+   ```bash
+   set -a && source git.env && set +a
+   export UV_CACHE_DIR=/tmp/uv-cache UV_LINK_MODE=copy
+   uv run python train.py --config-name pilot \
+     logging.enabled=true logging.backend=wandb \
+     logging.project=nested-learning logging.run_name=pilot-main-$(date +%Y%m%d%H%M%S) \
+     train.device=cuda:1
+   ```
+3. Checkpoints appear in `artifacts/checkpoints/pilot/step_*.pt` every 1 000 steps; the accompanying W&B run captures full telemetry.
+4. Copy the final checkpoint, config, logs, and eval JSON/CSV into `artifacts/pilot_release/` for distribution.
+## Logging
+Set `logging.enabled=true` in Hydra configs (or override via CLI) to send metrics to W&B (default). For local JSON logs, use `logging.backend=json logging.path=logs/run.json`. Sample outputs reside in `logs/` and `artifacts/examples/`.
+## Evaluation
+- Zero-shot:
+  ```bash
+  uv run python scripts/eval/zeroshot.py \
+  --config configs/hope/mid.yaml \
+  --checkpoint checkpoints/mid/step_000100.pt \
+  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
+  --tasks all --max-samples 200 --device cuda:0
+  ```
+  Use `uv run python scripts/eval/zeroshot.py --list-tasks` to display the full benchmark roster (PIQA, HellaSwag, WinoGrande, ARC-E/C, BoolQ, SIQA, CommonsenseQA, OpenBookQA). See `docs/zeroshot_eval.md` for details.
+- Needle-in-a-Haystack:
+  ```bash
+  uv run python scripts/eval/niah.py \
+    --config configs/hope/mid.yaml \
+    --checkpoint checkpoints/mid/step_000100.pt \
+    --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
+    --context-lengths 2048 4096 8192 --samples-per-length 20
+  ```
+- Continual-learning forgetting:
+  ```bash
+  uv run python scripts/eval/continual.py \
+    --config configs/hope/mid.yaml \
+    --checkpoints checkpoints/mid/step_000050.pt checkpoints/mid/step_000100.pt \
+    --segments-yaml configs/data/continual_segments_sample.yaml \
+    --batch-size 4 --max-batches 10 --memorize --memorize-steps 2
+  ```
+  Plot forgetting curves via `uv run python scripts/eval/plot_forgetting.py --continual-json eval/continual_mid.json`.
+- Long-context diagnostics:
+  ```bash
+  uv run python scripts/eval/passkey.py --config configs/hope/pilot.yaml --checkpoint artifacts/checkpoints/pilot/step_230000.pt \
+    --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model --samples 64 --memorize
+  uv run python scripts/eval/pg19_perplexity.py --config configs/hope/pilot.yaml --checkpoint artifacts/checkpoints/pilot/step_230000.pt \
+    --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model --max-samples 64
+  ```
+Evaluation summaries are written to `eval/` alongside per-task JSON metrics.
+### Test-time memorization toggles
+Every evaluator supports TITAN-style memorization so you can reproduce test-time adaptation:
+```bash
+uv run python scripts/eval/zeroshot.py \
+  ... \
+  --memorize \
+  --memorize-steps 2 \
+  --memorize-use-correct-answer \
+  --memorize-no-reset  # optional: retain updates across samples
+  --memorize-paths titan,cms_fast \
+  --memorize-surprise-threshold 0.01
+```
+- `--memorize` turns on the learner with one LMS step per example by default.
+- `--memorize-steps` controls the number of adaptation passes per prompt.
+- `--memorize-use-correct-answer` injects ground-truth text during memorization for ablations.
+- `--memorize-no-reset` carries memories across samples; omit it to reset every question.
+- `--memorize-paths` restricts which levels receive teach-signal updates (`titan`, `cms_fast`, or `all`).
+- `--memorize-surprise-threshold` gates updates on average teach-signal norm, matching the paper’s surprise trigger.
+Memorization metrics (baseline vs adaptive) are emitted alongside task accuracy for easy comparisons.
+## Architecture variants
+Select the paper-defined variant via `model.block_variant` in Hydra configs:
+- `hope_attention` (paper HOPE-Attention): `Attention → CMS` (paper-defined).
+- `hope_selfmod` (paper HOPE scaffold): `Self-modifying Titans (Eqs. 83–93; Eq. 91 residual MLP memories) → CMS` with (by default) **fixed q** and **local conv window=4**, plus chunked updates via `model.self_mod_chunk_size` (others) and `model.self_mod_chunk_size_memory` (M_memory). See `docs/PAPER_COMPLIANCE.md` for the “differentiable read / update-pass writes” semantics.
+- `hope_hybrid` (legacy): `Attention + TitanMemory + CMS` (exploratory; not paper-defined).
+- `transformer` (baseline): `Attention → MLP` (no TITAN/CMS learning updates; useful for Phase 2 comparisons).
+Self-modifying Titans knobs (ablation-friendly, paper-aligned):
+- `model.self_mod_objective` (`l2` vs `dot`), `model.self_mod_use_rank1_precond` (DGD-like preconditioner), `model.self_mod_use_alpha` (weight-decay/retention gate), `model.self_mod_stopgrad_vhat`, `model.self_mod_momentum`, `model.self_mod_adaptive_q`, `model.self_mod_local_conv_window`.
+## Fast state (Nested Learning semantics)
+In-context updates can run against a per-context fast state so meta parameters never change:
+- `HOPEModel.init_fast_state()` / `TitanOnlyModel.init_fast_state()` returns a `ModelFastState`.
+- `MemorizeConfig.use_fast_state=true` (default) requires passing `fast_state` into `memorize_tokens()` / `memorize_sequence()`; evaluation scripts handle this automatically.
+- Training can also run update passes against a per-batch fast state via `train.use_fast_state=true` (meta+delta fast state: meta params are learnable; online updates write deltas only). If `data.batch_size>1`, CMS/TITAN fast state is shared across the batch; use `data.batch_size=1` for strict per-context semantics. See `docs/PAPER_COMPLIANCE.md`.
+## Releases
+Before tagging or announcing a new checkpoint, work through:
+- `docs/release_checklist.md` (model/eval artifact release bundle)
+- `docs/PACKAGE_RELEASE_CHECKLIST.md` (package/GitHub/PyPI release flow)
+- `docs/PYPI_TRUSTED_PUBLISHING.md` (one-time OIDC setup for TestPyPI/PyPI)
+For versioning semantics and breaking-change expectations, see `docs/VERSIONING_POLICY.md`.
+For reproducibility bug reports, use `docs/BUG_REPORT_CHECKLIST.md`.
+## Performance & optimizer options
+- **Mixed precision:** enable bf16 autocast via `train.mixed_precision.enabled=true train.mixed_precision.dtype=bf16` (already enabled in pilot/mid/target configs).
+- **`torch.compile`:** accelerate attention/core loops by toggling `train.compile.enable=true train.compile.mode=max-autotune`; failure falls back to eager unless `train.compile.strict=true`.
+- **Muon hybrid (default):** all HOPE configs now set `optim.type=muon`, routing ≥2D tensors through PyTorch 2.9's Muon optimizer while embeddings/norms stay on AdamW. Training logs emit `optim.muon_param_elems` / `optim.adamw_param_elems` so you can confirm the split.
+- **Fused AdamW fallback:** override with `optim.type=adamw optim.fused=auto` if Muon is unavailable or if you want to compare against the AdamW ablation in `reports/ablations.md`.
+- **Surprise gating:** set `model.surprise_threshold=<float>` to gate all inner updates. By default the surprise metric is the average L2 norm of the (scaled/clipped) teach signal (`model.surprise_metric=l2`); you can also use `loss` or `logit_entropy` for ablations. Evaluation CLIs expose `--memorize-surprise-threshold` for ad-hoc gating.
+All Hydra knobs can be overridden from the CLI or composed via config groups (`configs/hope/*.yaml`). Use these flags in tandem with `scripts/run_e2e_smoke.sh` (automation) or `scripts/run_cpu_ddp_smoke.sh` (CPU-only determinism check) to validate releases quickly.
+## Documentation & References
+- `docs/IMPLEMENTATION_STATUS.md` – current mechanism-level status matrix.
+- `docs/PAPER_COMPLIANCE.md` – equation-to-code fidelity notes and explicit boundaries.
+- `docs/STREAMING_CONTRACT.md` – exact sequence/segment/chunk/update semantics.
+- `docs/release_checklist.md` – release readiness checklist.
+- `docs/data_pipeline.md` – large-scale sharding/tokenizer workflow.
+- `docs/scaling_guidance.md` – roadmap for expanding data + compute footprints.
+- `docs/stage2_plan.md` – Stage 2 architecture + experiment roadmap.
+- `docs/PHASE_2_PLAN.md` – detailed Phase 2 execution plan.
+- `docs/PLAN_PROGRESS_P7.md` – progress tracker for the latest faithfulness remediation sprint.
+- `docs/experiments_report.md` – draft paper covering completed experiments.
+- `docs/future_directions.md` – prioritized roadmap after the initial release.
+- `reports/stage2_smoke.md` – exact commands/artifacts for the release-ready smoke workflow.
+- `docs/FSDP_SCALING_GUIDE.md` – dual-RTX 6000 Ada instructions for the mid/target FSDP configs.
+- `google_papers/` – PDFs/markdown of Nested Learning & TITAN papers.
+- `CHANGELOG.md` – user-facing changes per release.
+## Contributing
+1. Run formatting/tests (`uv run ruff check .`, `uv run pytest`).
+2. Document new configs or scripts in the relevant docs under `docs/` and update `CHANGELOG.md`.
+3. Open a PR referencing the relevant NL/TITAN spec sections and tests.

nested_learning-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,76 @@
+nested_learning/__init__.py,sha256=mCyCGTd8lICAuNgITh_BtLVoVFjg8mTNjtuQRUxNtgE,379
+nested_learning/__main__.py,sha256=lT_tvQZf-ZPcGVSjIm7_KL1WBYwkz4KkL7_WDcMud-Y,130
+nested_learning/assoc_memory.py,sha256=nXbuSlH41J5PoENb9xCDAysdwqvW4l9x9bQ4gpAByqc,584
+nested_learning/backbones.py,sha256=7bLMdbeEaodl_nTtkpQDb37TwAMqYGdOEUCXwdz6jZA,5651
+nested_learning/capabilities.py,sha256=Yu0ojfpGq7uvAj2TfXTU03K2i767iL4L9m6R9Ak7ERI,3596
+nested_learning/cli.py,sha256=61DK4Yb9B1dPBuIgxw_pZUd7z8N5R_DtVoLbbwewMkM,8294
+nested_learning/cms.py,sha256=uP67svtGbjH-DW0d2HK48U14W0cnyBTo93-W4O6ZwIE,2725
+nested_learning/config_utils.py,sha256=CNYWrXLavcnE19b0t6QSyznexVGaYFgiRFZrzeMKXBk,1542
+nested_learning/continual_classification.py,sha256=8XQJw8jhNvxs-lM7F0fYcSk8CalRkSUj28QBwyie6GA,3692
+nested_learning/continual_streaming.py,sha256=suIgKzBkDrYEKgToHHsklt8wIEaZQ7Zc9ad1R26QJnA,9837
+nested_learning/data.py,sha256=zyTTBJW9JBahB-Zf6PYH5xbGe1VGJnBsABm85-vA9R0,5259
+nested_learning/device.py,sha256=PynH0ptNapWoCCg9SXStajkSfR19Qk8HKHofw2PwKa4,742
+nested_learning/eval_state.py,sha256=XHdRpY4ZlOOsScva5V9B6xK2HEM4JpxJhaK6E915_D0,2226
+nested_learning/fast_state.py,sha256=ccbHWFiAJaPzjIhHC61EqCpgoHGLVml9vkYu3a9CDiY,3216
+nested_learning/functional.py,sha256=VnissTh-E2IzGDEyy_p-4jbr6Uytgk8H7LYSRfykqto,1844
+nested_learning/instrumentation.py,sha256=bMiJ_WDRZ5jtQfiXlhHoLXKjBHREH-6_RmLrSggn8XA,1176
+nested_learning/levels.py,sha256=Cb3hpeuQlKK4PEvaPo5qSLc64iBCkpS8lsGGBiUy9S8,2966
+nested_learning/logging_utils.py,sha256=WiEgc8Hp7I-g2pgS-V5Xaeq_qtNo_z5m5A55EdEwAFg,1961
+nested_learning/memorize.py,sha256=mhNKrzOk_gvViumppi_U9UhxkW2BdoHeZDzbIw0y_-Q,15863
+nested_learning/model.py,sha256=6xQHQcasBqtMk8LxGlm6BI8QOIW6JafHai9LvQc7wSo,25855
+nested_learning/tokenizer.py,sha256=v4KZEXMMBirHFe1rbNU-_GbQo4tXDw5V015XNZlHeko,881
+nested_learning/tokenizer_coverage.py,sha256=zFWAUFoaaugG_iTL43i23bDrgzuS8viwcZ8MVDrIK2U,2723
+nested_learning/training.py,sha256=Afkmc4ku0XAG-THnbf-M3XPvTa5RdjKY1rPdc9NrJnk,63618
+nested_learning/transformer.py,sha256=cQUQ-cxbTRWSDN1GV3nyzlAm6zeAekRHFVFzmkLRzCU,3047
+nested_learning/hope/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nested_learning/hope/block.py,sha256=9L--naAZJW5K9h5c_V3STlJQA5LAq4YpB5Orjl2D0y0,79373
+nested_learning/hope/self_mod.py,sha256=0m6P3SvrWdXqG1Fm3BlRI98VJGyKifueHfbIIDzqHNU,1384
+nested_learning/optim/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nested_learning/optim/deep.py,sha256=xVz4qzTtv_RI5VpMxpc8ND-qfhVxigyLbmjMrGAqbXw,3507
+nested_learning/optim/factory.py,sha256=933VZ_8aSUeg-QStqDHQDs6v9d9iKh0FUNUJxozIglc,394
+nested_learning/optim/m3.py,sha256=bWlsOyU8UPyElNG9Jg-8olWJVIci7B2Wvfe1hlm8C90,4067
+nested_learning/optim/manager.py,sha256=_q6uvc_g4DvKJeVulr9bQeI2yDWg405WUHlLYHdaiTw,5278
+nested_learning/titan/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nested_learning/titan/memory.py,sha256=_KokLWLqpTtuRPFrwvAIjNYrMm90EV3gyoA2Y6z_i8U,2913
+nested_learning/titan/model.py,sha256=ZcxGYtxaanRPvnR9qAhYRvLb12tuc9ucwnMuvqGY_iU,16408
+nested_learning/titan/self_modifying.py,sha256=IMN_kirJAyQupSS9FuVX3Q4KZq18SlsIJULJgpFhPo4,27665
+nested_learning/configs/mid_smoke.yaml,sha256=kQ6zJQsJraHjJbt4AD-yvOT5kaRC8-zFOSOo_V1bSTQ,1926
+nested_learning/configs/mid_stage2.yaml,sha256=POhkK2qWjVjlmMNvGzu2EUHOfgJSyGw0Aa7dvOs6dYk,2146
+nested_learning/configs/mid_stage2_smoke.yaml,sha256=Tqmx7qnOLq2XT-BrosdX6iHBepF-alTTPzxkEjR-xOo,1990
+nested_learning/configs/mid_titan_baseline.yaml,sha256=KIYC8Ix-ecKxaBLonvORh6FJz9VhATX4EWAVsH7DLa4,1773
+nested_learning/configs/pilot.yaml,sha256=jpqzfqV51zbPXGKiEP1SuaffewXi-iObsLWtchlNARY,2632
+nested_learning/configs/pilot_paper_faithful.yaml,sha256=atghlgOe3DrqdUgD8rKx8pvJzjFmeQNdseXjmjnB1sM,1578
+nested_learning/configs/pilot_selfmod_paper_faithful.yaml,sha256=uA9jPgguqxknql_EYSn58T7MCh-az8npaYAXmLpQz1Y,446
+nested_learning/configs/pilot_smoke.yaml,sha256=X_r4kvSrAWk4129n7XjVANA9Czsx3kIhKE61Ye9geq4,1393
+nested_learning/configs/ablations/cms_sparse.yaml,sha256=HisciXA82J8accmLYnQRCN-zblRuVF5Aq86nFV1QQ2Q,858
+nested_learning/configs/ablations/selfmod_chunked_8_64.yaml,sha256=J7MTx125eaJWaTpZdodudokBbiCLDLYa7Y0ZYvPQUPw,476
+nested_learning/configs/ablations/selfmod_momentum_off.yaml,sha256=ip8pClpLLm99WHoCyb_oxHzsxik5rY-6W8nA0F6Xytk,443
+nested_learning/configs/ablations/selfmod_momentum_on.yaml,sha256=f36QxMltp1-q5mZ9EvE54UKwzxL0HIV4ObWW-EbAfT0,440
+nested_learning/configs/ablations/selfmod_no_alpha.yaml,sha256=QlUELjfSBTCWBIpN2pHI8RpVsC8R7ESzJ8Nj6FG54PU,434
+nested_learning/configs/ablations/selfmod_no_cms.yaml,sha256=VGO0MVhq_Axih9ekdCmpHEXZv74RNwE0PpmZ87NnTgg,417
+nested_learning/configs/ablations/selfmod_rank1_precond_off.yaml,sha256=q06eSRFtyVDtihx7VRmPhJ9ZdW7h_TXaQcayxfEcimA,445
+nested_learning/configs/data/continual_segments_sample.yaml,sha256=j-oODM2UDKK5gj6RsRhJRuwb39wpzFo2yFlU_WA9CV8,284
+nested_learning/configs/data/fineweb_edu_longdoc_filtered_sample.yaml,sha256=fyAez4AkhCcsBTZbKZUKrNTxkpIyAuc99jClOqZ2C6U,414
+nested_learning/configs/data/fineweb_edu_mixture_full.yaml,sha256=uLHDZObs0Lewail63dUnk9IP1u7NRyqaz46zr3IozOU,353
+nested_learning/configs/data/fineweb_edu_mixture_sample.yaml,sha256=T7f7TUhgul48CCD75dc8gqqnAd48qeTui2Zn7D2LPEc,355
+nested_learning/configs/data/refinedweb_mixture.yaml,sha256=51QeWc-MCYp1Hvvd-4euHhNLHaybfC7Y3rO-0YhvNbE,1239
+nested_learning/configs/data/refinedweb_mixture_filtered.yaml,sha256=0EhG1iL9IGp9a-NNZ0um0v1Sy8j6b_cjYCedFXuGAr4,1294
+nested_learning/configs/data/refinedweb_mixture_full.yaml,sha256=KvsofwVgjVaJ2wIX11ZgGBCTBCmEBJoamClQ8D1ZuSY,1270
+nested_learning/configs/data/refinedweb_mixture_sample.yaml,sha256=XnUeAkiihw9hGOoh68MS7MH4nCNQKre4SnROkfQlBUk,1288
+nested_learning/configs/deepspeed/zero3.json,sha256=QbqP_AVNFqNsC5D2D7DxynFQmfDxFRSUrIG0RJVpc-o,462
+nested_learning/configs/hope/mid.yaml,sha256=Snhuay5AJ0gR763cW2s0L_2vlYhA-Lt5daUZFCqv0Ko,2284
+nested_learning/configs/hope/mid_fsdp.yaml,sha256=aPT_BdqGN8WcedIXYWUX7DjVtdmY3tvat1L4kVwKl00,887
+nested_learning/configs/hope/pilot.yaml,sha256=6CnWGt2kY0O7iG7J09h_JsVmblUbhQxZURd-Jmspw2w,21
+nested_learning/configs/hope/pilot_attention.yaml,sha256=8JsAXyULT2t3FV7WvHOgcl-idTGrrekeBYjtKcrByGI,115
+nested_learning/configs/hope/pilot_selfmod.yaml,sha256=A5mFZrI1AkJzO041AVCWYjgDtKXwTvHVIN9YBrBWync,436
+nested_learning/configs/hope/pilot_transformer.yaml,sha256=0u9MayjjXlgLODkqWQ1cRZ8msaJ-8aokw4pwBGrIUpY,112
+nested_learning/configs/hope/target.yaml,sha256=Wm4PGyjT8TswW6fguM4yh8_0YNM63JqIJxv-9RCluGI,2912
+nested_learning/configs/hope/target_fsdp.yaml,sha256=mDPj3827bOZm43ZD976Q3Pa4uwM-LSSQLS0IsQziveI,878
+nested_learning/configs/resolved/cms_sparse_eval.yaml,sha256=h4PrYtINvTs3-cKF_rRwrJAktHeVfqXQaQAtoUujfAo,2110
+nested_learning/configs/resolved/phase2_pilot_attention_eval.yaml,sha256=n2zewCxfVgNjlQV9zEZ7QBLhdzZda8E_MMYYIsqvx-4,1008
+nested_learning/configs/resolved/phase2_pilot_transformer_eval.yaml,sha256=LSfw7NQHT9qWMoXA2hRUcfjm4XfFYDlH814_4t9EAVg,1005
+nested_learning-0.2.0.dist-info/METADATA,sha256=N4Q8ikRIJFXzcXGzbK25YgDT69Qs__KNhHOXXuByYV0,18727
+nested_learning-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+nested_learning-0.2.0.dist-info/entry_points.txt,sha256=kHVkNi_IXL_a8JTBGuqp86jzflHXUFoO14dLEgNM3_Q,47
+nested_learning-0.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nested_learning-0.2.0.dist-info/RECORD,,

nested_learning-0.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

nested_learning-0.2.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ nl = nested_learning.cli:app