PyPI - nested-learning - Versions diffs - 0.2.0__py3-none-any.whl - Mend

nested-learning 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

nested_learning/__init__.py +12 -0
nested_learning/__main__.py +12 -0
nested_learning/assoc_memory.py +23 -0
nested_learning/backbones.py +147 -0
nested_learning/capabilities.py +104 -0
nested_learning/cli.py +253 -0
nested_learning/cms.py +92 -0
nested_learning/config_utils.py +50 -0
nested_learning/configs/ablations/cms_sparse.yaml +46 -0
nested_learning/configs/ablations/selfmod_chunked_8_64.yaml +24 -0
nested_learning/configs/ablations/selfmod_momentum_off.yaml +23 -0
nested_learning/configs/ablations/selfmod_momentum_on.yaml +23 -0
nested_learning/configs/ablations/selfmod_no_alpha.yaml +23 -0
nested_learning/configs/ablations/selfmod_no_cms.yaml +23 -0
nested_learning/configs/ablations/selfmod_rank1_precond_off.yaml +23 -0
nested_learning/configs/data/continual_segments_sample.yaml +9 -0
nested_learning/configs/data/fineweb_edu_longdoc_filtered_sample.yaml +14 -0
nested_learning/configs/data/fineweb_edu_mixture_full.yaml +14 -0
nested_learning/configs/data/fineweb_edu_mixture_sample.yaml +14 -0
nested_learning/configs/data/refinedweb_mixture.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_filtered.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_full.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_sample.yaml +51 -0
nested_learning/configs/deepspeed/zero3.json +25 -0
nested_learning/configs/hope/mid.yaml +118 -0
nested_learning/configs/hope/mid_fsdp.yaml +47 -0
nested_learning/configs/hope/pilot.yaml +2 -0
nested_learning/configs/hope/pilot_attention.yaml +9 -0
nested_learning/configs/hope/pilot_selfmod.yaml +20 -0
nested_learning/configs/hope/pilot_transformer.yaml +9 -0
nested_learning/configs/hope/target.yaml +145 -0
nested_learning/configs/hope/target_fsdp.yaml +47 -0
nested_learning/configs/mid_smoke.yaml +99 -0
nested_learning/configs/mid_stage2.yaml +110 -0
nested_learning/configs/mid_stage2_smoke.yaml +102 -0
nested_learning/configs/mid_titan_baseline.yaml +92 -0
nested_learning/configs/pilot.yaml +127 -0
nested_learning/configs/pilot_paper_faithful.yaml +42 -0
nested_learning/configs/pilot_selfmod_paper_faithful.yaml +18 -0
nested_learning/configs/pilot_smoke.yaml +80 -0
nested_learning/configs/resolved/cms_sparse_eval.yaml +105 -0
nested_learning/configs/resolved/phase2_pilot_attention_eval.yaml +49 -0
nested_learning/configs/resolved/phase2_pilot_transformer_eval.yaml +49 -0
nested_learning/continual_classification.py +136 -0
nested_learning/continual_streaming.py +283 -0
nested_learning/data.py +153 -0
nested_learning/device.py +21 -0
nested_learning/eval_state.py +72 -0
nested_learning/fast_state.py +108 -0
nested_learning/functional.py +69 -0
nested_learning/hope/__init__.py +0 -0
nested_learning/hope/block.py +1973 -0
nested_learning/hope/self_mod.py +40 -0
nested_learning/instrumentation.py +38 -0
nested_learning/levels.py +94 -0
nested_learning/logging_utils.py +64 -0
nested_learning/memorize.py +382 -0
nested_learning/model.py +604 -0
nested_learning/optim/__init__.py +0 -0
nested_learning/optim/deep.py +102 -0
nested_learning/optim/factory.py +13 -0
nested_learning/optim/m3.py +121 -0
nested_learning/optim/manager.py +151 -0
nested_learning/titan/__init__.py +0 -0
nested_learning/titan/memory.py +88 -0
nested_learning/titan/model.py +412 -0
nested_learning/titan/self_modifying.py +724 -0
nested_learning/tokenizer.py +28 -0
nested_learning/tokenizer_coverage.py +77 -0
nested_learning/training.py +1600 -0
nested_learning/transformer.py +104 -0
nested_learning-0.2.0.dist-info/METADATA +390 -0
nested_learning-0.2.0.dist-info/RECORD +76 -0
nested_learning-0.2.0.dist-info/WHEEL +4 -0
nested_learning-0.2.0.dist-info/entry_points.txt +2 -0
nested_learning-0.2.0.dist-info/licenses/LICENSE +201 -0

nested_learning/configs/hope/target.yaml ADDED Viewed

@@ -0,0 +1,145 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job:
+    chdir: false
+model:
+  vocab_size: 32000
+  dim: 1536
+  num_layers: 32
+  heads: 24
+  surprise_threshold: null
+  freeze_backbone: false
+  titan_level:
+    name: titan
+    update_period: 32
+    optimizer_key: titan_opt
+  cms_levels:
+    - name: cms_fast
+      update_period: 1
+      optimizer_key: cms_fast_opt
+    - name: cms_mid
+      update_period: 4
+      optimizer_key: cms_mid_opt
+    - name: cms_slow
+      update_period: 32
+      optimizer_key: cms_slow_opt
+    - name: cms_ultra
+      update_period: 128
+      optimizer_key: cms_slow_opt
+    - name: cms_anchor
+      update_period: 512
+      optimizer_key: cms_anchor_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 6.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond
+    cms_fast_opt:
+      type: deep_momentum
+      lr: 3.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond
+    cms_mid_opt:
+      type: deep_momentum
+      lr: 2.5e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond
+    cms_slow_opt:
+      type: deep_momentum
+      lr: 2.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond
+    cms_anchor_opt:
+      type: deep_momentum
+      lr: 1.5e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond
+data:
+  source: mixture
+  batch_size: 32
+  num_workers: 8
+  mixture:
+    samples_per_epoch: 32768
+    seed: 123
+    sources:
+      - name: refinedweb
+        shards_dir: data/shards/refinedweb_filtered
+        weight: 0.35
+      - name: wikipedia
+        shards_dir: data/shards/wikipedia_filtered
+        weight: 0.2
+      - name: c4
+        shards_dir: data/shards/c4_filtered
+        weight: 0.15
+      - name: redpajama
+        shards_dir: data/shards/redpajama_filtered
+        weight: 0.2
+      - name: code
+        shards_dir: data/shards/code_filtered
+        weight: 0.1
+train:
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 200
+  log_interval: 10
+  device: "cuda:1"
+  seed: 9001
+  deterministic: false
+  step_offset: 0
+  mixed_precision:
+    enabled: true
+    dtype: bf16
+  compile:
+    enable: true
+    mode: max-autotune
+  fsdp:
+    auto_wrap_min_params: 2000000
+    cpu_offload: false
+  checkpoint:
+    enable: true
+    dir: checkpoints/target
+    save_interval: 100
+    resume_path: null
+    resume_tag: null
+optim:
+  type: muon
+  lr: 1.5e-4
+  weight_decay: 0.02
+  momentum: 0.95
+  betas:
+    - 0.9
+    - 0.999
+logging:
+  enabled: false
+  backend: wandb
+  project: nested-learning
+  run_name: target-${now:%Y%m%d%H%M%S}
+  path: logs/target_metrics.json
+deepspeed:
+  config: configs/deepspeed/zero3.json

nested_learning/configs/hope/target_fsdp.yaml ADDED Viewed

@@ -0,0 +1,47 @@
+defaults:
+  - target
+  - _self_
+model:
+  gradient_checkpointing: true
+data:
+  batch_size: 4  # per-rank micro-batch
+  num_workers: 8
+train:
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 300000
+  log_interval: 20
+  device: "cuda"
+  mixed_precision:
+    enabled: true
+    dtype: bf16
+  compile:
+    enable: false
+  fsdp:
+    auto_wrap_min_params: 2500000
+    cpu_offload: false
+  checkpoint:
+    enable: true
+    dir: artifacts/checkpoints/target_fsdp
+    save_interval: 1000
+    resume_path: null
+    resume_tag: null
+optim:
+  type: muon
+  lr: 1.5e-4
+  weight_decay: 0.01
+logging:
+  enabled: true
+  backend: wandb
+  project: nested-learning
+  run_name: hope-target-fsdp-${now:%Y%m%d%H%M%S}
+  path: logs/target_fsdp_metrics.json

nested_learning/configs/mid_smoke.yaml ADDED Viewed

@@ -0,0 +1,99 @@
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job:
+    chdir: false
+model:
+  vocab_size: 32000
+  dim: 256
+  num_layers: 4
+  heads: 8
+  titan_level:
+    name: titan
+    update_period: 16
+    optimizer_key: titan_opt
+  cms_levels:
+    - name: cms_fast
+      update_period: 1
+      optimizer_key: cms_opt
+    - name: cms_mid
+      update_period: 4
+      optimizer_key: cms_opt
+    - name: cms_slow
+      update_period: 16
+      optimizer_key: cms_opt
+    - name: cms_ultra
+      update_period: 64
+      optimizer_key: cms_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 8.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+    cms_opt:
+      type: deep_momentum
+      lr: 4.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+data:
+  source: mixture
+  batch_size: 4
+  num_workers: 0
+  mixture:
+    samples_per_epoch: 128
+    seed: 0
+    sources:
+      - name: refinedweb
+        shards_dir: data/shards/refinedweb_filtered
+        weight: 0.4
+      - name: wikipedia
+        shards_dir: data/shards/wikipedia_filtered
+        weight: 0.2
+      - name: c4
+        shards_dir: data/shards/c4_filtered
+        weight: 0.15
+      - name: redpajama
+        shards_dir: data/shards/redpajama_filtered
+        weight: 0.15
+      - name: code
+        shards_dir: data/shards/code_filtered
+        weight: 0.1
+train:
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 10
+  log_interval: 1
+  device: "cpu"
+  seed: 2024
+  deterministic: true
+  mixed_precision:
+    enabled: false
+    dtype: bf16
+  compile:
+    enable: false
+  checkpoint:
+    enable: true
+    dir: artifacts/checkpoints/mid_smoke
+    save_interval: 10
+    save_last: true
+optim:
+  type: adamw
+  lr: 2.0e-4
+  fused: false
+logging:
+  enabled: true
+  backend: json
+  path: logs/mid_smoke.json

nested_learning/configs/mid_stage2.yaml ADDED Viewed

@@ -0,0 +1,110 @@
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job:
+    chdir: false
+model:
+  vocab_size: 32000
+  dim: 768
+  num_layers: 18
+  heads: 12
+  teach_scale: 0.05
+  teach_clip: 5.0
+  teach_schedule:
+    warmup_steps: 20
+    decay_start: 80
+    decay_duration: 40
+  titan_level:
+    name: titan
+    update_period: 16
+    optimizer_key: titan_opt
+  cms_levels:
+    - name: cms_fast
+      update_period: 1
+      optimizer_key: cms_opt
+    - name: cms_mid
+      update_period: 4
+      optimizer_key: cms_opt
+    - name: cms_slow
+      update_period: 32
+      optimizer_key: cms_opt
+    - name: cms_ultra
+      update_period: 128
+      optimizer_key: cms_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 8.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+    cms_opt:
+      type: deep_momentum
+      lr: 4.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+data:
+  source: mixture
+  batch_size: 8
+  num_workers: 2
+  mixture:
+    samples_per_epoch: 1024
+    seed: 42
+    sources:
+      - name: refinedweb
+        shards_dir: data/shards/refinedweb_full
+        weight: 0.4
+      - name: wikipedia
+        shards_dir: data/shards/wikipedia_full
+        weight: 0.2
+      - name: c4
+        shards_dir: data/shards/c4_full
+        weight: 0.15
+      - name: redpajama
+        shards_dir: data/shards/redpajama_full
+        weight: 0.15
+      - name: code
+        shards_dir: data/shards/code_full
+        weight: 0.1
+train:
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 100
+  log_interval: 10
+  device: "cuda"
+  seed: 3401
+  deterministic: false
+  mixed_precision:
+    enabled: true
+    dtype: bf16
+  compile:
+    enable: true
+    mode: max-autotune
+  fsdp:
+    auto_wrap_min_params: 2000000
+    cpu_offload: false
+  checkpoint:
+    enable: true
+    dir: artifacts/checkpoints/mid_stage2
+    save_interval: 100
+    resume_path: null
+    resume_tag: null
+optim:
+  type: adamw
+  lr: 3.0e-5
+  fused: auto
+logging:
+  enabled: true
+  backend: json
+  path: logs/mid_stage2.json

nested_learning/configs/mid_stage2_smoke.yaml ADDED Viewed

@@ -0,0 +1,102 @@
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job:
+    chdir: false
+model:
+  vocab_size: 32000
+  dim: 512
+  num_layers: 12
+  heads: 8
+  teach_scale: 0.2
+  teach_clip: 2.0
+  titan_level:
+    name: titan
+    update_period: 16
+    optimizer_key: titan_opt
+  cms_levels:
+    - name: cms_fast
+      update_period: 1
+      optimizer_key: cms_opt
+    - name: cms_mid
+      update_period: 4
+      optimizer_key: cms_opt
+    - name: cms_slow
+      update_period: 16
+      optimizer_key: cms_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 6.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+    cms_opt:
+      type: deep_momentum
+      lr: 3.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+data:
+  source: mixture
+  batch_size: 8
+  num_workers: 2
+  mixture:
+    samples_per_epoch: 512
+    seed: 0
+    sources:
+      - name: refinedweb
+        shards_dir: data/shards/refinedweb_filtered
+        weight: 0.4
+      - name: wikipedia
+        shards_dir: data/shards/wikipedia_filtered
+        weight: 0.2
+      - name: c4
+        shards_dir: data/shards/c4_filtered
+        weight: 0.15
+      - name: redpajama
+        shards_dir: data/shards/redpajama_filtered
+        weight: 0.15
+      - name: code
+        shards_dir: data/shards/code_filtered
+        weight: 0.1
+train:
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 60
+  log_interval: 5
+  device: "cuda"
+  seed: 777
+  deterministic: false
+  mixed_precision:
+    enabled: true
+    dtype: bf16
+  compile:
+    enable: false
+  fsdp:
+    auto_wrap_min_params: 1500000
+    cpu_offload: false
+  checkpoint:
+    enable: true
+    dir: artifacts/checkpoints/mid_stage2_smoke
+    save_interval: 60
+    resume_path: null
+    resume_tag: null
+optim:
+  type: adamw
+  lr: 1.0e-4
+  fused: auto
+logging:
+  enabled: true
+  backend: json
+  path: logs/mid_stage2_smoke.json

nested_learning/configs/mid_titan_baseline.yaml ADDED Viewed

@@ -0,0 +1,92 @@
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job:
+    chdir: false
+model:
+  type: titan
+  vocab_size: 32000
+  dim: 768
+  num_layers: 18
+  heads: 12
+  surprise_threshold: 0.02
+  freeze_backbone: false
+  titan_level:
+    name: titan
+    update_period: 16
+    optimizer_key: titan_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 8.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+  teach_scale: 0.10
+  teach_clip: 4.0
+  teach_schedule:
+    warmup_steps: 60
+    decay_start: 140
+    decay_duration: 80
+data:
+  source: mixture
+  batch_size: 4
+  num_workers: 2
+  mixture:
+    samples_per_epoch: 1024
+    seed: 42
+    sources:
+      - name: refinedweb
+        shards_dir: data/shards/refinedweb_full
+        weight: 0.4
+      - name: wikipedia
+        shards_dir: data/shards/wikipedia_full
+        weight: 0.2
+      - name: c4
+        shards_dir: data/shards/c4_full
+        weight: 0.15
+      - name: redpajama
+        shards_dir: data/shards/redpajama_full
+        weight: 0.15
+      - name: code
+        shards_dir: data/shards/code_full
+        weight: 0.1
+train:
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 220
+  log_interval: 20
+  device: "cuda:1"
+  seed: 451
+  deterministic: false
+  step_offset: 0
+  mixed_precision:
+    enabled: true
+    dtype: bf16
+  compile:
+    enable: false
+  checkpoint:
+    enable: true
+    dir: artifacts/checkpoints/mid_titan_baseline
+    save_interval: 100
+    resume_path: null
+    resume_tag: null
+optim:
+  type: adamw
+  lr: 1.0e-5
+  fused: auto
+logging:
+  enabled: true
+  backend: json
+  path: logs/mid_titan_baseline.json
+  run_name: mid_titan_baseline

nested_learning/configs/pilot.yaml ADDED Viewed

@@ -0,0 +1,127 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job:
+    chdir: false
+model:
+  vocab_size: 32000
+  dim: 512
+  num_layers: 12
+  heads: 8
+  teach_scale: 0.10
+  teach_clip: 5.0
+  surprise_threshold: 0.02
+  freeze_backbone: false
+  self_mod_lr: 0.001
+  teach_schedule:
+    warmup_steps: 2000
+    decay_start: 120000
+    decay_duration: 20000
+  titan_level:
+    name: titan
+    update_period: 8
+    optimizer_key: titan_opt
+  cms_levels:
+    - name: cms_fast
+      update_period: 1
+      optimizer_key: cms_opt
+    - name: cms_mid
+      update_period: 4
+      optimizer_key: cms_opt
+    - name: cms_slow
+      update_period: 32
+      optimizer_key: cms_opt
+    - name: cms_ultra
+      update_period: 128
+      optimizer_key: cms_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 6.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        # Best-effort paper mapping: rank-1 context projection preconditioner.
+        variant: nl_l2_precond
+    cms_opt:
+      type: deep_momentum
+      lr: 3.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        # Best-effort paper mapping: rank-1 context projection preconditioner.
+        variant: nl_l2_precond
+data:
+  source: mixture
+  seq_len: 2048
+  batch_size: 6
+  num_workers: 4
+  mixture:
+    samples_per_epoch: 65536
+    seed: 1337
+    sources:
+      - name: refinedweb
+        shards_dir: data/shards/refinedweb_filtered
+        weight: 0.4
+      - name: wikipedia
+        shards_dir: data/shards/wikipedia_filtered
+        weight: 0.2
+      - name: c4
+        shards_dir: data/shards/c4_filtered
+        weight: 0.15
+      - name: redpajama
+        shards_dir: data/shards/redpajama_filtered
+        weight: 0.15
+      - name: code
+        shards_dir: data/shards/code_filtered
+        weight: 0.1
+train:
+  algorithm_mode: two_pass_stopgrad_updates
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 246667
+  log_interval: 50
+  device: "cuda:1"
+  seed: 1337
+  deterministic: false
+  step_offset: 0
+  mixed_precision:
+    enabled: true
+    dtype: bf16
+  compile:
+    enable: false
+    mode: max-autotune
+  checkpoint:
+    enable: true
+    dir: artifacts/checkpoints/pilot
+    save_interval: 1000
+    save_last: true
+    resume_path: null
+    resume_tag: null
+optim:
+  type: muon
+  lr: 2.5e-4
+  weight_decay: 0.02
+  momentum: 0.95
+  betas:
+    - 0.9
+    - 0.999
+logging:
+  enabled: true
+  backend: json
+  path: logs/pilot_metrics.json
+  project: nested-learning
+  run_name: pilot-main