PyPI - nested-learning - Versions diffs - 0.2.0__py3-none-any.whl - Mend

nested-learning 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

nested_learning/__init__.py +12 -0
nested_learning/__main__.py +12 -0
nested_learning/assoc_memory.py +23 -0
nested_learning/backbones.py +147 -0
nested_learning/capabilities.py +104 -0
nested_learning/cli.py +253 -0
nested_learning/cms.py +92 -0
nested_learning/config_utils.py +50 -0
nested_learning/configs/ablations/cms_sparse.yaml +46 -0
nested_learning/configs/ablations/selfmod_chunked_8_64.yaml +24 -0
nested_learning/configs/ablations/selfmod_momentum_off.yaml +23 -0
nested_learning/configs/ablations/selfmod_momentum_on.yaml +23 -0
nested_learning/configs/ablations/selfmod_no_alpha.yaml +23 -0
nested_learning/configs/ablations/selfmod_no_cms.yaml +23 -0
nested_learning/configs/ablations/selfmod_rank1_precond_off.yaml +23 -0
nested_learning/configs/data/continual_segments_sample.yaml +9 -0
nested_learning/configs/data/fineweb_edu_longdoc_filtered_sample.yaml +14 -0
nested_learning/configs/data/fineweb_edu_mixture_full.yaml +14 -0
nested_learning/configs/data/fineweb_edu_mixture_sample.yaml +14 -0
nested_learning/configs/data/refinedweb_mixture.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_filtered.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_full.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_sample.yaml +51 -0
nested_learning/configs/deepspeed/zero3.json +25 -0
nested_learning/configs/hope/mid.yaml +118 -0
nested_learning/configs/hope/mid_fsdp.yaml +47 -0
nested_learning/configs/hope/pilot.yaml +2 -0
nested_learning/configs/hope/pilot_attention.yaml +9 -0
nested_learning/configs/hope/pilot_selfmod.yaml +20 -0
nested_learning/configs/hope/pilot_transformer.yaml +9 -0
nested_learning/configs/hope/target.yaml +145 -0
nested_learning/configs/hope/target_fsdp.yaml +47 -0
nested_learning/configs/mid_smoke.yaml +99 -0
nested_learning/configs/mid_stage2.yaml +110 -0
nested_learning/configs/mid_stage2_smoke.yaml +102 -0
nested_learning/configs/mid_titan_baseline.yaml +92 -0
nested_learning/configs/pilot.yaml +127 -0
nested_learning/configs/pilot_paper_faithful.yaml +42 -0
nested_learning/configs/pilot_selfmod_paper_faithful.yaml +18 -0
nested_learning/configs/pilot_smoke.yaml +80 -0
nested_learning/configs/resolved/cms_sparse_eval.yaml +105 -0
nested_learning/configs/resolved/phase2_pilot_attention_eval.yaml +49 -0
nested_learning/configs/resolved/phase2_pilot_transformer_eval.yaml +49 -0
nested_learning/continual_classification.py +136 -0
nested_learning/continual_streaming.py +283 -0
nested_learning/data.py +153 -0
nested_learning/device.py +21 -0
nested_learning/eval_state.py +72 -0
nested_learning/fast_state.py +108 -0
nested_learning/functional.py +69 -0
nested_learning/hope/__init__.py +0 -0
nested_learning/hope/block.py +1973 -0
nested_learning/hope/self_mod.py +40 -0
nested_learning/instrumentation.py +38 -0
nested_learning/levels.py +94 -0
nested_learning/logging_utils.py +64 -0
nested_learning/memorize.py +382 -0
nested_learning/model.py +604 -0
nested_learning/optim/__init__.py +0 -0
nested_learning/optim/deep.py +102 -0
nested_learning/optim/factory.py +13 -0
nested_learning/optim/m3.py +121 -0
nested_learning/optim/manager.py +151 -0
nested_learning/titan/__init__.py +0 -0
nested_learning/titan/memory.py +88 -0
nested_learning/titan/model.py +412 -0
nested_learning/titan/self_modifying.py +724 -0
nested_learning/tokenizer.py +28 -0
nested_learning/tokenizer_coverage.py +77 -0
nested_learning/training.py +1600 -0
nested_learning/transformer.py +104 -0
nested_learning-0.2.0.dist-info/METADATA +390 -0
nested_learning-0.2.0.dist-info/RECORD +76 -0
nested_learning-0.2.0.dist-info/WHEEL +4 -0
nested_learning-0.2.0.dist-info/entry_points.txt +2 -0
nested_learning-0.2.0.dist-info/licenses/LICENSE +201 -0

nested_learning/configs/ablations/cms_sparse.yaml ADDED Viewed

@@ -0,0 +1,46 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  dim: 384
+  num_layers: 8
+  heads: 6
+  titan_level:
+    name: titan
+    update_period: 8
+    optimizer_key: titan_opt
+  cms_hidden_multiplier: 2
+  cms_levels:
+    - name: cms_fast
+      update_period: 8
+      optimizer_key: cms_opt
+    - name: cms_mid
+      update_period: 32
+      optimizer_key: cms_opt
+    - name: cms_slow
+      update_period: 128
+      optimizer_key: cms_opt
+    - name: cms_ultra
+      update_period: 512
+      optimizer_key: cms_opt
+data:
+  seq_len: 1024
+  batch_size: 2
+  num_workers: 2
+train:
+  online_updates: true
+  online_chunk_size: 0
+  per_layer_teach_signal: true
+  steps: 5000
+  device: "cuda:1"
+  checkpoint:
+    dir: artifacts/checkpoints/pilot_cms_sparse
+    save_interval: 1000
+  log_interval: 25
+logging:
+  path: logs/pilot_cms_sparse_metrics.json
+  run_name: pilot-cms-sparse

nested_learning/configs/ablations/selfmod_chunked_8_64.yaml ADDED Viewed

@@ -0,0 +1,24 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  block_variant: hope_selfmod
+  self_mod_chunk_size: 8
+  self_mod_chunk_size_memory: 64
+train:
+  online_updates: true
+  online_chunk_size: 0
+  per_layer_teach_signal: true
+  steps: 5000
+  device: "cuda:1"
+  checkpoint:
+    dir: artifacts/checkpoints/pilot_selfmod_chunked_8_64
+    save_interval: 1000
+logging:
+  enabled: true
+  backend: json
+  path: logs/pilot_selfmod_chunked_8_64_metrics.json
+  run_name: pilot-selfmod-chunked-8-64

nested_learning/configs/ablations/selfmod_momentum_off.yaml ADDED Viewed

@@ -0,0 +1,23 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  block_variant: hope_selfmod
+  self_mod_momentum: 0.0
+train:
+  online_updates: true
+  online_chunk_size: 0
+  per_layer_teach_signal: true
+  steps: 5000
+  device: "cuda:1"
+  checkpoint:
+    dir: artifacts/checkpoints/pilot_selfmod_momentum_off
+    save_interval: 1000
+logging:
+  enabled: true
+  backend: json
+  path: logs/pilot_selfmod_momentum_off_metrics.json
+  run_name: pilot-selfmod-momentum-off

nested_learning/configs/ablations/selfmod_momentum_on.yaml ADDED Viewed

@@ -0,0 +1,23 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  block_variant: hope_selfmod
+  self_mod_momentum: 0.9
+train:
+  online_updates: true
+  online_chunk_size: 0
+  per_layer_teach_signal: true
+  steps: 5000
+  device: "cuda:1"
+  checkpoint:
+    dir: artifacts/checkpoints/pilot_selfmod_momentum_on
+    save_interval: 1000
+logging:
+  enabled: true
+  backend: json
+  path: logs/pilot_selfmod_momentum_on_metrics.json
+  run_name: pilot-selfmod-momentum-on

nested_learning/configs/ablations/selfmod_no_alpha.yaml ADDED Viewed

@@ -0,0 +1,23 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  block_variant: hope_selfmod
+  self_mod_use_alpha: false
+train:
+  online_updates: true
+  online_chunk_size: 0
+  per_layer_teach_signal: true
+  steps: 5000
+  device: "cuda:1"
+  checkpoint:
+    dir: artifacts/checkpoints/pilot_selfmod_no_alpha
+    save_interval: 1000
+logging:
+  enabled: true
+  backend: json
+  path: logs/pilot_selfmod_no_alpha_metrics.json
+  run_name: pilot-selfmod-no-alpha

nested_learning/configs/ablations/selfmod_no_cms.yaml ADDED Viewed

@@ -0,0 +1,23 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  block_variant: hope_selfmod
+  cms_levels: []
+train:
+  online_updates: true
+  online_chunk_size: 0
+  per_layer_teach_signal: true
+  steps: 5000
+  device: "cuda:1"
+  checkpoint:
+    dir: artifacts/checkpoints/pilot_selfmod_no_cms
+    save_interval: 1000
+logging:
+  enabled: true
+  backend: json
+  path: logs/pilot_selfmod_no_cms_metrics.json
+  run_name: pilot-selfmod-no-cms

nested_learning/configs/ablations/selfmod_rank1_precond_off.yaml ADDED Viewed

@@ -0,0 +1,23 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  block_variant: hope_selfmod
+  self_mod_use_rank1_precond: false
+train:
+  online_updates: true
+  online_chunk_size: 0
+  per_layer_teach_signal: true
+  steps: 5000
+  device: "cuda:1"
+  checkpoint:
+    dir: artifacts/checkpoints/pilot_selfmod_rank1_off
+    save_interval: 1000
+logging:
+  enabled: true
+  backend: json
+  path: logs/pilot_selfmod_rank1_off_metrics.json
+  run_name: pilot-selfmod-rank1-off

nested_learning/configs/data/continual_segments_sample.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+segments:
+  - name: refinedweb_2018
+    shards_dir: data/shards/refinedweb_sample
+  - name: wikipedia_sample
+    shards_dir: data/shards/wikipedia_sample
+  - name: c4_sample
+    shards_dir: data/shards/c4_sample
+  - name: redpajama_sample
+    shards_dir: data/shards/redpajama_sample

nested_learning/configs/data/fineweb_edu_longdoc_filtered_sample.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+name: fineweb_edu_longdoc_filtered_sample
+tokenizer_output_dir: artifacts/tokenizer/fineweb_edu_longdoc
+datasets:
+  - name: fineweb_edu_longdoc
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/fineweb_edu_longdoc_en_sample.txt
+    sample_limit: 5000
+    seq_len: 4096
+    sequences_per_shard: 1024
+    output_dir: data/shards/fineweb_edu_longdoc_sample
+    max_records: null

nested_learning/configs/data/fineweb_edu_mixture_full.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+name: fineweb_edu_full
+tokenizer_output_dir: artifacts/tokenizer/fineweb_edu
+datasets:
+  - name: fineweb_edu
+    dataset: HuggingFaceFW/fineweb-edu
+    subset: sample-100BT
+    split: train
+    text_column: text
+    sample_limit: 100000
+    seq_len: 4096
+    sequences_per_shard: 1024
+    output_dir: data/shards/fineweb_edu_full
+    max_records: null

nested_learning/configs/data/fineweb_edu_mixture_sample.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+name: fineweb_edu_sample
+tokenizer_output_dir: artifacts/tokenizer/fineweb_edu
+datasets:
+  - name: fineweb_edu
+    dataset: HuggingFaceFW/fineweb-edu
+    subset: sample-10BT
+    split: train
+    text_column: text
+    sample_limit: 5000
+    seq_len: 2048
+    sequences_per_shard: 1024
+    output_dir: data/shards/fineweb_edu_sample
+    max_records: 10000

nested_learning/configs/data/refinedweb_mixture.yaml ADDED Viewed

@@ -0,0 +1,48 @@
+name: refinedweb_mix_v1
+tokenizer_output_dir: artifacts/tokenizer/refinedweb_mix
+datasets:
+  - name: refinedweb
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/refinedweb_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 2048
+    output_dir: data/shards/refinedweb
+    max_records: null
+  - name: books
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/wikipedia_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 2048
+    output_dir: data/shards/wikipedia
+    max_records: null
+  - name: c4
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/c4_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 2048
+    output_dir: data/shards/c4
+    max_records: null
+  - name: redpajama
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/redpajama_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 2048
+    output_dir: data/shards/redpajama
+    max_records: null
+  - name: code
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/code_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 2048
+    output_dir: data/shards/code
+    max_records: null

nested_learning/configs/data/refinedweb_mixture_filtered.yaml ADDED Viewed

@@ -0,0 +1,48 @@
+name: refinedweb_mix_filtered
+tokenizer_output_dir: artifacts/tokenizer/refinedweb_mix
+datasets:
+  - name: refinedweb
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/refinedweb_en_sample.txt
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/refinedweb_filtered
+    max_records: null
+  - name: wikipedia
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/wikipedia_en_sample.txt
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/wikipedia_filtered
+    max_records: null
+  - name: c4
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/c4_en_sample.txt
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/c4_filtered
+    max_records: null
+  - name: redpajama
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/redpajama_en_sample.txt
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/redpajama_filtered
+    max_records: null
+  - name: code
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/code_en_sample.txt
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/code_filtered
+    max_records: null

nested_learning/configs/data/refinedweb_mixture_full.yaml ADDED Viewed

@@ -0,0 +1,48 @@
+name: refinedweb_mix_full
+tokenizer_output_dir: artifacts/tokenizer/refinedweb_mix
+datasets:
+  - name: refinedweb
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/refinedweb_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 1024
+    output_dir: data/shards/refinedweb_full
+    max_records: null
+  - name: wikipedia
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/wikipedia_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 1024
+    output_dir: data/shards/wikipedia_full
+    max_records: null
+  - name: c4
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/c4_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 1024
+    output_dir: data/shards/c4_full
+    max_records: null
+  - name: redpajama
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/redpajama_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 1024
+    output_dir: data/shards/redpajama_full
+    max_records: null
+  - name: code
+    dataset: text
+    split: train
+    text_column: text
+    data_files: data/filtered/code_en_full.txt
+    seq_len: 2048
+    sequences_per_shard: 1024
+    output_dir: data/shards/code_full
+    max_records: null

nested_learning/configs/data/refinedweb_mixture_sample.yaml ADDED Viewed

@@ -0,0 +1,51 @@
+name: refinedweb_mix_sample
+tokenizer_output_dir: artifacts/tokenizer/refinedweb_mix
+datasets:
+  - name: refinedweb
+    dataset: HuggingFaceFW/fineweb
+    subset: sample-10BT
+    split: train
+    text_column: text
+    sample_limit: 5000
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/refinedweb_sample
+    max_records: 10000
+  - name: books
+    dataset: wikimedia/wikipedia
+    subset: 20231101.en
+    split: train
+    text_column: text
+    sample_limit: 2000
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/wikipedia_sample
+    max_records: 5000
+  - name: c4
+    dataset: allenai/c4
+    subset: en
+    split: train
+    text_column: text
+    sample_limit: 2000
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/c4_sample
+    max_records: 4000
+  - name: redpajama
+    dataset: cerebras/SlimPajama-627B
+    split: train
+    text_column: text
+    sample_limit: 2000
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/redpajama_sample
+    max_records: 4000
+  - name: code
+    dataset: codeparrot/codeparrot-clean-train
+    split: train
+    text_column: content
+    sample_limit: 2000
+    seq_len: 512
+    sequences_per_shard: 512
+    output_dir: data/shards/code_sample
+    max_records: 4000

nested_learning/configs/deepspeed/zero3.json ADDED Viewed

@@ -0,0 +1,25 @@
+{
+  "bf16": {
+    "enabled": true
+  },
+  "train_batch_size": 64,
+  "gradient_accumulation_steps": 1,
+  "zero_optimization": {
+    "stage": 3,
+    "reduce_bucket_size": 50000000,
+    "stage3_param_persistence_threshold": 100000,
+    "stage3_prefetch_bucket_size": 50000000
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": 0.0002,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-08,
+      "weight_decay": 0.01
+    }
+  }
+}

nested_learning/configs/hope/mid.yaml ADDED Viewed

@@ -0,0 +1,118 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job:
+    chdir: false
+model:
+  vocab_size: 32000
+  dim: 1024
+  num_layers: 24
+  heads: 16
+  surprise_threshold: null
+  freeze_backbone: false
+  titan_level:
+    name: titan
+    update_period: 16
+    optimizer_key: titan_opt
+  cms_levels:
+    - name: cms_fast
+      update_period: 1
+      optimizer_key: cms_opt
+    - name: cms_mid
+      update_period: 4
+      optimizer_key: cms_opt
+    - name: cms_slow
+      update_period: 32
+      optimizer_key: cms_opt
+    - name: cms_ultra
+      update_period: 128
+      optimizer_key: cms_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 8.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond
+    cms_opt:
+      type: deep_momentum
+      lr: 4.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond
+data:
+  source: mixture
+  batch_size: 16
+  num_workers: 4
+  mixture:
+    samples_per_epoch: 8192
+    seed: 42
+    sources:
+      - name: refinedweb
+        shards_dir: data/shards/refinedweb_full
+        weight: 0.4
+      - name: wikipedia
+        shards_dir: data/shards/wikipedia_full
+        weight: 0.2
+      - name: c4
+        shards_dir: data/shards/c4_full
+        weight: 0.15
+      - name: redpajama
+        shards_dir: data/shards/redpajama_full
+        weight: 0.15
+      - name: code
+        shards_dir: data/shards/code_full
+        weight: 0.1
+train:
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 100
+  log_interval: 10
+  device: "cuda:1"
+  seed: 808
+  deterministic: false
+  step_offset: 0
+  mixed_precision:
+    enabled: true
+    dtype: bf16
+  compile:
+    enable: true
+    mode: max-autotune
+  fsdp:
+    auto_wrap_min_params: 2000000
+    cpu_offload: false
+  checkpoint:
+    enable: true
+    dir: checkpoints/mid
+    save_interval: 50
+    resume_path: null
+    resume_tag: null
+optim:
+  type: muon
+  lr: 2.0e-4
+  weight_decay: 0.02
+  momentum: 0.95
+  betas:
+    - 0.9
+    - 0.999
+logging:
+  enabled: false
+  backend: wandb
+  project: nested-learning
+  run_name: mid-${now:%Y%m%d%H%M%S}
+  path: logs/mid_metrics.json

nested_learning/configs/hope/mid_fsdp.yaml ADDED Viewed

@@ -0,0 +1,47 @@
+defaults:
+  - mid
+  - _self_
+model:
+  gradient_checkpointing: true
+data:
+  batch_size: 8  # per-rank micro-batch for 2× RTX 6000 Ada
+  num_workers: 6
+train:
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 250000
+  log_interval: 20
+  device: "cuda"
+  mixed_precision:
+    enabled: true
+    dtype: bf16
+  compile:
+    enable: false
+  fsdp:
+    auto_wrap_min_params: 2000000
+    cpu_offload: false
+  checkpoint:
+    enable: true
+    dir: artifacts/checkpoints/mid_fsdp
+    save_interval: 1000
+    resume_path: null
+    resume_tag: null
+optim:
+  type: muon
+  lr: 2.0e-4
+  weight_decay: 0.01
+logging:
+  enabled: true
+  backend: wandb
+  project: nested-learning
+  run_name: hope-mid-fsdp-${now:%Y%m%d%H%M%S}
+  path: logs/mid_fsdp_metrics.json

nested_learning/configs/hope/pilot.yaml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ defaults:
2	+ - /pilot

nested_learning/configs/hope/pilot_attention.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  block_variant: hope_attention
+  qk_l2_norm: true
+  local_conv_window: 4

nested_learning/configs/hope/pilot_selfmod.yaml ADDED Viewed

@@ -0,0 +1,20 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  block_variant: hope_selfmod
+  # Chunk update cadence (paper §8.2): other memories update more often than M_memory.
+  self_mod_chunk_size: 8
+  self_mod_chunk_size_memory: 64
+train:
+  online_updates: true
+  online_chunk_size: 0
+  per_layer_teach_signal: true
+  checkpoint:
+    dir: artifacts/checkpoints/pilot_selfmod
+logging:
+  run_name: pilot-selfmod
+  path: logs/pilot_selfmod_metrics.json

nested_learning/configs/hope/pilot_transformer.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  block_variant: transformer
+  qk_l2_norm: true
+  local_conv_window: 4