PyPI - nested-learning - Versions diffs - 0.2.0__py3-none-any.whl - Mend

nested-learning 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

nested_learning/__init__.py +12 -0
nested_learning/__main__.py +12 -0
nested_learning/assoc_memory.py +23 -0
nested_learning/backbones.py +147 -0
nested_learning/capabilities.py +104 -0
nested_learning/cli.py +253 -0
nested_learning/cms.py +92 -0
nested_learning/config_utils.py +50 -0
nested_learning/configs/ablations/cms_sparse.yaml +46 -0
nested_learning/configs/ablations/selfmod_chunked_8_64.yaml +24 -0
nested_learning/configs/ablations/selfmod_momentum_off.yaml +23 -0
nested_learning/configs/ablations/selfmod_momentum_on.yaml +23 -0
nested_learning/configs/ablations/selfmod_no_alpha.yaml +23 -0
nested_learning/configs/ablations/selfmod_no_cms.yaml +23 -0
nested_learning/configs/ablations/selfmod_rank1_precond_off.yaml +23 -0
nested_learning/configs/data/continual_segments_sample.yaml +9 -0
nested_learning/configs/data/fineweb_edu_longdoc_filtered_sample.yaml +14 -0
nested_learning/configs/data/fineweb_edu_mixture_full.yaml +14 -0
nested_learning/configs/data/fineweb_edu_mixture_sample.yaml +14 -0
nested_learning/configs/data/refinedweb_mixture.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_filtered.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_full.yaml +48 -0
nested_learning/configs/data/refinedweb_mixture_sample.yaml +51 -0
nested_learning/configs/deepspeed/zero3.json +25 -0
nested_learning/configs/hope/mid.yaml +118 -0
nested_learning/configs/hope/mid_fsdp.yaml +47 -0
nested_learning/configs/hope/pilot.yaml +2 -0
nested_learning/configs/hope/pilot_attention.yaml +9 -0
nested_learning/configs/hope/pilot_selfmod.yaml +20 -0
nested_learning/configs/hope/pilot_transformer.yaml +9 -0
nested_learning/configs/hope/target.yaml +145 -0
nested_learning/configs/hope/target_fsdp.yaml +47 -0
nested_learning/configs/mid_smoke.yaml +99 -0
nested_learning/configs/mid_stage2.yaml +110 -0
nested_learning/configs/mid_stage2_smoke.yaml +102 -0
nested_learning/configs/mid_titan_baseline.yaml +92 -0
nested_learning/configs/pilot.yaml +127 -0
nested_learning/configs/pilot_paper_faithful.yaml +42 -0
nested_learning/configs/pilot_selfmod_paper_faithful.yaml +18 -0
nested_learning/configs/pilot_smoke.yaml +80 -0
nested_learning/configs/resolved/cms_sparse_eval.yaml +105 -0
nested_learning/configs/resolved/phase2_pilot_attention_eval.yaml +49 -0
nested_learning/configs/resolved/phase2_pilot_transformer_eval.yaml +49 -0
nested_learning/continual_classification.py +136 -0
nested_learning/continual_streaming.py +283 -0
nested_learning/data.py +153 -0
nested_learning/device.py +21 -0
nested_learning/eval_state.py +72 -0
nested_learning/fast_state.py +108 -0
nested_learning/functional.py +69 -0
nested_learning/hope/__init__.py +0 -0
nested_learning/hope/block.py +1973 -0
nested_learning/hope/self_mod.py +40 -0
nested_learning/instrumentation.py +38 -0
nested_learning/levels.py +94 -0
nested_learning/logging_utils.py +64 -0
nested_learning/memorize.py +382 -0
nested_learning/model.py +604 -0
nested_learning/optim/__init__.py +0 -0
nested_learning/optim/deep.py +102 -0
nested_learning/optim/factory.py +13 -0
nested_learning/optim/m3.py +121 -0
nested_learning/optim/manager.py +151 -0
nested_learning/titan/__init__.py +0 -0
nested_learning/titan/memory.py +88 -0
nested_learning/titan/model.py +412 -0
nested_learning/titan/self_modifying.py +724 -0
nested_learning/tokenizer.py +28 -0
nested_learning/tokenizer_coverage.py +77 -0
nested_learning/training.py +1600 -0
nested_learning/transformer.py +104 -0
nested_learning-0.2.0.dist-info/METADATA +390 -0
nested_learning-0.2.0.dist-info/RECORD +76 -0
nested_learning-0.2.0.dist-info/WHEEL +4 -0
nested_learning-0.2.0.dist-info/entry_points.txt +2 -0
nested_learning-0.2.0.dist-info/licenses/LICENSE +201 -0

nested_learning/configs/pilot_paper_faithful.yaml ADDED Viewed

@@ -0,0 +1,42 @@
+defaults:
+  - /pilot
+  - _self_
+model:
+  # Explicit paper-defined variant (avoid inheriting repo default `hope_hybrid`).
+  block_variant: hope_attention
+  # Paper-faithful: treat "surprise" as the (scaled) teach signal itself, without threshold gating.
+  surprise_threshold: null
+  # Paper updates on the last (possibly partial) chunk; enable flush for non-multiple seq lengths.
+  cms_flush_partial_at_end: true
+  # Paper: q is non-adaptive and uses a fixed projection.
+  self_mod_adaptive_q: false
+  # Paper: local causal conv in the HOPE self-mod module.
+  self_mod_local_conv_window: 4
+data:
+  # Paper-faithful semantics: CMS/TITAN fast state is per-context; this repo currently treats
+  # each *batch* as a single shared context when batch_size>1.
+  batch_size: 1
+train:
+  algorithm_mode: two_pass_stopgrad_updates
+  # Keep this explicit (instead of inherited) so paper-faithful behavior is visible in one file.
+  online_updates: true
+  # Paper: re-initialize fast memories per context (sequence).
+  use_fast_state: true
+  strict_streaming_contract: true
+  # Use explicit boundary-token supervision (no overlap approximation).
+  online_boundary_targets: true
+  # Carry attention state across chunks during online updates.
+  online_carry_attention_cache: true
+  # Fail fast if DDP would silently disable paper-critical features.
+  fail_if_paper_faithful_disabled: true
+optim:
+  # Ensure meta-learning updates include memory module initial states (paper §8.2).
+  param_policy: all
+logging:
+  run_name: pilot-paper-faithful
+  path: logs/pilot_paper_faithful_metrics.json

nested_learning/configs/pilot_selfmod_paper_faithful.yaml ADDED Viewed

@@ -0,0 +1,18 @@
+defaults:
+  - /pilot_paper_faithful
+  - _self_
+model:
+  block_variant: hope_selfmod
+  # Chunk update cadence (paper §8.2): other memories update more often than M_memory.
+  self_mod_chunk_size: 8
+  self_mod_chunk_size_memory: 64
+  self_mod_use_skip: false
+train:
+  checkpoint:
+    dir: artifacts/checkpoints/pilot_selfmod_paper_faithful
+logging:
+  run_name: pilot-selfmod-paper-faithful
+  path: logs/pilot_selfmod_paper_faithful_metrics.json

nested_learning/configs/pilot_smoke.yaml ADDED Viewed

@@ -0,0 +1,80 @@
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job:
+    chdir: false
+model:
+  vocab_size: 32000
+  dim: 128
+  num_layers: 2
+  heads: 4
+  titan_level:
+    name: titan
+    update_period: 8
+    optimizer_key: titan_opt
+  cms_levels:
+    - name: cms_fast
+      update_period: 1
+      optimizer_key: cms_opt
+    - name: cms_mid
+      update_period: 4
+      optimizer_key: cms_opt
+    - name: cms_slow
+      update_period: 16
+      optimizer_key: cms_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 1.0e-3
+      params:
+        beta: 0.9
+        beta2: 0.999
+    cms_opt:
+      type: deep_momentum
+      lr: 5.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+data:
+  source: synthetic
+  vocab_size: 32000
+  seq_len: 64
+  dataset_size: 1024
+  batch_size: 4
+  num_workers: 0
+train:
+  strict_streaming_contract: false
+  online_updates: true
+  online_chunk_size: 0
+  online_boundary_targets: false
+  online_carry_attention_cache: false
+  per_layer_teach_signal: true
+  steps: 10
+  log_interval: 1
+  device: "cpu"
+  seed: 1234
+  deterministic: true
+  mixed_precision:
+    enabled: false
+    dtype: bf16
+  compile:
+    enable: false
+  checkpoint:
+    enable: true
+    dir: artifacts/checkpoints/pilot_smoke
+    save_interval: 10
+    save_last: true
+optim:
+  type: adamw
+  lr: 3.0e-4
+  fused: false
+logging:
+  enabled: true
+  backend: json
+  path: logs/pilot_smoke.json

nested_learning/configs/resolved/cms_sparse_eval.yaml ADDED Viewed

@@ -0,0 +1,105 @@
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job:
+    chdir: false
+model:
+  vocab_size: 32000
+  dim: 384
+  num_layers: 8
+  heads: 6
+  teach_scale: 0.1
+  teach_clip: 5.0
+  self_mod_lr: 0.001
+  teach_schedule:
+    warmup_steps: 2000
+    decay_start: 120000
+    decay_duration: 20000
+  titan_level:
+    name: titan
+    update_period: 8
+    optimizer_key: titan_opt
+  cms_levels:
+  - name: cms_fast
+    update_period: 8
+    optimizer_key: cms_opt
+  - name: cms_mid
+    update_period: 32
+    optimizer_key: cms_opt
+  - name: cms_slow
+    update_period: 128
+    optimizer_key: cms_opt
+  - name: cms_ultra
+    update_period: 512
+    optimizer_key: cms_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 0.0006
+      params:
+        beta: 0.9
+        beta2: 0.999
+    cms_opt:
+      type: deep_momentum
+      lr: 0.0003
+      params:
+        beta: 0.9
+        beta2: 0.999
+  cms_hidden_multiplier: 2
+data:
+  source: mixture
+  seq_len: 1024
+  batch_size: 2
+  num_workers: 2
+  mixture:
+    samples_per_epoch: 65536
+    seed: 1337
+    sources:
+    - name: refinedweb
+      shards_dir: data/shards/refinedweb_filtered
+      weight: 0.4
+    - name: wikipedia
+      shards_dir: data/shards/wikipedia_filtered
+      weight: 0.2
+    - name: c4
+      shards_dir: data/shards/c4_filtered
+      weight: 0.15
+    - name: redpajama
+      shards_dir: data/shards/redpajama_filtered
+      weight: 0.15
+    - name: code
+      shards_dir: data/shards/code_filtered
+      weight: 0.1
+train:
+  online_updates: true
+  online_chunk_size: 0
+  per_layer_teach_signal: true
+  steps: 5000
+  log_interval: 25
+  device: cuda:1
+  seed: 1337
+  deterministic: false
+  mixed_precision:
+    enabled: true
+    dtype: bf16
+  compile:
+    enable: false
+    mode: max-autotune
+  checkpoint:
+    enable: true
+    dir: artifacts/checkpoints/pilot_cms_sparse
+    save_interval: 1000
+    save_last: true
+    resume_path: null
+    resume_tag: null
+optim:
+  type: adamw
+  lr: 0.00025
+  fused: auto
+logging:
+  enabled: true
+  backend: json
+  path: logs/pilot_cms_sparse_metrics.json
+  project: nested-learning
+  run_name: pilot-cms-sparse

nested_learning/configs/resolved/phase2_pilot_attention_eval.yaml ADDED Viewed

@@ -0,0 +1,49 @@
+model:
+  vocab_size: 32000
+  dim: 512
+  num_layers: 12
+  heads: 8
+  teach_scale: 0.10
+  teach_clip: 5.0
+  surprise_threshold: 0.02
+  freeze_backbone: false
+  qk_l2_norm: true
+  local_conv_window: 4
+  block_variant: hope_attention
+  teach_schedule:
+    warmup_steps: 2000
+    decay_start: 120000
+    decay_duration: 20000
+  titan_level:
+    name: titan
+    update_period: 8
+    optimizer_key: titan_opt
+  cms_levels:
+    - name: cms_fast
+      update_period: 1
+      optimizer_key: cms_opt
+    - name: cms_mid
+      update_period: 4
+      optimizer_key: cms_opt
+    - name: cms_slow
+      update_period: 32
+      optimizer_key: cms_opt
+    - name: cms_ultra
+      update_period: 128
+      optimizer_key: cms_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 6.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond
+    cms_opt:
+      type: deep_momentum
+      lr: 3.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond

nested_learning/configs/resolved/phase2_pilot_transformer_eval.yaml ADDED Viewed

@@ -0,0 +1,49 @@
+model:
+  vocab_size: 32000
+  dim: 512
+  num_layers: 12
+  heads: 8
+  teach_scale: 0.10
+  teach_clip: 5.0
+  surprise_threshold: 0.02
+  freeze_backbone: false
+  qk_l2_norm: true
+  local_conv_window: 4
+  block_variant: transformer
+  teach_schedule:
+    warmup_steps: 2000
+    decay_start: 120000
+    decay_duration: 20000
+  titan_level:
+    name: titan
+    update_period: 8
+    optimizer_key: titan_opt
+  cms_levels:
+    - name: cms_fast
+      update_period: 1
+      optimizer_key: cms_opt
+    - name: cms_mid
+      update_period: 4
+      optimizer_key: cms_opt
+    - name: cms_slow
+      update_period: 32
+      optimizer_key: cms_opt
+    - name: cms_ultra
+      update_period: 128
+      optimizer_key: cms_opt
+  optimizers:
+    titan_opt:
+      type: deep_momentum
+      lr: 6.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond
+    cms_opt:
+      type: deep_momentum
+      lr: 3.0e-4
+      params:
+        beta: 0.9
+        beta2: 0.999
+        variant: nl_l2_precond

nested_learning/continual_classification.py ADDED Viewed

@@ -0,0 +1,136 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Iterable, List, Sequence
+@dataclass(frozen=True)
+class ClassificationExample:
+    text: str
+    label: str
+@dataclass(frozen=True)
+class LoadedClassificationDataset:
+    name: str
+    split: str
+    examples: List[ClassificationExample]
+    label_names: List[str]
+def load_hf_classification_dataset(
+    dataset: str,
+    *,
+    split: str,
+    text_field: str,
+    label_field: str,
+    name: str | None = None,
+    max_samples: int | None = None,
+) -> LoadedClassificationDataset:
+    """
+    Load a HuggingFace `datasets` text classification dataset into a simple in-memory format.
+    This is used by the Phase 4 continual-learning harness (CLINC/Banking/DBpedia).
+    """
+    try:
+        from datasets import load_dataset  # type: ignore[import-not-found]
+    except Exception as exc:  # pragma: no cover
+        raise RuntimeError(
+            "`datasets` dependency is required for continual classification."
+        ) from exc
+    ds = load_dataset(dataset, name=name, split=split)
+    features = getattr(ds, "features", None)
+    label_names: List[str] = []
+    if features is not None and label_field in features:
+        feature = features[label_field]
+        if getattr(feature, "names", None) is not None:
+            label_names = list(feature.names)
+    examples: List[ClassificationExample] = []
+    count = 0
+    for row in ds:
+        if max_samples is not None and count >= max_samples:
+            break
+        text = str(row[text_field])
+        raw_label = row[label_field]
+        if isinstance(raw_label, int) and label_names:
+            label = label_names[raw_label]
+        else:
+            label = str(raw_label)
+        examples.append(ClassificationExample(text=text, label=label))
+        count += 1
+    if not label_names:
+        label_names = sorted({ex.label for ex in examples})
+    return LoadedClassificationDataset(
+        name=dataset if name is None else f"{dataset}:{name}",
+        split=split,
+        examples=examples,
+        label_names=label_names,
+    )
+def load_clinc_oos(
+    *,
+    split: str = "test",
+    max_samples: int | None = None,
+) -> LoadedClassificationDataset:
+    # HF dataset: "clinc_oos" with fields {"text", "intent"}.
+    return load_hf_classification_dataset(
+        "clinc_oos",
+        split=split,
+        text_field="text",
+        label_field="intent",
+        max_samples=max_samples,
+    )
+def load_banking77(
+    *,
+    split: str = "test",
+    max_samples: int | None = None,
+) -> LoadedClassificationDataset:
+    # HF dataset: "banking77" with fields {"text", "label"}.
+    return load_hf_classification_dataset(
+        "banking77",
+        split=split,
+        text_field="text",
+        label_field="label",
+        max_samples=max_samples,
+    )
+def load_dbpedia14(
+    *,
+    split: str = "test",
+    max_samples: int | None = None,
+) -> LoadedClassificationDataset:
+    # HF dataset: "dbpedia_14" with fields {"content", "label"}.
+    return load_hf_classification_dataset(
+        "dbpedia_14",
+        split=split,
+        text_field="content",
+        label_field="label",
+        max_samples=max_samples,
+    )
+def unique_labels(examples: Iterable[ClassificationExample]) -> List[str]:
+    seen = set()
+    ordered: List[str] = []
+    for ex in examples:
+        if ex.label in seen:
+            continue
+        seen.add(ex.label)
+        ordered.append(ex.label)
+    return ordered
+def filter_examples_by_labels(
+    examples: Sequence[ClassificationExample],
+    *,
+    allowed: set[str],
+) -> List[ClassificationExample]:
+    return [ex for ex in examples if ex.label in allowed]