libthx 0.2.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {libthx-0.2.1 → libthx-0.3.0}/PKG-INFO +3 -2
- {libthx-0.2.1 → libthx-0.3.0}/libthx.egg-info/PKG-INFO +3 -2
- {libthx-0.2.1 → libthx-0.3.0}/libthx.egg-info/SOURCES.txt +22 -0
- {libthx-0.2.1 → libthx-0.3.0}/libthx.egg-info/requires.txt +1 -0
- {libthx-0.2.1 → libthx-0.3.0}/pyproject.toml +12 -3
- {libthx-0.2.1 → libthx-0.3.0}/tests/test_kv_cache.py +54 -1
- libthx-0.3.0/tests/test_lact.py +625 -0
- libthx-0.3.0/tests/test_mamba.py +651 -0
- libthx-0.3.0/tests/test_vsubmit_log_fetcher.py +201 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/cli.py +105 -35
- {libthx-0.2.1 → libthx-0.3.0}/theseus/config.py +11 -5
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/__init__.py +2 -0
- libthx-0.3.0/theseus/data/datasets/flan.py +32 -0
- libthx-0.3.0/theseus/data/datasets/openr1_math.py +47 -0
- libthx-0.3.0/theseus/data/datasets/pes2o.py +59 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/bootstrap.py +1 -1
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/config.py +8 -1
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/dispatch.py +108 -55
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/slurm.py +5 -2
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/volcano.py +291 -203
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/base.py +29 -8
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/__init__.py +5 -0
- libthx-0.3.0/theseus/evaluation/datasets/arc_challenge.py +82 -0
- libthx-0.3.0/theseus/evaluation/datasets/bbh.py +174 -0
- libthx-0.3.0/theseus/evaluation/datasets/gsm8k.py +102 -0
- libthx-0.3.0/theseus/evaluation/datasets/hellaswag.py +74 -0
- libthx-0.3.0/theseus/evaluation/datasets/math.py +91 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/perplexity_evals.py +29 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/__init__.py +9 -0
- libthx-0.3.0/theseus/experiments/benchmark.py +125 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/continual/abcd.py +12 -5
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/continual/benchmark.py +80 -4
- libthx-0.3.0/theseus/experiments/models/lact.py +41 -0
- libthx-0.3.0/theseus/experiments/models/qwen_3_5.py +55 -0
- libthx-0.3.0/theseus/inference/__init__.py +4 -0
- libthx-0.3.0/theseus/inference/ttt.py +77 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/job.py +22 -11
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/attention/__init__.py +2 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/attention/base.py +51 -3
- libthx-0.3.0/theseus/model/attention/gated_delta.py +458 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/attention/grouped.py +55 -6
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/axes.py +2 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/block/__init__.py +5 -0
- libthx-0.3.0/theseus/model/block/lact.py +271 -0
- libthx-0.3.0/theseus/model/block/mamba.py +438 -0
- libthx-0.3.0/theseus/model/block/qwen_3_5.py +83 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/layers/__init__.py +4 -1
- libthx-0.3.0/theseus/model/layers/lact.py +320 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/layers/mlp.py +1 -1
- libthx-0.3.0/theseus/model/layers/mrope.py +82 -0
- libthx-0.3.0/theseus/model/layers/rmsnorm.py +75 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/masks.py +9 -2
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/__init__.py +6 -0
- libthx-0.3.0/theseus/model/models/contrib/__init__.py +8 -0
- libthx-0.3.0/theseus/model/models/contrib/qwen_3_5.py +496 -0
- libthx-0.3.0/theseus/model/models/contrib/qwen_3_5_moe.py +302 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/hybrid.py +0 -7
- libthx-0.3.0/theseus/model/models/lact.py +79 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/mamba.py +35 -14
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/moe.py +6 -1
- libthx-0.3.0/theseus/model/models/scratchbubbles.py +338 -0
- libthx-0.3.0/theseus/model/moe/__init__.py +5 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/moe/base.py +57 -6
- libthx-0.3.0/theseus/model/moe/shared.py +114 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/plot.py +53 -2
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/backbone.py +54 -3
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/base.py +87 -19
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/kl_divergence.py +2 -3
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/ppo.py +2 -1
- libthx-0.2.1/tests/test_mamba.py +0 -307
- libthx-0.2.1/theseus/data/datasets/pes2o.py +0 -38
- libthx-0.2.1/theseus/inference/__init__.py +0 -3
- libthx-0.2.1/theseus/model/block/mamba.py +0 -309
- libthx-0.2.1/theseus/model/layers/rmsnorm.py +0 -31
- libthx-0.2.1/theseus/model/models/contrib/__init__.py +0 -6
- libthx-0.2.1/theseus/model/models/scratchbubbles.py +0 -154
- libthx-0.2.1/theseus/model/moe/__init__.py +0 -4
- {libthx-0.2.1 → libthx-0.3.0}/LICENSE +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/README.md +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/libthx.egg-info/dependency_links.txt +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/libthx.egg-info/entry_points.txt +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/libthx.egg-info/top_level.txt +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/setup.cfg +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/tests/test_contrastive_roundtrip.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/tests/test_datasets.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/tests/test_eval_padding.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/tests/test_gpu_availability.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/tests/test_hardware_dispatch.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/tests/test_lora.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/tests/test_registries.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/base/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/base/axis.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/base/chip.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/base/hardware.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/base/job.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/base/topology.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/alpaca.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/bbq.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/ccaligned.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/cfq.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/clutrr.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/dataset.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/dictlearn.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/fever.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/fineweb.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/harmfulqa.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/longbench.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/longhealth.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/mmlu.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/mnli.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/mtob.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/pg19.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/pile.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/pile_detoxify.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/pile_injected.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/qqp.py +0 -0
- {libthx-0.2.1/theseus/training/flywheel → libthx-0.3.0/theseus/data/datasets/redcodegen}/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/redcodegen/hardening.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/siqa.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/squad.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/sst2.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/datasets/winogrande.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/tokenize.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/data/tokenizer.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/mailbox/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/mailbox/mailbox.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/mailbox/sidecar.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/solve.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/ssh.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/sync.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/dispatch/tpu.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/alpaca.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/arithmetic.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/bbq.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/blimp.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/ccaligned.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/cfq.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/clutrr.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/dictlearn.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/fever.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/longbench.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/longhealth.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/mmlu.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/mnli.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/mtob.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/pes2o.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/pg19.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/pg19_lengthgen.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/pile.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/pile_injected.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/qqp.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/siqa.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/squad.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/sst2.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/tinystories.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/datasets/winogrande.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/evaluation/huggingface.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/continual/__init__.py +0 -0
- {libthx-0.2.1/theseus/model → libthx-0.3.0/theseus/experiments/models}/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/models/forking.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/models/gpt.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/models/gpt_neox.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/models/llama.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/models/moe.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/models/qwen.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/mok/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/mok/reward.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/mok/smoke.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/redcodegen/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/experiments/redcodegen/hardening.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/inference/base.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/inference/huggingface.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/mock.py +0 -0
- {libthx-0.2.1/theseus/experiments/models → libthx-0.3.0/theseus/model}/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/activations/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/activations/swiglu.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/attention/forking.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/attention/rope.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/attention/scratching.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/block/block.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/block/forking.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/block/gpt_neox.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/block/llama.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/block/moe.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/block/qwen.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/block/scratching.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/huggingface.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/layers/layernorm.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/layers/rope.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/base.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/contrib/gpt_neox.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/contrib/llama.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/contrib/marin.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/contrib/qwen.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/models/thoughtbubbles.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/module.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/model/moe/bias_balanced.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/quick.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/registry.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/contrastive.py +0 -0
- {libthx-0.2.1/theseus/data/datasets/redcodegen → libthx-0.3.0/theseus/training/flywheel}/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/flywheel/contrastive.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/flywheel/padded.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/flywheel/pmd.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/flywheel/strategy.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/grpo.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/huggingface.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/lora.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/optimizers/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/optimizers/adamw.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/optimizers/muon.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/schedules/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/schedules/cosine_rewarm.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/schedules/wsd.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/schedules/wsds.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/training/utils.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/app.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/auth.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/generate_password_hash.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/models.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/routes/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/routes/api.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/routes/auth.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/routes/views.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/services/__init__.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/services/cache.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/services/checkpoints.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/services/logs.py +0 -0
- {libthx-0.2.1 → libthx-0.3.0}/theseus/web/services/status.py +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: libthx
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Architecture experimentation and training infrastructure.
|
|
5
|
-
Requires-Python:
|
|
5
|
+
Requires-Python: <3.14,>=3.11
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
7
7
|
License-File: LICENSE
|
|
8
8
|
Requires-Dist: click<=8.2.1
|
|
@@ -20,6 +20,7 @@ Requires-Dist: tiktoken>=0.12.0
|
|
|
20
20
|
Requires-Dist: torchax>=0.0.11
|
|
21
21
|
Requires-Dist: wandb>=0.24.1
|
|
22
22
|
Requires-Dist: datasets>=4.5.0
|
|
23
|
+
Requires-Dist: accelerate>=1.13.0
|
|
23
24
|
Provides-Extra: fever
|
|
24
25
|
Requires-Dist: wikipedia>=1.4.0; extra == "fever"
|
|
25
26
|
Provides-Extra: huggingface
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: libthx
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Architecture experimentation and training infrastructure.
|
|
5
|
-
Requires-Python:
|
|
5
|
+
Requires-Python: <3.14,>=3.11
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
7
7
|
License-File: LICENSE
|
|
8
8
|
Requires-Dist: click<=8.2.1
|
|
@@ -20,6 +20,7 @@ Requires-Dist: tiktoken>=0.12.0
|
|
|
20
20
|
Requires-Dist: torchax>=0.0.11
|
|
21
21
|
Requires-Dist: wandb>=0.24.1
|
|
22
22
|
Requires-Dist: datasets>=4.5.0
|
|
23
|
+
Requires-Dist: accelerate>=1.13.0
|
|
23
24
|
Provides-Extra: fever
|
|
24
25
|
Requires-Dist: wikipedia>=1.4.0; extra == "fever"
|
|
25
26
|
Provides-Extra: huggingface
|
|
@@ -13,9 +13,11 @@ tests/test_eval_padding.py
|
|
|
13
13
|
tests/test_gpu_availability.py
|
|
14
14
|
tests/test_hardware_dispatch.py
|
|
15
15
|
tests/test_kv_cache.py
|
|
16
|
+
tests/test_lact.py
|
|
16
17
|
tests/test_lora.py
|
|
17
18
|
tests/test_mamba.py
|
|
18
19
|
tests/test_registries.py
|
|
20
|
+
tests/test_vsubmit_log_fetcher.py
|
|
19
21
|
theseus/__init__.py
|
|
20
22
|
theseus/cli.py
|
|
21
23
|
theseus/config.py
|
|
@@ -43,12 +45,14 @@ theseus/data/datasets/dataset.py
|
|
|
43
45
|
theseus/data/datasets/dictlearn.py
|
|
44
46
|
theseus/data/datasets/fever.py
|
|
45
47
|
theseus/data/datasets/fineweb.py
|
|
48
|
+
theseus/data/datasets/flan.py
|
|
46
49
|
theseus/data/datasets/harmfulqa.py
|
|
47
50
|
theseus/data/datasets/longbench.py
|
|
48
51
|
theseus/data/datasets/longhealth.py
|
|
49
52
|
theseus/data/datasets/mmlu.py
|
|
50
53
|
theseus/data/datasets/mnli.py
|
|
51
54
|
theseus/data/datasets/mtob.py
|
|
55
|
+
theseus/data/datasets/openr1_math.py
|
|
52
56
|
theseus/data/datasets/pes2o.py
|
|
53
57
|
theseus/data/datasets/pg19.py
|
|
54
58
|
theseus/data/datasets/pile.py
|
|
@@ -79,7 +83,9 @@ theseus/evaluation/base.py
|
|
|
79
83
|
theseus/evaluation/huggingface.py
|
|
80
84
|
theseus/evaluation/datasets/__init__.py
|
|
81
85
|
theseus/evaluation/datasets/alpaca.py
|
|
86
|
+
theseus/evaluation/datasets/arc_challenge.py
|
|
82
87
|
theseus/evaluation/datasets/arithmetic.py
|
|
88
|
+
theseus/evaluation/datasets/bbh.py
|
|
83
89
|
theseus/evaluation/datasets/bbq.py
|
|
84
90
|
theseus/evaluation/datasets/blimp.py
|
|
85
91
|
theseus/evaluation/datasets/ccaligned.py
|
|
@@ -87,8 +93,11 @@ theseus/evaluation/datasets/cfq.py
|
|
|
87
93
|
theseus/evaluation/datasets/clutrr.py
|
|
88
94
|
theseus/evaluation/datasets/dictlearn.py
|
|
89
95
|
theseus/evaluation/datasets/fever.py
|
|
96
|
+
theseus/evaluation/datasets/gsm8k.py
|
|
97
|
+
theseus/evaluation/datasets/hellaswag.py
|
|
90
98
|
theseus/evaluation/datasets/longbench.py
|
|
91
99
|
theseus/evaluation/datasets/longhealth.py
|
|
100
|
+
theseus/evaluation/datasets/math.py
|
|
92
101
|
theseus/evaluation/datasets/mmlu.py
|
|
93
102
|
theseus/evaluation/datasets/mnli.py
|
|
94
103
|
theseus/evaluation/datasets/mtob.py
|
|
@@ -105,6 +114,7 @@ theseus/evaluation/datasets/sst2.py
|
|
|
105
114
|
theseus/evaluation/datasets/tinystories.py
|
|
106
115
|
theseus/evaluation/datasets/winogrande.py
|
|
107
116
|
theseus/experiments/__init__.py
|
|
117
|
+
theseus/experiments/benchmark.py
|
|
108
118
|
theseus/experiments/continual/__init__.py
|
|
109
119
|
theseus/experiments/continual/abcd.py
|
|
110
120
|
theseus/experiments/continual/benchmark.py
|
|
@@ -112,9 +122,11 @@ theseus/experiments/models/__init__.py
|
|
|
112
122
|
theseus/experiments/models/forking.py
|
|
113
123
|
theseus/experiments/models/gpt.py
|
|
114
124
|
theseus/experiments/models/gpt_neox.py
|
|
125
|
+
theseus/experiments/models/lact.py
|
|
115
126
|
theseus/experiments/models/llama.py
|
|
116
127
|
theseus/experiments/models/moe.py
|
|
117
128
|
theseus/experiments/models/qwen.py
|
|
129
|
+
theseus/experiments/models/qwen_3_5.py
|
|
118
130
|
theseus/experiments/mok/__init__.py
|
|
119
131
|
theseus/experiments/mok/reward.py
|
|
120
132
|
theseus/experiments/mok/smoke.py
|
|
@@ -123,6 +135,7 @@ theseus/experiments/redcodegen/hardening.py
|
|
|
123
135
|
theseus/inference/__init__.py
|
|
124
136
|
theseus/inference/base.py
|
|
125
137
|
theseus/inference/huggingface.py
|
|
138
|
+
theseus/inference/ttt.py
|
|
126
139
|
theseus/model/__init__.py
|
|
127
140
|
theseus/model/axes.py
|
|
128
141
|
theseus/model/huggingface.py
|
|
@@ -133,6 +146,7 @@ theseus/model/activations/swiglu.py
|
|
|
133
146
|
theseus/model/attention/__init__.py
|
|
134
147
|
theseus/model/attention/base.py
|
|
135
148
|
theseus/model/attention/forking.py
|
|
149
|
+
theseus/model/attention/gated_delta.py
|
|
136
150
|
theseus/model/attention/grouped.py
|
|
137
151
|
theseus/model/attention/rope.py
|
|
138
152
|
theseus/model/attention/scratching.py
|
|
@@ -140,19 +154,24 @@ theseus/model/block/__init__.py
|
|
|
140
154
|
theseus/model/block/block.py
|
|
141
155
|
theseus/model/block/forking.py
|
|
142
156
|
theseus/model/block/gpt_neox.py
|
|
157
|
+
theseus/model/block/lact.py
|
|
143
158
|
theseus/model/block/llama.py
|
|
144
159
|
theseus/model/block/mamba.py
|
|
145
160
|
theseus/model/block/moe.py
|
|
146
161
|
theseus/model/block/qwen.py
|
|
162
|
+
theseus/model/block/qwen_3_5.py
|
|
147
163
|
theseus/model/block/scratching.py
|
|
148
164
|
theseus/model/layers/__init__.py
|
|
165
|
+
theseus/model/layers/lact.py
|
|
149
166
|
theseus/model/layers/layernorm.py
|
|
150
167
|
theseus/model/layers/mlp.py
|
|
168
|
+
theseus/model/layers/mrope.py
|
|
151
169
|
theseus/model/layers/rmsnorm.py
|
|
152
170
|
theseus/model/layers/rope.py
|
|
153
171
|
theseus/model/models/__init__.py
|
|
154
172
|
theseus/model/models/base.py
|
|
155
173
|
theseus/model/models/hybrid.py
|
|
174
|
+
theseus/model/models/lact.py
|
|
156
175
|
theseus/model/models/mamba.py
|
|
157
176
|
theseus/model/models/moe.py
|
|
158
177
|
theseus/model/models/scratchbubbles.py
|
|
@@ -162,9 +181,12 @@ theseus/model/models/contrib/gpt_neox.py
|
|
|
162
181
|
theseus/model/models/contrib/llama.py
|
|
163
182
|
theseus/model/models/contrib/marin.py
|
|
164
183
|
theseus/model/models/contrib/qwen.py
|
|
184
|
+
theseus/model/models/contrib/qwen_3_5.py
|
|
185
|
+
theseus/model/models/contrib/qwen_3_5_moe.py
|
|
165
186
|
theseus/model/moe/__init__.py
|
|
166
187
|
theseus/model/moe/base.py
|
|
167
188
|
theseus/model/moe/bias_balanced.py
|
|
189
|
+
theseus/model/moe/shared.py
|
|
168
190
|
theseus/training/__init__.py
|
|
169
191
|
theseus/training/backbone.py
|
|
170
192
|
theseus/training/base.py
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "libthx"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Architecture experimentation and training infrastructure."
|
|
5
5
|
readme = "README.md"
|
|
6
|
-
requires-python = ">=3.11"
|
|
6
|
+
requires-python = ">=3.11,<3.14"
|
|
7
7
|
dependencies = [
|
|
8
8
|
"click<=8.2.1", # mkdocs breaks otherwise...
|
|
9
9
|
"flax>=0.12.2",
|
|
@@ -19,7 +19,8 @@ dependencies = [
|
|
|
19
19
|
"tiktoken>=0.12.0",
|
|
20
20
|
"torchax>=0.0.11",
|
|
21
21
|
"wandb>=0.24.1",
|
|
22
|
-
"datasets>=4.5.0"
|
|
22
|
+
"datasets>=4.5.0",
|
|
23
|
+
"accelerate>=1.13.0",
|
|
23
24
|
]
|
|
24
25
|
|
|
25
26
|
|
|
@@ -224,6 +225,14 @@ indent-style = "space"
|
|
|
224
225
|
docstring-code-format = true
|
|
225
226
|
docstring-code-line-length = 20
|
|
226
227
|
|
|
228
|
+
[tool.pytest.ini_options]
|
|
229
|
+
# ``contrib`` is not installed as a package by setuptools (only
|
|
230
|
+
# ``theseus*`` is — see ``[tool.setuptools.packages.find]``), but tests
|
|
231
|
+
# can still import from it because uv runs everything with the repo
|
|
232
|
+
# root as cwd. Pytest, however, doesn't add the rootdir to ``sys.path``
|
|
233
|
+
# by default, so we do it here.
|
|
234
|
+
pythonpath = ["."]
|
|
235
|
+
|
|
227
236
|
[tool.mypy]
|
|
228
237
|
plugins = ['pydantic.mypy']
|
|
229
238
|
python_version = "3.12"
|
|
@@ -12,7 +12,6 @@ from typing import Any
|
|
|
12
12
|
import numpy as np
|
|
13
13
|
import jax
|
|
14
14
|
import jax.numpy as jnp
|
|
15
|
-
import pytest
|
|
16
15
|
from omegaconf import OmegaConf
|
|
17
16
|
|
|
18
17
|
from theseus.config import build, configuration
|
|
@@ -123,3 +122,57 @@ class TestKVCacheGPTNeoX:
|
|
|
123
122
|
)
|
|
124
123
|
with _build_config_ctx(GPTNeoX, kwargs):
|
|
125
124
|
_kv_cache_parity(GPTNeoX, kwargs)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# Hybrid (full + linear/gated-delta attention). Exercises both the GQA KV
|
|
128
|
+
# cache and the GatedDeltaNet recurrent/conv decode cache in one model.
|
|
129
|
+
_QWEN35_KWARGS = dict(
|
|
130
|
+
n_layers=4,
|
|
131
|
+
n_embd=64,
|
|
132
|
+
n_head=4,
|
|
133
|
+
n_kv_head=2,
|
|
134
|
+
head_dim=16,
|
|
135
|
+
intermediate_size=128,
|
|
136
|
+
rope_theta=1e6,
|
|
137
|
+
partial_rotary_factor=0.25,
|
|
138
|
+
rms_norm_eps=1e-6,
|
|
139
|
+
block_size=32,
|
|
140
|
+
vocab_size=128,
|
|
141
|
+
dropout=0.0,
|
|
142
|
+
attn_dropout=0.0,
|
|
143
|
+
bias=False,
|
|
144
|
+
attention_bias=False,
|
|
145
|
+
layer_types=[
|
|
146
|
+
"linear_attention",
|
|
147
|
+
"full_attention",
|
|
148
|
+
"linear_attention",
|
|
149
|
+
"full_attention",
|
|
150
|
+
],
|
|
151
|
+
linear_num_value_heads=4,
|
|
152
|
+
linear_num_key_heads=2,
|
|
153
|
+
linear_key_head_dim=16,
|
|
154
|
+
linear_value_head_dim=16,
|
|
155
|
+
linear_conv_kernel_dim=4,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class TestKVCacheQwen35:
|
|
160
|
+
def test_qwen_3_5_dense(self):
|
|
161
|
+
from theseus.model.models.contrib.qwen_3_5 import Qwen3_5
|
|
162
|
+
|
|
163
|
+
kwargs = dict(_QWEN35_KWARGS)
|
|
164
|
+
with _build_config_ctx(Qwen3_5, kwargs):
|
|
165
|
+
_kv_cache_parity(Qwen3_5, kwargs, atol=2e-3)
|
|
166
|
+
|
|
167
|
+
def test_qwen_3_5_moe(self):
|
|
168
|
+
from theseus.model.models.contrib.qwen_3_5_moe import Qwen3_5MoE
|
|
169
|
+
|
|
170
|
+
kwargs = dict(
|
|
171
|
+
_QWEN35_KWARGS,
|
|
172
|
+
num_experts=4,
|
|
173
|
+
num_experts_per_tok=2,
|
|
174
|
+
moe_intermediate_size=32,
|
|
175
|
+
shared_expert_intermediate_size=32,
|
|
176
|
+
)
|
|
177
|
+
with _build_config_ctx(Qwen3_5MoE, kwargs):
|
|
178
|
+
_kv_cache_parity(Qwen3_5MoE, kwargs, atol=2e-3)
|