PyPI - lalamo - Versions diffs - 0.2.7__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

lalamo 0.2.7py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

lalamo/__init__.py +1 -1
lalamo/common.py +79 -29
lalamo/language_model.py +106 -83
lalamo/main.py +91 -18
lalamo/message_processor.py +170 -0
lalamo/model_import/common.py +159 -43
lalamo/model_import/{configs → decoder_configs}/__init__.py +0 -1
lalamo/model_import/{configs → decoder_configs}/common.py +11 -10
lalamo/model_import/{configs → decoder_configs}/huggingface/common.py +9 -4
lalamo/model_import/{configs → decoder_configs}/huggingface/gemma3.py +2 -2
lalamo/model_import/{configs → decoder_configs}/huggingface/llama.py +2 -2
lalamo/model_import/{configs → decoder_configs}/huggingface/mistral.py +1 -1
lalamo/model_import/{configs → decoder_configs}/huggingface/qwen2.py +1 -1
lalamo/model_import/{configs → decoder_configs}/huggingface/qwen3.py +1 -1
lalamo/model_import/huggingface_generation_config.py +44 -0
lalamo/model_import/huggingface_tokenizer_config.py +85 -0
lalamo/model_import/loaders/common.py +2 -1
lalamo/model_import/loaders/huggingface.py +12 -10
lalamo/model_import/model_specs/__init__.py +3 -2
lalamo/model_import/model_specs/common.py +32 -34
lalamo/model_import/model_specs/deepseek.py +1 -10
lalamo/model_import/model_specs/gemma.py +2 -25
lalamo/model_import/model_specs/huggingface.py +2 -12
lalamo/model_import/model_specs/llama.py +2 -58
lalamo/model_import/model_specs/mistral.py +9 -19
lalamo/model_import/model_specs/pleias.py +3 -13
lalamo/model_import/model_specs/polaris.py +5 -7
lalamo/model_import/model_specs/qwen.py +12 -111
lalamo/model_import/model_specs/reka.py +4 -13
lalamo/modules/__init__.py +2 -1
lalamo/modules/attention.py +90 -10
lalamo/modules/common.py +51 -4
lalamo/modules/decoder.py +90 -8
lalamo/modules/decoder_layer.py +85 -8
lalamo/modules/embedding.py +95 -29
lalamo/modules/kv_cache.py +3 -3
lalamo/modules/linear.py +170 -130
lalamo/modules/mlp.py +40 -7
lalamo/modules/normalization.py +24 -6
lalamo/modules/rope.py +24 -6
lalamo/sampling.py +99 -0
lalamo/utils.py +86 -1
{lalamo-0.2.7.dist-info → lalamo-0.3.0.dist-info}/METADATA +6 -6
lalamo-0.3.0.dist-info/RECORD +58 -0
lalamo-0.2.7.dist-info/RECORD +0 -54
/lalamo/model_import/{configs → decoder_configs}/executorch.py +0 -0
/lalamo/model_import/{configs → decoder_configs}/huggingface/__init__.py +0 -0
/lalamo/model_import/{configs → decoder_configs}/huggingface/gemma2.py +0 -0
{lalamo-0.2.7.dist-info → lalamo-0.3.0.dist-info}/WHEEL +0 -0
{lalamo-0.2.7.dist-info → lalamo-0.3.0.dist-info}/entry_points.txt +0 -0
{lalamo-0.2.7.dist-info → lalamo-0.3.0.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.2.7.dist-info → lalamo-0.3.0.dist-info}/top_level.txt +0 -0

lalamo/model_import/model_specs/qwen.py CHANGED Viewed

@@ -1,14 +1,7 @@
-from lalamo.model_import.configs import HFQwen2Config, HFQwen3Config
+from lalamo.model_import.decoder_configs import HFQwen2Config, HFQwen3Config
 from lalamo.quantization import QuantizationMode
-from .common import (
-    HUGGINFACE_GENERATION_CONFIG_FILE,
-    HUGGINGFACE_TOKENIZER_FILES,
-    ModelSpec,
-    UseCase,
-    WeightsType,
-    huggingface_weight_files,
-)
+from .common import ModelSpec, UseCase, WeightsType
 __all__ = ["QWEN_MODELS"]
@@ -22,11 +15,6 @@ QWEN25 = [
         quantization=None,
         repo="Qwen/Qwen2.5-0.5B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(1),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -36,11 +24,6 @@ QWEN25 = [
         quantization=None,
         repo="Qwen/Qwen2.5-1.5B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(1),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -50,11 +33,6 @@ QWEN25 = [
         quantization=None,
         repo="Qwen/Qwen2.5-3B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(2),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -64,11 +42,6 @@ QWEN25 = [
         quantization=None,
         repo="Qwen/Qwen2.5-7B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(4),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -78,11 +51,6 @@ QWEN25 = [
         quantization=None,
         repo="Qwen/Qwen2.5-14B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(8),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -92,11 +60,6 @@ QWEN25 = [
         quantization=None,
         repo="Qwen/Qwen2.5-32B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(17),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
 ]
@@ -110,10 +73,6 @@ QWEN25_CODER = [
         quantization=None,
         repo="Qwen/Qwen2.5-Coder-0.5B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(1),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
         use_cases=(UseCase.CODE,),
     ),
     ModelSpec(
@@ -124,10 +83,6 @@ QWEN25_CODER = [
         quantization=None,
         repo="Qwen/Qwen2.5-Coder-1.5B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(1),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
         use_cases=(UseCase.CODE,),
     ),
     ModelSpec(
@@ -138,10 +93,6 @@ QWEN25_CODER = [
         quantization=None,
         repo="Qwen/Qwen2.5-Coder-3B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(2),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
         use_cases=(UseCase.CODE,),
     ),
     ModelSpec(
@@ -152,10 +103,6 @@ QWEN25_CODER = [
         quantization=None,
         repo="Qwen/Qwen2.5-Coder-7B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(4),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
         use_cases=(UseCase.CODE,),
     ),
     ModelSpec(
@@ -166,10 +113,6 @@ QWEN25_CODER = [
         quantization=None,
         repo="Qwen/Qwen2.5-Coder-14B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(6),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
         use_cases=(UseCase.CODE,),
     ),
     ModelSpec(
@@ -180,10 +123,16 @@ QWEN25_CODER = [
         quantization=None,
         repo="Qwen/Qwen2.5-Coder-32B-Instruct",
         config_type=HFQwen2Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(14),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
+        use_cases=(UseCase.CODE,),
+    ),
+    ModelSpec(
+        vendor="Alibaba",
+        family="Qwen2.5-Coder",
+        name="Qwen2.5-Coder-32B-Instruct",
+        size="32B",
+        quantization=None,
+        repo="Qwen/Qwen2.5-Coder-32B-Instruct",
+        config_type=HFQwen2Config,
         use_cases=(UseCase.CODE,),
     ),
 ]
@@ -198,11 +147,6 @@ QWEN3 = [
         quantization=None,
         repo="Qwen/Qwen3-0.6B",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(1),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -212,10 +156,7 @@ QWEN3 = [
         quantization=None,
         repo="Qwen/Qwen3-1.7B",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(2),
         weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
         use_cases=tuple(),
     ),
     ModelSpec(
@@ -226,11 +167,6 @@ QWEN3 = [
         quantization=None,
         repo="Qwen/Qwen3-4B",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(3),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -240,11 +176,6 @@ QWEN3 = [
         quantization=QuantizationMode.UINT4,
         repo="Qwen/Qwen3-4B-AWQ",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(1),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -254,11 +185,6 @@ QWEN3 = [
         quantization=None,
         repo="Qwen/Qwen3-8B",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(5),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -268,11 +194,6 @@ QWEN3 = [
         quantization=QuantizationMode.UINT4,
         repo="Qwen/Qwen3-8B-AWQ",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(2),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -282,11 +203,6 @@ QWEN3 = [
         quantization=None,
         repo="Qwen/Qwen3-14B",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(8),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -296,11 +212,6 @@ QWEN3 = [
         quantization=None,
         repo="Qwen/Qwen3-14B-AWQ",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(2),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -310,11 +221,6 @@ QWEN3 = [
         quantization=None,
         repo="Qwen/Qwen3-32B",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(17),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
     ModelSpec(
         vendor="Alibaba",
@@ -324,11 +230,6 @@ QWEN3 = [
         quantization=QuantizationMode.UINT4,
         repo="Qwen/Qwen3-32B-AWQ",
         config_type=HFQwen3Config,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(4),
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
-        use_cases=tuple(),
     ),
 ]

lalamo/model_import/model_specs/reka.py CHANGED Viewed

@@ -1,12 +1,6 @@
-from lalamo.model_import.configs import HFLlamaConfig
+from lalamo.model_import.decoder_configs import HFLlamaConfig
-from .common import (
-    HUGGINFACE_GENERATION_CONFIG_FILE,
-    HUGGINGFACE_TOKENIZER_FILES,
-    ModelSpec,
-    WeightsType,
-    huggingface_weight_files,
-)
+from .common import ModelSpec
 __all__ = ["REKA_MODELS"]
@@ -19,10 +13,7 @@ REKA_MODELS = [
         quantization=None,
         repo="RekaAI/reka-flash-3.1",
         config_type=HFLlamaConfig,
-        config_file_name="config.json",
-        weights_file_names=huggingface_weight_files(9),  # Model has 9 shards
-        weights_type=WeightsType.SAFETENSORS,
-        tokenizer_files=(*HUGGINGFACE_TOKENIZER_FILES, HUGGINFACE_GENERATION_CONFIG_FILE),
+        user_role_name="human",
         use_cases=tuple(),
     ),
-]
+]

lalamo/modules/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .activations import Activation
 from .attention import Attention, AttentionConfig
-from .common import WeightLayout, config_converter
+from .common import LalamoModule, WeightLayout, config_converter
 from .decoder import Decoder, DecoderActivationTrace, DecoderConfig, DecoderResult
 from .decoder_layer import DecoderLayer, DecoderLayerActivationTrace, DecoderLayerConfig, DecoderLayerResult
 from .embedding import (
@@ -58,6 +58,7 @@ __all__ = [
     "GroupQuantizedLinearConfig",
     "KVCache",
     "KVCacheLayer",
+    "LalamoModule",
     "LinearBase",
     "LinearConfig",
     "LinearScalingRoPEConfig",

lalamo/modules/attention.py CHANGED Viewed

@@ -1,5 +1,6 @@
-from dataclasses import dataclass
-from typing import NamedTuple
+from collections.abc import Mapping
+from dataclasses import dataclass, replace
+from typing import NamedTuple, Self
 import equinox as eqx
 import jax
@@ -8,10 +9,9 @@ from jax import numpy as jnp
 from jax import vmap
 from jaxtyping import Array, Bool, DTypeLike, Float, Int, PRNGKeyArray
-from lalamo.common import ParameterDict
 from lalamo.modules.normalization import RMSNorm, RMSNormConfig
-from .common import AttentionType, LalamoModule, WeightLayout
+from .common import AttentionType, LalamoModule, ParameterTree, WeightLayout
 from .kv_cache import DynamicKVCacheLayer, KVCacheLayer, StaticKVCacheLayer
 from .linear import LinearBase, LinearConfig
 from .rope import PositionalEmbeddings
@@ -42,8 +42,8 @@ def _soft_capped_attention_kernel(
     scale: float | None,
     logit_soft_cap: float,
 ) -> Float[Array, "dst_tokens heads head_channels"]:
-    dst_length, num_heads, head_dim = queries.shape
-    src_length, num_groups, _ = keys.shape
+    _, num_heads, head_dim = queries.shape
+    _, num_groups, _ = keys.shape
     if scale is None:
         scale = head_dim**-0.5
     group_size = num_heads // num_groups
@@ -118,14 +118,67 @@ class AttentionConfig:
         if self.query_norm_config is not None:
             query_norm = self.query_norm_config.init(
-                channels=head_dim,
+                input_dim=head_dim,
             )
         else:
             query_norm = None
         if self.key_norm_config is not None:
             key_norm = self.key_norm_config.init(
-                channels=head_dim,
+                input_dim=head_dim,
+            )
+        else:
+            key_norm = None
+        return Attention(
+            self,
+            qkv_projection=qkv_projection,
+            out_projection=out_projection,
+            query_norm=query_norm,
+            key_norm=key_norm,
+            num_heads=num_heads,
+            num_groups=num_groups,
+            head_dim=head_dim,
+            is_causal=is_causal,
+            scale=scale,
+            sliding_window_size=sliding_window_size,
+        )
+    def empty(
+        self,
+        model_dim: int,
+        num_heads: int,
+        num_groups: int,
+        head_dim: int,
+        is_causal: bool,
+        scale: float | None,
+        sliding_window_size: int | None,
+    ) -> "Attention":
+        qkv_projection = self.qkv_projection_config.empty(
+            input_dim=model_dim,
+            output_dims=(
+                num_heads * head_dim,
+                num_groups * head_dim,
+                num_groups * head_dim,
+            ),
+            has_biases=self.has_qkv_biases,
+        )
+        out_projection = self.out_projection_config.empty(
+            num_heads * head_dim,
+            (model_dim,),
+            has_biases=self.has_out_biases,
+        )
+        if self.query_norm_config is not None:
+            query_norm = self.query_norm_config.empty(
+                input_dim=head_dim,
+            )
+        else:
+            query_norm = None
+        if self.key_norm_config is not None:
+            key_norm = self.key_norm_config.empty(
+                input_dim=head_dim,
             )
         else:
             key_norm = None
@@ -233,6 +286,7 @@ class Attention(LalamoModule[AttentionConfig]):
                 f" got {v_output_dim}",
             )
+    @eqx.filter_jit
     def __call__(
         self,
         inputs: Float[Array, "suffix_tokens channels"],
@@ -314,8 +368,8 @@ class Attention(LalamoModule[AttentionConfig]):
     def init_static_kv_cache(self, capacity: int) -> StaticKVCacheLayer:
         return StaticKVCacheLayer.empty(capacity, self.num_groups, self.head_dim, self.activation_precision)
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterDict:
-        result = ParameterDict(
+    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:
+        result = dict(
             qkv_projection=self.qkv_projection.export_weights(weight_layout),
             out_projection=self.out_projection.export_weights(weight_layout),
         )
@@ -324,3 +378,29 @@ class Attention(LalamoModule[AttentionConfig]):
         if self.key_norm is not None:
             result["key_norm"] = self.key_norm.export_weights(weight_layout)
         return result
+    def import_weights(
+        self,
+        weights: ParameterTree[Array],
+        weight_layout: WeightLayout = WeightLayout.AUTO,
+    ) -> Self:
+        assert isinstance(weights, Mapping)
+        assert isinstance(weights["qkv_projection"], Mapping)
+        assert isinstance(weights["out_projection"], Mapping)
+        if self.query_norm is not None:
+            assert isinstance(weights["query_norm"], Mapping)
+            query_norm = self.query_norm.import_weights(weights["query_norm"], weight_layout)
+        else:
+            query_norm = None
+        if self.key_norm is not None:
+            assert isinstance(weights["key_norm"], Mapping)
+            key_norm = self.key_norm.import_weights(weights["key_norm"], weight_layout)
+        else:
+            key_norm = None
+        return replace(
+            self,
+            qkv_projection=self.qkv_projection.import_weights(weights["qkv_projection"], weight_layout),
+            out_projection=self.out_projection.import_weights(weights["out_projection"], weight_layout),
+            query_norm=query_norm,
+            key_norm=key_norm,
+        )

lalamo/modules/common.py CHANGED Viewed

@@ -2,19 +2,23 @@ from abc import abstractmethod
 from dataclasses import dataclass
 from enum import Enum
 from types import UnionType
+from typing import Self
 import equinox as eqx
 from cattrs import Converter
+from einops import rearrange
 from jax import numpy as jnp
-from jaxtyping import DTypeLike
+from jaxtyping import Array, DTypeLike, Float
-from lalamo.common import ParameterDict
+from lalamo.common import ParameterTree
 __all__ = [
     "AttentionType",
     "DummyUnionMember",
     "LalamoModule",
     "config_converter",
+    "from_layout",
+    "into_layout",
     "register_config_union",
 ]
@@ -34,6 +38,42 @@ class WeightLayout(Enum):
                 return "(output, input)"
+_DEFAULT_WEIGHT_LAYOUT = WeightLayout.INPUT_OUTPUT
+def into_layout(
+    weights: Float[Array, "in_channels out_channels"],
+    layout: WeightLayout,
+) -> Float[Array, "in_channels out_channels"] | Float[Array, "out_channels in_channels"]:
+    if layout == WeightLayout.AUTO:
+        layout = _DEFAULT_WEIGHT_LAYOUT
+    match layout:
+        case WeightLayout.OUTPUT_INPUT:
+            return weights
+        case WeightLayout.INPUT_OUTPUT:
+            return rearrange(
+                weights,
+                "total_out_channels in_channels -> in_channels total_out_channels",
+            )
+def from_layout(
+    weights: ParameterTree | Array,
+    layout: WeightLayout,
+) -> Array:
+    assert isinstance(weights, Array)
+    if layout == WeightLayout.AUTO:
+        layout = _DEFAULT_WEIGHT_LAYOUT
+    match layout:
+        case WeightLayout.OUTPUT_INPUT:
+            return weights
+        case WeightLayout.INPUT_OUTPUT:
+            return rearrange(
+                weights,
+                "in_channels total_out_channels -> total_out_channels in_channels",
+            )
 class AttentionType(Enum):
     GLOBAL = "global"
     SLIDING_WINDOW = "sliding_window"
@@ -47,7 +87,14 @@ class LalamoModule[ConfigT](eqx.Module):
     def activation_precision(self) -> DTypeLike: ...
     @abstractmethod
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterDict: ...
+    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree[Array]: ...
+    @abstractmethod
+    def import_weights(
+        self,
+        weights: ParameterTree[Array],
+        weight_layout: WeightLayout = WeightLayout.AUTO,
+    ) -> Self: ...
 def _dtype_to_str(dtype: DTypeLike) -> str:
@@ -115,7 +162,7 @@ def register_config_union(union_type: UnionType) -> None:
         new_config = dict(config)
         type_name = new_config.pop("type")
         target_type = name_to_type[type_name]
-        return name_to_type[type_name](**config_converter.structure(new_config, target_type))
+        return config_converter.structure(new_config, target_type)
     config_converter.register_structure_hook(
         union_type,

lalamo 0.2.7__py3-none-any.whl → 0.3.0__py3-none-any.whl

lalamo 0.2.7py3-none-any.whl → 0.3.0py3-none-any.whl