lalamo 0.5.12__tar.gz → 0.5.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lalamo-0.5.12 → lalamo-0.5.13}/PKG-INFO +1 -1
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/__init__.py +1 -1
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/gemma3.py +47 -8
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/llama.py +27 -6
- lalamo-0.5.13/lalamo/model_import/model_specs/gemma.py +124 -0
- lalamo-0.5.13/lalamo/model_import/model_specs/llama.py +100 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/PKG-INFO +1 -1
- lalamo-0.5.12/lalamo/model_import/model_specs/gemma.py +0 -57
- lalamo-0.5.12/lalamo/model_import/model_specs/llama.py +0 -44
- {lalamo-0.5.12 → lalamo-0.5.13}/LICENSE +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/README.md +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/data/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/data/huggingface_message.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/data/lalamo_completions.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/data/utils.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/main.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/message_processor.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/executorch.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/gemma2.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/lfm2.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/llamba.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/mistral.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/modern_bert.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/qwen2.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/qwen3.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/huggingface_generation_config.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/huggingface_tokenizer_config.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/executorch.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/huggingface.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/utils.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/deepseek.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/essential_ai.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/gpt_oss.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/huggingface.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/lfm2.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/llamba.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/mirai.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/mistral.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/pleias.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/polaris.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/qwen.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/reka.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/models/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/models/classifier.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/models/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/models/language_model.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/activations.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/classifier.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/decoder.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/embedding.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/linear.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/mlp.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/mlx_interop.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/normalization.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/rope.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/attention.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/mamba.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/short_conv.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/kv_cache.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/mamba_state.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/short_conv_state.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/torch_interop.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/transformer.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/transformer_layer.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/utils.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/quantization.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/registry_abc.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/sampling.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/__init__.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/common.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/estimator.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/inference.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/ngram.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/utils.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/utils.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/SOURCES.txt +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/dependency_links.txt +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/entry_points.txt +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/requires.txt +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/top_level.txt +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/pyproject.toml +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/setup.cfg +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_cartesia_mlx_models.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_chat_template.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_generation.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_huggingface_model_conversion.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_huggingface_models.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_lfm2_models.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_mlx_models.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_model_spec.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_models.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_moe.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_parameter_tree.py +0 -0
- {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_registry_abc.py +0 -0
|
@@ -5,7 +5,13 @@ from typing import Literal
|
|
|
5
5
|
import jax.numpy as jnp
|
|
6
6
|
from jaxtyping import DTypeLike
|
|
7
7
|
|
|
8
|
-
from lalamo.modules import
|
|
8
|
+
from lalamo.modules import (
|
|
9
|
+
DecoderConfig,
|
|
10
|
+
MLXQuantizedLinearConfig,
|
|
11
|
+
MLXQuantizedTiedEmbeddingConfig,
|
|
12
|
+
TiedEmbeddingConfig,
|
|
13
|
+
TransformerConfig,
|
|
14
|
+
)
|
|
9
15
|
from lalamo.modules.activations import GELU
|
|
10
16
|
from lalamo.modules.linear import FullPrecisionLinearConfig
|
|
11
17
|
from lalamo.modules.mlp import DenseMLPConfig
|
|
@@ -13,8 +19,9 @@ from lalamo.modules.normalization import NormalizationConfig, UpcastMode
|
|
|
13
19
|
from lalamo.modules.rope import LinearScalingRoPEConfig, UnscaledRoPEConfig, YARNRoPEConfig
|
|
14
20
|
from lalamo.modules.token_mixers.attention import AttentionConfig
|
|
15
21
|
from lalamo.modules.transformer_layer import TransformerLayerConfig
|
|
22
|
+
from lalamo.quantization import QuantizationMode
|
|
16
23
|
|
|
17
|
-
from .common import HuggingFaceLMConfig
|
|
24
|
+
from .common import HuggingFaceLMConfig, MLXQuantizationConfig, QuantizationConfigType
|
|
18
25
|
|
|
19
26
|
__all__ = ["HFGemma3Config", "HFGemma3TextConfig"]
|
|
20
27
|
|
|
@@ -61,6 +68,9 @@ class HFGemma3TextConfigRaw:
|
|
|
61
68
|
final_logit_softcapping: float | None = None
|
|
62
69
|
vocab_size: int = 262208
|
|
63
70
|
|
|
71
|
+
quantization: QuantizationConfigType = None
|
|
72
|
+
quantization_config: QuantizationConfigType = None
|
|
73
|
+
|
|
64
74
|
@property
|
|
65
75
|
def sliding_window_sizes(self) -> list[int | None]:
|
|
66
76
|
result = []
|
|
@@ -77,14 +87,28 @@ class HFGemma3TextConfigRaw:
|
|
|
77
87
|
activation_precision: DTypeLike,
|
|
78
88
|
accumulation_precision: DTypeLike,
|
|
79
89
|
metadata_dict: Mapping[str, str], # noqa: ARG002
|
|
90
|
+
fallback_quantization: QuantizationConfigType | None = None,
|
|
80
91
|
) -> DecoderConfig:
|
|
92
|
+
quantization = self.quantization or self.quantization_config or fallback_quantization
|
|
81
93
|
input_scale = _round_to_bfloat16(self.hidden_size**0.5)
|
|
82
94
|
attention_scale = self.query_pre_attn_scalar**-0.5
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
95
|
+
if quantization is None:
|
|
96
|
+
embedding_config = TiedEmbeddingConfig(
|
|
97
|
+
input_scale=input_scale,
|
|
98
|
+
logit_soft_cap=self.final_logit_softcapping,
|
|
99
|
+
precision=activation_precision,
|
|
100
|
+
)
|
|
101
|
+
elif isinstance(quantization, MLXQuantizationConfig):
|
|
102
|
+
embedding_config = MLXQuantizedTiedEmbeddingConfig(
|
|
103
|
+
input_scale=input_scale,
|
|
104
|
+
logit_soft_cap=self.final_logit_softcapping,
|
|
105
|
+
group_size=quantization.group_size,
|
|
106
|
+
embedding_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
|
|
107
|
+
activation_quantization_mode=None,
|
|
108
|
+
activation_precision=activation_precision,
|
|
109
|
+
)
|
|
110
|
+
else:
|
|
111
|
+
raise RuntimeError(f"Unsupported quantization format: {type(quantization)}")
|
|
88
112
|
rms_norm_config = NormalizationConfig(
|
|
89
113
|
scale_precision=activation_precision,
|
|
90
114
|
accumulation_precision=accumulation_precision,
|
|
@@ -127,7 +151,17 @@ class HFGemma3TextConfigRaw:
|
|
|
127
151
|
max_sequence_length=context_length or self.max_position_embeddings,
|
|
128
152
|
)
|
|
129
153
|
|
|
130
|
-
|
|
154
|
+
if quantization is None:
|
|
155
|
+
linear_config = FullPrecisionLinearConfig(precision=activation_precision)
|
|
156
|
+
elif isinstance(quantization, MLXQuantizationConfig):
|
|
157
|
+
linear_config = MLXQuantizedLinearConfig(
|
|
158
|
+
group_size=quantization.group_size,
|
|
159
|
+
weight_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
|
|
160
|
+
activation_quantization_mode=None,
|
|
161
|
+
activation_precision=activation_precision,
|
|
162
|
+
)
|
|
163
|
+
else:
|
|
164
|
+
raise RuntimeError(f"Unsupported quantization format: {type(quantization)}")
|
|
131
165
|
mlp_config = DenseMLPConfig(
|
|
132
166
|
linear_config=linear_config,
|
|
133
167
|
activation=GELU(),
|
|
@@ -214,6 +248,9 @@ class HFGemma3Config(HuggingFaceLMConfig):
|
|
|
214
248
|
transformers_version: str
|
|
215
249
|
vision_config: HFGemma3VisionConfig
|
|
216
250
|
|
|
251
|
+
quantization: QuantizationConfigType = None
|
|
252
|
+
quantization_config: QuantizationConfigType = None
|
|
253
|
+
|
|
217
254
|
def to_decoder_config(
|
|
218
255
|
self,
|
|
219
256
|
context_length: int | None,
|
|
@@ -221,9 +258,11 @@ class HFGemma3Config(HuggingFaceLMConfig):
|
|
|
221
258
|
accumulation_precision: DTypeLike,
|
|
222
259
|
metadata_dict: Mapping[str, str],
|
|
223
260
|
) -> DecoderConfig:
|
|
261
|
+
quantization = self.quantization or self.quantization_config
|
|
224
262
|
return self.text_config.to_decoder_config(
|
|
225
263
|
context_length=context_length,
|
|
226
264
|
activation_precision=activation_precision,
|
|
227
265
|
accumulation_precision=accumulation_precision,
|
|
228
266
|
metadata_dict=metadata_dict,
|
|
267
|
+
fallback_quantization=quantization,
|
|
229
268
|
)
|
|
@@ -11,6 +11,8 @@ from lalamo.modules import (
|
|
|
11
11
|
FullPrecisionLinearConfig,
|
|
12
12
|
GroupQuantizedLinearConfig,
|
|
13
13
|
LlamaRoPEConfig,
|
|
14
|
+
MLXQuantizedLinearConfig,
|
|
15
|
+
MLXQuantizedTiedEmbeddingConfig,
|
|
14
16
|
NormalizationConfig,
|
|
15
17
|
SiLU,
|
|
16
18
|
TiedEmbeddingConfig,
|
|
@@ -23,7 +25,7 @@ from lalamo.modules import (
|
|
|
23
25
|
)
|
|
24
26
|
from lalamo.quantization import QuantizationMode
|
|
25
27
|
|
|
26
|
-
from .common import
|
|
28
|
+
from .common import HuggingFaceLMConfig, MLXQuantizationConfig, QuantizationConfigType
|
|
27
29
|
|
|
28
30
|
__all__ = ["HFLlamaConfig"]
|
|
29
31
|
|
|
@@ -75,7 +77,8 @@ class HFLlamaConfig(HuggingFaceLMConfig):
|
|
|
75
77
|
vocab_size: int
|
|
76
78
|
head_dim: int | None = None
|
|
77
79
|
|
|
78
|
-
|
|
80
|
+
quantization: QuantizationConfigType = None
|
|
81
|
+
quantization_config: QuantizationConfigType = None
|
|
79
82
|
|
|
80
83
|
def to_decoder_config(
|
|
81
84
|
self,
|
|
@@ -84,7 +87,18 @@ class HFLlamaConfig(HuggingFaceLMConfig):
|
|
|
84
87
|
accumulation_precision: DTypeLike,
|
|
85
88
|
metadata_dict: Mapping[str, str], # noqa: ARG002
|
|
86
89
|
) -> DecoderConfig:
|
|
87
|
-
|
|
90
|
+
quantization = self.quantization or self.quantization_config
|
|
91
|
+
if isinstance(quantization, MLXQuantizationConfig):
|
|
92
|
+
assert self.tie_word_embeddings, "only tied embeddings are supported"
|
|
93
|
+
embedding_config = MLXQuantizedTiedEmbeddingConfig(
|
|
94
|
+
input_scale=None,
|
|
95
|
+
logit_soft_cap=None,
|
|
96
|
+
group_size=quantization.group_size,
|
|
97
|
+
embedding_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
|
|
98
|
+
activation_quantization_mode=None,
|
|
99
|
+
activation_precision=activation_precision,
|
|
100
|
+
)
|
|
101
|
+
elif self.tie_word_embeddings:
|
|
88
102
|
embedding_config = TiedEmbeddingConfig(
|
|
89
103
|
input_scale=None,
|
|
90
104
|
logit_soft_cap=None,
|
|
@@ -133,14 +147,21 @@ class HFLlamaConfig(HuggingFaceLMConfig):
|
|
|
133
147
|
upcast_mode=UpcastMode.ONLY_NORMALIZATION,
|
|
134
148
|
subtract_mean=False,
|
|
135
149
|
)
|
|
136
|
-
if
|
|
150
|
+
if quantization is None:
|
|
137
151
|
linear_config = FullPrecisionLinearConfig(
|
|
138
152
|
precision=activation_precision,
|
|
139
153
|
)
|
|
154
|
+
elif isinstance(quantization, MLXQuantizationConfig):
|
|
155
|
+
linear_config = MLXQuantizedLinearConfig(
|
|
156
|
+
group_size=quantization.group_size,
|
|
157
|
+
weight_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
|
|
158
|
+
activation_quantization_mode=None,
|
|
159
|
+
activation_precision=activation_precision,
|
|
160
|
+
)
|
|
140
161
|
else:
|
|
141
162
|
linear_config = GroupQuantizedLinearConfig(
|
|
142
|
-
group_size=
|
|
143
|
-
weight_quantization_mode=QuantizationMode.from_num_bits(
|
|
163
|
+
group_size=quantization.group_size,
|
|
164
|
+
weight_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
|
|
144
165
|
activation_quantization_mode=None,
|
|
145
166
|
activation_precision=activation_precision,
|
|
146
167
|
)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from lalamo.model_import.decoder_configs import (
|
|
2
|
+
HFGemma2Config,
|
|
3
|
+
HFGemma3Config,
|
|
4
|
+
HFGemma3TextConfig,
|
|
5
|
+
)
|
|
6
|
+
from lalamo.quantization import QuantizationMode
|
|
7
|
+
|
|
8
|
+
from .common import ConfigMap, FileSpec, ModelSpec, WeightsType
|
|
9
|
+
|
|
10
|
+
__all__ = ["GEMMA_MODELS"]
|
|
11
|
+
|
|
12
|
+
GEMMA2 = [
|
|
13
|
+
ModelSpec(
|
|
14
|
+
vendor="Google",
|
|
15
|
+
family="Gemma-2",
|
|
16
|
+
name="Gemma-2-2B-Instruct",
|
|
17
|
+
size="2B",
|
|
18
|
+
quantization=None,
|
|
19
|
+
repo="google/gemma-2-2b-it",
|
|
20
|
+
config_type=HFGemma2Config,
|
|
21
|
+
),
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
GEMMA3 = [
|
|
25
|
+
ModelSpec(
|
|
26
|
+
vendor="Google",
|
|
27
|
+
family="Gemma-3",
|
|
28
|
+
name="Gemma-3-1B-Instruct",
|
|
29
|
+
size="1B",
|
|
30
|
+
quantization=None,
|
|
31
|
+
repo="google/gemma-3-1b-it",
|
|
32
|
+
config_type=HFGemma3TextConfig,
|
|
33
|
+
weights_type=WeightsType.SAFETENSORS,
|
|
34
|
+
),
|
|
35
|
+
ModelSpec(
|
|
36
|
+
vendor="Google",
|
|
37
|
+
family="Gemma-3",
|
|
38
|
+
name="Gemma-3-1B-Instruct-4bit",
|
|
39
|
+
size="1B",
|
|
40
|
+
quantization=QuantizationMode.UINT4,
|
|
41
|
+
repo="mlx-community/gemma-3-1b-it-4bit",
|
|
42
|
+
config_type=HFGemma3TextConfig,
|
|
43
|
+
weights_type=WeightsType.SAFETENSORS,
|
|
44
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-1b-it")),
|
|
45
|
+
),
|
|
46
|
+
ModelSpec(
|
|
47
|
+
vendor="Google",
|
|
48
|
+
family="Gemma-3",
|
|
49
|
+
name="Gemma-3-1B-Instruct-8bit",
|
|
50
|
+
size="1B",
|
|
51
|
+
quantization=QuantizationMode.UINT8,
|
|
52
|
+
repo="mlx-community/gemma-3-1b-it-8bit",
|
|
53
|
+
config_type=HFGemma3TextConfig,
|
|
54
|
+
weights_type=WeightsType.SAFETENSORS,
|
|
55
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-1b-it")),
|
|
56
|
+
),
|
|
57
|
+
ModelSpec(
|
|
58
|
+
vendor="Google",
|
|
59
|
+
family="Gemma-3",
|
|
60
|
+
name="Gemma-3-4B-Instruct",
|
|
61
|
+
size="4B",
|
|
62
|
+
quantization=None,
|
|
63
|
+
repo="google/gemma-3-4b-it",
|
|
64
|
+
config_type=HFGemma3Config,
|
|
65
|
+
weights_type=WeightsType.SAFETENSORS,
|
|
66
|
+
),
|
|
67
|
+
ModelSpec(
|
|
68
|
+
vendor="Google",
|
|
69
|
+
family="Gemma-3",
|
|
70
|
+
name="Gemma-3-4B-Instruct-4bit",
|
|
71
|
+
size="4B",
|
|
72
|
+
quantization=QuantizationMode.UINT4,
|
|
73
|
+
repo="mlx-community/gemma-3-4b-it-4bit",
|
|
74
|
+
config_type=HFGemma3Config,
|
|
75
|
+
weights_type=WeightsType.SAFETENSORS,
|
|
76
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-4b-it")),
|
|
77
|
+
),
|
|
78
|
+
ModelSpec(
|
|
79
|
+
vendor="Google",
|
|
80
|
+
family="Gemma-3",
|
|
81
|
+
name="Gemma-3-4B-Instruct-8bit",
|
|
82
|
+
size="4B",
|
|
83
|
+
quantization=QuantizationMode.UINT8,
|
|
84
|
+
repo="mlx-community/gemma-3-4b-it-8bit",
|
|
85
|
+
config_type=HFGemma3Config,
|
|
86
|
+
weights_type=WeightsType.SAFETENSORS,
|
|
87
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-4b-it")),
|
|
88
|
+
),
|
|
89
|
+
ModelSpec(
|
|
90
|
+
vendor="Google",
|
|
91
|
+
family="Gemma-3",
|
|
92
|
+
name="Gemma-3-27B-Instruct",
|
|
93
|
+
size="27B",
|
|
94
|
+
quantization=None,
|
|
95
|
+
repo="google/gemma-3-27b-it",
|
|
96
|
+
config_type=HFGemma3Config,
|
|
97
|
+
weights_type=WeightsType.SAFETENSORS,
|
|
98
|
+
),
|
|
99
|
+
ModelSpec(
|
|
100
|
+
vendor="Google",
|
|
101
|
+
family="Gemma-3",
|
|
102
|
+
name="Gemma-3-27B-Instruct-4bit",
|
|
103
|
+
size="27B",
|
|
104
|
+
quantization=QuantizationMode.UINT4,
|
|
105
|
+
repo="mlx-community/gemma-3-27b-it-4bit",
|
|
106
|
+
config_type=HFGemma3Config,
|
|
107
|
+
weights_type=WeightsType.SAFETENSORS,
|
|
108
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-27b-it")),
|
|
109
|
+
),
|
|
110
|
+
ModelSpec(
|
|
111
|
+
vendor="Google",
|
|
112
|
+
family="Gemma-3",
|
|
113
|
+
name="Gemma-3-27B-Instruct-8bit",
|
|
114
|
+
size="27B",
|
|
115
|
+
quantization=QuantizationMode.UINT8,
|
|
116
|
+
repo="mlx-community/gemma-3-27b-it-8bit",
|
|
117
|
+
config_type=HFGemma3Config,
|
|
118
|
+
weights_type=WeightsType.SAFETENSORS,
|
|
119
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-27b-it")),
|
|
120
|
+
),
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
GEMMA_MODELS = GEMMA2 + GEMMA3
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from lalamo.model_import.decoder_configs import HFLlamaConfig
|
|
2
|
+
from lalamo.quantization import QuantizationMode
|
|
3
|
+
|
|
4
|
+
from .common import ConfigMap, FileSpec, ModelSpec
|
|
5
|
+
|
|
6
|
+
__all__ = ["LLAMA_MODELS"]
|
|
7
|
+
|
|
8
|
+
LLAMA31 = [
|
|
9
|
+
ModelSpec(
|
|
10
|
+
vendor="Meta",
|
|
11
|
+
family="Llama-3.1",
|
|
12
|
+
name="Llama-3.1-8B-Instruct",
|
|
13
|
+
size="8B",
|
|
14
|
+
quantization=None,
|
|
15
|
+
repo="meta-llama/Llama-3.1-8B-Instruct",
|
|
16
|
+
config_type=HFLlamaConfig,
|
|
17
|
+
use_cases=tuple(),
|
|
18
|
+
),
|
|
19
|
+
ModelSpec(
|
|
20
|
+
vendor="Meta",
|
|
21
|
+
family="Llama-3.1",
|
|
22
|
+
name="Llama-3.1-8B-Instruct-4bit",
|
|
23
|
+
size="8B",
|
|
24
|
+
quantization=QuantizationMode.UINT4,
|
|
25
|
+
repo="mlx-community/Llama-3.1-8B-Instruct-4bit",
|
|
26
|
+
config_type=HFLlamaConfig,
|
|
27
|
+
use_cases=tuple(),
|
|
28
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.1-8B-Instruct")),
|
|
29
|
+
),
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
LLAMA32 = [
|
|
34
|
+
ModelSpec(
|
|
35
|
+
vendor="Meta",
|
|
36
|
+
family="Llama-3.2",
|
|
37
|
+
name="Llama-3.2-1B-Instruct",
|
|
38
|
+
size="1B",
|
|
39
|
+
quantization=None,
|
|
40
|
+
repo="meta-llama/Llama-3.2-1B-Instruct",
|
|
41
|
+
config_type=HFLlamaConfig,
|
|
42
|
+
use_cases=tuple(),
|
|
43
|
+
),
|
|
44
|
+
ModelSpec(
|
|
45
|
+
vendor="Meta",
|
|
46
|
+
family="Llama-3.2",
|
|
47
|
+
name="Llama-3.2-1B-Instruct-4bit",
|
|
48
|
+
size="1B",
|
|
49
|
+
quantization=QuantizationMode.UINT4,
|
|
50
|
+
repo="mlx-community/Llama-3.2-1B-Instruct-4bit",
|
|
51
|
+
config_type=HFLlamaConfig,
|
|
52
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.2-1B-Instruct")),
|
|
53
|
+
use_cases=tuple(),
|
|
54
|
+
),
|
|
55
|
+
ModelSpec(
|
|
56
|
+
vendor="Meta",
|
|
57
|
+
family="Llama-3.2",
|
|
58
|
+
name="Llama-3.2-1B-Instruct-8bit",
|
|
59
|
+
size="1B",
|
|
60
|
+
quantization=QuantizationMode.UINT8,
|
|
61
|
+
repo="mlx-community/Llama-3.2-1B-Instruct-8bit",
|
|
62
|
+
config_type=HFLlamaConfig,
|
|
63
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.2-1B-Instruct")),
|
|
64
|
+
use_cases=tuple(),
|
|
65
|
+
),
|
|
66
|
+
ModelSpec(
|
|
67
|
+
vendor="Meta",
|
|
68
|
+
family="Llama-3.2",
|
|
69
|
+
name="Llama-3.2-3B-Instruct",
|
|
70
|
+
size="3B",
|
|
71
|
+
quantization=None,
|
|
72
|
+
repo="meta-llama/Llama-3.2-3B-Instruct",
|
|
73
|
+
config_type=HFLlamaConfig,
|
|
74
|
+
use_cases=tuple(),
|
|
75
|
+
),
|
|
76
|
+
ModelSpec(
|
|
77
|
+
vendor="Meta",
|
|
78
|
+
family="Llama-3.2",
|
|
79
|
+
name="Llama-3.2-3B-Instruct-4bit",
|
|
80
|
+
size="3B",
|
|
81
|
+
quantization=QuantizationMode.UINT4,
|
|
82
|
+
repo="mlx-community/Llama-3.2-3B-Instruct-4bit",
|
|
83
|
+
config_type=HFLlamaConfig,
|
|
84
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.2-3B-Instruct")),
|
|
85
|
+
use_cases=tuple(),
|
|
86
|
+
),
|
|
87
|
+
ModelSpec(
|
|
88
|
+
vendor="Meta",
|
|
89
|
+
family="Llama-3.2",
|
|
90
|
+
name="Llama-3.2-3B-Instruct-8bit",
|
|
91
|
+
size="3B",
|
|
92
|
+
quantization=QuantizationMode.UINT8,
|
|
93
|
+
repo="mlx-community/Llama-3.2-3B-Instruct-8bit",
|
|
94
|
+
config_type=HFLlamaConfig,
|
|
95
|
+
configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.2-3B-Instruct")),
|
|
96
|
+
use_cases=tuple(),
|
|
97
|
+
),
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
LLAMA_MODELS = LLAMA31 + LLAMA32
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
from lalamo.model_import.decoder_configs import (
|
|
2
|
-
HFGemma2Config,
|
|
3
|
-
HFGemma3Config,
|
|
4
|
-
HFGemma3TextConfig,
|
|
5
|
-
)
|
|
6
|
-
|
|
7
|
-
from .common import ModelSpec, WeightsType
|
|
8
|
-
|
|
9
|
-
__all__ = ["GEMMA_MODELS"]
|
|
10
|
-
|
|
11
|
-
GEMMA2 = [
|
|
12
|
-
ModelSpec(
|
|
13
|
-
vendor="Google",
|
|
14
|
-
family="Gemma-2",
|
|
15
|
-
name="Gemma-2-2B-Instruct",
|
|
16
|
-
size="2B",
|
|
17
|
-
quantization=None,
|
|
18
|
-
repo="google/gemma-2-2b-it",
|
|
19
|
-
config_type=HFGemma2Config,
|
|
20
|
-
),
|
|
21
|
-
]
|
|
22
|
-
|
|
23
|
-
GEMMA3 = [
|
|
24
|
-
ModelSpec(
|
|
25
|
-
vendor="Google",
|
|
26
|
-
family="Gemma-3",
|
|
27
|
-
name="Gemma-3-1B-Instruct",
|
|
28
|
-
size="1B",
|
|
29
|
-
quantization=None,
|
|
30
|
-
repo="google/gemma-3-1b-it",
|
|
31
|
-
config_type=HFGemma3TextConfig,
|
|
32
|
-
weights_type=WeightsType.SAFETENSORS,
|
|
33
|
-
),
|
|
34
|
-
ModelSpec(
|
|
35
|
-
vendor="Google",
|
|
36
|
-
family="Gemma-3",
|
|
37
|
-
name="Gemma-3-4B-Instruct",
|
|
38
|
-
size="4B",
|
|
39
|
-
quantization=None,
|
|
40
|
-
repo="google/gemma-3-4b-it",
|
|
41
|
-
config_type=HFGemma3Config,
|
|
42
|
-
weights_type=WeightsType.SAFETENSORS,
|
|
43
|
-
),
|
|
44
|
-
ModelSpec(
|
|
45
|
-
vendor="Google",
|
|
46
|
-
family="Gemma-3",
|
|
47
|
-
name="Gemma-3-27B-Instruct",
|
|
48
|
-
size="27B",
|
|
49
|
-
quantization=None,
|
|
50
|
-
repo="google/gemma-3-27b-it",
|
|
51
|
-
config_type=HFGemma3Config,
|
|
52
|
-
weights_type=WeightsType.SAFETENSORS,
|
|
53
|
-
),
|
|
54
|
-
]
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
GEMMA_MODELS = GEMMA2 + GEMMA3
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
from lalamo.model_import.decoder_configs import HFLlamaConfig
|
|
2
|
-
|
|
3
|
-
from .common import ModelSpec
|
|
4
|
-
|
|
5
|
-
__all__ = ["LLAMA_MODELS"]
|
|
6
|
-
|
|
7
|
-
LLAMA31 = [
|
|
8
|
-
ModelSpec(
|
|
9
|
-
vendor="Meta",
|
|
10
|
-
family="Llama-3.1",
|
|
11
|
-
name="Llama-3.1-8B-Instruct",
|
|
12
|
-
size="8B",
|
|
13
|
-
quantization=None,
|
|
14
|
-
repo="meta-llama/Llama-3.1-8B-Instruct",
|
|
15
|
-
config_type=HFLlamaConfig,
|
|
16
|
-
use_cases=tuple(),
|
|
17
|
-
),
|
|
18
|
-
]
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
LLAMA32 = [
|
|
22
|
-
ModelSpec(
|
|
23
|
-
vendor="Meta",
|
|
24
|
-
family="Llama-3.2",
|
|
25
|
-
name="Llama-3.2-1B-Instruct",
|
|
26
|
-
size="1B",
|
|
27
|
-
quantization=None,
|
|
28
|
-
repo="meta-llama/Llama-3.2-1B-Instruct",
|
|
29
|
-
config_type=HFLlamaConfig,
|
|
30
|
-
use_cases=tuple(),
|
|
31
|
-
),
|
|
32
|
-
ModelSpec(
|
|
33
|
-
vendor="Meta",
|
|
34
|
-
family="Llama-3.2",
|
|
35
|
-
name="Llama-3.2-3B-Instruct",
|
|
36
|
-
size="3B",
|
|
37
|
-
quantization=None,
|
|
38
|
-
repo="meta-llama/Llama-3.2-3B-Instruct",
|
|
39
|
-
config_type=HFLlamaConfig,
|
|
40
|
-
use_cases=tuple(),
|
|
41
|
-
),
|
|
42
|
-
]
|
|
43
|
-
|
|
44
|
-
LLAMA_MODELS = LLAMA31 + LLAMA32
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/modern_bert.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|