lalamo 0.5.12__tar.gz → 0.5.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {lalamo-0.5.12 → lalamo-0.5.13}/PKG-INFO +1 -1
  2. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/__init__.py +1 -1
  3. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/gemma3.py +47 -8
  4. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/llama.py +27 -6
  5. lalamo-0.5.13/lalamo/model_import/model_specs/gemma.py +124 -0
  6. lalamo-0.5.13/lalamo/model_import/model_specs/llama.py +100 -0
  7. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/PKG-INFO +1 -1
  8. lalamo-0.5.12/lalamo/model_import/model_specs/gemma.py +0 -57
  9. lalamo-0.5.12/lalamo/model_import/model_specs/llama.py +0 -44
  10. {lalamo-0.5.12 → lalamo-0.5.13}/LICENSE +0 -0
  11. {lalamo-0.5.12 → lalamo-0.5.13}/README.md +0 -0
  12. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/common.py +0 -0
  13. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/data/__init__.py +0 -0
  14. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/data/huggingface_message.py +0 -0
  15. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/data/lalamo_completions.py +0 -0
  16. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/data/utils.py +0 -0
  17. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/main.py +0 -0
  18. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/message_processor.py +0 -0
  19. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/__init__.py +0 -0
  20. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/common.py +0 -0
  21. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/__init__.py +0 -0
  22. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/common.py +0 -0
  23. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/executorch.py +0 -0
  24. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/__init__.py +0 -0
  25. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/common.py +0 -0
  26. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/gemma2.py +0 -0
  27. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +0 -0
  28. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/lfm2.py +0 -0
  29. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/llamba.py +0 -0
  30. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/mistral.py +0 -0
  31. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/modern_bert.py +0 -0
  32. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/qwen2.py +0 -0
  33. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/decoder_configs/huggingface/qwen3.py +0 -0
  34. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/huggingface_generation_config.py +0 -0
  35. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/huggingface_tokenizer_config.py +0 -0
  36. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/__init__.py +0 -0
  37. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/common.py +0 -0
  38. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/executorch.py +0 -0
  39. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/huggingface.py +0 -0
  40. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/loaders/utils.py +0 -0
  41. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/__init__.py +0 -0
  42. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/common.py +0 -0
  43. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/deepseek.py +0 -0
  44. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/essential_ai.py +0 -0
  45. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/gpt_oss.py +0 -0
  46. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/huggingface.py +0 -0
  47. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/lfm2.py +0 -0
  48. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/llamba.py +0 -0
  49. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/mirai.py +0 -0
  50. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/mistral.py +0 -0
  51. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/pleias.py +0 -0
  52. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/polaris.py +0 -0
  53. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/qwen.py +0 -0
  54. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/model_import/model_specs/reka.py +0 -0
  55. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/models/__init__.py +0 -0
  56. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/models/classifier.py +0 -0
  57. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/models/common.py +0 -0
  58. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/models/language_model.py +0 -0
  59. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/__init__.py +0 -0
  60. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/activations.py +0 -0
  61. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/classifier.py +0 -0
  62. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/common.py +0 -0
  63. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/decoder.py +0 -0
  64. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/embedding.py +0 -0
  65. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/linear.py +0 -0
  66. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/mlp.py +0 -0
  67. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/mlx_interop.py +0 -0
  68. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/normalization.py +0 -0
  69. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/rope.py +0 -0
  70. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/__init__.py +0 -0
  71. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/attention.py +0 -0
  72. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/common.py +0 -0
  73. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/mamba.py +0 -0
  74. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/short_conv.py +0 -0
  75. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/__init__.py +0 -0
  76. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/common.py +0 -0
  77. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/kv_cache.py +0 -0
  78. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/mamba_state.py +0 -0
  79. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/token_mixers/state/short_conv_state.py +0 -0
  80. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/torch_interop.py +0 -0
  81. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/transformer.py +0 -0
  82. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/transformer_layer.py +0 -0
  83. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/modules/utils.py +0 -0
  84. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/quantization.py +0 -0
  85. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/registry_abc.py +0 -0
  86. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/sampling.py +0 -0
  87. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/__init__.py +0 -0
  88. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/common.py +0 -0
  89. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/estimator.py +0 -0
  90. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/inference.py +0 -0
  91. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/ngram.py +0 -0
  92. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/speculator/utils.py +0 -0
  93. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo/utils.py +0 -0
  94. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/SOURCES.txt +0 -0
  95. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/dependency_links.txt +0 -0
  96. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/entry_points.txt +0 -0
  97. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/requires.txt +0 -0
  98. {lalamo-0.5.12 → lalamo-0.5.13}/lalamo.egg-info/top_level.txt +0 -0
  99. {lalamo-0.5.12 → lalamo-0.5.13}/pyproject.toml +0 -0
  100. {lalamo-0.5.12 → lalamo-0.5.13}/setup.cfg +0 -0
  101. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_cartesia_mlx_models.py +0 -0
  102. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_chat_template.py +0 -0
  103. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_generation.py +0 -0
  104. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_huggingface_model_conversion.py +0 -0
  105. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_huggingface_models.py +0 -0
  106. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_lfm2_models.py +0 -0
  107. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_mlx_models.py +0 -0
  108. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_model_spec.py +0 -0
  109. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_models.py +0 -0
  110. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_moe.py +0 -0
  111. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_parameter_tree.py +0 -0
  112. {lalamo-0.5.12 → lalamo-0.5.13}/tests/test_registry_abc.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lalamo
3
- Version: 0.5.12
3
+ Version: 0.5.13
4
4
  Summary: JAX library for optimization and export of models for use with the UZU inference engine.
5
5
  Requires-Python: <4,>=3.12
6
6
  Description-Content-Type: text/markdown
@@ -15,7 +15,7 @@ from lalamo.speculator import (
15
15
  SpeculatorTrainingEvent,
16
16
  )
17
17
 
18
- __version__ = "0.5.12"
18
+ __version__ = "0.5.13"
19
19
 
20
20
  __all__ = [
21
21
  "AssistantMessage",
@@ -5,7 +5,13 @@ from typing import Literal
5
5
  import jax.numpy as jnp
6
6
  from jaxtyping import DTypeLike
7
7
 
8
- from lalamo.modules import DecoderConfig, TiedEmbeddingConfig, TransformerConfig
8
+ from lalamo.modules import (
9
+ DecoderConfig,
10
+ MLXQuantizedLinearConfig,
11
+ MLXQuantizedTiedEmbeddingConfig,
12
+ TiedEmbeddingConfig,
13
+ TransformerConfig,
14
+ )
9
15
  from lalamo.modules.activations import GELU
10
16
  from lalamo.modules.linear import FullPrecisionLinearConfig
11
17
  from lalamo.modules.mlp import DenseMLPConfig
@@ -13,8 +19,9 @@ from lalamo.modules.normalization import NormalizationConfig, UpcastMode
13
19
  from lalamo.modules.rope import LinearScalingRoPEConfig, UnscaledRoPEConfig, YARNRoPEConfig
14
20
  from lalamo.modules.token_mixers.attention import AttentionConfig
15
21
  from lalamo.modules.transformer_layer import TransformerLayerConfig
22
+ from lalamo.quantization import QuantizationMode
16
23
 
17
- from .common import HuggingFaceLMConfig
24
+ from .common import HuggingFaceLMConfig, MLXQuantizationConfig, QuantizationConfigType
18
25
 
19
26
  __all__ = ["HFGemma3Config", "HFGemma3TextConfig"]
20
27
 
@@ -61,6 +68,9 @@ class HFGemma3TextConfigRaw:
61
68
  final_logit_softcapping: float | None = None
62
69
  vocab_size: int = 262208
63
70
 
71
+ quantization: QuantizationConfigType = None
72
+ quantization_config: QuantizationConfigType = None
73
+
64
74
  @property
65
75
  def sliding_window_sizes(self) -> list[int | None]:
66
76
  result = []
@@ -77,14 +87,28 @@ class HFGemma3TextConfigRaw:
77
87
  activation_precision: DTypeLike,
78
88
  accumulation_precision: DTypeLike,
79
89
  metadata_dict: Mapping[str, str], # noqa: ARG002
90
+ fallback_quantization: QuantizationConfigType | None = None,
80
91
  ) -> DecoderConfig:
92
+ quantization = self.quantization or self.quantization_config or fallback_quantization
81
93
  input_scale = _round_to_bfloat16(self.hidden_size**0.5)
82
94
  attention_scale = self.query_pre_attn_scalar**-0.5
83
- embedding_config = TiedEmbeddingConfig(
84
- input_scale=input_scale,
85
- logit_soft_cap=self.final_logit_softcapping,
86
- precision=activation_precision,
87
- )
95
+ if quantization is None:
96
+ embedding_config = TiedEmbeddingConfig(
97
+ input_scale=input_scale,
98
+ logit_soft_cap=self.final_logit_softcapping,
99
+ precision=activation_precision,
100
+ )
101
+ elif isinstance(quantization, MLXQuantizationConfig):
102
+ embedding_config = MLXQuantizedTiedEmbeddingConfig(
103
+ input_scale=input_scale,
104
+ logit_soft_cap=self.final_logit_softcapping,
105
+ group_size=quantization.group_size,
106
+ embedding_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
107
+ activation_quantization_mode=None,
108
+ activation_precision=activation_precision,
109
+ )
110
+ else:
111
+ raise RuntimeError(f"Unsupported quantization format: {type(quantization)}")
88
112
  rms_norm_config = NormalizationConfig(
89
113
  scale_precision=activation_precision,
90
114
  accumulation_precision=accumulation_precision,
@@ -127,7 +151,17 @@ class HFGemma3TextConfigRaw:
127
151
  max_sequence_length=context_length or self.max_position_embeddings,
128
152
  )
129
153
 
130
- linear_config = FullPrecisionLinearConfig(precision=activation_precision)
154
+ if quantization is None:
155
+ linear_config = FullPrecisionLinearConfig(precision=activation_precision)
156
+ elif isinstance(quantization, MLXQuantizationConfig):
157
+ linear_config = MLXQuantizedLinearConfig(
158
+ group_size=quantization.group_size,
159
+ weight_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
160
+ activation_quantization_mode=None,
161
+ activation_precision=activation_precision,
162
+ )
163
+ else:
164
+ raise RuntimeError(f"Unsupported quantization format: {type(quantization)}")
131
165
  mlp_config = DenseMLPConfig(
132
166
  linear_config=linear_config,
133
167
  activation=GELU(),
@@ -214,6 +248,9 @@ class HFGemma3Config(HuggingFaceLMConfig):
214
248
  transformers_version: str
215
249
  vision_config: HFGemma3VisionConfig
216
250
 
251
+ quantization: QuantizationConfigType = None
252
+ quantization_config: QuantizationConfigType = None
253
+
217
254
  def to_decoder_config(
218
255
  self,
219
256
  context_length: int | None,
@@ -221,9 +258,11 @@ class HFGemma3Config(HuggingFaceLMConfig):
221
258
  accumulation_precision: DTypeLike,
222
259
  metadata_dict: Mapping[str, str],
223
260
  ) -> DecoderConfig:
261
+ quantization = self.quantization or self.quantization_config
224
262
  return self.text_config.to_decoder_config(
225
263
  context_length=context_length,
226
264
  activation_precision=activation_precision,
227
265
  accumulation_precision=accumulation_precision,
228
266
  metadata_dict=metadata_dict,
267
+ fallback_quantization=quantization,
229
268
  )
@@ -11,6 +11,8 @@ from lalamo.modules import (
11
11
  FullPrecisionLinearConfig,
12
12
  GroupQuantizedLinearConfig,
13
13
  LlamaRoPEConfig,
14
+ MLXQuantizedLinearConfig,
15
+ MLXQuantizedTiedEmbeddingConfig,
14
16
  NormalizationConfig,
15
17
  SiLU,
16
18
  TiedEmbeddingConfig,
@@ -23,7 +25,7 @@ from lalamo.modules import (
23
25
  )
24
26
  from lalamo.quantization import QuantizationMode
25
27
 
26
- from .common import AWQQuantizationConfig, GPTQQuantizationConfig, HuggingFaceLMConfig
28
+ from .common import HuggingFaceLMConfig, MLXQuantizationConfig, QuantizationConfigType
27
29
 
28
30
  __all__ = ["HFLlamaConfig"]
29
31
 
@@ -75,7 +77,8 @@ class HFLlamaConfig(HuggingFaceLMConfig):
75
77
  vocab_size: int
76
78
  head_dim: int | None = None
77
79
 
78
- quantization_config: AWQQuantizationConfig | GPTQQuantizationConfig | None = None
80
+ quantization: QuantizationConfigType = None
81
+ quantization_config: QuantizationConfigType = None
79
82
 
80
83
  def to_decoder_config(
81
84
  self,
@@ -84,7 +87,18 @@ class HFLlamaConfig(HuggingFaceLMConfig):
84
87
  accumulation_precision: DTypeLike,
85
88
  metadata_dict: Mapping[str, str], # noqa: ARG002
86
89
  ) -> DecoderConfig:
87
- if self.tie_word_embeddings:
90
+ quantization = self.quantization or self.quantization_config
91
+ if isinstance(quantization, MLXQuantizationConfig):
92
+ assert self.tie_word_embeddings, "only tied embeddings are supported"
93
+ embedding_config = MLXQuantizedTiedEmbeddingConfig(
94
+ input_scale=None,
95
+ logit_soft_cap=None,
96
+ group_size=quantization.group_size,
97
+ embedding_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
98
+ activation_quantization_mode=None,
99
+ activation_precision=activation_precision,
100
+ )
101
+ elif self.tie_word_embeddings:
88
102
  embedding_config = TiedEmbeddingConfig(
89
103
  input_scale=None,
90
104
  logit_soft_cap=None,
@@ -133,14 +147,21 @@ class HFLlamaConfig(HuggingFaceLMConfig):
133
147
  upcast_mode=UpcastMode.ONLY_NORMALIZATION,
134
148
  subtract_mean=False,
135
149
  )
136
- if self.quantization_config is None:
150
+ if quantization is None:
137
151
  linear_config = FullPrecisionLinearConfig(
138
152
  precision=activation_precision,
139
153
  )
154
+ elif isinstance(quantization, MLXQuantizationConfig):
155
+ linear_config = MLXQuantizedLinearConfig(
156
+ group_size=quantization.group_size,
157
+ weight_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
158
+ activation_quantization_mode=None,
159
+ activation_precision=activation_precision,
160
+ )
140
161
  else:
141
162
  linear_config = GroupQuantizedLinearConfig(
142
- group_size=self.quantization_config.group_size,
143
- weight_quantization_mode=QuantizationMode.from_num_bits(self.quantization_config.bits),
163
+ group_size=quantization.group_size,
164
+ weight_quantization_mode=QuantizationMode.from_num_bits(quantization.bits),
144
165
  activation_quantization_mode=None,
145
166
  activation_precision=activation_precision,
146
167
  )
@@ -0,0 +1,124 @@
1
+ from lalamo.model_import.decoder_configs import (
2
+ HFGemma2Config,
3
+ HFGemma3Config,
4
+ HFGemma3TextConfig,
5
+ )
6
+ from lalamo.quantization import QuantizationMode
7
+
8
+ from .common import ConfigMap, FileSpec, ModelSpec, WeightsType
9
+
10
+ __all__ = ["GEMMA_MODELS"]
11
+
12
+ GEMMA2 = [
13
+ ModelSpec(
14
+ vendor="Google",
15
+ family="Gemma-2",
16
+ name="Gemma-2-2B-Instruct",
17
+ size="2B",
18
+ quantization=None,
19
+ repo="google/gemma-2-2b-it",
20
+ config_type=HFGemma2Config,
21
+ ),
22
+ ]
23
+
24
+ GEMMA3 = [
25
+ ModelSpec(
26
+ vendor="Google",
27
+ family="Gemma-3",
28
+ name="Gemma-3-1B-Instruct",
29
+ size="1B",
30
+ quantization=None,
31
+ repo="google/gemma-3-1b-it",
32
+ config_type=HFGemma3TextConfig,
33
+ weights_type=WeightsType.SAFETENSORS,
34
+ ),
35
+ ModelSpec(
36
+ vendor="Google",
37
+ family="Gemma-3",
38
+ name="Gemma-3-1B-Instruct-4bit",
39
+ size="1B",
40
+ quantization=QuantizationMode.UINT4,
41
+ repo="mlx-community/gemma-3-1b-it-4bit",
42
+ config_type=HFGemma3TextConfig,
43
+ weights_type=WeightsType.SAFETENSORS,
44
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-1b-it")),
45
+ ),
46
+ ModelSpec(
47
+ vendor="Google",
48
+ family="Gemma-3",
49
+ name="Gemma-3-1B-Instruct-8bit",
50
+ size="1B",
51
+ quantization=QuantizationMode.UINT8,
52
+ repo="mlx-community/gemma-3-1b-it-8bit",
53
+ config_type=HFGemma3TextConfig,
54
+ weights_type=WeightsType.SAFETENSORS,
55
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-1b-it")),
56
+ ),
57
+ ModelSpec(
58
+ vendor="Google",
59
+ family="Gemma-3",
60
+ name="Gemma-3-4B-Instruct",
61
+ size="4B",
62
+ quantization=None,
63
+ repo="google/gemma-3-4b-it",
64
+ config_type=HFGemma3Config,
65
+ weights_type=WeightsType.SAFETENSORS,
66
+ ),
67
+ ModelSpec(
68
+ vendor="Google",
69
+ family="Gemma-3",
70
+ name="Gemma-3-4B-Instruct-4bit",
71
+ size="4B",
72
+ quantization=QuantizationMode.UINT4,
73
+ repo="mlx-community/gemma-3-4b-it-4bit",
74
+ config_type=HFGemma3Config,
75
+ weights_type=WeightsType.SAFETENSORS,
76
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-4b-it")),
77
+ ),
78
+ ModelSpec(
79
+ vendor="Google",
80
+ family="Gemma-3",
81
+ name="Gemma-3-4B-Instruct-8bit",
82
+ size="4B",
83
+ quantization=QuantizationMode.UINT8,
84
+ repo="mlx-community/gemma-3-4b-it-8bit",
85
+ config_type=HFGemma3Config,
86
+ weights_type=WeightsType.SAFETENSORS,
87
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-4b-it")),
88
+ ),
89
+ ModelSpec(
90
+ vendor="Google",
91
+ family="Gemma-3",
92
+ name="Gemma-3-27B-Instruct",
93
+ size="27B",
94
+ quantization=None,
95
+ repo="google/gemma-3-27b-it",
96
+ config_type=HFGemma3Config,
97
+ weights_type=WeightsType.SAFETENSORS,
98
+ ),
99
+ ModelSpec(
100
+ vendor="Google",
101
+ family="Gemma-3",
102
+ name="Gemma-3-27B-Instruct-4bit",
103
+ size="27B",
104
+ quantization=QuantizationMode.UINT4,
105
+ repo="mlx-community/gemma-3-27b-it-4bit",
106
+ config_type=HFGemma3Config,
107
+ weights_type=WeightsType.SAFETENSORS,
108
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-27b-it")),
109
+ ),
110
+ ModelSpec(
111
+ vendor="Google",
112
+ family="Gemma-3",
113
+ name="Gemma-3-27B-Instruct-8bit",
114
+ size="27B",
115
+ quantization=QuantizationMode.UINT8,
116
+ repo="mlx-community/gemma-3-27b-it-8bit",
117
+ config_type=HFGemma3Config,
118
+ weights_type=WeightsType.SAFETENSORS,
119
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "google/gemma-3-27b-it")),
120
+ ),
121
+ ]
122
+
123
+
124
+ GEMMA_MODELS = GEMMA2 + GEMMA3
@@ -0,0 +1,100 @@
1
+ from lalamo.model_import.decoder_configs import HFLlamaConfig
2
+ from lalamo.quantization import QuantizationMode
3
+
4
+ from .common import ConfigMap, FileSpec, ModelSpec
5
+
6
+ __all__ = ["LLAMA_MODELS"]
7
+
8
+ LLAMA31 = [
9
+ ModelSpec(
10
+ vendor="Meta",
11
+ family="Llama-3.1",
12
+ name="Llama-3.1-8B-Instruct",
13
+ size="8B",
14
+ quantization=None,
15
+ repo="meta-llama/Llama-3.1-8B-Instruct",
16
+ config_type=HFLlamaConfig,
17
+ use_cases=tuple(),
18
+ ),
19
+ ModelSpec(
20
+ vendor="Meta",
21
+ family="Llama-3.1",
22
+ name="Llama-3.1-8B-Instruct-4bit",
23
+ size="8B",
24
+ quantization=QuantizationMode.UINT4,
25
+ repo="mlx-community/Llama-3.1-8B-Instruct-4bit",
26
+ config_type=HFLlamaConfig,
27
+ use_cases=tuple(),
28
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.1-8B-Instruct")),
29
+ ),
30
+ ]
31
+
32
+
33
+ LLAMA32 = [
34
+ ModelSpec(
35
+ vendor="Meta",
36
+ family="Llama-3.2",
37
+ name="Llama-3.2-1B-Instruct",
38
+ size="1B",
39
+ quantization=None,
40
+ repo="meta-llama/Llama-3.2-1B-Instruct",
41
+ config_type=HFLlamaConfig,
42
+ use_cases=tuple(),
43
+ ),
44
+ ModelSpec(
45
+ vendor="Meta",
46
+ family="Llama-3.2",
47
+ name="Llama-3.2-1B-Instruct-4bit",
48
+ size="1B",
49
+ quantization=QuantizationMode.UINT4,
50
+ repo="mlx-community/Llama-3.2-1B-Instruct-4bit",
51
+ config_type=HFLlamaConfig,
52
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.2-1B-Instruct")),
53
+ use_cases=tuple(),
54
+ ),
55
+ ModelSpec(
56
+ vendor="Meta",
57
+ family="Llama-3.2",
58
+ name="Llama-3.2-1B-Instruct-8bit",
59
+ size="1B",
60
+ quantization=QuantizationMode.UINT8,
61
+ repo="mlx-community/Llama-3.2-1B-Instruct-8bit",
62
+ config_type=HFLlamaConfig,
63
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.2-1B-Instruct")),
64
+ use_cases=tuple(),
65
+ ),
66
+ ModelSpec(
67
+ vendor="Meta",
68
+ family="Llama-3.2",
69
+ name="Llama-3.2-3B-Instruct",
70
+ size="3B",
71
+ quantization=None,
72
+ repo="meta-llama/Llama-3.2-3B-Instruct",
73
+ config_type=HFLlamaConfig,
74
+ use_cases=tuple(),
75
+ ),
76
+ ModelSpec(
77
+ vendor="Meta",
78
+ family="Llama-3.2",
79
+ name="Llama-3.2-3B-Instruct-4bit",
80
+ size="3B",
81
+ quantization=QuantizationMode.UINT4,
82
+ repo="mlx-community/Llama-3.2-3B-Instruct-4bit",
83
+ config_type=HFLlamaConfig,
84
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.2-3B-Instruct")),
85
+ use_cases=tuple(),
86
+ ),
87
+ ModelSpec(
88
+ vendor="Meta",
89
+ family="Llama-3.2",
90
+ name="Llama-3.2-3B-Instruct-8bit",
91
+ size="3B",
92
+ quantization=QuantizationMode.UINT8,
93
+ repo="mlx-community/Llama-3.2-3B-Instruct-8bit",
94
+ config_type=HFLlamaConfig,
95
+ configs=ConfigMap(generation_config=FileSpec("generation_config.json", "meta-llama/Llama-3.2-3B-Instruct")),
96
+ use_cases=tuple(),
97
+ ),
98
+ ]
99
+
100
+ LLAMA_MODELS = LLAMA31 + LLAMA32
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lalamo
3
- Version: 0.5.12
3
+ Version: 0.5.13
4
4
  Summary: JAX library for optimization and export of models for use with the UZU inference engine.
5
5
  Requires-Python: <4,>=3.12
6
6
  Description-Content-Type: text/markdown
@@ -1,57 +0,0 @@
1
- from lalamo.model_import.decoder_configs import (
2
- HFGemma2Config,
3
- HFGemma3Config,
4
- HFGemma3TextConfig,
5
- )
6
-
7
- from .common import ModelSpec, WeightsType
8
-
9
- __all__ = ["GEMMA_MODELS"]
10
-
11
- GEMMA2 = [
12
- ModelSpec(
13
- vendor="Google",
14
- family="Gemma-2",
15
- name="Gemma-2-2B-Instruct",
16
- size="2B",
17
- quantization=None,
18
- repo="google/gemma-2-2b-it",
19
- config_type=HFGemma2Config,
20
- ),
21
- ]
22
-
23
- GEMMA3 = [
24
- ModelSpec(
25
- vendor="Google",
26
- family="Gemma-3",
27
- name="Gemma-3-1B-Instruct",
28
- size="1B",
29
- quantization=None,
30
- repo="google/gemma-3-1b-it",
31
- config_type=HFGemma3TextConfig,
32
- weights_type=WeightsType.SAFETENSORS,
33
- ),
34
- ModelSpec(
35
- vendor="Google",
36
- family="Gemma-3",
37
- name="Gemma-3-4B-Instruct",
38
- size="4B",
39
- quantization=None,
40
- repo="google/gemma-3-4b-it",
41
- config_type=HFGemma3Config,
42
- weights_type=WeightsType.SAFETENSORS,
43
- ),
44
- ModelSpec(
45
- vendor="Google",
46
- family="Gemma-3",
47
- name="Gemma-3-27B-Instruct",
48
- size="27B",
49
- quantization=None,
50
- repo="google/gemma-3-27b-it",
51
- config_type=HFGemma3Config,
52
- weights_type=WeightsType.SAFETENSORS,
53
- ),
54
- ]
55
-
56
-
57
- GEMMA_MODELS = GEMMA2 + GEMMA3
@@ -1,44 +0,0 @@
1
- from lalamo.model_import.decoder_configs import HFLlamaConfig
2
-
3
- from .common import ModelSpec
4
-
5
- __all__ = ["LLAMA_MODELS"]
6
-
7
- LLAMA31 = [
8
- ModelSpec(
9
- vendor="Meta",
10
- family="Llama-3.1",
11
- name="Llama-3.1-8B-Instruct",
12
- size="8B",
13
- quantization=None,
14
- repo="meta-llama/Llama-3.1-8B-Instruct",
15
- config_type=HFLlamaConfig,
16
- use_cases=tuple(),
17
- ),
18
- ]
19
-
20
-
21
- LLAMA32 = [
22
- ModelSpec(
23
- vendor="Meta",
24
- family="Llama-3.2",
25
- name="Llama-3.2-1B-Instruct",
26
- size="1B",
27
- quantization=None,
28
- repo="meta-llama/Llama-3.2-1B-Instruct",
29
- config_type=HFLlamaConfig,
30
- use_cases=tuple(),
31
- ),
32
- ModelSpec(
33
- vendor="Meta",
34
- family="Llama-3.2",
35
- name="Llama-3.2-3B-Instruct",
36
- size="3B",
37
- quantization=None,
38
- repo="meta-llama/Llama-3.2-3B-Instruct",
39
- config_type=HFLlamaConfig,
40
- use_cases=tuple(),
41
- ),
42
- ]
43
-
44
- LLAMA_MODELS = LLAMA31 + LLAMA32
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes