oracle-ads 2.13.18rc0__py3-none-any.whl → 2.13.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ads/aqua/cli.py +7 -5
  2. ads/aqua/common/entities.py +195 -48
  3. ads/aqua/common/enums.py +6 -0
  4. ads/aqua/common/errors.py +5 -0
  5. ads/aqua/common/utils.py +157 -66
  6. ads/aqua/constants.py +3 -0
  7. ads/aqua/extension/deployment_handler.py +36 -0
  8. ads/aqua/modeldeployment/constants.py +1 -0
  9. ads/aqua/modeldeployment/deployment.py +95 -14
  10. ads/aqua/modeldeployment/entities.py +3 -0
  11. ads/aqua/modeldeployment/model_group_config.py +3 -3
  12. ads/aqua/resources/gpu_shapes_index.json +315 -26
  13. ads/aqua/shaperecommend/__init__.py +6 -0
  14. ads/aqua/shaperecommend/constants.py +116 -0
  15. ads/aqua/shaperecommend/estimator.py +384 -0
  16. ads/aqua/shaperecommend/llm_config.py +283 -0
  17. ads/aqua/shaperecommend/recommend.py +493 -0
  18. ads/aqua/shaperecommend/shape_report.py +233 -0
  19. ads/aqua/version.json +1 -1
  20. ads/cli.py +9 -1
  21. ads/jobs/builders/infrastructure/dsc_job.py +1 -0
  22. ads/jobs/builders/infrastructure/dsc_job_runtime.py +9 -1
  23. ads/model/service/oci_datascience_model_deployment.py +46 -19
  24. ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +4 -3
  25. ads/pipeline/ads_pipeline.py +13 -9
  26. {oracle_ads-2.13.18rc0.dist-info → oracle_ads-2.13.19.dist-info}/METADATA +1 -1
  27. {oracle_ads-2.13.18rc0.dist-info → oracle_ads-2.13.19.dist-info}/RECORD +30 -24
  28. {oracle_ads-2.13.18rc0.dist-info → oracle_ads-2.13.19.dist-info}/WHEEL +0 -0
  29. {oracle_ads-2.13.18rc0.dist-info → oracle_ads-2.13.19.dist-info}/entry_points.txt +0 -0
  30. {oracle_ads-2.13.18rc0.dist-info → oracle_ads-2.13.19.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,384 @@
1
+ #!/usr/bin/env python
2
+ # Copyright (c) 2025 Oracle and/or its affiliates.
3
+ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
4
+ from typing import Optional
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from ads.aqua.app import logger
9
+ from ads.aqua.shaperecommend.constants import (
10
+ IN_FLIGHT_QUANTIZATION,
11
+ LLAMA_REQUIRED_FIELDS,
12
+ MOE_REQUIRED_FIELDS,
13
+ NEXT_QUANT,
14
+ QUANT_MAPPING,
15
+ VLLM_PARAMS,
16
+ )
17
+ from ads.aqua.shaperecommend.llm_config import LLMConfig
18
+
19
+
20
+ class MemoryEstimator(BaseModel):
21
+ """
22
+ The generic estimator for Transformer Architecture models (OPT/ Bloom)
23
+ Used as a fallback estimator if model identified is not a MoE or GQA Architecture Model.
24
+ Has properties to estimate the KV Cache size, Model size, and total footprint (KV Cache + Model size)
25
+
26
+ KV cache: Use num_attention_heads (all heads, no GQA)
27
+ Parameter estimation: Standard decoder-only, untied embeddings possible
28
+ """
29
+
30
+ llm_config: LLMConfig = Field(
31
+ ...,
32
+ description="The model's config.json file with the necessary parameters for model size and KV cache estimation.",
33
+ )
34
+ batch_size: Optional[int] = (
35
+ 1 # we assume that estimation for batch sizes are not supported yet
36
+ )
37
+ seq_len: int = Field(
38
+ ..., description="The max-seq-len to estimate the size of the KV cache."
39
+ )
40
+
41
+ @property
42
+ def kv_cache_memory(self) -> float:
43
+ """
44
+ Estimates the KV cache size (in GB) using the LLM config.json parameters.
45
+
46
+ Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
47
+ """
48
+ seq_len = self.seq_len or self.llm_config.max_seq_len
49
+ c = self.llm_config
50
+ kv_cache_dtype_bytes = QUANT_MAPPING.get(
51
+ c.weight_dtype, 2
52
+ ) # vLLM uses model's weight applied to KV cache
53
+
54
+ total_bytes = (
55
+ self.batch_size
56
+ * c.num_hidden_layers
57
+ * 2
58
+ * c.num_attention_heads
59
+ * seq_len
60
+ * c.head_dim
61
+ * kv_cache_dtype_bytes
62
+ )
63
+ return total_bytes / 1e9
64
+
65
+ @property
66
+ def model_memory(self) -> float:
67
+ """
68
+ Estimates the model size (in GB) based on estimating the model parameter size and model weights.
69
+
70
+ Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
71
+ """
72
+ c = self.llm_config
73
+ embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
74
+ embedding_params = (
75
+ embedding_count * c.vocab_size * c.hidden_size
76
+ ) # input and output untied
77
+ layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2) # GPT-style
78
+ num_params = layer_params + embedding_params
79
+
80
+ return num_params * c.bytes_per_parameter / 1e9
81
+
82
+ @property
83
+ def total_memory(self) -> float:
84
+ """
85
+ Computes the total memory footprint of the model (KV cache & model size from estimated parameters).
86
+ """
87
+ return self.model_memory + self.kv_cache_memory
88
+
89
+ def validate_shape(
90
+ self, allowed_gpu_memory: float, gpu_utilization: float = 0.9
91
+ ) -> bool:
92
+ """
93
+ Validates if a given model estimator fits within the allowed GPU memory budget, using a fixed utilization margin.
94
+
95
+ Parameters
96
+ ----------
97
+ estimator : MemoryEstimator
98
+ The estimator with current shape/memory needs.
99
+ allowed_gpu_memory : float
100
+ The maximum allowed GPU memory.
101
+
102
+ Returns
103
+ -------
104
+ bool
105
+ True if estimator uses less than adjusted GPU memory, else False.
106
+ """
107
+ return (allowed_gpu_memory * gpu_utilization) > self.total_memory
108
+
109
+ def construct_deployment_params(self) -> str:
110
+ """
111
+ Constructs a deployment parameter string for the model.
112
+
113
+ This method assembles runtime configuration parameters to be passed
114
+ during model deployment. It:
115
+ - Overrides the max sequence length if a shorter length is provided.
116
+ - Suggests in-flight quantization **only if the model is unquantized**
117
+ and in-flight quantization (such as '4bit') is requested in config.
118
+
119
+ Returns
120
+ -------
121
+ str: Parameter string for model deployment.
122
+ """
123
+ c = self.llm_config
124
+ params = []
125
+ if self.seq_len < c.max_seq_len:
126
+ params.append(VLLM_PARAMS["max_model_len"])
127
+ params.append(str(self.seq_len))
128
+
129
+ # Only suggest in-flight quantization for unquantized models when such quantization is requested
130
+ if not c.quantization and c.in_flight_quantization in IN_FLIGHT_QUANTIZATION:
131
+ # vLLM only supports 4bit in-flight quantization
132
+ params.append(VLLM_PARAMS["in_flight_quant"])
133
+
134
+ params = " ".join(params) if params else ""
135
+ return params
136
+
137
+ def suggest_param_advice(self, allowed: float) -> str:
138
+ """
139
+ Suggests parameter modifications to help a model fit within GPU memory limits.
140
+
141
+ Parameters
142
+ ----------
143
+ estimator : MemoryEstimator
144
+ The memory estimator object.
145
+ allowed : float
146
+ Allowed GPU memory in GB.
147
+
148
+ Returns
149
+ -------
150
+ str
151
+ Advice message with suggestions.
152
+ """
153
+ kv_gb = self.kv_cache_memory
154
+ wt_gb = self.model_memory
155
+ batch_size = self.batch_size
156
+ seq_len = self.seq_len
157
+ weight_size = getattr(self.llm_config, "weight_dtype", "unknown")
158
+ config = self.llm_config
159
+
160
+ suggested_quant_msg = None
161
+ quant_advice = ", ".join(config.suggested_quantizations)
162
+ quantization = getattr(config, "quantization", None)
163
+
164
+ advice = []
165
+
166
+ if config.suggested_quantizations:
167
+ to_do = f", which is smaller than the current {quantization if quantization in NEXT_QUANT else weight_size} format."
168
+ if "No" in quant_advice:
169
+ suggested_quant_msg = "No smaller quantized version exists. Use a model with fewer parameters."
170
+ elif not quant_advice:
171
+ suggested_quant_msg = (
172
+ "Use a quantized version of the same model (e.g., INT8 or other)"
173
+ + to_do
174
+ )
175
+ else:
176
+ suggested_quant_msg = (
177
+ f"Either use a pre-quantized model at {quant_advice}, or apply in-flight {quant_advice} quantization"
178
+ + to_do
179
+ )
180
+
181
+ kv_advice = [f"Reduce maximum context length (set --max-model-len < {seq_len})"]
182
+
183
+ if batch_size != 1:
184
+ kv_advice.append(f"Reduce batch size to less than {batch_size}.")
185
+
186
+ wt_advice = [
187
+ "Use a model with fewer parameters.",
188
+ f"{suggested_quant_msg}" if suggested_quant_msg else "",
189
+ ]
190
+
191
+ if kv_gb > wt_gb and kv_gb > allowed * 0.5:
192
+ main = "KV cache memory usage is the main limiting factor"
193
+ advice = kv_advice
194
+ elif wt_gb > kv_gb and wt_gb > allowed * 0.5:
195
+ main = "Model weights are the main limiting factor"
196
+ advice = wt_advice
197
+ else:
198
+ main = "Both model weights and KV cache are significant contributors to memory use"
199
+ advice = kv_advice
200
+ advice.extend(wt_advice)
201
+
202
+ advice_str = "\n".join(f"{i}. {item}" for i, item in enumerate(advice, 1))
203
+
204
+ return (
205
+ f"{advice_str}\n\n{main} (KV cache: {kv_gb:.1f}GB, Weights: {wt_gb:.1f}GB)."
206
+ )
207
+
208
+ def limiting_factor(
209
+ self, allowed_gpu_memory: float, warn_delta: float = 0.85
210
+ ) -> str:
211
+ """
212
+ Determines the memory limiting factor for a model deployment and returns advice.
213
+
214
+ Parameters
215
+ ----------
216
+ estimator : MemoryEstimator
217
+ The memory estimator object with current model configuration.
218
+ allowed_gpu_memory : float
219
+ The maximum allowed GPU memory (in GBs).
220
+ warn_delta : float, optional
221
+ The threshold (fraction) of allowed GPU memory to trigger a warning (default=0.85).
222
+
223
+ Returns
224
+ -------
225
+ str
226
+ Advice message about model fit and limiting factors.
227
+ """
228
+ required = self.total_memory
229
+
230
+ # Warn if required is close to but under allowed
231
+ if allowed_gpu_memory > required > allowed_gpu_memory * warn_delta:
232
+ model_params = self.suggest_param_advice(allowed_gpu_memory)
233
+ advice = (
234
+ f"While the selected compute shape is estimated to work "
235
+ f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed), "
236
+ f"the model configuration is close to the GPU memory limit.\n\n"
237
+ "If you encounter issues with this shape, consider the following options to reduce memory usage:\n\n"
238
+ f"{model_params.lstrip()}"
239
+ )
240
+ elif required > allowed_gpu_memory:
241
+ model_params = self.suggest_param_advice(allowed_gpu_memory)
242
+ advice = (
243
+ f"Model does not fit within GPU memory budget. "
244
+ "Consider the following options to reduce memory usage:\n\n"
245
+ f"{model_params.lstrip()}"
246
+ )
247
+ else:
248
+ advice = (
249
+ f"No override PARAMS needed. \n\nModel fits well within the allowed compute shape "
250
+ f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed)."
251
+ )
252
+ return advice
253
+
254
+
255
+ # Specialized estimators:
256
+ class LlamaMemoryEstimator(MemoryEstimator):
257
+ """
258
+ Estimator for GQA-type architectures. Handles tied (memory savings) and untied embeddings,
259
+ and uses grouped attention (GQA) for more efficient KV cache memory estimation.
260
+
261
+ KV cache: Use num_attention_heads (assumes GQA)
262
+ Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible
263
+ """
264
+
265
+ @property
266
+ def model_memory(self) -> float:
267
+ """
268
+ Returns estimated model parameter memory (in GB), accurately accounting
269
+ for Llama-style attention and MLP, and tied or untied embeddings.
270
+ """
271
+ c = self.llm_config
272
+
273
+ embedding_params, attn_params = self._calc_attn_embed_params()
274
+
275
+ # MLP params
276
+ gate_proj = c.hidden_size * c.intermediate_size
277
+ up_proj = c.hidden_size * c.intermediate_size
278
+ down_proj = c.intermediate_size * c.hidden_size
279
+ mlp_params = gate_proj + up_proj + down_proj
280
+
281
+ # Total per-layer
282
+ layer_params = attn_params + mlp_params
283
+ # Total params
284
+ num_params = c.num_hidden_layers * layer_params + embedding_params
285
+
286
+ return num_params * c.bytes_per_parameter / 1e9
287
+
288
+ @property
289
+ def kv_cache_memory(self) -> float:
290
+ """
291
+ Returns estimated KV cache memory in GB for GQA models.
292
+
293
+ Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
294
+ num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
295
+ """
296
+ c = self.llm_config
297
+ seq_len = self.seq_len or getattr(c, "max_seq_len", 2048)
298
+ kv_cache_dtype_bytes = QUANT_MAPPING.get(c.weight_dtype, 2)
299
+ kv_heads = c.num_key_value_heads
300
+
301
+ total_bytes = (
302
+ self.batch_size
303
+ * c.num_hidden_layers
304
+ * 2
305
+ * kv_heads
306
+ * seq_len
307
+ * c.head_dim
308
+ * kv_cache_dtype_bytes
309
+ )
310
+ return total_bytes / 1e9
311
+
312
+ def _calc_attn_embed_params(self) -> tuple:
313
+ """
314
+ Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
315
+ """
316
+ c = self.llm_config
317
+
318
+ # Embedding parameters
319
+ # assume tied embeddings unless tie_word_embeddings = False
320
+ embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
321
+ embedding_params = embedding_count * c.vocab_size * c.hidden_size
322
+
323
+ q_proj = c.hidden_size * c.hidden_size
324
+ k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
325
+ v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
326
+ o_proj = c.hidden_size * c.hidden_size
327
+ attn_params = q_proj + k_proj + v_proj + o_proj
328
+
329
+ return embedding_params, attn_params
330
+
331
+
332
+ class MixtureMemoryEstimator(LlamaMemoryEstimator):
333
+ """
334
+ Estimator for Mixture-of-Experts (MoE) architectures (e.g., Mixtral, MoE Llama).
335
+ Adds extra expert parallelism block parameter count to LlamaMemoryEstimator logic.
336
+ """
337
+
338
+ @property
339
+ def model_memory(self) -> float:
340
+ """
341
+ Accounts for the increase in model parameters due to additional expert MLP blocks in MoE Models.
342
+
343
+ Returns the estimated memory size of the MoE Model (in GB).
344
+ """
345
+ c = self.llm_config
346
+ # Attention parameter count (Llama-style)
347
+ embedding_params, attn_params = self._calc_attn_embed_params()
348
+
349
+ # MoE MLP params per layer
350
+ moe_params_per_layer = (
351
+ c.num_local_experts * 3 * c.hidden_size * c.intermediate_size
352
+ )
353
+ total_params = (
354
+ c.num_hidden_layers * (attn_params + moe_params_per_layer)
355
+ + embedding_params
356
+ )
357
+
358
+ # Convert to GB
359
+ return total_params * c.bytes_per_parameter / 1e9
360
+
361
+
362
+ def get_estimator(llm_config, **kwargs) -> MemoryEstimator:
363
+ """
364
+ Extracts the correct estimator based on the defined parameters in the config.json
365
+ See constants.py for LLMConfig parameters necessary for specific estimators.
366
+ Uses MemoryEstimator as a fallback if parameters needed for GQA and MoE Architectures are missing.
367
+
368
+ Returns the appropriate MemoryEstimator based on the fields defined by the model's config.json (as represented by LLMConfig).
369
+ """
370
+ if all(
371
+ hasattr(llm_config, f) and getattr(llm_config, f) is not None
372
+ for f in MOE_REQUIRED_FIELDS
373
+ ):
374
+ return MixtureMemoryEstimator(llm_config=llm_config, **kwargs)
375
+ elif all(
376
+ hasattr(llm_config, f) and getattr(llm_config, f) is not None
377
+ for f in LLAMA_REQUIRED_FIELDS
378
+ ):
379
+ return LlamaMemoryEstimator(llm_config=llm_config, **kwargs)
380
+ else:
381
+ logger.warning(
382
+ "Falling back to generic GPT estimator: required fields missing from config.json file in model."
383
+ )
384
+ return MemoryEstimator(llm_config=llm_config, **kwargs)
@@ -0,0 +1,283 @@
1
+ #!/usr/bin/env python
2
+ # Copyright (c) 2025 Oracle and/or its affiliates.
3
+ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
4
+
5
+ import re
6
+ from typing import Optional
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from ads.aqua.common.errors import AquaRecommendationError
11
+ from ads.aqua.shaperecommend.constants import (
12
+ BITS_AND_BYTES_4BIT,
13
+ BITS_AND_BYTES_8BIT,
14
+ DEFAULT_WEIGHT_SIZE,
15
+ NEXT_QUANT,
16
+ QUANT_MAPPING,
17
+ QUANT_METHODS,
18
+ )
19
+
20
+
21
+ class LLMConfig(BaseModel):
22
+ """
23
+ Standardized configuration object for evaluating the size of Large Language Models (LLMs)
24
+ based on their architecture and quantization.
25
+ """
26
+
27
+ num_hidden_layers: int = Field(
28
+ ...,
29
+ description="Number of transformer blocks (layers) in the model’s neural network stack.",
30
+ )
31
+ hidden_size: int = Field(
32
+ ..., description="Embedding dimension or hidden size of each layer."
33
+ )
34
+ vocab_size: int = Field(..., description="Vocabulary size for input/output tokens.")
35
+ num_attention_heads: int = Field(
36
+ ...,
37
+ description="Number of attention heads (used for queries and to determine head_dim).",
38
+ )
39
+
40
+ head_dim: int = Field(
41
+ ...,
42
+ description="Dimension of each attention head. Typically hidden_size // num_attention_heads.",
43
+ )
44
+ max_seq_len: Optional[int] = Field(
45
+ 4096, description="Maximum input sequence length (context window)."
46
+ )
47
+ weight_dtype: Optional[str] = Field(
48
+ DEFAULT_WEIGHT_SIZE,
49
+ description="Parameter data type: 'float32', 'float16', etc.",
50
+ )
51
+ quantization: Optional[str] = Field(
52
+ None,
53
+ description="Quantization weight (e.g., '8bit', '4bit') or None if unquantized.",
54
+ )
55
+ quantization_type: Optional[str] = Field(
56
+ None,
57
+ description="Quantization method (e.g., '8bit', '4bit', 'gptq', 'awq') or None if unquantized.",
58
+ )
59
+
60
+ in_flight_quantization: Optional[str] = Field(
61
+ None,
62
+ description="By setting this, enables recalculation of model footprint using 4bit in-flight quantization",
63
+ )
64
+
65
+ num_key_value_heads: Optional[int] = Field(
66
+ None,
67
+ description="Number of key/value heads (for GQA architectures: Llama, Mistral, Falcon, Qwen, etc.). Used to determine KV cache size",
68
+ )
69
+
70
+ num_local_experts: Optional[int] = Field(
71
+ None, description="For MoE architectures, the number of experts per MoE layer"
72
+ )
73
+ intermediate_size: Optional[int] = Field(
74
+ None, description="For MoE architectures, size of the MLP activation layer."
75
+ )
76
+
77
+ tie_word_embeddings: Optional[bool] = Field(None)
78
+
79
+ @property
80
+ def bytes_per_parameter(self) -> float:
81
+ """
82
+ Returns the number of bytes used to store a model parameter,
83
+ accounting for quantization or weight storage type.
84
+ """
85
+ # Quantization takes precedence
86
+ q = (self.quantization or "").lower()
87
+
88
+ # Direct match in mapping
89
+ if q in QUANT_MAPPING:
90
+ return QUANT_MAPPING[q]
91
+
92
+ # Dynamic bit-width detection
93
+ m = re.match(r"(\d+)\s*bit", q)
94
+ if m:
95
+ bits = int(m[1])
96
+ return bits / 8 # bytes per parameter
97
+
98
+ # consider in-flight quantization
99
+ if self.in_flight_quantization in QUANT_MAPPING:
100
+ return QUANT_MAPPING[self.in_flight_quantization]
101
+
102
+ # Fallback to dtype mapping
103
+ dtype = (self.weight_dtype or DEFAULT_WEIGHT_SIZE).lower()
104
+ return QUANT_MAPPING.get(dtype, QUANT_MAPPING[DEFAULT_WEIGHT_SIZE])
105
+
106
+ @classmethod
107
+ def detect_quantization_type(cls, raw: dict) -> Optional[str]:
108
+ """
109
+ Detects quantization type (e.g., 'gptq', 'bitsandbytes', 'awq', etc.) from Hugging Face config dict.
110
+ """
111
+ qcfg = raw.get("quantization_config", {})
112
+ if raw.get("load_in_8bit") or raw.get("load_in_4bit"):
113
+ return "bitsandbytes"
114
+ for key in QUANT_METHODS:
115
+ if key in str(qcfg).lower() or key in str(raw).lower():
116
+ return key
117
+ return None
118
+
119
+ @classmethod
120
+ def detect_quantization_bits(cls, raw: dict) -> Optional[str]:
121
+ """
122
+ Detects quantization bit-width as a string (e.g., '4bit', '8bit') from Hugging Face config dict.
123
+ """
124
+ if raw.get("load_in_8bit"):
125
+ return BITS_AND_BYTES_8BIT
126
+ if raw.get("load_in_4bit"):
127
+ return BITS_AND_BYTES_4BIT
128
+ if "quantization_config" in raw:
129
+ qcfg = raw["quantization_config"]
130
+ bits = qcfg.get("bits") or qcfg.get("wbits")
131
+ if bits:
132
+ return f"{bits}bit"
133
+ return None
134
+
135
+ @property
136
+ def suggested_quantizations(self):
137
+ """
138
+ Suggests the next lower quantization options based on the current quantization level/ weight size.
139
+
140
+ If model is un-quantized, uses the weight size.
141
+ If model is pre-quantized, uses the quantization level.
142
+ """
143
+ key = (
144
+ self.quantization
145
+ or self.in_flight_quantization
146
+ or self.weight_dtype
147
+ or DEFAULT_WEIGHT_SIZE
148
+ ).lower()
149
+ return NEXT_QUANT.get(key, [])
150
+
151
+ def calculate_possible_seq_len(self, min_len=2048):
152
+ """
153
+ Calculates a list of possible sequence lengths (in tokens).
154
+ [2048, ... max-length] (max-length found in model's config.json file)
155
+ """
156
+ vals = []
157
+ curr = min_len
158
+ while curr <= self.max_seq_len:
159
+ vals.append(curr)
160
+ curr *= 2
161
+ if vals and vals[-1] != self.max_seq_len:
162
+ vals.append(self.max_seq_len)
163
+ return vals
164
+
165
+ def optimal_config(self):
166
+ """
167
+ Builds a list of optimal configuration parameters (sorted descending). Combination of:
168
+ - Quantization / weight sizes: bfloat16 weight size -> 8bit -> 4bit
169
+ - max-model-len: power-of-two model lengths from max length (config.json of model) to 2048 tokens.
170
+
171
+ Example:
172
+ [('bfloat16', max_model_len supported by model) ('bfloat16', 1/2 of max_model_len) ... ('4bit', 4096), ('4bit', 2048)]
173
+
174
+ """
175
+ # use later-Create a copy of the suggested_quantizations list
176
+ # quantizations = self.suggested_quantizations[:]
177
+ quantizations = ["bfloat16", "4bit"]
178
+
179
+ lengths = self.calculate_possible_seq_len()
180
+
181
+ configs = []
182
+ for quantization in quantizations:
183
+ for length in lengths:
184
+ configs.append((quantization, length))
185
+
186
+ configs.sort(
187
+ key=lambda x: (-QUANT_MAPPING.get(x[0], 0), -x[1])
188
+ ) # (-quant_priority, -max_seq_len)
189
+ return configs
190
+
191
+ @classmethod
192
+ def validate_model_support(cls, raw: dict) -> ValueError:
193
+ """
194
+ Validates if model is decoder-only. Check for text-generation model occurs at DataScienceModel level.
195
+ """
196
+ excluded_models = {"t5", "gemma", "bart", "bert", "roberta", "albert"}
197
+ if (
198
+ raw.get("is_encoder_decoder", False) # exclude encoder-decoder models
199
+ or (
200
+ raw.get("is_decoder") is False
201
+ ) # exclude explicit encoder-only models (altho no text-generation task ones, just dbl check)
202
+ or raw.get("model_type", "").lower() # exclude by known model types
203
+ in excluded_models
204
+ ):
205
+ raise AquaRecommendationError(
206
+ "Please provide a decoder-only text-generation model (ex. Llama, Falcon, etc). "
207
+ "Encoder-decoder models (ex. T5, Gemma) and encoder-only (BERT) are not supported at this time."
208
+ )
209
+
210
+ @classmethod
211
+ def from_raw_config(cls, raw: dict) -> "LLMConfig":
212
+ """
213
+ Instantiates an LLMConfig from a raw Hugging Face config.json file,
214
+ using robust key detection and fallback for architecture.
215
+ """
216
+ cls.validate_model_support(raw)
217
+
218
+ # Field mappings with fallback
219
+ num_hidden_layers = (
220
+ raw.get("num_hidden_layers") or raw.get("n_layer") or raw.get("num_layers")
221
+ )
222
+ hidden_size = raw.get("hidden_size") or raw.get("n_embd") or raw.get("d_model")
223
+ vocab_size = raw.get("vocab_size")
224
+ weight_dtype = str(raw.get("torch_dtype", DEFAULT_WEIGHT_SIZE))
225
+ quantization = cls.detect_quantization_bits(raw)
226
+ quantization_type = cls.detect_quantization_type(raw)
227
+
228
+ if not quantization and quantization_type in QUANT_MAPPING:
229
+ quantization = quantization_type
230
+
231
+ num_key_value_heads = (
232
+ raw.get("num_key_value_heads") # GQA models (ex. Llama-type)
233
+ )
234
+
235
+ num_attention_heads = (
236
+ raw.get("num_attention_heads") or raw.get("n_head") or raw.get("num_heads")
237
+ )
238
+
239
+ head_dim = raw.get("head_dim") or (
240
+ int(hidden_size) // int(num_attention_heads)
241
+ if hidden_size and num_attention_heads
242
+ else None
243
+ )
244
+ max_seq_len = (
245
+ raw.get("max_position_embeddings")
246
+ or raw.get("n_positions")
247
+ or raw.get("max_seq_len")
248
+ or 2048
249
+ )
250
+
251
+ num_local_experts = (
252
+ raw.get("num_local_experts")
253
+ or raw.get("n_routed_experts")
254
+ or raw.get("num_experts")
255
+ )
256
+ intermediate_size = raw.get("moe_intermediate_size") or raw.get(
257
+ "intermediate_size"
258
+ )
259
+
260
+ # Type safety: minimal assertion
261
+ if None in [
262
+ num_hidden_layers,
263
+ hidden_size,
264
+ vocab_size,
265
+ num_attention_heads,
266
+ head_dim,
267
+ ]:
268
+ raise ValueError("Missing required value in model config.")
269
+
270
+ return cls(
271
+ num_hidden_layers=int(num_hidden_layers),
272
+ hidden_size=int(hidden_size),
273
+ num_attention_heads=int(num_attention_heads),
274
+ num_key_value_heads=num_key_value_heads,
275
+ head_dim=int(head_dim),
276
+ vocab_size=int(vocab_size),
277
+ weight_dtype=weight_dtype,
278
+ quantization=quantization,
279
+ quantization_type=quantization_type,
280
+ max_seq_len=int(max_seq_len),
281
+ num_local_experts=num_local_experts,
282
+ intermediate_size=intermediate_size,
283
+ )