npm - @aws/ml-container-creator - Versions diffs - 0.4.0 → 0.6.0 - Mend

@aws/ml-container-creator 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/bin/cli.js +5 -2
package/config/bootstrap-stack.json +40 -9
package/infra/ci-harness/buildspec.yml +60 -0
package/infra/ci-harness/package-lock.json +5 -1
package/package.json +1 -1
package/servers/README.md +41 -1
package/servers/instance-sizer/index.js +10 -4
package/servers/instance-sizer/lib/model-resolver.js +1 -1
package/servers/lib/catalogs/model-sizes.json +135 -90
package/servers/lib/catalogs/models.json +483 -411
package/src/app.js +33 -2
package/src/lib/bootstrap-command-handler.js +6 -0
package/src/lib/cli-handler.js +1 -1
package/src/lib/config-manager.js +41 -2
package/src/lib/deployment-entry-schema.js +16 -0
package/src/lib/mcp-client.js +3 -3
package/src/lib/prompt-runner.js +179 -8
package/src/lib/prompts.js +253 -7
package/src/lib/registry-command-handler.js +12 -0
package/templates/Dockerfile +12 -0
package/templates/code/serving.properties +14 -0
package/templates/do/adapter +1230 -0
package/templates/do/adapters/.gitkeep +2 -0
package/templates/do/add-ic +130 -0
package/templates/do/benchmark +81 -9
package/templates/do/clean +507 -17
package/templates/do/config +28 -5
package/templates/do/deploy +513 -367
package/templates/do/ic/default.conf +32 -0
package/templates/do/lib/endpoint-config.sh +216 -0
package/templates/do/lib/inference-component.sh +167 -0
package/templates/do/lib/secrets.sh +44 -0
package/templates/do/lib/wait.sh +131 -0
package/templates/do/logs +107 -27
package/templates/do/optimize +528 -0
package/templates/do/register +111 -1
package/templates/do/status +337 -0
package/templates/do/test +80 -28

package/servers/lib/catalogs/models.json CHANGED Viewed

@@ -1,372 +1,555 @@
 {
-    "openai/gpt-oss-20b": {
-        "family": "gpt-oss",
-        "parameterCount": 20000000000,
+    "meta-llama/Llama-3.2-1B-Instruct": {
+        "family": "llama-3",
+        "parameterCount": 1235814400,
         "defaultDtype": "bfloat16",
-        "maxPositionEmbeddings": 8192,
-        "gated": false,
+        "maxPositionEmbeddings": 131072,
+        "gated": true,
         "tags": [
             "text-generation",
-            "openai",
+            "llama-3",
             "conversational"
         ],
-        "architecture": "GPT2LMHeadModel",
-        "notes": "Open-source 20B parameter model. Requires significant GPU memory for inference",
+        "architecture": "LlamaForCausalLM",
+        "notes": "Llama 3.2 1B Instruct. Lightweight model suitable for single-GPU deployment",
         "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "validationLevel": "community-validated",
+        "validationLevel": "tested",
         "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "meta-llama/Llama-2-7b-chat-hf": {
-        "family": "llama-2",
+    "meta-llama/Llama-3.2-3B-Instruct": {
+        "family": "llama-3",
+        "parameterCount": 3212749824,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
         "gated": true,
         "tags": [
             "text-generation",
-            "llama-2",
+            "llama-3",
             "conversational"
         ],
         "architecture": "LlamaForCausalLM",
-        "profiles": {
-            "7b": {
-                "displayName": "Llama-2 7B",
-                "envVars": {
-                    "MAX_MODEL_LEN": "4096",
-                    "GPU_MEMORY_UTILIZATION": "0.9"
-                }
-            }
-        },
-        "notes": "Llama-2 7B chat model with official chat template. Requires HuggingFace authentication for download",
-        "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
+        "notes": "Llama 3.2 3B Instruct. Compact model with strong performance for its size",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
         "validationLevel": "tested",
         "modelType": "transformer",
-        "parameterCount": 6738415616,
-        "defaultDtype": "float16",
-        "maxPositionEmbeddings": 4096,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
         "tasks": [
             "text-generation"
         ]
     },
-    "meta-llama/Llama-2-13b-chat-hf": {
-        "family": "llama-2",
+    "meta-llama/Llama-3.1-8B-Instruct": {
+        "family": "llama-3",
+        "parameterCount": 8030261248,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
         "gated": true,
         "tags": [
             "text-generation",
-            "llama-2",
+            "llama-3",
             "conversational"
         ],
         "architecture": "LlamaForCausalLM",
-        "profiles": {
-            "13b": {
-                "displayName": "Llama-2 13B",
-                "envVars": {
-                    "MAX_MODEL_LEN": "4096",
-                    "GPU_MEMORY_UTILIZATION": "0.9"
-                }
-            }
-        },
-        "notes": "Llama-2 13B chat model. Requires more GPU memory than 7B variant",
-        "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
+        "notes": "Llama 3.1 8B Instruct with 128K context window",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
         "validationLevel": "tested",
         "modelType": "transformer",
-        "parameterCount": 13015864320,
-        "defaultDtype": "float16",
-        "maxPositionEmbeddings": 4096,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
         "tasks": [
             "text-generation"
         ]
     },
-    "meta-llama/Llama-2-70b-chat-hf": {
-        "family": "llama-2",
-        "parameterCount": 70000000000,
-        "defaultDtype": "float16",
-        "maxPositionEmbeddings": 4096,
+    "meta-llama/Llama-3.3-70B-Instruct": {
+        "family": "llama-3",
+        "parameterCount": 70553706496,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
         "gated": true,
         "tags": [
             "text-generation",
-            "llama-2",
+            "llama-3",
             "conversational"
         ],
         "architecture": "LlamaForCausalLM",
-        "profiles": {
-            "70b-tp2": {
-                "displayName": "Llama-2 70B (2-GPU)",
-                "envVars": {
-                    "TENSOR_PARALLEL_SIZE": "2",
-                    "MAX_MODEL_LEN": "4096",
-                    "GPU_MEMORY_UTILIZATION": "0.95"
-                }
-            },
-            "70b-tp4": {
-                "displayName": "Llama-2 70B (4-GPU)",
-                "envVars": {
-                    "TENSOR_PARALLEL_SIZE": "4",
-                    "MAX_MODEL_LEN": "4096",
-                    "GPU_MEMORY_UTILIZATION": "0.9"
-                }
-            }
+        "notes": "Llama 3.3 70B Instruct. Requires multi-GPU tensor parallelism",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "notes": "Llama-2 70B requires tensor parallelism across multiple GPUs",
-        "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
+        "validationLevel": "tested",
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "Qwen/Qwen3-0.6B": {
+        "family": "qwen3",
+        "parameterCount": 600000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 32768,
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "qwen",
+            "conversational"
+        ],
+        "architecture": "Qwen3ForCausalLM",
+        "notes": "Qwen3 0.6B. Ultra-lightweight model for edge and low-resource deployments",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "validationLevel": "community-validated",
+        "validationLevel": "tested",
         "modelType": "transformer",
-        "parameterCount": 68976648192,
-        "defaultDtype": "float16",
-        "maxPositionEmbeddings": 4096,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "Qwen/Qwen3-1.7B": {
+        "family": "qwen3",
+        "parameterCount": 1700000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 32768,
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "qwen",
+            "conversational"
         ],
+        "architecture": "Qwen3ForCausalLM",
+        "notes": "Qwen3 1.7B. Lightweight model with strong reasoning capabilities",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
+        },
+        "validationLevel": "tested",
+        "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "mistralai/Mistral-7B-Instruct-v0.1": {
-        "family": "mistral",
+    "Qwen/Qwen3-4B": {
+        "family": "qwen3",
+        "parameterCount": 4000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 32768,
         "gated": false,
         "tags": [
             "text-generation",
-            "mistral",
+            "qwen",
             "conversational"
         ],
-        "architecture": "MistralForCausalLM",
-        "profiles": {
-            "7b": {
-                "displayName": "Mistral 7B Instruct",
-                "envVars": {
-                    "MAX_MODEL_LEN": "8192",
-                    "GPU_MEMORY_UTILIZATION": "0.9"
-                }
-            }
+        "architecture": "Qwen3ForCausalLM",
+        "notes": "Qwen3 4B. Balanced model for single-GPU inference",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "notes": "Mistral 7B v0.1 with 8K context window",
-        "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
+        "validationLevel": "tested",
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "Qwen/Qwen3-8B": {
+        "family": "qwen3",
+        "parameterCount": 8000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 32768,
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "qwen",
+            "conversational"
+        ],
+        "architecture": "Qwen3ForCausalLM",
+        "notes": "Qwen3 8B. Strong general-purpose model for single-GPU deployment",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
         "validationLevel": "tested",
         "modelType": "transformer",
-        "parameterCount": 7241732096,
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "Qwen/Qwen3-14B": {
+        "family": "qwen3",
+        "parameterCount": 14000000000,
         "defaultDtype": "bfloat16",
         "maxPositionEmbeddings": 32768,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "qwen",
+            "conversational"
         ],
+        "architecture": "Qwen3ForCausalLM",
+        "notes": "Qwen3 14B. High-quality model requiring larger GPU memory",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
+        },
+        "validationLevel": "tested",
+        "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "mistralai/Mistral-7B-Instruct-v0.2": {
-        "family": "mistral",
+    "Qwen/Qwen3-32B": {
+        "family": "qwen3",
+        "parameterCount": 32000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 32768,
         "gated": false,
         "tags": [
             "text-generation",
-            "mistral",
+            "qwen",
             "conversational"
         ],
-        "architecture": "MistralForCausalLM",
-        "profiles": {
-            "7b": {
-                "displayName": "Mistral 7B Instruct v0.2",
-                "envVars": {
-                    "MAX_MODEL_LEN": "32768",
-                    "GPU_MEMORY_UTILIZATION": "0.9"
-                }
-            }
+        "architecture": "Qwen3ForCausalLM",
+        "notes": "Qwen3 32B. Large model requiring multi-GPU or quantization",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "notes": "Mistral 7B v0.2 with extended 32K context window. Requires more memory for long contexts",
-        "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
+        "validationLevel": "tested",
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "family": "qwen2.5",
+        "parameterCount": 7721324544,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "qwen",
+            "conversational"
+        ],
+        "architecture": "Qwen2ForCausalLM",
+        "notes": "Qwen2.5 7B Instruct with 128K context window",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
         "validationLevel": "tested",
         "modelType": "transformer",
-        "parameterCount": 7241732096,
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "Qwen/Qwen2.5-14B-Instruct": {
+        "family": "qwen2.5",
+        "parameterCount": 14167134208,
         "defaultDtype": "bfloat16",
-        "maxPositionEmbeddings": 32768,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
+        "maxPositionEmbeddings": 131072,
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "qwen",
+            "conversational"
         ],
+        "architecture": "Qwen2ForCausalLM",
+        "notes": "Qwen2.5 14B Instruct with 128K context window",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
+        },
+        "validationLevel": "tested",
+        "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": {
-        "family": "mistral",
+    "Qwen/Qwen2.5-32B-Instruct": {
+        "family": "qwen2.5",
+        "parameterCount": 32000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
         "gated": false,
         "tags": [
             "text-generation",
-            "mistral",
-            "mixture-of-experts"
+            "qwen",
+            "conversational"
         ],
-        "architecture": "MixtralForCausalLM",
-        "profiles": {
-            "8x7b-tp2": {
-                "displayName": "Mixtral 8x7B (2-GPU)",
-                "envVars": {
-                    "TENSOR_PARALLEL_SIZE": "2",
-                    "MAX_MODEL_LEN": "32768",
-                    "GPU_MEMORY_UTILIZATION": "0.95"
-                }
-            }
+        "architecture": "Qwen2ForCausalLM",
+        "notes": "Qwen2.5 32B Instruct with 128K context window. Requires multi-GPU or quantization",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "notes": "Mixtral 8x7B MoE model. Requires tensor parallelism for efficient inference",
-        "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
+        "validationLevel": "tested",
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "Qwen/Qwen2.5-72B-Instruct": {
+        "family": "qwen2.5",
+        "parameterCount": 72710410240,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "qwen",
+            "conversational"
+        ],
+        "architecture": "Qwen2ForCausalLM",
+        "notes": "Qwen2.5 72B Instruct with 128K context window. Requires multi-GPU tensor parallelism",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "validationLevel": "community-validated",
+        "validationLevel": "tested",
         "modelType": "transformer",
-        "parameterCount": 46702792704,
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": {
+        "family": "deepseek-r1",
+        "parameterCount": 1500000000,
         "defaultDtype": "bfloat16",
-        "maxPositionEmbeddings": 32768,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
+        "maxPositionEmbeddings": 131072,
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "deepseek",
+            "reasoning",
+            "conversational"
         ],
+        "architecture": "Qwen2ForCausalLM",
+        "notes": "DeepSeek R1 Distill Qwen 1.5B. Reasoning-focused distilled model",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
+        },
+        "validationLevel": "tested",
+        "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "meta-llama/Llama-2-70b-hf": {
-        "family": "llama-2",
-        "parameterCount": 70000000000,
-        "defaultDtype": "float16",
-        "maxPositionEmbeddings": 4096,
-        "gated": true,
+    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": {
+        "family": "deepseek-r1",
+        "parameterCount": 7000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
+        "gated": false,
         "tags": [
             "text-generation",
-            "llama-2"
+            "deepseek",
+            "reasoning",
+            "conversational"
         ],
-        "architecture": "LlamaForCausalLM",
-        "notes": "Llama-2 70B base model (non-chat). Requires multi-GPU for inference.",
+        "architecture": "Qwen2ForCausalLM",
+        "notes": "DeepSeek R1 Distill Qwen 7B. Reasoning-focused distilled model",
         "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "validationLevel": "community-validated",
+        "validationLevel": "tested",
         "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "meta-llama/Llama-2-*": {
-        "family": "llama-2",
-        "gated": true,
+    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": {
+        "family": "deepseek-r1",
+        "parameterCount": 14000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
+        "gated": false,
         "tags": [
             "text-generation",
-            "llama-2"
+            "deepseek",
+            "reasoning",
+            "conversational"
         ],
-        "architecture": null,
-        "notes": "Fallback configuration for Llama-2 models not explicitly listed. Uses standard Llama-2 chat template",
-        "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
+        "architecture": "Qwen2ForCausalLM",
+        "notes": "DeepSeek R1 Distill Qwen 14B. Reasoning-focused distilled model",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "validationLevel": "experimental",
+        "validationLevel": "tested",
         "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "mistralai/Mistral-*": {
-        "family": "mistral",
+    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": {
+        "family": "deepseek-r1",
+        "parameterCount": 32000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
         "gated": false,
         "tags": [
             "text-generation",
-            "mistral"
+            "deepseek",
+            "reasoning",
+            "conversational"
         ],
-        "architecture": null,
-        "notes": "Fallback configuration for Mistral models not explicitly listed. Uses standard Mistral chat template",
-        "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
+        "architecture": "Qwen2ForCausalLM",
+        "notes": "DeepSeek R1 Distill Qwen 32B. Reasoning-focused distilled model. Requires multi-GPU or quantization",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0",
-            "sglang": ">=0.2.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "validationLevel": "experimental",
+        "validationLevel": "tested",
         "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "codellama/*": {
-        "family": "codellama",
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {
+        "family": "deepseek-r1",
+        "parameterCount": 8000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
         "gated": false,
         "tags": [
             "text-generation",
-            "code",
-            "codellama"
+            "deepseek",
+            "reasoning",
+            "conversational"
         ],
-        "architecture": null,
-        "notes": "CodeLlama models use Llama-2 chat template. Optimized for code generation",
-        "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
+        "architecture": "LlamaForCausalLM",
+        "notes": "DeepSeek R1 Distill Llama 8B. Reasoning-focused distilled model based on Llama architecture",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "validationLevel": "experimental",
+        "validationLevel": "tested",
         "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "tiiuae/falcon-*": {
-        "family": "falcon",
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {
+        "family": "deepseek-r1",
+        "parameterCount": 70000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 131072,
         "gated": false,
         "tags": [
             "text-generation",
-            "falcon"
+            "deepseek",
+            "reasoning",
+            "conversational"
         ],
-        "architecture": null,
-        "notes": "Falcon models typically don't require chat templates for instruction following",
-        "chatTemplate": null,
+        "architecture": "LlamaForCausalLM",
+        "notes": "DeepSeek R1 Distill Llama 70B. Reasoning-focused distilled model. Requires multi-GPU tensor parallelism",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm": ">=0.3.0",
-            "tensorrt-llm": ">=0.8.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
-        "validationLevel": "experimental",
+        "validationLevel": "tested",
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "openai/gpt-oss-20b": {
+        "family": "gpt-oss",
+        "parameterCount": 20000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 8192,
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "openai",
+            "conversational"
+        ],
+        "architecture": "GPT2LMHeadModel",
+        "notes": "GPT-OSS 20B. Open-source 20B parameter model from OpenAI",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
+        },
+        "validationLevel": "tested",
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "openai/gpt-oss-120b": {
+        "family": "gpt-oss",
+        "parameterCount": 120000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 8192,
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "openai",
+            "conversational"
+        ],
+        "architecture": "GPT2LMHeadModel",
+        "notes": "GPT-OSS 120B. Large open-source model from OpenAI. Requires multi-GPU tensor parallelism",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
+        },
+        "validationLevel": "tested",
         "modelType": "transformer",
         "tasks": [
             "text-generation"
@@ -464,240 +647,129 @@
             "text-to-video"
         ]
     },
-    "stabilityai/stable-diffusion-*": {
-        "family": "stable-diffusion",
-        "gated": false,
+    "meta-llama/Llama-3*": {
+        "family": "llama-3",
+        "gated": true,
         "tags": [
-            "image-generation",
-            "diffusion",
-            "stable-diffusion"
+            "text-generation",
+            "llama-3"
         ],
         "architecture": null,
-        "notes": "Fallback for Stable Diffusion variants not explicitly listed",
-        "chatTemplate": null,
+        "notes": "Fallback configuration for Llama 3.x models not explicitly listed",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm-omni": ">=0.14.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
         "validationLevel": "experimental",
-        "modelType": "diffusor",
+        "modelType": "transformer",
         "tasks": [
-            "text-to-image"
+            "text-generation"
         ]
     },
-    "black-forest-labs/FLUX*": {
-        "family": "flux",
+    "Qwen/Qwen*": {
+        "family": "qwen",
         "gated": false,
         "tags": [
-            "image-generation",
-            "diffusion",
-            "flux"
+            "text-generation",
+            "qwen"
         ],
         "architecture": null,
-        "notes": "Fallback for FLUX model variants not explicitly listed",
-        "chatTemplate": null,
+        "notes": "Fallback configuration for Qwen models not explicitly listed",
+        "chatTemplate": "",
         "frameworkCompatibility": {
-            "vllm-omni": ">=0.14.0"
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
         },
         "validationLevel": "experimental",
-        "modelType": "diffusor",
-        "tasks": [
-            "text-to-image"
-        ]
-    },
-    "meta-llama/Meta-Llama-3-8B*": {
-        "parameterCount": 8030261248,
-        "defaultDtype": "bfloat16",
-        "architecture": "LlamaForCausalLM",
-        "maxPositionEmbeddings": 8192,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
-        "modelType": "transformer",
-        "tasks": [
-            "text-generation"
-        ]
-    },
-    "meta-llama/Meta-Llama-3-70B*": {
-        "parameterCount": 70553706496,
-        "defaultDtype": "bfloat16",
-        "architecture": "LlamaForCausalLM",
-        "maxPositionEmbeddings": 8192,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
-        "modelType": "transformer",
-        "tasks": [
-            "text-generation"
-        ]
-    },
-    "meta-llama/Llama-3.1-8B*": {
-        "parameterCount": 8030261248,
-        "defaultDtype": "bfloat16",
-        "architecture": "LlamaForCausalLM",
-        "maxPositionEmbeddings": 131072,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
-        "modelType": "transformer",
-        "tasks": [
-            "text-generation"
-        ]
-    },
-    "meta-llama/Llama-3.1-70B*": {
-        "parameterCount": 70553706496,
-        "defaultDtype": "bfloat16",
-        "architecture": "LlamaForCausalLM",
-        "maxPositionEmbeddings": 131072,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
-        "modelType": "transformer",
-        "tasks": [
-            "text-generation"
-        ]
-    },
-    "meta-llama/Llama-3.1-405B*": {
-        "parameterCount": 405000000000,
-        "defaultDtype": "bfloat16",
-        "architecture": "LlamaForCausalLM",
-        "maxPositionEmbeddings": 131072,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq",
-            "fp8"
-        ],
         "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "meta-llama/Llama-3.2-1B*": {
-        "parameterCount": 1235814400,
-        "defaultDtype": "bfloat16",
-        "architecture": "LlamaForCausalLM",
-        "maxPositionEmbeddings": 131072,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
-        "modelType": "transformer",
-        "tasks": [
-            "text-generation"
-        ]
-    },
-    "meta-llama/Llama-3.2-3B*": {
-        "parameterCount": 3212749824,
-        "defaultDtype": "bfloat16",
-        "architecture": "LlamaForCausalLM",
-        "maxPositionEmbeddings": 131072,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
-        "modelType": "transformer",
-        "tasks": [
-            "text-generation"
-        ]
-    },
-    "Qwen/Qwen-7B*": {
-        "parameterCount": 7721324544,
-        "defaultDtype": "bfloat16",
-        "architecture": "QWenLMHeadModel",
-        "maxPositionEmbeddings": 8192,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
-        "modelType": "transformer",
-        "tasks": [
-            "text-generation"
-        ]
-    },
-    "Qwen/Qwen2-7B*": {
-        "parameterCount": 7721324544,
-        "defaultDtype": "bfloat16",
-        "architecture": "Qwen2ForCausalLM",
-        "maxPositionEmbeddings": 32768,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
-        "modelType": "transformer",
-        "tasks": [
-            "text-generation"
-        ]
-    },
-    "Qwen/Qwen-14B*": {
-        "parameterCount": 14167134208,
-        "defaultDtype": "bfloat16",
-        "architecture": "QWenLMHeadModel",
-        "maxPositionEmbeddings": 8192,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
-        ],
-        "modelType": "transformer",
-        "tasks": [
-            "text-generation"
-        ]
-    },
-    "Qwen/Qwen2-14B*": {
-        "parameterCount": 14167134208,
-        "defaultDtype": "bfloat16",
-        "architecture": "Qwen2ForCausalLM",
-        "maxPositionEmbeddings": 32768,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
+    "deepseek-ai/DeepSeek*": {
+        "family": "deepseek",
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "deepseek",
+            "reasoning"
         ],
+        "architecture": null,
+        "notes": "Fallback configuration for DeepSeek models not explicitly listed",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
+        },
+        "validationLevel": "experimental",
         "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "Qwen/Qwen-72B*": {
-        "parameterCount": 72710410240,
-        "defaultDtype": "bfloat16",
-        "architecture": "QWenLMHeadModel",
-        "maxPositionEmbeddings": 32768,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
+    "openai/gpt-oss*": {
+        "family": "gpt-oss",
+        "gated": false,
+        "tags": [
+            "text-generation",
+            "openai"
         ],
+        "architecture": null,
+        "notes": "Fallback configuration for OpenAI GPT-OSS models not explicitly listed",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.5.0",
+            "tensorrt-llm": ">=0.9.0",
+            "sglang": ">=0.3.0"
+        },
+        "validationLevel": "experimental",
         "modelType": "transformer",
         "tasks": [
             "text-generation"
         ]
     },
-    "Qwen/Qwen2-72B*": {
-        "parameterCount": 72710410240,
-        "defaultDtype": "bfloat16",
-        "architecture": "Qwen2ForCausalLM",
-        "maxPositionEmbeddings": 32768,
-        "recommendedQuantizations": [
-            "awq",
-            "gptq"
+    "stabilityai/stable-diffusion-*": {
+        "family": "stable-diffusion",
+        "gated": false,
+        "tags": [
+            "image-generation",
+            "diffusion",
+            "stable-diffusion"
         ],
-        "modelType": "transformer",
+        "architecture": null,
+        "notes": "Fallback for Stable Diffusion variants not explicitly listed",
+        "chatTemplate": null,
+        "frameworkCompatibility": {
+            "vllm-omni": ">=0.14.0"
+        },
+        "validationLevel": "experimental",
+        "modelType": "diffusor",
         "tasks": [
-            "text-generation"
+            "text-to-image"
         ]
     },
-    "EleutherAI/gpt-neox-20b*": {
-        "parameterCount": 20554568704,
-        "defaultDtype": "float16",
-        "architecture": "GPTNeoXForCausalLM",
-        "maxPositionEmbeddings": 2048,
-        "recommendedQuantizations": [
-            "gptq"
+    "black-forest-labs/FLUX*": {
+        "family": "flux",
+        "gated": false,
+        "tags": [
+            "image-generation",
+            "diffusion",
+            "flux"
         ],
-        "modelType": "transformer",
+        "architecture": null,
+        "notes": "Fallback for FLUX model variants not explicitly listed",
+        "chatTemplate": null,
+        "frameworkCompatibility": {
+            "vllm-omni": ">=0.14.0"
+        },
+        "validationLevel": "experimental",
+        "modelType": "diffusor",
         "tasks": [
-            "text-generation"
+            "text-to-image"
         ]
     }
 }