@aws/ml-container-creator 1.0.2 ā 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +1 -1
- package/config/tune-catalog.json +303 -1
- package/package.json +2 -1
- package/servers/lib/catalogs/model-servers.json +334 -120
- package/src/lib/bootstrap-command-handler.js +12 -2
- package/src/lib/bootstrap-profile-manager.js +16 -0
- package/src/lib/cross-cutting-checker.js +6 -1
- package/src/lib/generated/cli-options.js +1 -1
- package/src/lib/generated/parameter-matrix.js +1 -1
- package/src/lib/generated/validation-rules.js +1 -1
package/bin/cli.js
CHANGED
|
@@ -162,7 +162,7 @@ program
|
|
|
162
162
|
.command('bootstrap')
|
|
163
163
|
.description('Set up AWS infrastructure (IAM role, ECR repo, S3 buckets)')
|
|
164
164
|
.passThroughOptions()
|
|
165
|
-
.argument('[action]', 'Bootstrap action (status, use, list, remove, scan, prune, update, sync-schemas)')
|
|
165
|
+
.argument('[action]', 'Bootstrap action (status, use, list, remove, scan, prune, update, migrate, sync-schemas, sync-model-families)')
|
|
166
166
|
.argument('[args...]', 'Additional arguments')
|
|
167
167
|
.option('--profile <profile>', 'AWS profile name')
|
|
168
168
|
.option('--region <region>', 'AWS region')
|
package/config/tune-catalog.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": "2026-05-27",
|
|
3
|
-
"lastSynced": "2026-
|
|
3
|
+
"lastSynced": "2026-06-26T19:01:02.821Z",
|
|
4
4
|
"source": "https://docs.aws.amazon.com/sagemaker/latest/dg/model-customize-open-weight.html",
|
|
5
5
|
"models": {
|
|
6
6
|
"huggingface-llm-qwen2-5-7b-instruct": {
|
|
@@ -1614,6 +1614,24 @@
|
|
|
1614
1614
|
"prompt": "array"
|
|
1615
1615
|
}
|
|
1616
1616
|
}
|
|
1617
|
+
},
|
|
1618
|
+
"dpo": {
|
|
1619
|
+
"trainingTypes": [
|
|
1620
|
+
"lora"
|
|
1621
|
+
],
|
|
1622
|
+
"datasetFormat": "default-dpo",
|
|
1623
|
+
"datasetSchema": {
|
|
1624
|
+
"required": [
|
|
1625
|
+
"prompt",
|
|
1626
|
+
"chosen",
|
|
1627
|
+
"rejected"
|
|
1628
|
+
],
|
|
1629
|
+
"types": {
|
|
1630
|
+
"prompt": "string",
|
|
1631
|
+
"chosen": "string",
|
|
1632
|
+
"rejected": "string"
|
|
1633
|
+
}
|
|
1634
|
+
}
|
|
1617
1635
|
}
|
|
1618
1636
|
},
|
|
1619
1637
|
"goldenPath": false
|
|
@@ -1667,6 +1685,24 @@
|
|
|
1667
1685
|
"prompt": "array"
|
|
1668
1686
|
}
|
|
1669
1687
|
}
|
|
1688
|
+
},
|
|
1689
|
+
"dpo": {
|
|
1690
|
+
"trainingTypes": [
|
|
1691
|
+
"lora"
|
|
1692
|
+
],
|
|
1693
|
+
"datasetFormat": "default-dpo",
|
|
1694
|
+
"datasetSchema": {
|
|
1695
|
+
"required": [
|
|
1696
|
+
"prompt",
|
|
1697
|
+
"chosen",
|
|
1698
|
+
"rejected"
|
|
1699
|
+
],
|
|
1700
|
+
"types": {
|
|
1701
|
+
"prompt": "string",
|
|
1702
|
+
"chosen": "string",
|
|
1703
|
+
"rejected": "string"
|
|
1704
|
+
}
|
|
1705
|
+
}
|
|
1670
1706
|
}
|
|
1671
1707
|
},
|
|
1672
1708
|
"goldenPath": false
|
|
@@ -1773,6 +1809,272 @@
|
|
|
1773
1809
|
"prompt": "array"
|
|
1774
1810
|
}
|
|
1775
1811
|
}
|
|
1812
|
+
},
|
|
1813
|
+
"dpo": {
|
|
1814
|
+
"trainingTypes": [
|
|
1815
|
+
"lora"
|
|
1816
|
+
],
|
|
1817
|
+
"datasetFormat": "default-dpo",
|
|
1818
|
+
"datasetSchema": {
|
|
1819
|
+
"required": [
|
|
1820
|
+
"prompt",
|
|
1821
|
+
"chosen",
|
|
1822
|
+
"rejected"
|
|
1823
|
+
],
|
|
1824
|
+
"types": {
|
|
1825
|
+
"prompt": "string",
|
|
1826
|
+
"chosen": "string",
|
|
1827
|
+
"rejected": "string"
|
|
1828
|
+
}
|
|
1829
|
+
}
|
|
1830
|
+
}
|
|
1831
|
+
},
|
|
1832
|
+
"goldenPath": false
|
|
1833
|
+
},
|
|
1834
|
+
"huggingface-llm-nvidia-nemotron-3-super-120b-a12b-bf16": {
|
|
1835
|
+
"family": "huggingface-llm-nvidia-nemotron",
|
|
1836
|
+
"provider": "unknown",
|
|
1837
|
+
"displayName": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
|
|
1838
|
+
"huggingFaceId": "",
|
|
1839
|
+
"techniques": {
|
|
1840
|
+
"sft": {
|
|
1841
|
+
"trainingTypes": [
|
|
1842
|
+
"lora"
|
|
1843
|
+
],
|
|
1844
|
+
"datasetFormat": "default-sft",
|
|
1845
|
+
"datasetSchema": {
|
|
1846
|
+
"required": [
|
|
1847
|
+
"prompt",
|
|
1848
|
+
"completion"
|
|
1849
|
+
],
|
|
1850
|
+
"types": {
|
|
1851
|
+
"prompt": "string",
|
|
1852
|
+
"completion": "string"
|
|
1853
|
+
}
|
|
1854
|
+
}
|
|
1855
|
+
},
|
|
1856
|
+
"rlvr": {
|
|
1857
|
+
"trainingTypes": [
|
|
1858
|
+
"lora"
|
|
1859
|
+
],
|
|
1860
|
+
"datasetFormat": "default-rlvr",
|
|
1861
|
+
"datasetSchema": {
|
|
1862
|
+
"required": [
|
|
1863
|
+
"prompt"
|
|
1864
|
+
],
|
|
1865
|
+
"types": {
|
|
1866
|
+
"prompt": "array"
|
|
1867
|
+
}
|
|
1868
|
+
}
|
|
1869
|
+
},
|
|
1870
|
+
"rlaif": {
|
|
1871
|
+
"trainingTypes": [
|
|
1872
|
+
"lora"
|
|
1873
|
+
],
|
|
1874
|
+
"datasetFormat": "default-rlaif",
|
|
1875
|
+
"datasetSchema": {
|
|
1876
|
+
"required": [
|
|
1877
|
+
"prompt"
|
|
1878
|
+
],
|
|
1879
|
+
"types": {
|
|
1880
|
+
"prompt": "array"
|
|
1881
|
+
}
|
|
1882
|
+
}
|
|
1883
|
+
}
|
|
1884
|
+
},
|
|
1885
|
+
"goldenPath": false
|
|
1886
|
+
},
|
|
1887
|
+
"huggingface-reasoning-nvidia-nemotron-3-nano-30b-a3b-bf16": {
|
|
1888
|
+
"family": "huggingface-reasoning-nvidia-nemotron",
|
|
1889
|
+
"provider": "unknown",
|
|
1890
|
+
"displayName": "NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
|
|
1891
|
+
"huggingFaceId": "",
|
|
1892
|
+
"techniques": {
|
|
1893
|
+
"sft": {
|
|
1894
|
+
"trainingTypes": [
|
|
1895
|
+
"lora"
|
|
1896
|
+
],
|
|
1897
|
+
"datasetFormat": "default-sft",
|
|
1898
|
+
"datasetSchema": {
|
|
1899
|
+
"required": [
|
|
1900
|
+
"prompt",
|
|
1901
|
+
"completion"
|
|
1902
|
+
],
|
|
1903
|
+
"types": {
|
|
1904
|
+
"prompt": "string",
|
|
1905
|
+
"completion": "string"
|
|
1906
|
+
}
|
|
1907
|
+
}
|
|
1908
|
+
},
|
|
1909
|
+
"rlaif": {
|
|
1910
|
+
"trainingTypes": [
|
|
1911
|
+
"lora"
|
|
1912
|
+
],
|
|
1913
|
+
"datasetFormat": "default-rlaif",
|
|
1914
|
+
"datasetSchema": {
|
|
1915
|
+
"required": [
|
|
1916
|
+
"prompt"
|
|
1917
|
+
],
|
|
1918
|
+
"types": {
|
|
1919
|
+
"prompt": "array"
|
|
1920
|
+
}
|
|
1921
|
+
}
|
|
1922
|
+
},
|
|
1923
|
+
"rlvr": {
|
|
1924
|
+
"trainingTypes": [
|
|
1925
|
+
"lora"
|
|
1926
|
+
],
|
|
1927
|
+
"datasetFormat": "default-rlvr",
|
|
1928
|
+
"datasetSchema": {
|
|
1929
|
+
"required": [
|
|
1930
|
+
"prompt"
|
|
1931
|
+
],
|
|
1932
|
+
"types": {
|
|
1933
|
+
"prompt": "array"
|
|
1934
|
+
}
|
|
1935
|
+
}
|
|
1936
|
+
}
|
|
1937
|
+
},
|
|
1938
|
+
"goldenPath": false
|
|
1939
|
+
},
|
|
1940
|
+
"huggingface-vlm-gemma-4-e4b-it": {
|
|
1941
|
+
"family": "huggingface-vlm",
|
|
1942
|
+
"provider": "unknown",
|
|
1943
|
+
"displayName": "gemma-4-e4b-it",
|
|
1944
|
+
"huggingFaceId": "",
|
|
1945
|
+
"techniques": {
|
|
1946
|
+
"dpo": {
|
|
1947
|
+
"trainingTypes": [
|
|
1948
|
+
"lora"
|
|
1949
|
+
],
|
|
1950
|
+
"datasetFormat": "default-dpo",
|
|
1951
|
+
"datasetSchema": {
|
|
1952
|
+
"required": [
|
|
1953
|
+
"prompt",
|
|
1954
|
+
"chosen",
|
|
1955
|
+
"rejected"
|
|
1956
|
+
],
|
|
1957
|
+
"types": {
|
|
1958
|
+
"prompt": "string",
|
|
1959
|
+
"chosen": "string",
|
|
1960
|
+
"rejected": "string"
|
|
1961
|
+
}
|
|
1962
|
+
}
|
|
1963
|
+
},
|
|
1964
|
+
"sft": {
|
|
1965
|
+
"trainingTypes": [
|
|
1966
|
+
"lora"
|
|
1967
|
+
],
|
|
1968
|
+
"datasetFormat": "default-sft",
|
|
1969
|
+
"datasetSchema": {
|
|
1970
|
+
"required": [
|
|
1971
|
+
"prompt",
|
|
1972
|
+
"completion"
|
|
1973
|
+
],
|
|
1974
|
+
"types": {
|
|
1975
|
+
"prompt": "string",
|
|
1976
|
+
"completion": "string"
|
|
1977
|
+
}
|
|
1978
|
+
}
|
|
1979
|
+
},
|
|
1980
|
+
"rlvr": {
|
|
1981
|
+
"trainingTypes": [
|
|
1982
|
+
"lora"
|
|
1983
|
+
],
|
|
1984
|
+
"datasetFormat": "default-rlvr",
|
|
1985
|
+
"datasetSchema": {
|
|
1986
|
+
"required": [
|
|
1987
|
+
"prompt"
|
|
1988
|
+
],
|
|
1989
|
+
"types": {
|
|
1990
|
+
"prompt": "array"
|
|
1991
|
+
}
|
|
1992
|
+
}
|
|
1993
|
+
},
|
|
1994
|
+
"rlaif": {
|
|
1995
|
+
"trainingTypes": [
|
|
1996
|
+
"lora"
|
|
1997
|
+
],
|
|
1998
|
+
"datasetFormat": "default-rlaif",
|
|
1999
|
+
"datasetSchema": {
|
|
2000
|
+
"required": [
|
|
2001
|
+
"prompt"
|
|
2002
|
+
],
|
|
2003
|
+
"types": {
|
|
2004
|
+
"prompt": "array"
|
|
2005
|
+
}
|
|
2006
|
+
}
|
|
2007
|
+
}
|
|
2008
|
+
},
|
|
2009
|
+
"goldenPath": false
|
|
2010
|
+
},
|
|
2011
|
+
"huggingface-vlm-gemma-4-31b-it": {
|
|
2012
|
+
"family": "huggingface-vlm",
|
|
2013
|
+
"provider": "unknown",
|
|
2014
|
+
"displayName": "gemma-4-31b-it",
|
|
2015
|
+
"huggingFaceId": "",
|
|
2016
|
+
"techniques": {
|
|
2017
|
+
"dpo": {
|
|
2018
|
+
"trainingTypes": [
|
|
2019
|
+
"lora"
|
|
2020
|
+
],
|
|
2021
|
+
"datasetFormat": "default-dpo",
|
|
2022
|
+
"datasetSchema": {
|
|
2023
|
+
"required": [
|
|
2024
|
+
"prompt",
|
|
2025
|
+
"chosen",
|
|
2026
|
+
"rejected"
|
|
2027
|
+
],
|
|
2028
|
+
"types": {
|
|
2029
|
+
"prompt": "string",
|
|
2030
|
+
"chosen": "string",
|
|
2031
|
+
"rejected": "string"
|
|
2032
|
+
}
|
|
2033
|
+
}
|
|
2034
|
+
},
|
|
2035
|
+
"sft": {
|
|
2036
|
+
"trainingTypes": [
|
|
2037
|
+
"lora"
|
|
2038
|
+
],
|
|
2039
|
+
"datasetFormat": "default-sft",
|
|
2040
|
+
"datasetSchema": {
|
|
2041
|
+
"required": [
|
|
2042
|
+
"prompt",
|
|
2043
|
+
"completion"
|
|
2044
|
+
],
|
|
2045
|
+
"types": {
|
|
2046
|
+
"prompt": "string",
|
|
2047
|
+
"completion": "string"
|
|
2048
|
+
}
|
|
2049
|
+
}
|
|
2050
|
+
},
|
|
2051
|
+
"rlaif": {
|
|
2052
|
+
"trainingTypes": [
|
|
2053
|
+
"lora"
|
|
2054
|
+
],
|
|
2055
|
+
"datasetFormat": "default-rlaif",
|
|
2056
|
+
"datasetSchema": {
|
|
2057
|
+
"required": [
|
|
2058
|
+
"prompt"
|
|
2059
|
+
],
|
|
2060
|
+
"types": {
|
|
2061
|
+
"prompt": "array"
|
|
2062
|
+
}
|
|
2063
|
+
}
|
|
2064
|
+
},
|
|
2065
|
+
"rlvr": {
|
|
2066
|
+
"trainingTypes": [
|
|
2067
|
+
"lora"
|
|
2068
|
+
],
|
|
2069
|
+
"datasetFormat": "default-rlvr",
|
|
2070
|
+
"datasetSchema": {
|
|
2071
|
+
"required": [
|
|
2072
|
+
"prompt"
|
|
2073
|
+
],
|
|
2074
|
+
"types": {
|
|
2075
|
+
"prompt": "array"
|
|
2076
|
+
}
|
|
2077
|
+
}
|
|
1776
2078
|
}
|
|
1777
2079
|
},
|
|
1778
2080
|
"goldenPath": false
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aws/ml-container-creator",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.3",
|
|
4
4
|
"description": "Build and deploy custom ML containers on AWS SageMaker with minimal configuration.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -107,6 +107,7 @@
|
|
|
107
107
|
"prepare": "husky || true"
|
|
108
108
|
},
|
|
109
109
|
"dependencies": {
|
|
110
|
+
"@aws/ml-container-creator": "^1.0.2",
|
|
110
111
|
"@inquirer/prompts": "^8.4.2",
|
|
111
112
|
"@modelcontextprotocol/sdk": "^1.27.1",
|
|
112
113
|
"ajv": "^8.12.0",
|
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"vllm": [
|
|
3
3
|
{
|
|
4
|
-
"image": "vllm/vllm-openai:v0.
|
|
5
|
-
"tag": "v0.
|
|
4
|
+
"image": "vllm/vllm-openai:v0.23.0",
|
|
5
|
+
"tag": "v0.23.0",
|
|
6
6
|
"architecture": "amd64",
|
|
7
|
-
"created": "2026-
|
|
7
|
+
"created": "2026-06-13T00:36:45.565402Z",
|
|
8
8
|
"labels": {
|
|
9
|
-
"
|
|
10
|
-
"python_version": "3.12",
|
|
11
|
-
"framework_version": "0.20.2"
|
|
9
|
+
"framework_version": "0.23.0"
|
|
12
10
|
},
|
|
13
11
|
"registry": "dockerhub",
|
|
14
12
|
"repository": "vllm/vllm-openai",
|
|
@@ -22,15 +20,6 @@
|
|
|
22
20
|
},
|
|
23
21
|
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
24
22
|
},
|
|
25
|
-
"accelerator": {
|
|
26
|
-
"type": "cuda",
|
|
27
|
-
"version": "12.9",
|
|
28
|
-
"versionRange": {
|
|
29
|
-
"min": "12.4",
|
|
30
|
-
"max": "12.9"
|
|
31
|
-
}
|
|
32
|
-
},
|
|
33
|
-
"validationLevel": "community-validated",
|
|
34
23
|
"profiles": {
|
|
35
24
|
"low-latency": {
|
|
36
25
|
"displayName": "Low Latency",
|
|
@@ -64,7 +53,16 @@
|
|
|
64
53
|
"notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
|
|
65
54
|
}
|
|
66
55
|
},
|
|
56
|
+
"accelerator": {
|
|
57
|
+
"type": "cuda",
|
|
58
|
+
"version": "12.9",
|
|
59
|
+
"versionRange": {
|
|
60
|
+
"min": "12.4",
|
|
61
|
+
"max": "12.9"
|
|
62
|
+
}
|
|
63
|
+
},
|
|
67
64
|
"notes": "vLLM 0.20.2 adds Gemma 4 support, CUDA 12.9, improved multi-GPU. Requires CUDA compat on drivers < 570.",
|
|
65
|
+
"validationLevel": "community-validated",
|
|
68
66
|
"supportedModelTypes": [
|
|
69
67
|
"afmoe",
|
|
70
68
|
"apertus",
|
|
@@ -84,12 +82,15 @@
|
|
|
84
82
|
"chatglm",
|
|
85
83
|
"cheers",
|
|
86
84
|
"clip",
|
|
85
|
+
"cohere2_moe",
|
|
86
|
+
"cohere_eagle",
|
|
87
87
|
"colbert",
|
|
88
88
|
"colmodernvbert",
|
|
89
89
|
"colpali",
|
|
90
90
|
"colqwen3",
|
|
91
91
|
"colqwen3_5",
|
|
92
92
|
"commandr",
|
|
93
|
+
"cosmos3",
|
|
93
94
|
"dbrx",
|
|
94
95
|
"deepseek_eagle",
|
|
95
96
|
"deepseek_eagle3",
|
|
@@ -97,8 +98,6 @@
|
|
|
97
98
|
"deepseek_ocr",
|
|
98
99
|
"deepseek_ocr2",
|
|
99
100
|
"deepseek_v2",
|
|
100
|
-
"deepseek_v4",
|
|
101
|
-
"deepseek_v4_mtp",
|
|
102
101
|
"deepseek_vl2",
|
|
103
102
|
"dots1",
|
|
104
103
|
"dots_ocr",
|
|
@@ -125,6 +124,7 @@
|
|
|
125
124
|
"gemma3n",
|
|
126
125
|
"gemma4",
|
|
127
126
|
"gemma4_mm",
|
|
127
|
+
"gemma4_mtp",
|
|
128
128
|
"glm",
|
|
129
129
|
"glm4",
|
|
130
130
|
"glm4_1v",
|
|
@@ -159,7 +159,6 @@
|
|
|
159
159
|
"internvl",
|
|
160
160
|
"iquest_loopcoder",
|
|
161
161
|
"isaac",
|
|
162
|
-
"jais",
|
|
163
162
|
"jais2",
|
|
164
163
|
"jamba",
|
|
165
164
|
"jina",
|
|
@@ -170,6 +169,7 @@
|
|
|
170
169
|
"kimi_k25",
|
|
171
170
|
"kimi_linear",
|
|
172
171
|
"kimi_vl",
|
|
172
|
+
"laguna",
|
|
173
173
|
"lfm2",
|
|
174
174
|
"lfm2_moe",
|
|
175
175
|
"lfm2_vl",
|
|
@@ -184,10 +184,13 @@
|
|
|
184
184
|
"mamba",
|
|
185
185
|
"mamba2",
|
|
186
186
|
"medusa",
|
|
187
|
+
"mellum",
|
|
187
188
|
"midashenglm",
|
|
188
189
|
"mimo",
|
|
189
190
|
"mimo_mtp",
|
|
190
|
-
"
|
|
191
|
+
"mimo_v2",
|
|
192
|
+
"mimo_v2_mtp",
|
|
193
|
+
"mimo_v2_omni",
|
|
191
194
|
"minicpm",
|
|
192
195
|
"minicpm3",
|
|
193
196
|
"minicpm_eagle",
|
|
@@ -196,6 +199,7 @@
|
|
|
196
199
|
"minimax_m2",
|
|
197
200
|
"minimax_text_01",
|
|
198
201
|
"mistral",
|
|
202
|
+
"mistral_eagle",
|
|
199
203
|
"mistral_large_3",
|
|
200
204
|
"mixtral",
|
|
201
205
|
"mllama4",
|
|
@@ -203,6 +207,7 @@
|
|
|
203
207
|
"modernbert",
|
|
204
208
|
"molmo",
|
|
205
209
|
"molmo2",
|
|
210
|
+
"moondream3",
|
|
206
211
|
"mpt",
|
|
207
212
|
"nano_nemotron_vl",
|
|
208
213
|
"nemotron",
|
|
@@ -218,6 +223,7 @@
|
|
|
218
223
|
"opencua",
|
|
219
224
|
"openpangu",
|
|
220
225
|
"openpangu_mtp",
|
|
226
|
+
"openvla",
|
|
221
227
|
"opt",
|
|
222
228
|
"orion",
|
|
223
229
|
"ouro",
|
|
@@ -265,6 +271,7 @@
|
|
|
265
271
|
"step3_vl",
|
|
266
272
|
"step3p5",
|
|
267
273
|
"step3p5_mtp",
|
|
274
|
+
"step3p7",
|
|
268
275
|
"step_vl",
|
|
269
276
|
"tarsier",
|
|
270
277
|
"telechat2",
|
|
@@ -279,14 +286,12 @@
|
|
|
279
286
|
]
|
|
280
287
|
},
|
|
281
288
|
{
|
|
282
|
-
"image": "vllm/vllm-openai:v0.
|
|
283
|
-
"tag": "v0.
|
|
289
|
+
"image": "vllm/vllm-openai:v0.22.1",
|
|
290
|
+
"tag": "v0.22.1",
|
|
284
291
|
"architecture": "amd64",
|
|
285
|
-
"created": "
|
|
292
|
+
"created": "2026-06-05T07:16:13.856004Z",
|
|
286
293
|
"labels": {
|
|
287
|
-
"
|
|
288
|
-
"python_version": "3.12",
|
|
289
|
-
"framework_version": "0.10.1"
|
|
294
|
+
"framework_version": "0.22.1"
|
|
290
295
|
},
|
|
291
296
|
"registry": "dockerhub",
|
|
292
297
|
"repository": "vllm/vllm-openai",
|
|
@@ -300,15 +305,6 @@
|
|
|
300
305
|
},
|
|
301
306
|
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
302
307
|
},
|
|
303
|
-
"accelerator": {
|
|
304
|
-
"type": "cuda",
|
|
305
|
-
"version": "12.1",
|
|
306
|
-
"versionRange": {
|
|
307
|
-
"min": "12.0",
|
|
308
|
-
"max": "12.3"
|
|
309
|
-
}
|
|
310
|
-
},
|
|
311
|
-
"validationLevel": "tested",
|
|
312
308
|
"profiles": {
|
|
313
309
|
"low-latency": {
|
|
314
310
|
"displayName": "Low Latency",
|
|
@@ -342,58 +338,94 @@
|
|
|
342
338
|
"notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
|
|
343
339
|
}
|
|
344
340
|
},
|
|
345
|
-
"
|
|
341
|
+
"accelerator": {
|
|
342
|
+
"type": "cuda",
|
|
343
|
+
"version": "12.9",
|
|
344
|
+
"versionRange": {
|
|
345
|
+
"min": "12.4",
|
|
346
|
+
"max": "12.9"
|
|
347
|
+
}
|
|
348
|
+
},
|
|
349
|
+
"notes": "vLLM 0.20.2 adds Gemma 4 support, CUDA 12.9, improved multi-GPU. Requires CUDA compat on drivers < 570.",
|
|
350
|
+
"validationLevel": "community-validated",
|
|
346
351
|
"supportedModelTypes": [
|
|
352
|
+
"afmoe",
|
|
353
|
+
"apertus",
|
|
347
354
|
"arcee",
|
|
348
355
|
"arctic",
|
|
349
356
|
"aria",
|
|
350
|
-
"
|
|
357
|
+
"bagel",
|
|
351
358
|
"baichuan",
|
|
352
359
|
"bailing_moe",
|
|
360
|
+
"bailing_moe_linear",
|
|
353
361
|
"bamba",
|
|
354
|
-
"
|
|
362
|
+
"bee",
|
|
355
363
|
"bert",
|
|
356
364
|
"bert_with_rope",
|
|
357
365
|
"blip2",
|
|
358
366
|
"bloom",
|
|
359
|
-
"chameleon",
|
|
360
367
|
"chatglm",
|
|
361
|
-
"
|
|
368
|
+
"cheers",
|
|
369
|
+
"clip",
|
|
370
|
+
"cohere2_moe",
|
|
371
|
+
"cohere_eagle",
|
|
372
|
+
"colbert",
|
|
373
|
+
"colmodernvbert",
|
|
374
|
+
"colpali",
|
|
375
|
+
"colqwen3",
|
|
376
|
+
"colqwen3_5",
|
|
362
377
|
"commandr",
|
|
363
378
|
"dbrx",
|
|
364
|
-
"
|
|
379
|
+
"deepseek_eagle",
|
|
380
|
+
"deepseek_eagle3",
|
|
365
381
|
"deepseek_mtp",
|
|
382
|
+
"deepseek_ocr",
|
|
383
|
+
"deepseek_ocr2",
|
|
366
384
|
"deepseek_v2",
|
|
367
385
|
"deepseek_vl2",
|
|
368
386
|
"dots1",
|
|
387
|
+
"dots_ocr",
|
|
388
|
+
"ernie",
|
|
369
389
|
"ernie45",
|
|
370
390
|
"ernie45_moe",
|
|
391
|
+
"ernie_mtp",
|
|
371
392
|
"exaone",
|
|
372
393
|
"exaone4",
|
|
394
|
+
"exaone4_5_mtp",
|
|
395
|
+
"exaone_moe",
|
|
396
|
+
"exaone_moe_mtp",
|
|
397
|
+
"extract_hidden_states",
|
|
373
398
|
"fairseq2_llama",
|
|
374
399
|
"falcon",
|
|
375
400
|
"falcon_h1",
|
|
376
|
-
"
|
|
401
|
+
"flex_olmo",
|
|
402
|
+
"funasr",
|
|
377
403
|
"fuyu",
|
|
378
404
|
"gemma",
|
|
379
405
|
"gemma2",
|
|
380
406
|
"gemma3",
|
|
381
407
|
"gemma3_mm",
|
|
382
408
|
"gemma3n",
|
|
383
|
-
"
|
|
409
|
+
"gemma4",
|
|
410
|
+
"gemma4_mm",
|
|
411
|
+
"gemma4_mtp",
|
|
384
412
|
"glm",
|
|
385
413
|
"glm4",
|
|
386
414
|
"glm4_1v",
|
|
387
415
|
"glm4_moe",
|
|
416
|
+
"glm4_moe_lite",
|
|
417
|
+
"glm4_moe_lite_mtp",
|
|
388
418
|
"glm4_moe_mtp",
|
|
389
419
|
"glm4v",
|
|
420
|
+
"glm_ocr",
|
|
421
|
+
"glm_ocr_mtp",
|
|
422
|
+
"glmasr",
|
|
390
423
|
"gpt2",
|
|
391
424
|
"gpt_bigcode",
|
|
392
425
|
"gpt_j",
|
|
393
426
|
"gpt_neox",
|
|
394
427
|
"gpt_oss",
|
|
395
428
|
"granite",
|
|
396
|
-
"granite_speech",
|
|
397
429
|
"granitemoe",
|
|
398
430
|
"granitemoehybrid",
|
|
399
431
|
"granitemoeshared",
|
|
@@ -401,108 +433,149 @@
|
|
|
401
433
|
"grok1",
|
|
402
434
|
"h2ovl",
|
|
403
435
|
"hunyuan_v1",
|
|
436
|
+
"hy_v3",
|
|
437
|
+
"hy_v3_mtp",
|
|
438
|
+
"hyperclovax",
|
|
404
439
|
"hyperclovax_vision",
|
|
405
|
-
"
|
|
440
|
+
"hyperclovax_vision_v2",
|
|
406
441
|
"internlm2",
|
|
407
442
|
"internlm2_ve",
|
|
408
|
-
"interns1",
|
|
409
443
|
"internvl",
|
|
444
|
+
"iquest_loopcoder",
|
|
445
|
+
"isaac",
|
|
410
446
|
"jais",
|
|
447
|
+
"jais2",
|
|
411
448
|
"jamba",
|
|
449
|
+
"jina",
|
|
412
450
|
"jina_vl",
|
|
451
|
+
"kanana_v",
|
|
413
452
|
"keye",
|
|
453
|
+
"kimi_audio",
|
|
454
|
+
"kimi_k25",
|
|
455
|
+
"kimi_linear",
|
|
414
456
|
"kimi_vl",
|
|
457
|
+
"laguna",
|
|
458
|
+
"lfm2",
|
|
459
|
+
"lfm2_moe",
|
|
460
|
+
"lfm2_vl",
|
|
415
461
|
"llama",
|
|
416
462
|
"llama4",
|
|
417
463
|
"llama4_eagle",
|
|
418
464
|
"llama_eagle",
|
|
419
465
|
"llama_eagle3",
|
|
420
466
|
"llava",
|
|
421
|
-
"
|
|
422
|
-
"
|
|
423
|
-
"llava_onevision",
|
|
467
|
+
"longcat_flash",
|
|
468
|
+
"longcat_flash_mtp",
|
|
424
469
|
"mamba",
|
|
425
470
|
"mamba2",
|
|
426
471
|
"medusa",
|
|
472
|
+
"mellum",
|
|
473
|
+
"midashenglm",
|
|
427
474
|
"mimo",
|
|
428
475
|
"mimo_mtp",
|
|
476
|
+
"mimo_v2",
|
|
477
|
+
"mimo_v2_mtp",
|
|
478
|
+
"mimo_v2_omni",
|
|
429
479
|
"minicpm",
|
|
430
480
|
"minicpm3",
|
|
431
481
|
"minicpm_eagle",
|
|
432
482
|
"minicpmo",
|
|
433
483
|
"minicpmv",
|
|
484
|
+
"minimax_m2",
|
|
434
485
|
"minimax_text_01",
|
|
435
|
-
"
|
|
436
|
-
"
|
|
486
|
+
"mistral",
|
|
487
|
+
"mistral_eagle",
|
|
488
|
+
"mistral_large_3",
|
|
437
489
|
"mixtral",
|
|
438
|
-
"mixtral_quant",
|
|
439
|
-
"mllama",
|
|
440
490
|
"mllama4",
|
|
441
491
|
"mlp_speculator",
|
|
442
492
|
"modernbert",
|
|
443
493
|
"molmo",
|
|
494
|
+
"molmo2",
|
|
495
|
+
"moondream3",
|
|
444
496
|
"mpt",
|
|
497
|
+
"nano_nemotron_vl",
|
|
445
498
|
"nemotron",
|
|
446
499
|
"nemotron_h",
|
|
500
|
+
"nemotron_h_mtp",
|
|
447
501
|
"nemotron_nas",
|
|
448
502
|
"nemotron_vl",
|
|
449
503
|
"nvlm_d",
|
|
450
504
|
"olmo",
|
|
451
505
|
"olmo2",
|
|
506
|
+
"olmo_hybrid",
|
|
452
507
|
"olmoe",
|
|
508
|
+
"opencua",
|
|
509
|
+
"openpangu",
|
|
510
|
+
"openpangu_mtp",
|
|
511
|
+
"openvla",
|
|
453
512
|
"opt",
|
|
454
513
|
"orion",
|
|
514
|
+
"ouro",
|
|
455
515
|
"ovis",
|
|
456
|
-
"
|
|
516
|
+
"ovis2_5",
|
|
517
|
+
"param2moe",
|
|
457
518
|
"persimmon",
|
|
458
519
|
"phi",
|
|
459
520
|
"phi3",
|
|
460
521
|
"phi3v",
|
|
461
|
-
"phi4_multimodal",
|
|
462
|
-
"phi4flash",
|
|
463
522
|
"phi4mm",
|
|
523
|
+
"phi4siglip",
|
|
464
524
|
"phimoe",
|
|
465
525
|
"pixtral",
|
|
466
526
|
"plamo2",
|
|
467
|
-
"
|
|
527
|
+
"plamo3",
|
|
468
528
|
"qwen",
|
|
469
529
|
"qwen2",
|
|
470
|
-
"qwen2_5_omni_thinker",
|
|
471
|
-
"qwen2_5_vl",
|
|
472
|
-
"qwen2_audio",
|
|
473
530
|
"qwen2_moe",
|
|
474
531
|
"qwen2_rm",
|
|
475
532
|
"qwen2_vl",
|
|
476
533
|
"qwen3",
|
|
534
|
+
"qwen3_5",
|
|
535
|
+
"qwen3_5_mtp",
|
|
536
|
+
"qwen3_asr_realtime",
|
|
537
|
+
"qwen3_dflash",
|
|
477
538
|
"qwen3_moe",
|
|
539
|
+
"qwen3_next",
|
|
540
|
+
"qwen3_next_mtp",
|
|
541
|
+
"qwen3_vl",
|
|
478
542
|
"qwen_vl",
|
|
543
|
+
"rnj1",
|
|
479
544
|
"roberta",
|
|
545
|
+
"rvl",
|
|
546
|
+
"sarvam",
|
|
547
|
+
"seed_oss",
|
|
548
|
+
"siglip",
|
|
480
549
|
"skyworkr1v",
|
|
481
550
|
"smolvlm",
|
|
482
551
|
"solar",
|
|
483
552
|
"stablelm",
|
|
484
553
|
"starcoder2",
|
|
554
|
+
"step1",
|
|
485
555
|
"step3_text",
|
|
486
556
|
"step3_vl",
|
|
557
|
+
"step3p5",
|
|
558
|
+
"step3p5_mtp",
|
|
559
|
+
"step_vl",
|
|
487
560
|
"tarsier",
|
|
488
561
|
"telechat2",
|
|
489
562
|
"teleflm",
|
|
563
|
+
"terratorch",
|
|
490
564
|
"transformers",
|
|
491
565
|
"ultravox",
|
|
492
566
|
"voxtral",
|
|
567
|
+
"voxtral_realtime",
|
|
493
568
|
"whisper",
|
|
494
569
|
"zamba2"
|
|
495
570
|
]
|
|
496
571
|
},
|
|
497
572
|
{
|
|
498
|
-
"image": "vllm/vllm-openai:v0.
|
|
499
|
-
"tag": "v0.
|
|
573
|
+
"image": "vllm/vllm-openai:v0.22.0",
|
|
574
|
+
"tag": "v0.22.0",
|
|
500
575
|
"architecture": "amd64",
|
|
501
|
-
"created": "
|
|
576
|
+
"created": "2026-05-29T09:06:43.475324Z",
|
|
502
577
|
"labels": {
|
|
503
|
-
"
|
|
504
|
-
"python_version": "3.12",
|
|
505
|
-
"framework_version": "0.9.1"
|
|
578
|
+
"framework_version": "0.22.0"
|
|
506
579
|
},
|
|
507
580
|
"registry": "dockerhub",
|
|
508
581
|
"repository": "vllm/vllm-openai",
|
|
@@ -516,15 +589,6 @@
|
|
|
516
589
|
},
|
|
517
590
|
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
518
591
|
},
|
|
519
|
-
"accelerator": {
|
|
520
|
-
"type": "cuda",
|
|
521
|
-
"version": "12.1",
|
|
522
|
-
"versionRange": {
|
|
523
|
-
"min": "12.0",
|
|
524
|
-
"max": "12.3"
|
|
525
|
-
}
|
|
526
|
-
},
|
|
527
|
-
"validationLevel": "tested",
|
|
528
592
|
"profiles": {
|
|
529
593
|
"low-latency": {
|
|
530
594
|
"displayName": "Low Latency",
|
|
@@ -558,130 +622,232 @@
|
|
|
558
622
|
"notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
|
|
559
623
|
}
|
|
560
624
|
},
|
|
561
|
-
"
|
|
625
|
+
"accelerator": {
|
|
626
|
+
"type": "cuda",
|
|
627
|
+
"version": "12.9",
|
|
628
|
+
"versionRange": {
|
|
629
|
+
"min": "12.4",
|
|
630
|
+
"max": "12.9"
|
|
631
|
+
}
|
|
632
|
+
},
|
|
633
|
+
"notes": "vLLM 0.20.2 adds Gemma 4 support, CUDA 12.9, improved multi-GPU. Requires CUDA compat on drivers < 570.",
|
|
634
|
+
"validationLevel": "community-validated",
|
|
562
635
|
"supportedModelTypes": [
|
|
636
|
+
"afmoe",
|
|
637
|
+
"apertus",
|
|
638
|
+
"arcee",
|
|
563
639
|
"arctic",
|
|
564
640
|
"aria",
|
|
565
|
-
"
|
|
641
|
+
"bagel",
|
|
566
642
|
"baichuan",
|
|
643
|
+
"bailing_moe",
|
|
644
|
+
"bailing_moe_linear",
|
|
567
645
|
"bamba",
|
|
568
|
-
"
|
|
646
|
+
"bee",
|
|
569
647
|
"bert",
|
|
570
648
|
"bert_with_rope",
|
|
571
649
|
"blip2",
|
|
572
650
|
"bloom",
|
|
573
|
-
"chameleon",
|
|
574
651
|
"chatglm",
|
|
652
|
+
"cheers",
|
|
653
|
+
"clip",
|
|
654
|
+
"cohere2_moe",
|
|
655
|
+
"cohere_eagle",
|
|
656
|
+
"colbert",
|
|
657
|
+
"colmodernvbert",
|
|
658
|
+
"colpali",
|
|
659
|
+
"colqwen3",
|
|
660
|
+
"colqwen3_5",
|
|
575
661
|
"commandr",
|
|
576
662
|
"dbrx",
|
|
577
|
-
"
|
|
663
|
+
"deepseek_eagle",
|
|
664
|
+
"deepseek_eagle3",
|
|
578
665
|
"deepseek_mtp",
|
|
666
|
+
"deepseek_ocr",
|
|
667
|
+
"deepseek_ocr2",
|
|
579
668
|
"deepseek_v2",
|
|
580
669
|
"deepseek_vl2",
|
|
581
|
-
"
|
|
670
|
+
"dots1",
|
|
671
|
+
"dots_ocr",
|
|
672
|
+
"ernie",
|
|
673
|
+
"ernie45",
|
|
674
|
+
"ernie45_moe",
|
|
675
|
+
"ernie_mtp",
|
|
582
676
|
"exaone",
|
|
677
|
+
"exaone4",
|
|
678
|
+
"exaone4_5_mtp",
|
|
679
|
+
"exaone_moe",
|
|
680
|
+
"exaone_moe_mtp",
|
|
681
|
+
"extract_hidden_states",
|
|
583
682
|
"fairseq2_llama",
|
|
584
683
|
"falcon",
|
|
585
684
|
"falcon_h1",
|
|
586
|
-
"
|
|
685
|
+
"flex_olmo",
|
|
686
|
+
"funasr",
|
|
587
687
|
"fuyu",
|
|
588
688
|
"gemma",
|
|
589
689
|
"gemma2",
|
|
590
690
|
"gemma3",
|
|
591
691
|
"gemma3_mm",
|
|
692
|
+
"gemma3n",
|
|
693
|
+
"gemma4",
|
|
694
|
+
"gemma4_mm",
|
|
695
|
+
"gemma4_mtp",
|
|
592
696
|
"glm",
|
|
593
697
|
"glm4",
|
|
698
|
+
"glm4_1v",
|
|
699
|
+
"glm4_moe",
|
|
700
|
+
"glm4_moe_lite",
|
|
701
|
+
"glm4_moe_lite_mtp",
|
|
702
|
+
"glm4_moe_mtp",
|
|
594
703
|
"glm4v",
|
|
704
|
+
"glm_ocr",
|
|
705
|
+
"glm_ocr_mtp",
|
|
706
|
+
"glmasr",
|
|
595
707
|
"gpt2",
|
|
596
708
|
"gpt_bigcode",
|
|
597
709
|
"gpt_j",
|
|
598
710
|
"gpt_neox",
|
|
711
|
+
"gpt_oss",
|
|
599
712
|
"granite",
|
|
600
|
-
"granite_speech",
|
|
601
713
|
"granitemoe",
|
|
602
714
|
"granitemoehybrid",
|
|
603
715
|
"granitemoeshared",
|
|
604
716
|
"gritlm",
|
|
605
717
|
"grok1",
|
|
606
718
|
"h2ovl",
|
|
607
|
-
"
|
|
719
|
+
"hunyuan_v1",
|
|
720
|
+
"hy_v3",
|
|
721
|
+
"hy_v3_mtp",
|
|
722
|
+
"hyperclovax",
|
|
723
|
+
"hyperclovax_vision",
|
|
724
|
+
"hyperclovax_vision_v2",
|
|
608
725
|
"internlm2",
|
|
609
726
|
"internlm2_ve",
|
|
610
727
|
"internvl",
|
|
728
|
+
"iquest_loopcoder",
|
|
729
|
+
"isaac",
|
|
611
730
|
"jais",
|
|
731
|
+
"jais2",
|
|
612
732
|
"jamba",
|
|
733
|
+
"jina",
|
|
734
|
+
"jina_vl",
|
|
735
|
+
"kanana_v",
|
|
736
|
+
"keye",
|
|
737
|
+
"kimi_audio",
|
|
738
|
+
"kimi_k25",
|
|
739
|
+
"kimi_linear",
|
|
613
740
|
"kimi_vl",
|
|
741
|
+
"laguna",
|
|
742
|
+
"lfm2",
|
|
743
|
+
"lfm2_moe",
|
|
744
|
+
"lfm2_vl",
|
|
614
745
|
"llama",
|
|
746
|
+
"llama4",
|
|
747
|
+
"llama4_eagle",
|
|
615
748
|
"llama_eagle",
|
|
616
749
|
"llama_eagle3",
|
|
617
750
|
"llava",
|
|
618
|
-
"
|
|
619
|
-
"
|
|
620
|
-
"llava_onevision",
|
|
751
|
+
"longcat_flash",
|
|
752
|
+
"longcat_flash_mtp",
|
|
621
753
|
"mamba",
|
|
622
754
|
"mamba2",
|
|
623
755
|
"medusa",
|
|
756
|
+
"midashenglm",
|
|
624
757
|
"mimo",
|
|
625
758
|
"mimo_mtp",
|
|
759
|
+
"mimo_v2",
|
|
760
|
+
"mimo_v2_mtp",
|
|
761
|
+
"mimo_v2_omni",
|
|
626
762
|
"minicpm",
|
|
627
763
|
"minicpm3",
|
|
628
764
|
"minicpm_eagle",
|
|
629
765
|
"minicpmo",
|
|
630
766
|
"minicpmv",
|
|
767
|
+
"minimax_m2",
|
|
631
768
|
"minimax_text_01",
|
|
632
|
-
"
|
|
633
|
-
"
|
|
769
|
+
"mistral",
|
|
770
|
+
"mistral_eagle",
|
|
771
|
+
"mistral_large_3",
|
|
634
772
|
"mixtral",
|
|
635
|
-
"mixtral_quant",
|
|
636
|
-
"mllama",
|
|
637
773
|
"mllama4",
|
|
638
774
|
"mlp_speculator",
|
|
639
775
|
"modernbert",
|
|
640
776
|
"molmo",
|
|
777
|
+
"molmo2",
|
|
778
|
+
"moondream3",
|
|
641
779
|
"mpt",
|
|
780
|
+
"nano_nemotron_vl",
|
|
642
781
|
"nemotron",
|
|
643
782
|
"nemotron_h",
|
|
783
|
+
"nemotron_h_mtp",
|
|
644
784
|
"nemotron_nas",
|
|
785
|
+
"nemotron_vl",
|
|
645
786
|
"nvlm_d",
|
|
646
787
|
"olmo",
|
|
647
788
|
"olmo2",
|
|
789
|
+
"olmo_hybrid",
|
|
648
790
|
"olmoe",
|
|
791
|
+
"opencua",
|
|
792
|
+
"openpangu",
|
|
793
|
+
"openpangu_mtp",
|
|
794
|
+
"openvla",
|
|
649
795
|
"opt",
|
|
650
796
|
"orion",
|
|
797
|
+
"ouro",
|
|
651
798
|
"ovis",
|
|
652
|
-
"
|
|
799
|
+
"ovis2_5",
|
|
800
|
+
"param2moe",
|
|
653
801
|
"persimmon",
|
|
654
802
|
"phi",
|
|
655
803
|
"phi3",
|
|
656
|
-
"phi3_small",
|
|
657
804
|
"phi3v",
|
|
658
805
|
"phi4mm",
|
|
806
|
+
"phi4siglip",
|
|
659
807
|
"phimoe",
|
|
660
808
|
"pixtral",
|
|
661
809
|
"plamo2",
|
|
662
|
-
"
|
|
810
|
+
"plamo3",
|
|
663
811
|
"qwen",
|
|
664
812
|
"qwen2",
|
|
665
|
-
"qwen2_5_omni_thinker",
|
|
666
|
-
"qwen2_5_vl",
|
|
667
|
-
"qwen2_audio",
|
|
668
813
|
"qwen2_moe",
|
|
669
814
|
"qwen2_rm",
|
|
670
815
|
"qwen2_vl",
|
|
671
816
|
"qwen3",
|
|
817
|
+
"qwen3_5",
|
|
818
|
+
"qwen3_5_mtp",
|
|
819
|
+
"qwen3_asr_realtime",
|
|
820
|
+
"qwen3_dflash",
|
|
672
821
|
"qwen3_moe",
|
|
822
|
+
"qwen3_next",
|
|
823
|
+
"qwen3_next_mtp",
|
|
824
|
+
"qwen3_vl",
|
|
673
825
|
"qwen_vl",
|
|
826
|
+
"rnj1",
|
|
674
827
|
"roberta",
|
|
828
|
+
"rvl",
|
|
829
|
+
"sarvam",
|
|
830
|
+
"seed_oss",
|
|
831
|
+
"siglip",
|
|
675
832
|
"skyworkr1v",
|
|
676
833
|
"smolvlm",
|
|
677
834
|
"solar",
|
|
678
835
|
"stablelm",
|
|
679
836
|
"starcoder2",
|
|
837
|
+
"step1",
|
|
838
|
+
"step3_text",
|
|
839
|
+
"step3_vl",
|
|
840
|
+
"step3p5",
|
|
841
|
+
"step3p5_mtp",
|
|
842
|
+
"step_vl",
|
|
680
843
|
"tarsier",
|
|
681
844
|
"telechat2",
|
|
682
845
|
"teleflm",
|
|
846
|
+
"terratorch",
|
|
683
847
|
"transformers",
|
|
684
848
|
"ultravox",
|
|
849
|
+
"voxtral",
|
|
850
|
+
"voxtral_realtime",
|
|
685
851
|
"whisper",
|
|
686
852
|
"zamba2"
|
|
687
853
|
]
|
|
@@ -689,14 +855,12 @@
|
|
|
689
855
|
],
|
|
690
856
|
"sglang": [
|
|
691
857
|
{
|
|
692
|
-
"image": "lmsysorg/sglang:v0.5.
|
|
693
|
-
"tag": "v0.5.
|
|
858
|
+
"image": "lmsysorg/sglang:v0.5.14",
|
|
859
|
+
"tag": "v0.5.14",
|
|
694
860
|
"architecture": "amd64",
|
|
695
|
-
"created": "
|
|
861
|
+
"created": "2026-06-26T04:19:52.602207Z",
|
|
696
862
|
"labels": {
|
|
697
|
-
"
|
|
698
|
-
"python_version": "3.10",
|
|
699
|
-
"framework_version": "0.5.4"
|
|
863
|
+
"framework_version": "0.5.14"
|
|
700
864
|
},
|
|
701
865
|
"registry": "dockerhub",
|
|
702
866
|
"repository": "lmsysorg/sglang",
|
|
@@ -709,15 +873,6 @@
|
|
|
709
873
|
},
|
|
710
874
|
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
711
875
|
},
|
|
712
|
-
"accelerator": {
|
|
713
|
-
"type": "cuda",
|
|
714
|
-
"version": "12.1",
|
|
715
|
-
"versionRange": {
|
|
716
|
-
"min": "11.8",
|
|
717
|
-
"max": "12.2"
|
|
718
|
-
}
|
|
719
|
-
},
|
|
720
|
-
"validationLevel": "experimental",
|
|
721
876
|
"profiles": {
|
|
722
877
|
"default": {
|
|
723
878
|
"displayName": "Default Configuration",
|
|
@@ -740,17 +895,24 @@
|
|
|
740
895
|
"notes": "RadixAttention provides automatic KV cache reuse for improved throughput"
|
|
741
896
|
}
|
|
742
897
|
},
|
|
743
|
-
"
|
|
898
|
+
"accelerator": {
|
|
899
|
+
"type": "cuda",
|
|
900
|
+
"version": "12.1",
|
|
901
|
+
"versionRange": {
|
|
902
|
+
"min": "11.8",
|
|
903
|
+
"max": "12.2"
|
|
904
|
+
}
|
|
905
|
+
},
|
|
906
|
+
"notes": "SGLang 0.2.0 features RadixAttention for automatic KV cache reuse. Experimental support",
|
|
907
|
+
"validationLevel": "experimental"
|
|
744
908
|
},
|
|
745
909
|
{
|
|
746
|
-
"image": "lmsysorg/sglang:v0.
|
|
747
|
-
"tag": "v0.
|
|
910
|
+
"image": "lmsysorg/sglang:v0.5.13",
|
|
911
|
+
"tag": "v0.5.13",
|
|
748
912
|
"architecture": "amd64",
|
|
749
|
-
"created": "
|
|
913
|
+
"created": "2026-06-11T10:15:46.142149Z",
|
|
750
914
|
"labels": {
|
|
751
|
-
"
|
|
752
|
-
"python_version": "3.10",
|
|
753
|
-
"framework_version": "0.4.6"
|
|
915
|
+
"framework_version": "0.5.13"
|
|
754
916
|
},
|
|
755
917
|
"registry": "dockerhub",
|
|
756
918
|
"repository": "lmsysorg/sglang",
|
|
@@ -763,6 +925,28 @@
|
|
|
763
925
|
},
|
|
764
926
|
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
765
927
|
},
|
|
928
|
+
"profiles": {
|
|
929
|
+
"default": {
|
|
930
|
+
"displayName": "Default Configuration",
|
|
931
|
+
"description": "Balanced configuration for general use",
|
|
932
|
+
"envVars": {
|
|
933
|
+
"SGLANG_MAX_RUNNING_REQUESTS": "256",
|
|
934
|
+
"SGLANG_MEM_FRACTION": "0.9"
|
|
935
|
+
},
|
|
936
|
+
"notes": "Good starting point for most workloads"
|
|
937
|
+
},
|
|
938
|
+
"high-throughput": {
|
|
939
|
+
"displayName": "High Throughput",
|
|
940
|
+
"description": "Optimized for maximum throughput with RadixAttention",
|
|
941
|
+
"envVars": {
|
|
942
|
+
"SGLANG_MAX_RUNNING_REQUESTS": "512",
|
|
943
|
+
"SGLANG_MEM_FRACTION": "0.95",
|
|
944
|
+
"SGLANG_CONTEXT_LENGTH": "2048",
|
|
945
|
+
"SGLANG_ENABLE_RADIX_CACHE": "true"
|
|
946
|
+
},
|
|
947
|
+
"notes": "RadixAttention provides automatic KV cache reuse for improved throughput"
|
|
948
|
+
}
|
|
949
|
+
},
|
|
766
950
|
"accelerator": {
|
|
767
951
|
"type": "cuda",
|
|
768
952
|
"version": "12.1",
|
|
@@ -771,7 +955,28 @@
|
|
|
771
955
|
"max": "12.2"
|
|
772
956
|
}
|
|
773
957
|
},
|
|
774
|
-
"
|
|
958
|
+
"notes": "SGLang 0.2.0 features RadixAttention for automatic KV cache reuse. Experimental support",
|
|
959
|
+
"validationLevel": "experimental"
|
|
960
|
+
},
|
|
961
|
+
{
|
|
962
|
+
"image": "lmsysorg/sglang:v0.5.12",
|
|
963
|
+
"tag": "v0.5.12",
|
|
964
|
+
"architecture": "amd64",
|
|
965
|
+
"created": "2026-05-16T18:18:22.925418Z",
|
|
966
|
+
"labels": {
|
|
967
|
+
"framework_version": "0.5.12"
|
|
968
|
+
},
|
|
969
|
+
"registry": "dockerhub",
|
|
970
|
+
"repository": "lmsysorg/sglang",
|
|
971
|
+
"defaults": {
|
|
972
|
+
"envVars": {
|
|
973
|
+
"SGLANG_TENSOR_PARALLEL_SIZE": "1",
|
|
974
|
+
"SGLANG_MEM_FRACTION": "0.9",
|
|
975
|
+
"SGLANG_MAX_RUNNING_REQUESTS": "256",
|
|
976
|
+
"SGLANG_CONTEXT_LENGTH": "4096"
|
|
977
|
+
},
|
|
978
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
979
|
+
},
|
|
775
980
|
"profiles": {
|
|
776
981
|
"default": {
|
|
777
982
|
"displayName": "Default Configuration",
|
|
@@ -794,7 +999,16 @@
|
|
|
794
999
|
"notes": "RadixAttention provides automatic KV cache reuse for improved throughput"
|
|
795
1000
|
}
|
|
796
1001
|
},
|
|
797
|
-
"
|
|
1002
|
+
"accelerator": {
|
|
1003
|
+
"type": "cuda",
|
|
1004
|
+
"version": "12.1",
|
|
1005
|
+
"versionRange": {
|
|
1006
|
+
"min": "11.8",
|
|
1007
|
+
"max": "12.2"
|
|
1008
|
+
}
|
|
1009
|
+
},
|
|
1010
|
+
"notes": "SGLang 0.2.0 features RadixAttention for automatic KV cache reuse. Experimental support",
|
|
1011
|
+
"validationLevel": "experimental"
|
|
798
1012
|
}
|
|
799
1013
|
],
|
|
800
1014
|
"tensorrt-llm": [
|
|
@@ -64,6 +64,7 @@ export default class BootstrapCommandHandler {
|
|
|
64
64
|
_handlePrune() { return this.profileManager._handlePrune(); }
|
|
65
65
|
_handleSyncSchemas() { return this.profileManager._handleSyncSchemas(); }
|
|
66
66
|
_handleSyncModelFamilies() { return this.profileManager._handleSyncModelFamilies(); }
|
|
67
|
+
_handleSyncServingVersions() { return this.profileManager._handleSyncServingVersions(); }
|
|
67
68
|
|
|
68
69
|
/**
|
|
69
70
|
* Dispatch bootstrap subcommands.
|
|
@@ -132,6 +133,9 @@ export default class BootstrapCommandHandler {
|
|
|
132
133
|
case 'sync-model-families':
|
|
133
134
|
await this._handleSyncModelFamilies();
|
|
134
135
|
break;
|
|
136
|
+
case 'sync-serving-versions':
|
|
137
|
+
await this._handleSyncServingVersions();
|
|
138
|
+
break;
|
|
135
139
|
// Migration path: upgrades legacy profiles to current naming conventions.
|
|
136
140
|
// Corrects stackName to mlcc-bootstrap-{profileName}, renames sharedStackFrom
|
|
137
141
|
// to sharedInfraFrom. Idempotent ā safe to run multiple times.
|
|
@@ -1467,7 +1471,9 @@ SUBCOMMANDS:
|
|
|
1467
1471
|
prune Remove deleted and unknown records from the deployment manifest
|
|
1468
1472
|
update Re-deploy bootstrap stacks using active profile (no prompts)
|
|
1469
1473
|
migrate Upgrade legacy profiles to current naming conventions
|
|
1474
|
+
sync-schemas Download AWS service model schemas (sagemaker, iam, ecr, s3)
|
|
1470
1475
|
sync-model-families Discover tune-eligible models from JumpStart Hub and update catalog
|
|
1476
|
+
sync-serving-versions Discover latest vLLM/SGLang/TRT-LLM image versions and update catalog
|
|
1471
1477
|
|
|
1472
1478
|
SETUP OPTIONS:
|
|
1473
1479
|
--non-interactive Run without interactive prompts
|
|
@@ -1477,8 +1483,10 @@ SETUP OPTIONS:
|
|
|
1477
1483
|
--role-arn <arn> Use existing IAM role ARN (skip role creation)
|
|
1478
1484
|
--skip-s3 Skip S3 bucket creation
|
|
1479
1485
|
--ci Provision CI testing infrastructure
|
|
1486
|
+
--benchmark-infra Provision Athena/Glue benchmark infrastructure (requires --ci)
|
|
1480
1487
|
--skip-ci Skip CI infrastructure provisioning
|
|
1481
1488
|
--skip-post-setup Skip post-setup chain (mcp init, sync-architectures, sync-schemas)
|
|
1489
|
+
--ignore-staleness Suppress schema staleness warnings
|
|
1482
1490
|
|
|
1483
1491
|
STATUS OPTIONS:
|
|
1484
1492
|
--verify Check each active resource against AWS APIs for drift detection
|
|
@@ -1495,13 +1503,15 @@ EXAMPLES:
|
|
|
1495
1503
|
ml-container-creator bootstrap list
|
|
1496
1504
|
ml-container-creator bootstrap remove dev
|
|
1497
1505
|
ml-container-creator bootstrap remove dev --force --delete-stack
|
|
1506
|
+
ml-container-creator bootstrap update
|
|
1507
|
+
ml-container-creator bootstrap update --ci --benchmark-infra
|
|
1498
1508
|
ml-container-creator bootstrap scan
|
|
1509
|
+
ml-container-creator bootstrap sync-schemas
|
|
1499
1510
|
ml-container-creator bootstrap sync-model-families
|
|
1511
|
+
ml-container-creator bootstrap sync-serving-versions
|
|
1500
1512
|
ml-container-creator bootstrap migrate
|
|
1501
1513
|
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2
|
|
1502
|
-
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --role-arn arn:aws:iam::123456789012:role/MyRole --skip-s3
|
|
1503
1514
|
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --ci
|
|
1504
|
-
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --skip-ci
|
|
1505
1515
|
`);
|
|
1506
1516
|
}
|
|
1507
1517
|
|
|
@@ -655,4 +655,20 @@ export default class BootstrapProfileManager {
|
|
|
655
655
|
process.exit(1);
|
|
656
656
|
}
|
|
657
657
|
}
|
|
658
|
+
|
|
659
|
+
/**
|
|
660
|
+
* Handle sync-serving-versions subcommand: discover latest container image
|
|
661
|
+
* versions for vLLM, SGLang, and TensorRT-LLM and update the model-servers catalog.
|
|
662
|
+
*/
|
|
663
|
+
async _handleSyncServingVersions() {
|
|
664
|
+
console.log('\nš Sync Serving Versions ā Discovering latest container images...\n');
|
|
665
|
+
try {
|
|
666
|
+
const { syncServingVersions } = await import('../../scripts/sync-serving-versions.js');
|
|
667
|
+
const result = await syncServingVersions();
|
|
668
|
+
console.log(`\nā
Sync complete: ${result.totalAdded} new, ${result.totalRemoved} pruned\n`);
|
|
669
|
+
} catch (err) {
|
|
670
|
+
console.log(`ā Sync failed: ${err.message}`);
|
|
671
|
+
process.exit(1);
|
|
672
|
+
}
|
|
673
|
+
}
|
|
658
674
|
}
|
|
@@ -290,7 +290,12 @@ export default class CrossCuttingChecker {
|
|
|
290
290
|
if (!modelType || !server || !serverVersion) return findings;
|
|
291
291
|
|
|
292
292
|
const entries = modelServersCatalog[server] || [];
|
|
293
|
-
|
|
293
|
+
// Try exact version match first, then fall back to nearest entry with supportedModelTypes
|
|
294
|
+
let entry = entries.find(e => e.labels?.framework_version === serverVersion);
|
|
295
|
+
if (!entry?.supportedModelTypes?.length) {
|
|
296
|
+
// Fall back to any entry that has supportedModelTypes populated
|
|
297
|
+
entry = entries.find(e => e.supportedModelTypes?.length > 0);
|
|
298
|
+
}
|
|
294
299
|
if (!entry?.supportedModelTypes?.length) return findings;
|
|
295
300
|
|
|
296
301
|
if (!entry.supportedModelTypes.includes(modelType.toLowerCase())) {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
// AUTO-GENERATED by scripts/codegen-parameter-matrix.js ā DO NOT EDIT
|
|
2
2
|
// Source: config/parameter-schema-v2.json
|
|
3
|
-
// Generated: 2026-06-
|
|
3
|
+
// Generated: 2026-06-29T13:37:06.375Z
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Parameter matrix defining how each parameter is loaded from various sources.
|