@aws/ml-container-creator 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +9 -0
- package/config/bootstrap-stack.json +106 -9
- package/infra/ci-harness/package-lock.json +5 -1
- package/package.json +1 -1
- package/servers/instance-sizer/index.js +4 -4
- package/servers/instance-sizer/lib/model-resolver.js +1 -1
- package/servers/lib/catalogs/model-sizes.json +135 -90
- package/servers/lib/catalogs/models.json +483 -411
- package/src/app.js +29 -1
- package/src/lib/bootstrap-command-handler.js +71 -23
- package/src/lib/cli-handler.js +1 -1
- package/src/lib/config-manager.js +1 -1
- package/src/lib/mcp-client.js +3 -3
- package/src/lib/prompt-runner.js +5 -5
- package/src/lib/prompts.js +31 -5
- package/src/lib/tune-catalog-validator.js +143 -0
- package/src/lib/tune-config-state.js +116 -0
- package/src/lib/tune-dataset-validator.js +279 -0
- package/src/lib/tune-output-resolver.js +66 -0
- package/templates/do/.tune_helper.py +768 -0
- package/templates/do/adapter +128 -17
- package/templates/do/add-ic +155 -19
- package/templates/do/config +11 -4
- package/templates/do/tune +1143 -0
package/bin/cli.js
CHANGED
|
@@ -102,6 +102,15 @@ program
|
|
|
102
102
|
.addOption(new Option('--max-loras <n>', 'Maximum concurrent LoRA adapters in GPU memory (default: 30)'))
|
|
103
103
|
.addOption(new Option('--max-lora-rank <n>', 'Maximum LoRA rank (default: 64)'))
|
|
104
104
|
|
|
105
|
+
// --- Benchmarking ---
|
|
106
|
+
.addOption(new Option('--include-benchmark', 'Include SageMaker AI Benchmarking (transformers/diffusors only)'))
|
|
107
|
+
.addOption(new Option('--benchmark-concurrency <n>', 'Benchmark concurrent requests (default: 10)'))
|
|
108
|
+
.addOption(new Option('--benchmark-input-tokens <n>', 'Benchmark mean input tokens (default: 550)'))
|
|
109
|
+
.addOption(new Option('--benchmark-output-tokens <n>', 'Benchmark mean output tokens (default: 150)'))
|
|
110
|
+
.addOption(new Option('--benchmark-streaming', 'Enable streaming in benchmark (default: true)'))
|
|
111
|
+
.addOption(new Option('--benchmark-request-count <n>', 'Total benchmark requests (optional)'))
|
|
112
|
+
.addOption(new Option('--benchmark-s3-output-path <path>', 'S3 path for benchmark results'))
|
|
113
|
+
|
|
105
114
|
// --- MCP & Discovery ---
|
|
106
115
|
.addOption(new Option('--smart', 'Enable Bedrock-powered smart mode on MCP servers'))
|
|
107
116
|
.addOption(new Option('--discover', 'Enable live registry lookups via MCP discovery'))
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
"Type": "String",
|
|
8
8
|
"Default": "false",
|
|
9
9
|
"AllowedValues": ["true", "false"],
|
|
10
|
-
"Description": "Whether to create S3 buckets for async inference
|
|
10
|
+
"Description": "Whether to create S3 buckets for async inference, batch transform, adapters, and benchmarks"
|
|
11
11
|
},
|
|
12
12
|
"UseExistingRoleArn": {
|
|
13
13
|
"Type": "String",
|
|
@@ -62,6 +62,7 @@
|
|
|
62
62
|
"sagemaker:DescribeEndpointConfig",
|
|
63
63
|
"sagemaker:DescribeModel",
|
|
64
64
|
"sagemaker:DescribeInferenceComponent",
|
|
65
|
+
"sagemaker:ListInferenceComponents",
|
|
65
66
|
"sagemaker:InvokeEndpoint",
|
|
66
67
|
"sagemaker:InvokeEndpointAsync"
|
|
67
68
|
],
|
|
@@ -131,9 +132,12 @@
|
|
|
131
132
|
"Action": [
|
|
132
133
|
"s3:GetObject",
|
|
133
134
|
"s3:PutObject",
|
|
135
|
+
"s3:AbortMultipartUpload",
|
|
134
136
|
"s3:ListBucket"
|
|
135
137
|
],
|
|
136
138
|
"Resource": [
|
|
139
|
+
"arn:aws:s3:::mlcc-*",
|
|
140
|
+
"arn:aws:s3:::mlcc-*/*",
|
|
137
141
|
"arn:aws:s3:::ml-container-creator-*",
|
|
138
142
|
"arn:aws:s3:::ml-container-creator-*/*"
|
|
139
143
|
]
|
|
@@ -163,18 +167,55 @@
|
|
|
163
167
|
"arn:aws:secretsmanager:*:*:secret:ml-container-creator/*"
|
|
164
168
|
]
|
|
165
169
|
},
|
|
170
|
+
{
|
|
171
|
+
"Sid": "SNSPublish",
|
|
172
|
+
"Effect": "Allow",
|
|
173
|
+
"Action": "sns:Publish",
|
|
174
|
+
"Resource": [
|
|
175
|
+
{ "Fn::Sub": "arn:aws:sns:*:${AWS::AccountId}:mlcc-*" },
|
|
176
|
+
{ "Fn::Sub": "arn:aws:sns:*:${AWS::AccountId}:ml-container-creator-*" }
|
|
177
|
+
]
|
|
178
|
+
},
|
|
166
179
|
{
|
|
167
180
|
"Sid": "QuotaAndAvailability",
|
|
168
181
|
"Effect": "Allow",
|
|
169
182
|
"Action": [
|
|
170
183
|
"service-quotas:GetServiceQuota",
|
|
171
184
|
"service-quotas:ListServiceQuotas",
|
|
172
|
-
"ec2:DescribeCapacityReservations",
|
|
173
185
|
"sagemaker:ListTrainingPlans",
|
|
174
186
|
"sagemaker:DescribeTrainingPlan",
|
|
175
187
|
"sagemaker:ListEndpoints"
|
|
176
188
|
],
|
|
177
189
|
"Resource": "*"
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
"Sid": "SageMakerModelCustomization",
|
|
193
|
+
"Effect": "Allow",
|
|
194
|
+
"Action": [
|
|
195
|
+
"sagemaker:CreateTrainingJob",
|
|
196
|
+
"sagemaker:DescribeTrainingJob",
|
|
197
|
+
"sagemaker:ListTrainingJobs",
|
|
198
|
+
"sagemaker:StopTrainingJob",
|
|
199
|
+
"sagemaker:CreateModelPackage",
|
|
200
|
+
"sagemaker:CreateModelPackageGroup",
|
|
201
|
+
"sagemaker:DescribeModelPackage",
|
|
202
|
+
"sagemaker:DescribeModelPackageGroup",
|
|
203
|
+
"sagemaker:ListModelPackages",
|
|
204
|
+
"sagemaker:CallMlflowAppApi"
|
|
205
|
+
],
|
|
206
|
+
"Resource": "*"
|
|
207
|
+
},
|
|
208
|
+
{
|
|
209
|
+
"Sid": "SageMakerMLflow",
|
|
210
|
+
"Effect": "Allow",
|
|
211
|
+
"Action": "sagemaker-mlflow:*",
|
|
212
|
+
"Resource": "*"
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
"Sid": "LambdaInvokeForReward",
|
|
216
|
+
"Effect": "Allow",
|
|
217
|
+
"Action": "lambda:InvokeFunction",
|
|
218
|
+
"Resource": { "Fn::Sub": "arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:function:*" }
|
|
178
219
|
}
|
|
179
220
|
]
|
|
180
221
|
}
|
|
@@ -209,7 +250,7 @@
|
|
|
209
250
|
"DeletionPolicy": "Retain",
|
|
210
251
|
"UpdateReplacePolicy": "Retain",
|
|
211
252
|
"Properties": {
|
|
212
|
-
"BucketName": { "Fn::Sub": "
|
|
253
|
+
"BucketName": { "Fn::Sub": "mlcc-async-${AWS::AccountId}-${AWS::Region}" },
|
|
213
254
|
"VersioningConfiguration": { "Status": "Enabled" },
|
|
214
255
|
"BucketEncryption": {
|
|
215
256
|
"ServerSideEncryptionConfiguration": [
|
|
@@ -218,7 +259,8 @@
|
|
|
218
259
|
},
|
|
219
260
|
"Tags": [
|
|
220
261
|
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
221
|
-
{ "Key": "mlcc:created-by", "Value": "bootstrap" }
|
|
262
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap" },
|
|
263
|
+
{ "Key": "mlcc:purpose", "Value": "async-inference-output" }
|
|
222
264
|
]
|
|
223
265
|
}
|
|
224
266
|
},
|
|
@@ -229,7 +271,7 @@
|
|
|
229
271
|
"DeletionPolicy": "Retain",
|
|
230
272
|
"UpdateReplacePolicy": "Retain",
|
|
231
273
|
"Properties": {
|
|
232
|
-
"BucketName": { "Fn::Sub": "
|
|
274
|
+
"BucketName": { "Fn::Sub": "mlcc-batch-${AWS::AccountId}-${AWS::Region}" },
|
|
233
275
|
"VersioningConfiguration": { "Status": "Enabled" },
|
|
234
276
|
"BucketEncryption": {
|
|
235
277
|
"ServerSideEncryptionConfiguration": [
|
|
@@ -238,17 +280,40 @@
|
|
|
238
280
|
},
|
|
239
281
|
"Tags": [
|
|
240
282
|
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
241
|
-
{ "Key": "mlcc:created-by", "Value": "bootstrap" }
|
|
283
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap" },
|
|
284
|
+
{ "Key": "mlcc:purpose", "Value": "batch-transform-io" }
|
|
285
|
+
]
|
|
286
|
+
}
|
|
287
|
+
},
|
|
288
|
+
|
|
289
|
+
"AdapterS3Bucket": {
|
|
290
|
+
"Type": "AWS::S3::Bucket",
|
|
291
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
292
|
+
"DeletionPolicy": "Retain",
|
|
293
|
+
"UpdateReplacePolicy": "Retain",
|
|
294
|
+
"Properties": {
|
|
295
|
+
"BucketName": { "Fn::Sub": "mlcc-adapters-${AWS::AccountId}-${AWS::Region}" },
|
|
296
|
+
"VersioningConfiguration": { "Status": "Enabled" },
|
|
297
|
+
"BucketEncryption": {
|
|
298
|
+
"ServerSideEncryptionConfiguration": [
|
|
299
|
+
{ "ServerSideEncryptionByDefault": { "SSEAlgorithm": "AES256" } }
|
|
300
|
+
]
|
|
301
|
+
},
|
|
302
|
+
"Tags": [
|
|
303
|
+
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
304
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap" },
|
|
305
|
+
{ "Key": "mlcc:purpose", "Value": "lora-adapter-storage" }
|
|
242
306
|
]
|
|
243
307
|
}
|
|
244
308
|
},
|
|
245
309
|
|
|
246
310
|
"BenchmarkS3Bucket": {
|
|
247
311
|
"Type": "AWS::S3::Bucket",
|
|
312
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
248
313
|
"DeletionPolicy": "Retain",
|
|
249
314
|
"UpdateReplacePolicy": "Retain",
|
|
250
315
|
"Properties": {
|
|
251
|
-
"BucketName": { "Fn::Sub": "
|
|
316
|
+
"BucketName": { "Fn::Sub": "mlcc-benchmark-${AWS::AccountId}-${AWS::Region}" },
|
|
252
317
|
"VersioningConfiguration": { "Status": "Enabled" },
|
|
253
318
|
"BucketEncryption": {
|
|
254
319
|
"ServerSideEncryptionConfiguration": [
|
|
@@ -257,7 +322,28 @@
|
|
|
257
322
|
},
|
|
258
323
|
"Tags": [
|
|
259
324
|
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
260
|
-
{ "Key": "mlcc:created-by", "Value": "bootstrap" }
|
|
325
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap" },
|
|
326
|
+
{ "Key": "mlcc:purpose", "Value": "benchmark-results" }
|
|
327
|
+
]
|
|
328
|
+
}
|
|
329
|
+
},
|
|
330
|
+
|
|
331
|
+
"TuneS3Bucket": {
|
|
332
|
+
"Type": "AWS::S3::Bucket",
|
|
333
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
334
|
+
"DeletionPolicy": "Retain",
|
|
335
|
+
"UpdateReplacePolicy": "Retain",
|
|
336
|
+
"Properties": {
|
|
337
|
+
"BucketName": { "Fn::Sub": "mlcc-tune-${AWS::AccountId}-${AWS::Region}" },
|
|
338
|
+
"VersioningConfiguration": { "Status": "Enabled" },
|
|
339
|
+
"BucketEncryption": {
|
|
340
|
+
"ServerSideEncryptionConfiguration": [
|
|
341
|
+
{ "ServerSideEncryptionByDefault": { "SSEAlgorithm": "AES256" } }
|
|
342
|
+
]
|
|
343
|
+
},
|
|
344
|
+
"Tags": [
|
|
345
|
+
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
346
|
+
{ "Key": "mlcc:purpose", "Value": "tune-datasets-and-output" }
|
|
261
347
|
]
|
|
262
348
|
}
|
|
263
349
|
}
|
|
@@ -292,13 +378,24 @@
|
|
|
292
378
|
"Description": "S3 bucket for batch transform I/O",
|
|
293
379
|
"Value": { "Ref": "BatchS3Bucket" }
|
|
294
380
|
},
|
|
381
|
+
"AdapterS3BucketName": {
|
|
382
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
383
|
+
"Description": "S3 bucket for LoRA adapter storage",
|
|
384
|
+
"Value": { "Ref": "AdapterS3Bucket" }
|
|
385
|
+
},
|
|
295
386
|
"BenchmarkS3BucketName": {
|
|
387
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
296
388
|
"Description": "S3 bucket for benchmark results output",
|
|
297
389
|
"Value": { "Ref": "BenchmarkS3Bucket" }
|
|
298
390
|
},
|
|
391
|
+
"TuneS3BucketName": {
|
|
392
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
393
|
+
"Description": "S3 bucket for tune datasets and output",
|
|
394
|
+
"Value": { "Ref": "TuneS3Bucket" }
|
|
395
|
+
},
|
|
299
396
|
"StackVersion": {
|
|
300
397
|
"Description": "Bootstrap stack template version for forward compatibility tracking",
|
|
301
|
-
"Value": "2026-05-
|
|
398
|
+
"Value": "2026-05-18"
|
|
302
399
|
}
|
|
303
400
|
}
|
|
304
401
|
}
|
|
@@ -48,6 +48,7 @@
|
|
|
48
48
|
"semver"
|
|
49
49
|
],
|
|
50
50
|
"license": "Apache-2.0",
|
|
51
|
+
"peer": true,
|
|
51
52
|
"dependencies": {
|
|
52
53
|
"jsonschema": "~1.4.1",
|
|
53
54
|
"semver": "^7.7.4"
|
|
@@ -2150,6 +2151,7 @@
|
|
|
2150
2151
|
"integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==",
|
|
2151
2152
|
"dev": true,
|
|
2152
2153
|
"license": "MIT",
|
|
2154
|
+
"peer": true,
|
|
2153
2155
|
"dependencies": {
|
|
2154
2156
|
"undici-types": "~6.21.0"
|
|
2155
2157
|
}
|
|
@@ -2789,7 +2791,8 @@
|
|
|
2789
2791
|
"version": "10.6.0",
|
|
2790
2792
|
"resolved": "https://registry.npmjs.org/constructs/-/constructs-10.6.0.tgz",
|
|
2791
2793
|
"integrity": "sha512-TxHOnBO5zMo/G76ykzGF/wMpEHu257TbWiIxP9K0Yv/+t70UzgBQiTqjkAsWOPC6jW91DzJI0+ehQV6xDRNBuQ==",
|
|
2792
|
-
"license": "Apache-2.0"
|
|
2794
|
+
"license": "Apache-2.0",
|
|
2795
|
+
"peer": true
|
|
2793
2796
|
},
|
|
2794
2797
|
"node_modules/create-require": {
|
|
2795
2798
|
"version": "1.1.1",
|
|
@@ -3694,6 +3697,7 @@
|
|
|
3694
3697
|
"integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
|
|
3695
3698
|
"dev": true,
|
|
3696
3699
|
"license": "Apache-2.0",
|
|
3700
|
+
"peer": true,
|
|
3697
3701
|
"bin": {
|
|
3698
3702
|
"tsc": "bin/tsc",
|
|
3699
3703
|
"tsserver": "bin/tsserver"
|
package/package.json
CHANGED
|
@@ -51,7 +51,7 @@ try {
|
|
|
51
51
|
|
|
52
52
|
// ── Mode configuration ───────────────────────────────────────────────────────
|
|
53
53
|
|
|
54
|
-
const DISCOVER_MODE = process.
|
|
54
|
+
const DISCOVER_MODE = process.env.DISCOVER_MODE !== 'false' && !process.argv.includes('--no-discover')
|
|
55
55
|
const SMART_MODE = process.env.BEDROCK_SMART === 'true'
|
|
56
56
|
const BEDROCK_MODEL = process.env.BEDROCK_MODEL || 'global.anthropic.claude-sonnet-4-20250514-v1:0'
|
|
57
57
|
const BEDROCK_REGION = process.env.BEDROCK_REGION || process.env.AWS_REGION || 'us-east-1'
|
|
@@ -593,10 +593,10 @@ const isMain = process.argv[1] && resolve(process.argv[1]) === __filename
|
|
|
593
593
|
if (isMain) {
|
|
594
594
|
if (SMART_MODE) {
|
|
595
595
|
log(`Smart mode enabled (model: ${BEDROCK_MODEL}, region: ${BEDROCK_REGION})`)
|
|
596
|
-
} else if (DISCOVER_MODE) {
|
|
597
|
-
log('
|
|
596
|
+
} else if (!DISCOVER_MODE) {
|
|
597
|
+
log('Static mode (catalog-only, no network calls) — use --no-discover to force this')
|
|
598
598
|
} else {
|
|
599
|
-
log('
|
|
599
|
+
log('Discover mode (HuggingFace API + quota lookups active)')
|
|
600
600
|
}
|
|
601
601
|
|
|
602
602
|
const transport = new StdioServerTransport()
|
|
@@ -207,7 +207,7 @@ const isHuggingFacePattern = (modelName) => {
|
|
|
207
207
|
* @returns {Promise<{ parameterCount: number, dtype: string, architecture: string, maxPositionEmbeddings: number, source: string } | null>}
|
|
208
208
|
*/
|
|
209
209
|
const resolveModelMetadata = async (modelName, options = {}) => {
|
|
210
|
-
const { discover =
|
|
210
|
+
const { discover = true, catalogPath } = options
|
|
211
211
|
|
|
212
212
|
// Tier 1: Catalog lookup
|
|
213
213
|
const catalog = await loadCatalog(catalogPath)
|
|
@@ -1,50 +1,23 @@
|
|
|
1
1
|
{
|
|
2
2
|
"catalogVersion": "1.0.0",
|
|
3
3
|
"models": {
|
|
4
|
-
"meta-llama/Llama-2-
|
|
5
|
-
"parameterCount":
|
|
6
|
-
"defaultDtype": "float16",
|
|
7
|
-
"architecture": "LlamaForCausalLM",
|
|
8
|
-
"maxPositionEmbeddings": 4096,
|
|
9
|
-
"recommendedQuantizations": ["awq", "gptq"],
|
|
10
|
-
"minVramGb": 18,
|
|
11
|
-
"recommendedInstances": ["ml.g5.2xlarge", "ml.g5.4xlarge"]
|
|
12
|
-
},
|
|
13
|
-
"meta-llama/Llama-2-13b*": {
|
|
14
|
-
"parameterCount": 13015864320,
|
|
15
|
-
"defaultDtype": "float16",
|
|
16
|
-
"architecture": "LlamaForCausalLM",
|
|
17
|
-
"maxPositionEmbeddings": 4096,
|
|
18
|
-
"recommendedQuantizations": ["awq", "gptq"],
|
|
19
|
-
"minVramGb": 34,
|
|
20
|
-
"recommendedInstances": ["ml.g5.4xlarge", "ml.g5.12xlarge"]
|
|
21
|
-
},
|
|
22
|
-
"meta-llama/Llama-2-70b*": {
|
|
23
|
-
"parameterCount": 68976648192,
|
|
24
|
-
"defaultDtype": "float16",
|
|
25
|
-
"architecture": "LlamaForCausalLM",
|
|
26
|
-
"maxPositionEmbeddings": 4096,
|
|
27
|
-
"recommendedQuantizations": ["awq", "gptq"],
|
|
28
|
-
"minVramGb": 180,
|
|
29
|
-
"recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge"]
|
|
30
|
-
},
|
|
31
|
-
"meta-llama/Meta-Llama-3-8B*": {
|
|
32
|
-
"parameterCount": 8030261248,
|
|
4
|
+
"meta-llama/Llama-3.2-1B*": {
|
|
5
|
+
"parameterCount": 1235814400,
|
|
33
6
|
"defaultDtype": "bfloat16",
|
|
34
7
|
"architecture": "LlamaForCausalLM",
|
|
35
|
-
"maxPositionEmbeddings":
|
|
8
|
+
"maxPositionEmbeddings": 131072,
|
|
36
9
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
37
|
-
"minVramGb":
|
|
38
|
-
"recommendedInstances": ["ml.g5.
|
|
10
|
+
"minVramGb": 5,
|
|
11
|
+
"recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
|
|
39
12
|
},
|
|
40
|
-
"meta-llama/
|
|
41
|
-
"parameterCount":
|
|
13
|
+
"meta-llama/Llama-3.2-3B*": {
|
|
14
|
+
"parameterCount": 3212749824,
|
|
42
15
|
"defaultDtype": "bfloat16",
|
|
43
16
|
"architecture": "LlamaForCausalLM",
|
|
44
|
-
"maxPositionEmbeddings":
|
|
17
|
+
"maxPositionEmbeddings": 131072,
|
|
45
18
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
46
|
-
"minVramGb":
|
|
47
|
-
"recommendedInstances": ["ml.g5.
|
|
19
|
+
"minVramGb": 9,
|
|
20
|
+
"recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
|
|
48
21
|
},
|
|
49
22
|
"meta-llama/Llama-3.1-8B*": {
|
|
50
23
|
"parameterCount": 8030261248,
|
|
@@ -55,104 +28,176 @@
|
|
|
55
28
|
"minVramGb": 20,
|
|
56
29
|
"recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
|
|
57
30
|
},
|
|
58
|
-
"meta-llama/Llama-3.
|
|
59
|
-
"parameterCount":
|
|
31
|
+
"meta-llama/Llama-3.3-70B*": {
|
|
32
|
+
"parameterCount": 70553706496,
|
|
60
33
|
"defaultDtype": "bfloat16",
|
|
61
34
|
"architecture": "LlamaForCausalLM",
|
|
62
35
|
"maxPositionEmbeddings": 131072,
|
|
36
|
+
"recommendedQuantizations": ["awq", "gptq", "fp8"],
|
|
37
|
+
"minVramGb": 184,
|
|
38
|
+
"recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge", "ml.g6e.48xlarge"]
|
|
39
|
+
},
|
|
40
|
+
"Qwen/Qwen3-0.6B*": {
|
|
41
|
+
"parameterCount": 600000000,
|
|
42
|
+
"defaultDtype": "bfloat16",
|
|
43
|
+
"architecture": "Qwen3ForCausalLM",
|
|
44
|
+
"maxPositionEmbeddings": 32768,
|
|
63
45
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
64
|
-
"minVramGb":
|
|
46
|
+
"minVramGb": 3,
|
|
65
47
|
"recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
|
|
66
48
|
},
|
|
67
|
-
"
|
|
68
|
-
"parameterCount":
|
|
49
|
+
"Qwen/Qwen3-1.7B*": {
|
|
50
|
+
"parameterCount": 1700000000,
|
|
69
51
|
"defaultDtype": "bfloat16",
|
|
70
|
-
"architecture": "
|
|
71
|
-
"maxPositionEmbeddings":
|
|
52
|
+
"architecture": "Qwen3ForCausalLM",
|
|
53
|
+
"maxPositionEmbeddings": 32768,
|
|
72
54
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
73
|
-
"minVramGb":
|
|
55
|
+
"minVramGb": 6,
|
|
74
56
|
"recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
|
|
75
57
|
},
|
|
76
|
-
"
|
|
77
|
-
"parameterCount":
|
|
58
|
+
"Qwen/Qwen3-4B*": {
|
|
59
|
+
"parameterCount": 4000000000,
|
|
78
60
|
"defaultDtype": "bfloat16",
|
|
79
|
-
"architecture": "
|
|
61
|
+
"architecture": "Qwen3ForCausalLM",
|
|
80
62
|
"maxPositionEmbeddings": 32768,
|
|
81
63
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
82
|
-
"minVramGb":
|
|
83
|
-
"recommendedInstances": ["ml.g5.
|
|
64
|
+
"minVramGb": 11,
|
|
65
|
+
"recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
|
|
84
66
|
},
|
|
85
|
-
"
|
|
86
|
-
"parameterCount":
|
|
67
|
+
"Qwen/Qwen3-8B*": {
|
|
68
|
+
"parameterCount": 8000000000,
|
|
87
69
|
"defaultDtype": "bfloat16",
|
|
88
|
-
"architecture": "
|
|
70
|
+
"architecture": "Qwen3ForCausalLM",
|
|
89
71
|
"maxPositionEmbeddings": 32768,
|
|
90
72
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
91
|
-
"minVramGb":
|
|
92
|
-
"recommendedInstances": ["ml.g5.
|
|
73
|
+
"minVramGb": 20,
|
|
74
|
+
"recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
|
|
93
75
|
},
|
|
94
|
-
"Qwen/
|
|
95
|
-
"parameterCount":
|
|
76
|
+
"Qwen/Qwen3-14B*": {
|
|
77
|
+
"parameterCount": 14000000000,
|
|
96
78
|
"defaultDtype": "bfloat16",
|
|
97
|
-
"architecture": "
|
|
98
|
-
"maxPositionEmbeddings":
|
|
79
|
+
"architecture": "Qwen3ForCausalLM",
|
|
80
|
+
"maxPositionEmbeddings": 32768,
|
|
99
81
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
100
|
-
"minVramGb":
|
|
101
|
-
"recommendedInstances": ["ml.g5.
|
|
82
|
+
"minVramGb": 37,
|
|
83
|
+
"recommendedInstances": ["ml.g5.4xlarge", "ml.g5.12xlarge"]
|
|
84
|
+
},
|
|
85
|
+
"Qwen/Qwen3-32B*": {
|
|
86
|
+
"parameterCount": 32000000000,
|
|
87
|
+
"defaultDtype": "bfloat16",
|
|
88
|
+
"architecture": "Qwen3ForCausalLM",
|
|
89
|
+
"maxPositionEmbeddings": 32768,
|
|
90
|
+
"recommendedQuantizations": ["awq", "gptq"],
|
|
91
|
+
"minVramGb": 84,
|
|
92
|
+
"recommendedInstances": ["ml.g5.12xlarge", "ml.g5.48xlarge"]
|
|
102
93
|
},
|
|
103
|
-
"Qwen/Qwen2-7B*": {
|
|
94
|
+
"Qwen/Qwen2.5-7B*": {
|
|
104
95
|
"parameterCount": 7721324544,
|
|
105
96
|
"defaultDtype": "bfloat16",
|
|
106
97
|
"architecture": "Qwen2ForCausalLM",
|
|
107
|
-
"maxPositionEmbeddings":
|
|
98
|
+
"maxPositionEmbeddings": 131072,
|
|
108
99
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
109
100
|
"minVramGb": 20,
|
|
110
|
-
"recommendedInstances": ["ml.g5.2xlarge", "ml.
|
|
101
|
+
"recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
|
|
111
102
|
},
|
|
112
|
-
"Qwen/
|
|
103
|
+
"Qwen/Qwen2.5-14B*": {
|
|
113
104
|
"parameterCount": 14167134208,
|
|
114
105
|
"defaultDtype": "bfloat16",
|
|
115
|
-
"architecture": "
|
|
116
|
-
"maxPositionEmbeddings":
|
|
106
|
+
"architecture": "Qwen2ForCausalLM",
|
|
107
|
+
"maxPositionEmbeddings": 131072,
|
|
117
108
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
118
109
|
"minVramGb": 37,
|
|
119
110
|
"recommendedInstances": ["ml.g5.4xlarge", "ml.g5.12xlarge"]
|
|
120
111
|
},
|
|
121
|
-
"Qwen/Qwen2-
|
|
122
|
-
"parameterCount":
|
|
112
|
+
"Qwen/Qwen2.5-32B*": {
|
|
113
|
+
"parameterCount": 32000000000,
|
|
123
114
|
"defaultDtype": "bfloat16",
|
|
124
115
|
"architecture": "Qwen2ForCausalLM",
|
|
125
|
-
"maxPositionEmbeddings":
|
|
116
|
+
"maxPositionEmbeddings": 131072,
|
|
126
117
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
127
|
-
"minVramGb":
|
|
128
|
-
"recommendedInstances": ["ml.g5.
|
|
118
|
+
"minVramGb": 84,
|
|
119
|
+
"recommendedInstances": ["ml.g5.12xlarge", "ml.g5.48xlarge"]
|
|
129
120
|
},
|
|
130
|
-
"Qwen/
|
|
121
|
+
"Qwen/Qwen2.5-72B*": {
|
|
131
122
|
"parameterCount": 72710410240,
|
|
132
123
|
"defaultDtype": "bfloat16",
|
|
133
|
-
"architecture": "
|
|
134
|
-
"maxPositionEmbeddings":
|
|
135
|
-
"recommendedQuantizations": ["awq", "gptq"],
|
|
124
|
+
"architecture": "Qwen2ForCausalLM",
|
|
125
|
+
"maxPositionEmbeddings": 131072,
|
|
126
|
+
"recommendedQuantizations": ["awq", "gptq", "fp8"],
|
|
136
127
|
"minVramGb": 190,
|
|
137
|
-
"recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge"]
|
|
128
|
+
"recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge", "ml.g6e.48xlarge"]
|
|
138
129
|
},
|
|
139
|
-
"
|
|
140
|
-
"parameterCount":
|
|
130
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B*": {
|
|
131
|
+
"parameterCount": 1500000000,
|
|
141
132
|
"defaultDtype": "bfloat16",
|
|
142
133
|
"architecture": "Qwen2ForCausalLM",
|
|
143
|
-
"maxPositionEmbeddings":
|
|
134
|
+
"maxPositionEmbeddings": 131072,
|
|
144
135
|
"recommendedQuantizations": ["awq", "gptq"],
|
|
145
|
-
"minVramGb":
|
|
146
|
-
"recommendedInstances": ["ml.g5.
|
|
147
|
-
},
|
|
148
|
-
"
|
|
149
|
-
"parameterCount":
|
|
150
|
-
"defaultDtype": "
|
|
151
|
-
"architecture": "
|
|
152
|
-
"maxPositionEmbeddings":
|
|
153
|
-
"recommendedQuantizations": ["gptq"],
|
|
154
|
-
"minVramGb":
|
|
136
|
+
"minVramGb": 5,
|
|
137
|
+
"recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
|
|
138
|
+
},
|
|
139
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B*": {
|
|
140
|
+
"parameterCount": 7000000000,
|
|
141
|
+
"defaultDtype": "bfloat16",
|
|
142
|
+
"architecture": "Qwen2ForCausalLM",
|
|
143
|
+
"maxPositionEmbeddings": 131072,
|
|
144
|
+
"recommendedQuantizations": ["awq", "gptq"],
|
|
145
|
+
"minVramGb": 18,
|
|
146
|
+
"recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
|
|
147
|
+
},
|
|
148
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B*": {
|
|
149
|
+
"parameterCount": 14000000000,
|
|
150
|
+
"defaultDtype": "bfloat16",
|
|
151
|
+
"architecture": "Qwen2ForCausalLM",
|
|
152
|
+
"maxPositionEmbeddings": 131072,
|
|
153
|
+
"recommendedQuantizations": ["awq", "gptq"],
|
|
154
|
+
"minVramGb": 37,
|
|
155
|
+
"recommendedInstances": ["ml.g5.4xlarge", "ml.g5.12xlarge"]
|
|
156
|
+
},
|
|
157
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B*": {
|
|
158
|
+
"parameterCount": 32000000000,
|
|
159
|
+
"defaultDtype": "bfloat16",
|
|
160
|
+
"architecture": "Qwen2ForCausalLM",
|
|
161
|
+
"maxPositionEmbeddings": 131072,
|
|
162
|
+
"recommendedQuantizations": ["awq", "gptq"],
|
|
163
|
+
"minVramGb": 84,
|
|
164
|
+
"recommendedInstances": ["ml.g5.12xlarge", "ml.g5.48xlarge"]
|
|
165
|
+
},
|
|
166
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B*": {
|
|
167
|
+
"parameterCount": 8000000000,
|
|
168
|
+
"defaultDtype": "bfloat16",
|
|
169
|
+
"architecture": "LlamaForCausalLM",
|
|
170
|
+
"maxPositionEmbeddings": 131072,
|
|
171
|
+
"recommendedQuantizations": ["awq", "gptq"],
|
|
172
|
+
"minVramGb": 20,
|
|
173
|
+
"recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
|
|
174
|
+
},
|
|
175
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B*": {
|
|
176
|
+
"parameterCount": 70000000000,
|
|
177
|
+
"defaultDtype": "bfloat16",
|
|
178
|
+
"architecture": "LlamaForCausalLM",
|
|
179
|
+
"maxPositionEmbeddings": 131072,
|
|
180
|
+
"recommendedQuantizations": ["awq", "gptq", "fp8"],
|
|
181
|
+
"minVramGb": 184,
|
|
182
|
+
"recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge", "ml.g6e.48xlarge"]
|
|
183
|
+
},
|
|
184
|
+
"openai/gpt-oss-20b*": {
|
|
185
|
+
"parameterCount": 20000000000,
|
|
186
|
+
"defaultDtype": "bfloat16",
|
|
187
|
+
"architecture": "GPT2LMHeadModel",
|
|
188
|
+
"maxPositionEmbeddings": 8192,
|
|
189
|
+
"recommendedQuantizations": ["awq", "gptq"],
|
|
190
|
+
"minVramGb": 52,
|
|
155
191
|
"recommendedInstances": ["ml.g5.12xlarge", "ml.g5.48xlarge"]
|
|
192
|
+
},
|
|
193
|
+
"openai/gpt-oss-120b*": {
|
|
194
|
+
"parameterCount": 120000000000,
|
|
195
|
+
"defaultDtype": "bfloat16",
|
|
196
|
+
"architecture": "GPT2LMHeadModel",
|
|
197
|
+
"maxPositionEmbeddings": 8192,
|
|
198
|
+
"recommendedQuantizations": ["awq", "gptq", "fp8"],
|
|
199
|
+
"minVramGb": 312,
|
|
200
|
+
"recommendedInstances": ["ml.p4d.24xlarge", "ml.p5.48xlarge"]
|
|
156
201
|
}
|
|
157
202
|
}
|
|
158
203
|
}
|