@aws/ml-container-creator 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/cli.js CHANGED
@@ -102,6 +102,15 @@ program
102
102
  .addOption(new Option('--max-loras <n>', 'Maximum concurrent LoRA adapters in GPU memory (default: 30)'))
103
103
  .addOption(new Option('--max-lora-rank <n>', 'Maximum LoRA rank (default: 64)'))
104
104
 
105
+ // --- Benchmarking ---
106
+ .addOption(new Option('--include-benchmark', 'Include SageMaker AI Benchmarking (transformers/diffusors only)'))
107
+ .addOption(new Option('--benchmark-concurrency <n>', 'Benchmark concurrent requests (default: 10)'))
108
+ .addOption(new Option('--benchmark-input-tokens <n>', 'Benchmark mean input tokens (default: 550)'))
109
+ .addOption(new Option('--benchmark-output-tokens <n>', 'Benchmark mean output tokens (default: 150)'))
110
+ .addOption(new Option('--benchmark-streaming', 'Enable streaming in benchmark (default: true)'))
111
+ .addOption(new Option('--benchmark-request-count <n>', 'Total benchmark requests (optional)'))
112
+ .addOption(new Option('--benchmark-s3-output-path <path>', 'S3 path for benchmark results'))
113
+
105
114
  // --- MCP & Discovery ---
106
115
  .addOption(new Option('--smart', 'Enable Bedrock-powered smart mode on MCP servers'))
107
116
  .addOption(new Option('--discover', 'Enable live registry lookups via MCP discovery'))
@@ -7,7 +7,7 @@
7
7
  "Type": "String",
8
8
  "Default": "false",
9
9
  "AllowedValues": ["true", "false"],
10
- "Description": "Whether to create S3 buckets for async inference and batch transform"
10
+ "Description": "Whether to create S3 buckets for async inference, batch transform, adapters, and benchmarks"
11
11
  },
12
12
  "UseExistingRoleArn": {
13
13
  "Type": "String",
@@ -62,6 +62,7 @@
62
62
  "sagemaker:DescribeEndpointConfig",
63
63
  "sagemaker:DescribeModel",
64
64
  "sagemaker:DescribeInferenceComponent",
65
+ "sagemaker:ListInferenceComponents",
65
66
  "sagemaker:InvokeEndpoint",
66
67
  "sagemaker:InvokeEndpointAsync"
67
68
  ],
@@ -131,9 +132,12 @@
131
132
  "Action": [
132
133
  "s3:GetObject",
133
134
  "s3:PutObject",
135
+ "s3:AbortMultipartUpload",
134
136
  "s3:ListBucket"
135
137
  ],
136
138
  "Resource": [
139
+ "arn:aws:s3:::mlcc-*",
140
+ "arn:aws:s3:::mlcc-*/*",
137
141
  "arn:aws:s3:::ml-container-creator-*",
138
142
  "arn:aws:s3:::ml-container-creator-*/*"
139
143
  ]
@@ -163,18 +167,55 @@
163
167
  "arn:aws:secretsmanager:*:*:secret:ml-container-creator/*"
164
168
  ]
165
169
  },
170
+ {
171
+ "Sid": "SNSPublish",
172
+ "Effect": "Allow",
173
+ "Action": "sns:Publish",
174
+ "Resource": [
175
+ { "Fn::Sub": "arn:aws:sns:*:${AWS::AccountId}:mlcc-*" },
176
+ { "Fn::Sub": "arn:aws:sns:*:${AWS::AccountId}:ml-container-creator-*" }
177
+ ]
178
+ },
166
179
  {
167
180
  "Sid": "QuotaAndAvailability",
168
181
  "Effect": "Allow",
169
182
  "Action": [
170
183
  "service-quotas:GetServiceQuota",
171
184
  "service-quotas:ListServiceQuotas",
172
- "ec2:DescribeCapacityReservations",
173
185
  "sagemaker:ListTrainingPlans",
174
186
  "sagemaker:DescribeTrainingPlan",
175
187
  "sagemaker:ListEndpoints"
176
188
  ],
177
189
  "Resource": "*"
190
+ },
191
+ {
192
+ "Sid": "SageMakerModelCustomization",
193
+ "Effect": "Allow",
194
+ "Action": [
195
+ "sagemaker:CreateTrainingJob",
196
+ "sagemaker:DescribeTrainingJob",
197
+ "sagemaker:ListTrainingJobs",
198
+ "sagemaker:StopTrainingJob",
199
+ "sagemaker:CreateModelPackage",
200
+ "sagemaker:CreateModelPackageGroup",
201
+ "sagemaker:DescribeModelPackage",
202
+ "sagemaker:DescribeModelPackageGroup",
203
+ "sagemaker:ListModelPackages",
204
+ "sagemaker:CallMlflowAppApi"
205
+ ],
206
+ "Resource": "*"
207
+ },
208
+ {
209
+ "Sid": "SageMakerMLflow",
210
+ "Effect": "Allow",
211
+ "Action": "sagemaker-mlflow:*",
212
+ "Resource": "*"
213
+ },
214
+ {
215
+ "Sid": "LambdaInvokeForReward",
216
+ "Effect": "Allow",
217
+ "Action": "lambda:InvokeFunction",
218
+ "Resource": { "Fn::Sub": "arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:function:*" }
178
219
  }
179
220
  ]
180
221
  }
@@ -209,7 +250,7 @@
209
250
  "DeletionPolicy": "Retain",
210
251
  "UpdateReplacePolicy": "Retain",
211
252
  "Properties": {
212
- "BucketName": { "Fn::Sub": "${AWS::AccountId}-${AWS::Region}-ml-container-creator-async" },
253
+ "BucketName": { "Fn::Sub": "mlcc-async-${AWS::AccountId}-${AWS::Region}" },
213
254
  "VersioningConfiguration": { "Status": "Enabled" },
214
255
  "BucketEncryption": {
215
256
  "ServerSideEncryptionConfiguration": [
@@ -218,7 +259,8 @@
218
259
  },
219
260
  "Tags": [
220
261
  { "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
221
- { "Key": "mlcc:created-by", "Value": "bootstrap" }
262
+ { "Key": "mlcc:created-by", "Value": "bootstrap" },
263
+ { "Key": "mlcc:purpose", "Value": "async-inference-output" }
222
264
  ]
223
265
  }
224
266
  },
@@ -229,7 +271,7 @@
229
271
  "DeletionPolicy": "Retain",
230
272
  "UpdateReplacePolicy": "Retain",
231
273
  "Properties": {
232
- "BucketName": { "Fn::Sub": "${AWS::AccountId}-${AWS::Region}-ml-container-creator-batch" },
274
+ "BucketName": { "Fn::Sub": "mlcc-batch-${AWS::AccountId}-${AWS::Region}" },
233
275
  "VersioningConfiguration": { "Status": "Enabled" },
234
276
  "BucketEncryption": {
235
277
  "ServerSideEncryptionConfiguration": [
@@ -238,17 +280,40 @@
238
280
  },
239
281
  "Tags": [
240
282
  { "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
241
- { "Key": "mlcc:created-by", "Value": "bootstrap" }
283
+ { "Key": "mlcc:created-by", "Value": "bootstrap" },
284
+ { "Key": "mlcc:purpose", "Value": "batch-transform-io" }
285
+ ]
286
+ }
287
+ },
288
+
289
+ "AdapterS3Bucket": {
290
+ "Type": "AWS::S3::Bucket",
291
+ "Condition": "ShouldCreateS3Buckets",
292
+ "DeletionPolicy": "Retain",
293
+ "UpdateReplacePolicy": "Retain",
294
+ "Properties": {
295
+ "BucketName": { "Fn::Sub": "mlcc-adapters-${AWS::AccountId}-${AWS::Region}" },
296
+ "VersioningConfiguration": { "Status": "Enabled" },
297
+ "BucketEncryption": {
298
+ "ServerSideEncryptionConfiguration": [
299
+ { "ServerSideEncryptionByDefault": { "SSEAlgorithm": "AES256" } }
300
+ ]
301
+ },
302
+ "Tags": [
303
+ { "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
304
+ { "Key": "mlcc:created-by", "Value": "bootstrap" },
305
+ { "Key": "mlcc:purpose", "Value": "lora-adapter-storage" }
242
306
  ]
243
307
  }
244
308
  },
245
309
 
246
310
  "BenchmarkS3Bucket": {
247
311
  "Type": "AWS::S3::Bucket",
312
+ "Condition": "ShouldCreateS3Buckets",
248
313
  "DeletionPolicy": "Retain",
249
314
  "UpdateReplacePolicy": "Retain",
250
315
  "Properties": {
251
- "BucketName": { "Fn::Sub": "ml-container-creator-benchmark-${AWS::Region}-${AWS::AccountId}" },
316
+ "BucketName": { "Fn::Sub": "mlcc-benchmark-${AWS::AccountId}-${AWS::Region}" },
252
317
  "VersioningConfiguration": { "Status": "Enabled" },
253
318
  "BucketEncryption": {
254
319
  "ServerSideEncryptionConfiguration": [
@@ -257,7 +322,28 @@
257
322
  },
258
323
  "Tags": [
259
324
  { "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
260
- { "Key": "mlcc:created-by", "Value": "bootstrap" }
325
+ { "Key": "mlcc:created-by", "Value": "bootstrap" },
326
+ { "Key": "mlcc:purpose", "Value": "benchmark-results" }
327
+ ]
328
+ }
329
+ },
330
+
331
+ "TuneS3Bucket": {
332
+ "Type": "AWS::S3::Bucket",
333
+ "Condition": "ShouldCreateS3Buckets",
334
+ "DeletionPolicy": "Retain",
335
+ "UpdateReplacePolicy": "Retain",
336
+ "Properties": {
337
+ "BucketName": { "Fn::Sub": "mlcc-tune-${AWS::AccountId}-${AWS::Region}" },
338
+ "VersioningConfiguration": { "Status": "Enabled" },
339
+ "BucketEncryption": {
340
+ "ServerSideEncryptionConfiguration": [
341
+ { "ServerSideEncryptionByDefault": { "SSEAlgorithm": "AES256" } }
342
+ ]
343
+ },
344
+ "Tags": [
345
+ { "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
346
+ { "Key": "mlcc:purpose", "Value": "tune-datasets-and-output" }
261
347
  ]
262
348
  }
263
349
  }
@@ -292,13 +378,24 @@
292
378
  "Description": "S3 bucket for batch transform I/O",
293
379
  "Value": { "Ref": "BatchS3Bucket" }
294
380
  },
381
+ "AdapterS3BucketName": {
382
+ "Condition": "ShouldCreateS3Buckets",
383
+ "Description": "S3 bucket for LoRA adapter storage",
384
+ "Value": { "Ref": "AdapterS3Bucket" }
385
+ },
295
386
  "BenchmarkS3BucketName": {
387
+ "Condition": "ShouldCreateS3Buckets",
296
388
  "Description": "S3 bucket for benchmark results output",
297
389
  "Value": { "Ref": "BenchmarkS3Bucket" }
298
390
  },
391
+ "TuneS3BucketName": {
392
+ "Condition": "ShouldCreateS3Buckets",
393
+ "Description": "S3 bucket for tune datasets and output",
394
+ "Value": { "Ref": "TuneS3Bucket" }
395
+ },
299
396
  "StackVersion": {
300
397
  "Description": "Bootstrap stack template version for forward compatibility tracking",
301
- "Value": "2026-05-04"
398
+ "Value": "2026-05-18"
302
399
  }
303
400
  }
304
401
  }
@@ -48,6 +48,7 @@
48
48
  "semver"
49
49
  ],
50
50
  "license": "Apache-2.0",
51
+ "peer": true,
51
52
  "dependencies": {
52
53
  "jsonschema": "~1.4.1",
53
54
  "semver": "^7.7.4"
@@ -2150,6 +2151,7 @@
2150
2151
  "integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==",
2151
2152
  "dev": true,
2152
2153
  "license": "MIT",
2154
+ "peer": true,
2153
2155
  "dependencies": {
2154
2156
  "undici-types": "~6.21.0"
2155
2157
  }
@@ -2789,7 +2791,8 @@
2789
2791
  "version": "10.6.0",
2790
2792
  "resolved": "https://registry.npmjs.org/constructs/-/constructs-10.6.0.tgz",
2791
2793
  "integrity": "sha512-TxHOnBO5zMo/G76ykzGF/wMpEHu257TbWiIxP9K0Yv/+t70UzgBQiTqjkAsWOPC6jW91DzJI0+ehQV6xDRNBuQ==",
2792
- "license": "Apache-2.0"
2794
+ "license": "Apache-2.0",
2795
+ "peer": true
2793
2796
  },
2794
2797
  "node_modules/create-require": {
2795
2798
  "version": "1.1.1",
@@ -3694,6 +3697,7 @@
3694
3697
  "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
3695
3698
  "dev": true,
3696
3699
  "license": "Apache-2.0",
3700
+ "peer": true,
3697
3701
  "bin": {
3698
3702
  "tsc": "bin/tsc",
3699
3703
  "tsserver": "bin/tsserver"
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aws/ml-container-creator",
3
- "version": "0.5.0",
3
+ "version": "0.6.1",
4
4
  "description": "Generator for SageMaker AI BYOC paradigm for predictive inference use-cases.",
5
5
  "type": "module",
6
6
  "main": "src/app.js",
@@ -51,7 +51,7 @@ try {
51
51
 
52
52
  // ── Mode configuration ───────────────────────────────────────────────────────
53
53
 
54
- const DISCOVER_MODE = process.argv.includes('--discover') || process.env.DISCOVER_MODE === 'true'
54
+ const DISCOVER_MODE = process.env.DISCOVER_MODE !== 'false' && !process.argv.includes('--no-discover')
55
55
  const SMART_MODE = process.env.BEDROCK_SMART === 'true'
56
56
  const BEDROCK_MODEL = process.env.BEDROCK_MODEL || 'global.anthropic.claude-sonnet-4-20250514-v1:0'
57
57
  const BEDROCK_REGION = process.env.BEDROCK_REGION || process.env.AWS_REGION || 'us-east-1'
@@ -593,10 +593,10 @@ const isMain = process.argv[1] && resolve(process.argv[1]) === __filename
593
593
  if (isMain) {
594
594
  if (SMART_MODE) {
595
595
  log(`Smart mode enabled (model: ${BEDROCK_MODEL}, region: ${BEDROCK_REGION})`)
596
- } else if (DISCOVER_MODE) {
597
- log('Discover mode enabled (HuggingFace API lookups active)')
596
+ } else if (!DISCOVER_MODE) {
597
+ log('Static mode (catalog-only, no network calls) — use --no-discover to force this')
598
598
  } else {
599
- log('Static mode (catalog-only, no network calls)')
599
+ log('Discover mode (HuggingFace API + quota lookups active)')
600
600
  }
601
601
 
602
602
  const transport = new StdioServerTransport()
@@ -207,7 +207,7 @@ const isHuggingFacePattern = (modelName) => {
207
207
  * @returns {Promise<{ parameterCount: number, dtype: string, architecture: string, maxPositionEmbeddings: number, source: string } | null>}
208
208
  */
209
209
  const resolveModelMetadata = async (modelName, options = {}) => {
210
- const { discover = false, catalogPath } = options
210
+ const { discover = true, catalogPath } = options
211
211
 
212
212
  // Tier 1: Catalog lookup
213
213
  const catalog = await loadCatalog(catalogPath)
@@ -1,50 +1,23 @@
1
1
  {
2
2
  "catalogVersion": "1.0.0",
3
3
  "models": {
4
- "meta-llama/Llama-2-7b*": {
5
- "parameterCount": 6738415616,
6
- "defaultDtype": "float16",
7
- "architecture": "LlamaForCausalLM",
8
- "maxPositionEmbeddings": 4096,
9
- "recommendedQuantizations": ["awq", "gptq"],
10
- "minVramGb": 18,
11
- "recommendedInstances": ["ml.g5.2xlarge", "ml.g5.4xlarge"]
12
- },
13
- "meta-llama/Llama-2-13b*": {
14
- "parameterCount": 13015864320,
15
- "defaultDtype": "float16",
16
- "architecture": "LlamaForCausalLM",
17
- "maxPositionEmbeddings": 4096,
18
- "recommendedQuantizations": ["awq", "gptq"],
19
- "minVramGb": 34,
20
- "recommendedInstances": ["ml.g5.4xlarge", "ml.g5.12xlarge"]
21
- },
22
- "meta-llama/Llama-2-70b*": {
23
- "parameterCount": 68976648192,
24
- "defaultDtype": "float16",
25
- "architecture": "LlamaForCausalLM",
26
- "maxPositionEmbeddings": 4096,
27
- "recommendedQuantizations": ["awq", "gptq"],
28
- "minVramGb": 180,
29
- "recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge"]
30
- },
31
- "meta-llama/Meta-Llama-3-8B*": {
32
- "parameterCount": 8030261248,
4
+ "meta-llama/Llama-3.2-1B*": {
5
+ "parameterCount": 1235814400,
33
6
  "defaultDtype": "bfloat16",
34
7
  "architecture": "LlamaForCausalLM",
35
- "maxPositionEmbeddings": 8192,
8
+ "maxPositionEmbeddings": 131072,
36
9
  "recommendedQuantizations": ["awq", "gptq"],
37
- "minVramGb": 21,
38
- "recommendedInstances": ["ml.g5.2xlarge", "ml.g5.4xlarge"]
10
+ "minVramGb": 5,
11
+ "recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
39
12
  },
40
- "meta-llama/Meta-Llama-3-70B*": {
41
- "parameterCount": 70553706496,
13
+ "meta-llama/Llama-3.2-3B*": {
14
+ "parameterCount": 3212749824,
42
15
  "defaultDtype": "bfloat16",
43
16
  "architecture": "LlamaForCausalLM",
44
- "maxPositionEmbeddings": 8192,
17
+ "maxPositionEmbeddings": 131072,
45
18
  "recommendedQuantizations": ["awq", "gptq"],
46
- "minVramGb": 184,
47
- "recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge"]
19
+ "minVramGb": 9,
20
+ "recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
48
21
  },
49
22
  "meta-llama/Llama-3.1-8B*": {
50
23
  "parameterCount": 8030261248,
@@ -55,104 +28,176 @@
55
28
  "minVramGb": 20,
56
29
  "recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
57
30
  },
58
- "meta-llama/Llama-3.2-1B*": {
59
- "parameterCount": 1235814400,
31
+ "meta-llama/Llama-3.3-70B*": {
32
+ "parameterCount": 70553706496,
60
33
  "defaultDtype": "bfloat16",
61
34
  "architecture": "LlamaForCausalLM",
62
35
  "maxPositionEmbeddings": 131072,
36
+ "recommendedQuantizations": ["awq", "gptq", "fp8"],
37
+ "minVramGb": 184,
38
+ "recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge", "ml.g6e.48xlarge"]
39
+ },
40
+ "Qwen/Qwen3-0.6B*": {
41
+ "parameterCount": 600000000,
42
+ "defaultDtype": "bfloat16",
43
+ "architecture": "Qwen3ForCausalLM",
44
+ "maxPositionEmbeddings": 32768,
63
45
  "recommendedQuantizations": ["awq", "gptq"],
64
- "minVramGb": 5,
46
+ "minVramGb": 3,
65
47
  "recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
66
48
  },
67
- "meta-llama/Llama-3.2-3B*": {
68
- "parameterCount": 3212749824,
49
+ "Qwen/Qwen3-1.7B*": {
50
+ "parameterCount": 1700000000,
69
51
  "defaultDtype": "bfloat16",
70
- "architecture": "LlamaForCausalLM",
71
- "maxPositionEmbeddings": 131072,
52
+ "architecture": "Qwen3ForCausalLM",
53
+ "maxPositionEmbeddings": 32768,
72
54
  "recommendedQuantizations": ["awq", "gptq"],
73
- "minVramGb": 9,
55
+ "minVramGb": 6,
74
56
  "recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
75
57
  },
76
- "mistralai/Mistral-7B*": {
77
- "parameterCount": 7241732096,
58
+ "Qwen/Qwen3-4B*": {
59
+ "parameterCount": 4000000000,
78
60
  "defaultDtype": "bfloat16",
79
- "architecture": "MistralForCausalLM",
61
+ "architecture": "Qwen3ForCausalLM",
80
62
  "maxPositionEmbeddings": 32768,
81
63
  "recommendedQuantizations": ["awq", "gptq"],
82
- "minVramGb": 19,
83
- "recommendedInstances": ["ml.g5.2xlarge", "ml.g5.4xlarge"]
64
+ "minVramGb": 11,
65
+ "recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
84
66
  },
85
- "mistralai/Mixtral-8x7B*": {
86
- "parameterCount": 46702792704,
67
+ "Qwen/Qwen3-8B*": {
68
+ "parameterCount": 8000000000,
87
69
  "defaultDtype": "bfloat16",
88
- "architecture": "MixtralForCausalLM",
70
+ "architecture": "Qwen3ForCausalLM",
89
71
  "maxPositionEmbeddings": 32768,
90
72
  "recommendedQuantizations": ["awq", "gptq"],
91
- "minVramGb": 122,
92
- "recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge"]
73
+ "minVramGb": 20,
74
+ "recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
93
75
  },
94
- "Qwen/Qwen-7B*": {
95
- "parameterCount": 7721324544,
76
+ "Qwen/Qwen3-14B*": {
77
+ "parameterCount": 14000000000,
96
78
  "defaultDtype": "bfloat16",
97
- "architecture": "QWenLMHeadModel",
98
- "maxPositionEmbeddings": 8192,
79
+ "architecture": "Qwen3ForCausalLM",
80
+ "maxPositionEmbeddings": 32768,
99
81
  "recommendedQuantizations": ["awq", "gptq"],
100
- "minVramGb": 20,
101
- "recommendedInstances": ["ml.g5.2xlarge", "ml.g5.4xlarge"]
82
+ "minVramGb": 37,
83
+ "recommendedInstances": ["ml.g5.4xlarge", "ml.g5.12xlarge"]
84
+ },
85
+ "Qwen/Qwen3-32B*": {
86
+ "parameterCount": 32000000000,
87
+ "defaultDtype": "bfloat16",
88
+ "architecture": "Qwen3ForCausalLM",
89
+ "maxPositionEmbeddings": 32768,
90
+ "recommendedQuantizations": ["awq", "gptq"],
91
+ "minVramGb": 84,
92
+ "recommendedInstances": ["ml.g5.12xlarge", "ml.g5.48xlarge"]
102
93
  },
103
- "Qwen/Qwen2-7B*": {
94
+ "Qwen/Qwen2.5-7B*": {
104
95
  "parameterCount": 7721324544,
105
96
  "defaultDtype": "bfloat16",
106
97
  "architecture": "Qwen2ForCausalLM",
107
- "maxPositionEmbeddings": 32768,
98
+ "maxPositionEmbeddings": 131072,
108
99
  "recommendedQuantizations": ["awq", "gptq"],
109
100
  "minVramGb": 20,
110
- "recommendedInstances": ["ml.g5.2xlarge", "ml.g5.4xlarge"]
101
+ "recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
111
102
  },
112
- "Qwen/Qwen-14B*": {
103
+ "Qwen/Qwen2.5-14B*": {
113
104
  "parameterCount": 14167134208,
114
105
  "defaultDtype": "bfloat16",
115
- "architecture": "QWenLMHeadModel",
116
- "maxPositionEmbeddings": 8192,
106
+ "architecture": "Qwen2ForCausalLM",
107
+ "maxPositionEmbeddings": 131072,
117
108
  "recommendedQuantizations": ["awq", "gptq"],
118
109
  "minVramGb": 37,
119
110
  "recommendedInstances": ["ml.g5.4xlarge", "ml.g5.12xlarge"]
120
111
  },
121
- "Qwen/Qwen2-14B*": {
122
- "parameterCount": 14167134208,
112
+ "Qwen/Qwen2.5-32B*": {
113
+ "parameterCount": 32000000000,
123
114
  "defaultDtype": "bfloat16",
124
115
  "architecture": "Qwen2ForCausalLM",
125
- "maxPositionEmbeddings": 32768,
116
+ "maxPositionEmbeddings": 131072,
126
117
  "recommendedQuantizations": ["awq", "gptq"],
127
- "minVramGb": 37,
128
- "recommendedInstances": ["ml.g5.4xlarge", "ml.g5.12xlarge"]
118
+ "minVramGb": 84,
119
+ "recommendedInstances": ["ml.g5.12xlarge", "ml.g5.48xlarge"]
129
120
  },
130
- "Qwen/Qwen-72B*": {
121
+ "Qwen/Qwen2.5-72B*": {
131
122
  "parameterCount": 72710410240,
132
123
  "defaultDtype": "bfloat16",
133
- "architecture": "QWenLMHeadModel",
134
- "maxPositionEmbeddings": 32768,
135
- "recommendedQuantizations": ["awq", "gptq"],
124
+ "architecture": "Qwen2ForCausalLM",
125
+ "maxPositionEmbeddings": 131072,
126
+ "recommendedQuantizations": ["awq", "gptq", "fp8"],
136
127
  "minVramGb": 190,
137
- "recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge"]
128
+ "recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge", "ml.g6e.48xlarge"]
138
129
  },
139
- "Qwen/Qwen2-72B*": {
140
- "parameterCount": 72710410240,
130
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B*": {
131
+ "parameterCount": 1500000000,
141
132
  "defaultDtype": "bfloat16",
142
133
  "architecture": "Qwen2ForCausalLM",
143
- "maxPositionEmbeddings": 32768,
134
+ "maxPositionEmbeddings": 131072,
144
135
  "recommendedQuantizations": ["awq", "gptq"],
145
- "minVramGb": 190,
146
- "recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge"]
147
- },
148
- "EleutherAI/gpt-neox-20b*": {
149
- "parameterCount": 20554568704,
150
- "defaultDtype": "float16",
151
- "architecture": "GPTNeoXForCausalLM",
152
- "maxPositionEmbeddings": 2048,
153
- "recommendedQuantizations": ["gptq"],
154
- "minVramGb": 54,
136
+ "minVramGb": 5,
137
+ "recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
138
+ },
139
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B*": {
140
+ "parameterCount": 7000000000,
141
+ "defaultDtype": "bfloat16",
142
+ "architecture": "Qwen2ForCausalLM",
143
+ "maxPositionEmbeddings": 131072,
144
+ "recommendedQuantizations": ["awq", "gptq"],
145
+ "minVramGb": 18,
146
+ "recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
147
+ },
148
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B*": {
149
+ "parameterCount": 14000000000,
150
+ "defaultDtype": "bfloat16",
151
+ "architecture": "Qwen2ForCausalLM",
152
+ "maxPositionEmbeddings": 131072,
153
+ "recommendedQuantizations": ["awq", "gptq"],
154
+ "minVramGb": 37,
155
+ "recommendedInstances": ["ml.g5.4xlarge", "ml.g5.12xlarge"]
156
+ },
157
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B*": {
158
+ "parameterCount": 32000000000,
159
+ "defaultDtype": "bfloat16",
160
+ "architecture": "Qwen2ForCausalLM",
161
+ "maxPositionEmbeddings": 131072,
162
+ "recommendedQuantizations": ["awq", "gptq"],
163
+ "minVramGb": 84,
164
+ "recommendedInstances": ["ml.g5.12xlarge", "ml.g5.48xlarge"]
165
+ },
166
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-8B*": {
167
+ "parameterCount": 8000000000,
168
+ "defaultDtype": "bfloat16",
169
+ "architecture": "LlamaForCausalLM",
170
+ "maxPositionEmbeddings": 131072,
171
+ "recommendedQuantizations": ["awq", "gptq"],
172
+ "minVramGb": 20,
173
+ "recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
174
+ },
175
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B*": {
176
+ "parameterCount": 70000000000,
177
+ "defaultDtype": "bfloat16",
178
+ "architecture": "LlamaForCausalLM",
179
+ "maxPositionEmbeddings": 131072,
180
+ "recommendedQuantizations": ["awq", "gptq", "fp8"],
181
+ "minVramGb": 184,
182
+ "recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge", "ml.g6e.48xlarge"]
183
+ },
184
+ "openai/gpt-oss-20b*": {
185
+ "parameterCount": 20000000000,
186
+ "defaultDtype": "bfloat16",
187
+ "architecture": "GPT2LMHeadModel",
188
+ "maxPositionEmbeddings": 8192,
189
+ "recommendedQuantizations": ["awq", "gptq"],
190
+ "minVramGb": 52,
155
191
  "recommendedInstances": ["ml.g5.12xlarge", "ml.g5.48xlarge"]
192
+ },
193
+ "openai/gpt-oss-120b*": {
194
+ "parameterCount": 120000000000,
195
+ "defaultDtype": "bfloat16",
196
+ "architecture": "GPT2LMHeadModel",
197
+ "maxPositionEmbeddings": 8192,
198
+ "recommendedQuantizations": ["awq", "gptq", "fp8"],
199
+ "minVramGb": 312,
200
+ "recommendedInstances": ["ml.p4d.24xlarge", "ml.p5.48xlarge"]
156
201
  }
157
202
  }
158
203
  }