@aws/ml-container-creator 0.2.6 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +38 -2
- package/config/bootstrap-stack.json +94 -1
- package/config/defaults.json +1 -1
- package/infra/ci-harness/package-lock.json +22 -9
- package/package.json +3 -1
- package/servers/instance-sizer/index.js +45 -8
- package/servers/instance-sizer/lib/instance-ranker.js +140 -11
- package/servers/instance-sizer/lib/model-resolver.js +10 -6
- package/servers/instance-sizer/lib/quota-resolver.js +368 -0
- package/servers/instance-sizer/package.json +2 -0
- package/servers/lib/catalogs/instances.json +527 -12
- package/servers/lib/catalogs/model-servers.json +298 -20
- package/servers/lib/catalogs/model-sizes.json +27 -0
- package/servers/lib/catalogs/models.json +101 -0
- package/servers/lib/schemas/image-catalog.schema.json +15 -1
- package/servers/model-picker/index.js +2 -1
- package/src/app.js +96 -2
- package/src/lib/architecture-sync.js +171 -0
- package/src/lib/arn-detection.js +22 -0
- package/src/lib/bootstrap-command-handler.js +178 -3
- package/src/lib/cli-handler.js +2 -2
- package/src/lib/config-manager.js +121 -1
- package/src/lib/cross-cutting-checker.js +119 -0
- package/src/lib/deployment-entry-schema.js +1 -2
- package/src/lib/prompt-runner.js +514 -20
- package/src/lib/prompts.js +67 -5
- package/src/lib/registry-command-handler.js +236 -0
- package/src/lib/schema-sync.js +31 -0
- package/src/lib/secret-classification.js +56 -0
- package/src/lib/secrets-command-handler.js +550 -0
- package/src/lib/template-manager.js +49 -1
- package/src/lib/validate-runner.js +174 -2
- package/src/lib/validation-report.js +8 -1
- package/src/prompt-adapter.js +3 -2
- package/templates/Dockerfile +10 -2
- package/templates/code/cuda_compat.sh +22 -0
- package/templates/code/serve +3 -0
- package/templates/code/start_server.sh +3 -0
- package/templates/diffusors/Dockerfile +2 -1
- package/templates/diffusors/serve +3 -0
- package/templates/do/README.md +33 -0
- package/templates/do/benchmark +646 -0
- package/templates/do/build +22 -0
- package/templates/do/clean +86 -0
- package/templates/do/config +41 -6
- package/templates/do/deploy +66 -6
- package/templates/do/logs +18 -3
- package/templates/do/register +8 -1
- package/templates/do/run +10 -0
- package/templates/triton/Dockerfile +5 -0
package/bin/cli.js
CHANGED
|
@@ -90,6 +90,9 @@ program
|
|
|
90
90
|
|
|
91
91
|
// --- Authentication ---
|
|
92
92
|
.addOption(new Option('--hf-token <token>', 'HuggingFace token (or "$HF_TOKEN" for env var reference)'))
|
|
93
|
+
.addOption(new Option('--hf-token-arn <arn>', 'HuggingFace token ARN from Secrets Manager'))
|
|
94
|
+
.addOption(new Option('--ngc-token <token>', 'NVIDIA NGC token (or "$NGC_API_KEY" for env var reference)'))
|
|
95
|
+
.addOption(new Option('--ngc-token-arn <arn>', 'NVIDIA NGC token ARN from Secrets Manager'))
|
|
93
96
|
|
|
94
97
|
// --- Optional Features ---
|
|
95
98
|
.addOption(new Option('--include-sample', 'Include sample model code'))
|
|
@@ -106,7 +109,18 @@ program
|
|
|
106
109
|
.addOption(new Option('--validate-with-docker', 'Enable Docker introspection validation (opt-in)'))
|
|
107
110
|
.addOption(new Option('--offline', 'Disable HuggingFace API lookups'))
|
|
108
111
|
|
|
109
|
-
.action((projectNameArgs, options) =>
|
|
112
|
+
.action((projectNameArgs, options) => {
|
|
113
|
+
// Mutual exclusion validation: plaintext token and ARN flags cannot both be provided
|
|
114
|
+
if (options.hfToken && options.hfTokenArn) {
|
|
115
|
+
console.error('❌ Cannot specify both --hf-token and --hf-token-arn. Use one or the other.');
|
|
116
|
+
process.exit(1);
|
|
117
|
+
}
|
|
118
|
+
if (options.ngcToken && options.ngcTokenArn) {
|
|
119
|
+
console.error('❌ Cannot specify both --ngc-token and --ngc-token-arn. Use one or the other.');
|
|
120
|
+
process.exit(1);
|
|
121
|
+
}
|
|
122
|
+
return run(projectNameArgs?.[0] || null, options);
|
|
123
|
+
});
|
|
110
124
|
|
|
111
125
|
// Custom help formatting — group options into logical sections (root command only)
|
|
112
126
|
program.configureHelp({
|
|
@@ -174,7 +188,7 @@ program.configureHelp({
|
|
|
174
188
|
groups.hyperpod.push(opt);
|
|
175
189
|
} else if (['--model-env', '--server-env'].includes(long)) {
|
|
176
190
|
groups.env.push(opt);
|
|
177
|
-
} else if (['--hf-token'].includes(long)) {
|
|
191
|
+
} else if (['--hf-token', '--hf-token-arn', '--ngc-token', '--ngc-token-arn'].includes(long)) {
|
|
178
192
|
groups.auth.push(opt);
|
|
179
193
|
} else if (['--include-sample', '--include-testing', '--test-types'].includes(long)) {
|
|
180
194
|
groups.features.push(opt);
|
|
@@ -255,6 +269,7 @@ program
|
|
|
255
269
|
.option('--ci', 'Provision CI integration infrastructure')
|
|
256
270
|
.option('--skip-ci', 'Skip CI infrastructure provisioning')
|
|
257
271
|
.option('--skip-s3', 'Skip S3 bucket creation')
|
|
272
|
+
.option('--skip-post-setup', 'Skip post-setup chain (mcp init, sync-architectures, sync-schemas)')
|
|
258
273
|
.action(async (action, args, options) => {
|
|
259
274
|
const { default: BootstrapCommandHandler } = await import('../src/lib/bootstrap-command-handler.js');
|
|
260
275
|
const handler = new BootstrapCommandHandler();
|
|
@@ -314,12 +329,33 @@ program
|
|
|
314
329
|
.option('--project', 'Use project-level registry')
|
|
315
330
|
.option('--parameters <json>', 'Parameters JSON string')
|
|
316
331
|
.option('--generator-version <version>', 'Generator version')
|
|
332
|
+
// Options used by `registry list-architectures`
|
|
333
|
+
.option('--server <name>', 'Filter by server name (for list-architectures)')
|
|
334
|
+
.option('--verbose', 'Show full list of supported model types (for list-architectures)')
|
|
317
335
|
.action(async (action, args, options) => {
|
|
318
336
|
const { default: RegistryCommandHandler } = await import('../src/lib/registry-command-handler.js');
|
|
319
337
|
const handler = new RegistryCommandHandler();
|
|
320
338
|
await handler.handle([action, ...args], options);
|
|
321
339
|
});
|
|
322
340
|
|
|
341
|
+
program
|
|
342
|
+
.command('secrets')
|
|
343
|
+
.description('Manage secrets in AWS Secrets Manager (create, list, describe)')
|
|
344
|
+
.argument('[action]', 'Secrets action (create, list, describe)')
|
|
345
|
+
.argument('[args...]', 'Additional arguments')
|
|
346
|
+
.option('--type <type>', 'Secret type (e.g., hf-token, ngc-token)')
|
|
347
|
+
.option('--name <label>', 'Secret label (used in naming convention)')
|
|
348
|
+
.option('--secret-value <value>', 'Secret value (masked in terminal)')
|
|
349
|
+
.option('--description <text>', 'Secret description')
|
|
350
|
+
.option('--kms-key-id <key>', 'KMS key for encryption')
|
|
351
|
+
.option('--json <json-or-path>', 'JSON input (inline or file://path)')
|
|
352
|
+
.action(async (action, args, options) => {
|
|
353
|
+
const { default: SecretsCommandHandler } = await import('../src/lib/secrets-command-handler.js');
|
|
354
|
+
const handler = new SecretsCommandHandler();
|
|
355
|
+
const allArgs = action ? [action, ...args] : [];
|
|
356
|
+
await handler.handle(allArgs, options);
|
|
357
|
+
});
|
|
358
|
+
|
|
323
359
|
program
|
|
324
360
|
.command('configure')
|
|
325
361
|
.description('Interactive configuration setup (experimental)')
|
|
@@ -67,6 +67,37 @@
|
|
|
67
67
|
],
|
|
68
68
|
"Resource": "*"
|
|
69
69
|
},
|
|
70
|
+
{
|
|
71
|
+
"Sid": "SageMakerBenchmarking",
|
|
72
|
+
"Effect": "Allow",
|
|
73
|
+
"Action": [
|
|
74
|
+
"sagemaker:CreateAIBenchmarkJob",
|
|
75
|
+
"sagemaker:DescribeAIBenchmarkJob",
|
|
76
|
+
"sagemaker:ListAIBenchmarkJobs",
|
|
77
|
+
"sagemaker:StopAIBenchmarkJob",
|
|
78
|
+
"sagemaker:DeleteAIBenchmarkJob",
|
|
79
|
+
"sagemaker:CreateAIWorkloadConfig",
|
|
80
|
+
"sagemaker:DescribeAIWorkloadConfig",
|
|
81
|
+
"sagemaker:ListAIWorkloadConfigs",
|
|
82
|
+
"sagemaker:DeleteAIWorkloadConfig",
|
|
83
|
+
"sagemaker:CreateTrainingJob",
|
|
84
|
+
"sagemaker:DescribeTrainingJob",
|
|
85
|
+
"sagemaker:StopTrainingJob",
|
|
86
|
+
"sagemaker:AddTags"
|
|
87
|
+
],
|
|
88
|
+
"Resource": "*"
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"Sid": "PassRoleToSageMaker",
|
|
92
|
+
"Effect": "Allow",
|
|
93
|
+
"Action": "iam:PassRole",
|
|
94
|
+
"Resource": { "Fn::Sub": "arn:aws:iam::${AWS::AccountId}:role/mlcc-sagemaker-execution-role" },
|
|
95
|
+
"Condition": {
|
|
96
|
+
"StringEquals": {
|
|
97
|
+
"iam:PassedToService": "sagemaker.amazonaws.com"
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
},
|
|
70
101
|
{
|
|
71
102
|
"Sid": "ECRPull",
|
|
72
103
|
"Effect": "Allow",
|
|
@@ -76,7 +107,7 @@
|
|
|
76
107
|
"ecr:GetDownloadUrlForLayer",
|
|
77
108
|
"ecr:BatchGetImage"
|
|
78
109
|
],
|
|
79
|
-
"Resource":
|
|
110
|
+
"Resource": "*"
|
|
80
111
|
},
|
|
81
112
|
{
|
|
82
113
|
"Sid": "ECRAuth",
|
|
@@ -99,12 +130,51 @@
|
|
|
99
130
|
"Effect": "Allow",
|
|
100
131
|
"Action": [
|
|
101
132
|
"s3:GetObject",
|
|
133
|
+
"s3:PutObject",
|
|
102
134
|
"s3:ListBucket"
|
|
103
135
|
],
|
|
104
136
|
"Resource": [
|
|
105
137
|
"arn:aws:s3:::ml-container-creator-*",
|
|
106
138
|
"arn:aws:s3:::ml-container-creator-*/*"
|
|
107
139
|
]
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"Sid": "SecretsManagerRead",
|
|
143
|
+
"Effect": "Allow",
|
|
144
|
+
"Action": [
|
|
145
|
+
"secretsmanager:GetSecretValue",
|
|
146
|
+
"secretsmanager:DescribeSecret"
|
|
147
|
+
],
|
|
148
|
+
"Resource": [
|
|
149
|
+
"arn:aws:secretsmanager:*:*:secret:mlcc/*",
|
|
150
|
+
"arn:aws:secretsmanager:*:*:secret:ml-container-creator/*"
|
|
151
|
+
]
|
|
152
|
+
},
|
|
153
|
+
{
|
|
154
|
+
"Sid": "SecretsManagerWrite",
|
|
155
|
+
"Effect": "Allow",
|
|
156
|
+
"Action": [
|
|
157
|
+
"secretsmanager:CreateSecret",
|
|
158
|
+
"secretsmanager:PutSecretValue",
|
|
159
|
+
"secretsmanager:TagResource"
|
|
160
|
+
],
|
|
161
|
+
"Resource": [
|
|
162
|
+
"arn:aws:secretsmanager:*:*:secret:mlcc/*",
|
|
163
|
+
"arn:aws:secretsmanager:*:*:secret:ml-container-creator/*"
|
|
164
|
+
]
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
"Sid": "QuotaAndAvailability",
|
|
168
|
+
"Effect": "Allow",
|
|
169
|
+
"Action": [
|
|
170
|
+
"service-quotas:GetServiceQuota",
|
|
171
|
+
"service-quotas:ListServiceQuotas",
|
|
172
|
+
"ec2:DescribeCapacityReservations",
|
|
173
|
+
"sagemaker:ListTrainingPlans",
|
|
174
|
+
"sagemaker:DescribeTrainingPlan",
|
|
175
|
+
"sagemaker:ListEndpoints"
|
|
176
|
+
],
|
|
177
|
+
"Resource": "*"
|
|
108
178
|
}
|
|
109
179
|
]
|
|
110
180
|
}
|
|
@@ -171,6 +241,25 @@
|
|
|
171
241
|
{ "Key": "mlcc:created-by", "Value": "bootstrap" }
|
|
172
242
|
]
|
|
173
243
|
}
|
|
244
|
+
},
|
|
245
|
+
|
|
246
|
+
"BenchmarkS3Bucket": {
|
|
247
|
+
"Type": "AWS::S3::Bucket",
|
|
248
|
+
"DeletionPolicy": "Retain",
|
|
249
|
+
"UpdateReplacePolicy": "Retain",
|
|
250
|
+
"Properties": {
|
|
251
|
+
"BucketName": { "Fn::Sub": "ml-container-creator-benchmark-${AWS::Region}-${AWS::AccountId}" },
|
|
252
|
+
"VersioningConfiguration": { "Status": "Enabled" },
|
|
253
|
+
"BucketEncryption": {
|
|
254
|
+
"ServerSideEncryptionConfiguration": [
|
|
255
|
+
{ "ServerSideEncryptionByDefault": { "SSEAlgorithm": "AES256" } }
|
|
256
|
+
]
|
|
257
|
+
},
|
|
258
|
+
"Tags": [
|
|
259
|
+
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
260
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap" }
|
|
261
|
+
]
|
|
262
|
+
}
|
|
174
263
|
}
|
|
175
264
|
},
|
|
176
265
|
|
|
@@ -203,6 +292,10 @@
|
|
|
203
292
|
"Description": "S3 bucket for batch transform I/O",
|
|
204
293
|
"Value": { "Ref": "BatchS3Bucket" }
|
|
205
294
|
},
|
|
295
|
+
"BenchmarkS3BucketName": {
|
|
296
|
+
"Description": "S3 bucket for benchmark results output",
|
|
297
|
+
"Value": { "Ref": "BenchmarkS3Bucket" }
|
|
298
|
+
},
|
|
206
299
|
"StackVersion": {
|
|
207
300
|
"Description": "Bootstrap stack template version for forward compatibility tracking",
|
|
208
301
|
"Value": "2026-05-04"
|
package/config/defaults.json
CHANGED
|
@@ -48,7 +48,6 @@
|
|
|
48
48
|
"semver"
|
|
49
49
|
],
|
|
50
50
|
"license": "Apache-2.0",
|
|
51
|
-
"peer": true,
|
|
52
51
|
"dependencies": {
|
|
53
52
|
"jsonschema": "~1.4.1",
|
|
54
53
|
"semver": "^7.7.4"
|
|
@@ -2151,7 +2150,6 @@
|
|
|
2151
2150
|
"integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==",
|
|
2152
2151
|
"dev": true,
|
|
2153
2152
|
"license": "MIT",
|
|
2154
|
-
"peer": true,
|
|
2155
2153
|
"dependencies": {
|
|
2156
2154
|
"undici-types": "~6.21.0"
|
|
2157
2155
|
}
|
|
@@ -2791,8 +2789,7 @@
|
|
|
2791
2789
|
"version": "10.6.0",
|
|
2792
2790
|
"resolved": "https://registry.npmjs.org/constructs/-/constructs-10.6.0.tgz",
|
|
2793
2791
|
"integrity": "sha512-TxHOnBO5zMo/G76ykzGF/wMpEHu257TbWiIxP9K0Yv/+t70UzgBQiTqjkAsWOPC6jW91DzJI0+ehQV6xDRNBuQ==",
|
|
2794
|
-
"license": "Apache-2.0"
|
|
2795
|
-
"peer": true
|
|
2792
|
+
"license": "Apache-2.0"
|
|
2796
2793
|
},
|
|
2797
2794
|
"node_modules/create-require": {
|
|
2798
2795
|
"version": "1.1.1",
|
|
@@ -2937,9 +2934,9 @@
|
|
|
2937
2934
|
}
|
|
2938
2935
|
},
|
|
2939
2936
|
"node_modules/fast-xml-builder": {
|
|
2940
|
-
"version": "1.
|
|
2941
|
-
"resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.
|
|
2942
|
-
"integrity": "sha512-
|
|
2937
|
+
"version": "1.2.0",
|
|
2938
|
+
"resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.2.0.tgz",
|
|
2939
|
+
"integrity": "sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==",
|
|
2943
2940
|
"dev": true,
|
|
2944
2941
|
"funding": [
|
|
2945
2942
|
{
|
|
@@ -2949,7 +2946,8 @@
|
|
|
2949
2946
|
],
|
|
2950
2947
|
"license": "MIT",
|
|
2951
2948
|
"dependencies": {
|
|
2952
|
-
"path-expression-matcher": "^1.
|
|
2949
|
+
"path-expression-matcher": "^1.5.0",
|
|
2950
|
+
"xml-naming": "^0.1.0"
|
|
2953
2951
|
}
|
|
2954
2952
|
},
|
|
2955
2953
|
"node_modules/fast-xml-parser": {
|
|
@@ -3696,7 +3694,6 @@
|
|
|
3696
3694
|
"integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
|
|
3697
3695
|
"dev": true,
|
|
3698
3696
|
"license": "Apache-2.0",
|
|
3699
|
-
"peer": true,
|
|
3700
3697
|
"bin": {
|
|
3701
3698
|
"tsc": "bin/tsc",
|
|
3702
3699
|
"tsserver": "bin/tsserver"
|
|
@@ -3837,6 +3834,22 @@
|
|
|
3837
3834
|
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
|
|
3838
3835
|
}
|
|
3839
3836
|
},
|
|
3837
|
+
"node_modules/xml-naming": {
|
|
3838
|
+
"version": "0.1.0",
|
|
3839
|
+
"resolved": "https://registry.npmjs.org/xml-naming/-/xml-naming-0.1.0.tgz",
|
|
3840
|
+
"integrity": "sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==",
|
|
3841
|
+
"dev": true,
|
|
3842
|
+
"funding": [
|
|
3843
|
+
{
|
|
3844
|
+
"type": "github",
|
|
3845
|
+
"url": "https://github.com/sponsors/NaturalIntelligence"
|
|
3846
|
+
}
|
|
3847
|
+
],
|
|
3848
|
+
"license": "MIT",
|
|
3849
|
+
"engines": {
|
|
3850
|
+
"node": ">=16.0.0"
|
|
3851
|
+
}
|
|
3852
|
+
},
|
|
3840
3853
|
"node_modules/y18n": {
|
|
3841
3854
|
"version": "5.0.8",
|
|
3842
3855
|
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aws/ml-container-creator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Generator for SageMaker AI BYOC paradigm for predictive inference use-cases.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/app.js",
|
|
@@ -111,6 +111,8 @@
|
|
|
111
111
|
"tinyglobby": "^0.2.16"
|
|
112
112
|
},
|
|
113
113
|
"devDependencies": {
|
|
114
|
+
"@aws-sdk/client-sagemaker": "^3.700.0",
|
|
115
|
+
"@aws-sdk/client-service-quotas": "^3.700.0",
|
|
114
116
|
"@microsoft/eslint-formatter-sarif": "^3.1.0",
|
|
115
117
|
"eslint": "^8.57.0",
|
|
116
118
|
"fast-check": "^4.5.2",
|
|
@@ -26,7 +26,8 @@ import { fileURLToPath } from 'node:url'
|
|
|
26
26
|
import { resolve, dirname } from 'node:path'
|
|
27
27
|
import { resolveModelMetadata } from './lib/model-resolver.js'
|
|
28
28
|
import { estimateVram } from './lib/vram-estimator.js'
|
|
29
|
-
import { filterAndRankInstances } from './lib/instance-ranker.js'
|
|
29
|
+
import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js'
|
|
30
|
+
import { QuotaResolver } from './lib/quota-resolver.js'
|
|
30
31
|
import { queryBedrock } from '../lib/bedrock-client.js'
|
|
31
32
|
|
|
32
33
|
// ── Path setup ───────────────────────────────────────────────────────────────
|
|
@@ -114,11 +115,11 @@ function log(message) {
|
|
|
114
115
|
* @param {string} search - Search query string
|
|
115
116
|
* @param {object} instanceCatalog - Instance catalog object
|
|
116
117
|
* @param {object} [options={}]
|
|
117
|
-
* @param {number} [options.limit=
|
|
118
|
+
* @param {number} [options.limit=10] - Max results
|
|
118
119
|
* @returns {string[]} Matching instance type names, sorted by relevance
|
|
119
120
|
*/
|
|
120
121
|
function searchInstancesByTag(search, instanceCatalog, options = {}) {
|
|
121
|
-
const { limit =
|
|
122
|
+
const { limit = 10 } = options
|
|
122
123
|
const candidates = Object.entries(instanceCatalog)
|
|
123
124
|
|
|
124
125
|
// Tokenize search into lowercase keywords
|
|
@@ -236,7 +237,7 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
236
237
|
maxSequenceLength,
|
|
237
238
|
batchSize,
|
|
238
239
|
cudaVersion,
|
|
239
|
-
limit =
|
|
240
|
+
limit = 10,
|
|
240
241
|
context
|
|
241
242
|
} = params
|
|
242
243
|
|
|
@@ -361,11 +362,14 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
361
362
|
}
|
|
362
363
|
|
|
363
364
|
// Step 2: Estimate VRAM
|
|
365
|
+
// Use model's max_position_embeddings as the sequence length when no explicit value is provided.
|
|
366
|
+
// This ensures KV cache is sized for the model's actual context window, not the 4096 default.
|
|
367
|
+
const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined
|
|
364
368
|
const vramEstimate = estimateVram({
|
|
365
369
|
parameterCount: modelMetadata.parameterCount,
|
|
366
370
|
dtype: modelMetadata.dtype,
|
|
367
371
|
quantization: quantization || undefined,
|
|
368
|
-
maxSequenceLength:
|
|
372
|
+
maxSequenceLength: resolvedMaxSeqLen,
|
|
369
373
|
batchSize: effectiveBatchSize || undefined
|
|
370
374
|
})
|
|
371
375
|
|
|
@@ -376,6 +380,38 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
376
380
|
{ limit }
|
|
377
381
|
)
|
|
378
382
|
|
|
383
|
+
// Step 3a: Quota & availability filtering (discover mode only)
|
|
384
|
+
let preQuotaFilterCount = 0
|
|
385
|
+
let allFilteredByQuota = false
|
|
386
|
+
if (DISCOVER_MODE && recommendations.length > 0) {
|
|
387
|
+
try {
|
|
388
|
+
const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
|
|
389
|
+
const quotaResolver = new QuotaResolver(region)
|
|
390
|
+
|
|
391
|
+
const instanceTypes = recommendations.map(r => r.instanceType)
|
|
392
|
+
const [quotas, reservations, ftps] = await Promise.allSettled([
|
|
393
|
+
quotaResolver.getQuotaHeadroom(instanceTypes),
|
|
394
|
+
quotaResolver.getCapacityReservations(),
|
|
395
|
+
quotaResolver.getTrainingPlans()
|
|
396
|
+
])
|
|
397
|
+
|
|
398
|
+
preQuotaFilterCount = recommendations.length
|
|
399
|
+
recommendations = applyAvailabilityRanking(
|
|
400
|
+
recommendations,
|
|
401
|
+
quotas.status === 'fulfilled' ? quotas.value : null,
|
|
402
|
+
reservations.status === 'fulfilled' ? reservations.value : null,
|
|
403
|
+
ftps.status === 'fulfilled' ? ftps.value : null
|
|
404
|
+
)
|
|
405
|
+
if (recommendations.length === 0 && preQuotaFilterCount > 0) {
|
|
406
|
+
allFilteredByQuota = true
|
|
407
|
+
}
|
|
408
|
+
} catch (err) {
|
|
409
|
+
// Graceful degradation: if credentials are missing or any unexpected
|
|
410
|
+
// error occurs, skip quota filtering and continue with unfiltered results
|
|
411
|
+
log(`Quota resolution skipped: ${err.message}`)
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
379
415
|
// Step 3b: If instanceSearch is also provided, further filter by tags
|
|
380
416
|
if (instanceSearch && recommendations.length > 0) {
|
|
381
417
|
const searchMatches = new Set(searchInstancesByTag(instanceSearch, effectiveCatalog, { limit: 100 }))
|
|
@@ -477,7 +513,8 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
477
513
|
vramBreakdown: vramEstimate.breakdown,
|
|
478
514
|
recommendations: finalRecommendations,
|
|
479
515
|
source: modelMetadata.source,
|
|
480
|
-
smartModeUsed
|
|
516
|
+
smartModeUsed,
|
|
517
|
+
allFilteredByQuota
|
|
481
518
|
}
|
|
482
519
|
})
|
|
483
520
|
}]
|
|
@@ -502,7 +539,7 @@ server.tool(
|
|
|
502
539
|
maxSequenceLength: z.number().optional().describe('Max context/sequence length (affects KV cache estimate)'),
|
|
503
540
|
batchSize: z.number().optional().describe('Expected concurrent batch size'),
|
|
504
541
|
cudaVersion: z.string().optional().describe('Required CUDA version from base image (filters incompatible instances)'),
|
|
505
|
-
limit: z.number().optional().default(
|
|
542
|
+
limit: z.number().optional().default(10).describe('Maximum number of instance recommendations to return'),
|
|
506
543
|
context: z.object({
|
|
507
544
|
architecture: z.string().optional(),
|
|
508
545
|
backend: z.string().optional(),
|
|
@@ -526,7 +563,7 @@ server.tool(
|
|
|
526
563
|
maxSequenceLength: z.number().optional().describe('Max context/sequence length (affects KV cache estimate)'),
|
|
527
564
|
batchSize: z.number().optional().describe('Expected concurrent batch size'),
|
|
528
565
|
cudaVersion: z.string().optional().describe('Required CUDA version from base image (filters incompatible instances)'),
|
|
529
|
-
limit: z.number().optional().default(
|
|
566
|
+
limit: z.number().optional().default(10).describe('Maximum number of instance recommendations to return'),
|
|
530
567
|
context: z.object({
|
|
531
568
|
architecture: z.string().optional(),
|
|
532
569
|
backend: z.string().optional(),
|
|
@@ -31,14 +31,20 @@ const GPU_MEMORY_MAP = {
|
|
|
31
31
|
*/
|
|
32
32
|
const COST_TIER_MAP = {
|
|
33
33
|
'g4dn': 'low',
|
|
34
|
+
'g4ad': 'low',
|
|
34
35
|
'inf2': 'low',
|
|
35
36
|
'g5': 'medium',
|
|
36
37
|
'g6': 'medium',
|
|
38
|
+
'g6e': 'medium',
|
|
39
|
+
'g7e': 'medium',
|
|
37
40
|
'trn1': 'medium',
|
|
38
41
|
'p3': 'high',
|
|
39
42
|
'p4d': 'high',
|
|
40
43
|
'p4de': 'high',
|
|
41
|
-
'p5': 'high'
|
|
44
|
+
'p5': 'high',
|
|
45
|
+
'p5e': 'high',
|
|
46
|
+
'p5en': 'high',
|
|
47
|
+
'p6': 'high'
|
|
42
48
|
}
|
|
43
49
|
|
|
44
50
|
/**
|
|
@@ -51,6 +57,28 @@ const COST_TIER_WEIGHT = {
|
|
|
51
57
|
'high': 3
|
|
52
58
|
}
|
|
53
59
|
|
|
60
|
+
/**
|
|
61
|
+
* Generation weight by instance family.
|
|
62
|
+
* Lower is newer (sorted first). Newer generations offer better perf/$.
|
|
63
|
+
*/
|
|
64
|
+
const GENERATION_WEIGHT = {
|
|
65
|
+
'g7e': 1,
|
|
66
|
+
'p6': 1,
|
|
67
|
+
'g6e': 2,
|
|
68
|
+
'p5e': 2,
|
|
69
|
+
'p5en': 2,
|
|
70
|
+
'g6': 3,
|
|
71
|
+
'p5': 3,
|
|
72
|
+
'trn1': 3,
|
|
73
|
+
'inf2': 3,
|
|
74
|
+
'g5': 4,
|
|
75
|
+
'p4de': 5,
|
|
76
|
+
'p4d': 5,
|
|
77
|
+
'p3': 6,
|
|
78
|
+
'g4dn': 7,
|
|
79
|
+
'g4ad': 7
|
|
80
|
+
}
|
|
81
|
+
|
|
54
82
|
/**
|
|
55
83
|
* TP overhead penalty: 10% per additional GPU beyond the first.
|
|
56
84
|
* Effective VRAM = totalVram × (1 - 0.10 × (gpuCount - 1))
|
|
@@ -144,12 +172,12 @@ const effectiveVram = (totalVramGb, gpuCount) => {
|
|
|
144
172
|
* @param {number} vramRequired - Required VRAM in GB
|
|
145
173
|
* @param {object} instanceCatalog - Object keyed by instance type, values are metadata
|
|
146
174
|
* @param {object} [options={}]
|
|
147
|
-
* @param {number} [options.limit=
|
|
175
|
+
* @param {number} [options.limit=10] - Max results to return
|
|
148
176
|
* @param {boolean} [options.allowTensorParallelism=true] - Consider multi-GPU splits
|
|
149
177
|
* @returns {object[]} Ranked list of compatible instances
|
|
150
178
|
*/
|
|
151
179
|
const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) => {
|
|
152
|
-
const { limit =
|
|
180
|
+
const { limit = 10, allowTensorParallelism = true } = options
|
|
153
181
|
|
|
154
182
|
if (!vramRequired || vramRequired <= 0) {
|
|
155
183
|
return []
|
|
@@ -182,7 +210,8 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
|
|
|
182
210
|
totalVramGb,
|
|
183
211
|
utilizationPercent,
|
|
184
212
|
tensorParallelism: 1,
|
|
185
|
-
costTier: getCostTier(meta)
|
|
213
|
+
costTier: getCostTier(meta),
|
|
214
|
+
family: meta.family || ''
|
|
186
215
|
})
|
|
187
216
|
}
|
|
188
217
|
} else if (allowTensorParallelism) {
|
|
@@ -196,7 +225,8 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
|
|
|
196
225
|
totalVramGb,
|
|
197
226
|
utilizationPercent,
|
|
198
227
|
tensorParallelism: gpuCount,
|
|
199
|
-
costTier: getCostTier(meta)
|
|
228
|
+
costTier: getCostTier(meta),
|
|
229
|
+
family: meta.family || ''
|
|
200
230
|
})
|
|
201
231
|
}
|
|
202
232
|
}
|
|
@@ -204,24 +234,30 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
|
|
|
204
234
|
|
|
205
235
|
// Sort candidates by ranking criteria:
|
|
206
236
|
// 1. Single-GPU first (TP=1), then multi-GPU by lowest TP degree
|
|
207
|
-
// 2. Within each TP tier,
|
|
208
|
-
//
|
|
237
|
+
// 2. Within each TP tier, newest generation first (g6 > g5 > g4dn)
|
|
238
|
+
// 3. Within same generation, sort by cost tier (lower is better)
|
|
239
|
+
// 4. Within same cost tier, prefer lower total VRAM (right-sized)
|
|
209
240
|
candidates.sort((a, b) => {
|
|
210
241
|
// Primary: TP degree (lower is better)
|
|
211
242
|
if (a.tensorParallelism !== b.tensorParallelism) {
|
|
212
243
|
return a.tensorParallelism - b.tensorParallelism
|
|
213
244
|
}
|
|
214
245
|
|
|
215
|
-
// Secondary:
|
|
246
|
+
// Secondary: generation (newer is better — lower weight)
|
|
247
|
+
const genA = GENERATION_WEIGHT[a.family] || 4
|
|
248
|
+
const genB = GENERATION_WEIGHT[b.family] || 4
|
|
249
|
+
if (genA !== genB) {
|
|
250
|
+
return genA - genB
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Tertiary: cost tier (lower is better)
|
|
216
254
|
const costA = COST_TIER_WEIGHT[a.costTier] || 2
|
|
217
255
|
const costB = COST_TIER_WEIGHT[b.costTier] || 2
|
|
218
256
|
if (costA !== costB) {
|
|
219
257
|
return costA - costB
|
|
220
258
|
}
|
|
221
259
|
|
|
222
|
-
//
|
|
223
|
-
// lower cost tier with higher total VRAM (more GB per dollar)
|
|
224
|
-
// Since cost tier is equal here, prefer higher total VRAM (better value)
|
|
260
|
+
// Quaternary: prefer lower total VRAM (right-sized, less waste)
|
|
225
261
|
if (a.totalVramGb !== b.totalVramGb) {
|
|
226
262
|
return a.totalVramGb - b.totalVramGb
|
|
227
263
|
}
|
|
@@ -233,13 +269,106 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
|
|
|
233
269
|
return candidates.slice(0, limit)
|
|
234
270
|
}
|
|
235
271
|
|
|
272
|
+
// ── Availability Ranking ─────────────────────────────────────────────────────
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Priority weights for capacity types used in availability ranking.
|
|
276
|
+
* Lower value = higher priority (sorted first).
|
|
277
|
+
*/
|
|
278
|
+
const CAPACITY_TYPE_PRIORITY = {
|
|
279
|
+
reserved: 0,
|
|
280
|
+
ftp: 1,
|
|
281
|
+
'on-demand': 2
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Annotate, filter, and re-rank instance recommendations based on
|
|
286
|
+
* quota headroom, capacity reservations, and Flexible Training Plans.
|
|
287
|
+
*
|
|
288
|
+
* Each recommendation is annotated with:
|
|
289
|
+
* - capacityType: 'reserved' | 'ftp' | 'on-demand'
|
|
290
|
+
* - quotaStatus: 'available' | 'limited' | 'zero-quota'
|
|
291
|
+
* - reservationInfo: object (when capacityType is 'reserved')
|
|
292
|
+
* - ftpInfo: object (when capacityType is 'ftp')
|
|
293
|
+
*
|
|
294
|
+
* Instances with quotaStatus === 'zero-quota' are filtered out.
|
|
295
|
+
* Sort order: reserved → FTP → on-demand, preserving existing order within tiers.
|
|
296
|
+
*
|
|
297
|
+
* When any input signal is null (API failure), that signal is skipped
|
|
298
|
+
* and the function degrades gracefully.
|
|
299
|
+
*
|
|
300
|
+
* @param {object[]} recommendations - Ranked instance recommendations from filterAndRankInstances
|
|
301
|
+
* @param {Map|null} quotas - Map: instanceType → { quota, deployed, headroom }, or null
|
|
302
|
+
* @param {Map|null} reservations - Map: instanceType → { reservationId, count, expiresAt }, or null
|
|
303
|
+
* @param {Map|null} ftps - Map: instanceType → { planName, remainingCapacity, expiresAt }, or null
|
|
304
|
+
* @returns {object[]} Filtered and re-ranked recommendations
|
|
305
|
+
*/
|
|
306
|
+
const applyAvailabilityRanking = (recommendations, quotas, reservations, ftps) => {
|
|
307
|
+
if (!recommendations || recommendations.length === 0) {
|
|
308
|
+
return []
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// If all signals are null (all API calls failed), return unmodified
|
|
312
|
+
if (!quotas && !reservations && !ftps) {
|
|
313
|
+
return recommendations
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Annotate each recommendation with capacityType and quotaStatus
|
|
317
|
+
for (const rec of recommendations) {
|
|
318
|
+
rec.capacityType = 'on-demand'
|
|
319
|
+
rec.quotaStatus = 'available'
|
|
320
|
+
|
|
321
|
+
if (reservations?.has(rec.instanceType)) {
|
|
322
|
+
rec.capacityType = 'reserved'
|
|
323
|
+
rec.reservationInfo = reservations.get(rec.instanceType)
|
|
324
|
+
rec.reservationType = 'training-plan'
|
|
325
|
+
} else if (ftps?.has(rec.instanceType)) {
|
|
326
|
+
rec.capacityType = 'ftp'
|
|
327
|
+
rec.ftpInfo = ftps.get(rec.instanceType)
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// quotaStatus applies to all instances regardless of capacityType
|
|
331
|
+
if (quotas) {
|
|
332
|
+
const q = quotas.get(rec.instanceType)
|
|
333
|
+
if (q && q.headroom === 0) {
|
|
334
|
+
rec.quotaStatus = 'zero-quota'
|
|
335
|
+
} else if (q && q.headroom < 2) {
|
|
336
|
+
rec.quotaStatus = 'limited'
|
|
337
|
+
}
|
|
338
|
+
if (q) {
|
|
339
|
+
rec.quotaHeadroom = q.headroom
|
|
340
|
+
rec.quotaDeployed = q.deployed
|
|
341
|
+
rec.quotaLimit = q.quota
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Filter out zero-quota instances (but never filter reserved/FTP — you have the capacity)
|
|
347
|
+
const filtered = recommendations.filter(r =>
|
|
348
|
+
r.quotaStatus !== 'zero-quota' || r.capacityType === 'reserved' || r.capacityType === 'ftp'
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
// Sort: reserved first, then FTP, then on-demand (preserve existing order within tier)
|
|
352
|
+
filtered.sort((a, b) => {
|
|
353
|
+
const pa = CAPACITY_TYPE_PRIORITY[a.capacityType] ?? 2
|
|
354
|
+
const pb = CAPACITY_TYPE_PRIORITY[b.capacityType] ?? 2
|
|
355
|
+
if (pa !== pb) return pa - pb
|
|
356
|
+
return 0
|
|
357
|
+
})
|
|
358
|
+
|
|
359
|
+
return filtered
|
|
360
|
+
}
|
|
361
|
+
|
|
236
362
|
export {
|
|
237
363
|
filterAndRankInstances,
|
|
364
|
+
applyAvailabilityRanking,
|
|
238
365
|
getPerGpuMemoryGb,
|
|
239
366
|
getCostTier,
|
|
240
367
|
effectiveVram,
|
|
241
368
|
GPU_MEMORY_MAP,
|
|
242
369
|
COST_TIER_MAP,
|
|
243
370
|
COST_TIER_WEIGHT,
|
|
371
|
+
GENERATION_WEIGHT,
|
|
372
|
+
CAPACITY_TYPE_PRIORITY,
|
|
244
373
|
TP_OVERHEAD_PER_GPU
|
|
245
374
|
}
|