@aws/ml-container-creator 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +5 -2
- package/infra/ci-harness/buildspec.yml +60 -0
- package/package.json +1 -1
- package/servers/README.md +41 -1
- package/servers/instance-sizer/index.js +6 -0
- package/src/app.js +33 -2
- package/src/lib/config-manager.js +40 -1
- package/src/lib/deployment-entry-schema.js +16 -0
- package/src/lib/prompt-runner.js +174 -3
- package/src/lib/prompts.js +222 -2
- package/src/lib/registry-command-handler.js +12 -0
- package/templates/Dockerfile +12 -0
- package/templates/code/serving.properties +14 -0
- package/templates/do/adapter +1214 -0
- package/templates/do/adapters/.gitkeep +2 -0
- package/templates/do/add-ic +130 -0
- package/templates/do/benchmark +81 -9
- package/templates/do/clean +507 -17
- package/templates/do/config +23 -1
- package/templates/do/deploy +513 -367
- package/templates/do/ic/default.conf +32 -0
- package/templates/do/lib/endpoint-config.sh +216 -0
- package/templates/do/lib/inference-component.sh +167 -0
- package/templates/do/lib/secrets.sh +44 -0
- package/templates/do/lib/wait.sh +131 -0
- package/templates/do/logs +107 -27
- package/templates/do/optimize +528 -0
- package/templates/do/register +111 -1
- package/templates/do/status +337 -0
- package/templates/do/test +80 -28
package/src/lib/prompts.js
CHANGED
|
@@ -47,6 +47,69 @@ function loadInstanceTypeRegistry() {
|
|
|
47
47
|
|
|
48
48
|
const instanceTypeRegistry = loadInstanceTypeRegistry();
|
|
49
49
|
|
|
50
|
+
/**
|
|
51
|
+
* Load the raw instance catalog for GPU/CUDA generation lookups.
|
|
52
|
+
* Returns the full catalog entries keyed by instance type.
|
|
53
|
+
*/
|
|
54
|
+
function loadInstanceCatalogRaw() {
|
|
55
|
+
try {
|
|
56
|
+
const raw = readFileSync(instancesCatalogPath, 'utf8');
|
|
57
|
+
const catalog = JSON.parse(raw);
|
|
58
|
+
return catalog?.catalog || {};
|
|
59
|
+
} catch (error) {
|
|
60
|
+
return {};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const instanceCatalogRaw = loadInstanceCatalogRaw();
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Get the CUDA generation key for an instance type.
|
|
68
|
+
* Uses gpuArchitecture as the generation grouping (e.g., "Turing", "Ampere", "Hopper").
|
|
69
|
+
* Instances in the same generation share AMI compatibility.
|
|
70
|
+
* @param {string} instanceType - e.g., "ml.g5.xlarge"
|
|
71
|
+
* @returns {string|null} Generation key or null if not found/not GPU
|
|
72
|
+
*/
|
|
73
|
+
function getInstanceCudaGeneration(instanceType) {
|
|
74
|
+
const entry = instanceCatalogRaw[instanceType];
|
|
75
|
+
if (!entry) return null;
|
|
76
|
+
if (entry.acceleratorType !== 'cuda') return null;
|
|
77
|
+
return entry.gpuArchitecture || null;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Filter instance choices to only include instances from the same CUDA generation
|
|
82
|
+
* as the first (highest-priority) instance in the list.
|
|
83
|
+
* @param {string[]} instanceTypes - Array of instance type strings
|
|
84
|
+
* @returns {{ filtered: string[], generation: string|null, removed: string[] }}
|
|
85
|
+
*/
|
|
86
|
+
function filterByCudaGeneration(instanceTypes) {
|
|
87
|
+
if (!instanceTypes || instanceTypes.length === 0) {
|
|
88
|
+
return { filtered: [], generation: null, removed: [] };
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Find the generation of the first instance
|
|
92
|
+
const firstGen = getInstanceCudaGeneration(instanceTypes[0]);
|
|
93
|
+
if (!firstGen) {
|
|
94
|
+
// First instance not in catalog or not CUDA — return all (can't filter)
|
|
95
|
+
return { filtered: instanceTypes, generation: null, removed: [] };
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const filtered = [];
|
|
99
|
+
const removed = [];
|
|
100
|
+
for (const it of instanceTypes) {
|
|
101
|
+
const gen = getInstanceCudaGeneration(it);
|
|
102
|
+
// Keep if same generation, or if not in catalog (don't block unknown types)
|
|
103
|
+
if (gen === firstGen || gen === null) {
|
|
104
|
+
filtered.push(it);
|
|
105
|
+
} else {
|
|
106
|
+
removed.push(it);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return { filtered, generation: firstGen, removed };
|
|
111
|
+
}
|
|
112
|
+
|
|
50
113
|
/**
|
|
51
114
|
* Generate pseudo-randomized project name based on framework
|
|
52
115
|
* @param {string} framework - The ML framework
|
|
@@ -698,12 +761,129 @@ const infraRegionAndTargetPrompts = [
|
|
|
698
761
|
}
|
|
699
762
|
];
|
|
700
763
|
|
|
764
|
+
// Sub-phase A2: Existing endpoint prompt (only when deploymentTarget === 'realtime-inference')
|
|
765
|
+
const infraExistingEndpointPrompts = [
|
|
766
|
+
{
|
|
767
|
+
type: 'list',
|
|
768
|
+
name: 'useExistingEndpoint',
|
|
769
|
+
message: 'Deploy to an existing endpoint? (attach IC to running endpoint)',
|
|
770
|
+
choices: [
|
|
771
|
+
{ name: 'No — create a new endpoint', value: 'no' },
|
|
772
|
+
{ name: 'Yes — attach to an existing endpoint', value: 'yes' }
|
|
773
|
+
],
|
|
774
|
+
default: 'no',
|
|
775
|
+
when: answers => answers.deploymentTarget === 'realtime-inference'
|
|
776
|
+
},
|
|
777
|
+
{
|
|
778
|
+
type: 'list',
|
|
779
|
+
name: 'existingEndpointName',
|
|
780
|
+
message: 'Select endpoint:',
|
|
781
|
+
choices: (answers) => {
|
|
782
|
+
const mcpChoices = answers._mcpEndpointChoices || [];
|
|
783
|
+
if (mcpChoices.length > 0) {
|
|
784
|
+
return [...mcpChoices, { name: 'Custom (enter manually)', value: 'custom' }];
|
|
785
|
+
}
|
|
786
|
+
return [{ name: 'Enter endpoint name manually', value: 'custom' }];
|
|
787
|
+
},
|
|
788
|
+
when: answers => answers.useExistingEndpoint === 'yes'
|
|
789
|
+
},
|
|
790
|
+
{
|
|
791
|
+
type: 'input',
|
|
792
|
+
name: 'customExistingEndpointName',
|
|
793
|
+
message: 'Enter existing endpoint name:',
|
|
794
|
+
validate: (input) => {
|
|
795
|
+
if (!input || input.trim() === '') {
|
|
796
|
+
return 'Endpoint name is required';
|
|
797
|
+
}
|
|
798
|
+
return true;
|
|
799
|
+
},
|
|
800
|
+
when: answers => answers.useExistingEndpoint === 'yes' && answers.existingEndpointName === 'custom'
|
|
801
|
+
}
|
|
802
|
+
];
|
|
803
|
+
|
|
701
804
|
// Sub-phase B: Instance type (only when deploymentTarget === 'realtime-inference')
|
|
702
805
|
const infraInstancePrompts = [
|
|
806
|
+
// Multi-select prompt: shown when MCP sizer has choices AND deployment target is realtime-inference
|
|
807
|
+
// User can select 1-5 instances; selection count determines single-type vs instance-pools behavior
|
|
808
|
+
// Requirements: 6.4
|
|
809
|
+
{
|
|
810
|
+
type: 'checkbox',
|
|
811
|
+
name: 'instanceTypeSelections',
|
|
812
|
+
when: answers => answers.deploymentTarget === 'realtime-inference' &&
|
|
813
|
+
answers._mcpInstanceChoices && answers._mcpInstanceChoices.length > 1,
|
|
814
|
+
message: 'Select instance type(s) — select multiple for instance pools (priority = selection order, max 5):',
|
|
815
|
+
choices: (answers) => {
|
|
816
|
+
const mcpChoices = answers._mcpInstanceChoices || [];
|
|
817
|
+
// Show all compatible instances — CUDA generation filtering happens
|
|
818
|
+
// after selection to allow users to see all options and make informed choices.
|
|
819
|
+
// If they select instances from different generations, the post-selection
|
|
820
|
+
// filter (filterByCudaGeneration in prompt-runner.js) will warn and remove incompatible ones.
|
|
821
|
+
const choices = mcpChoices.map(instanceType => {
|
|
822
|
+
const entry = instanceCatalogRaw[instanceType];
|
|
823
|
+
const gpuInfo = entry ? `${entry.gpus} GPU${entry.gpus > 1 ? 's' : ''}, ${entry.gpuMemoryGb || '?'}GB` : '';
|
|
824
|
+
return {
|
|
825
|
+
name: gpuInfo ? `${instanceType} (${gpuInfo})` : instanceType,
|
|
826
|
+
value: instanceType,
|
|
827
|
+
short: instanceType
|
|
828
|
+
};
|
|
829
|
+
});
|
|
830
|
+
// Always include a "Custom Input" option at the end
|
|
831
|
+
choices.push({
|
|
832
|
+
name: 'Custom Input (enter one or comma-separated list)',
|
|
833
|
+
value: '__custom_input__',
|
|
834
|
+
short: 'Custom'
|
|
835
|
+
});
|
|
836
|
+
return choices;
|
|
837
|
+
},
|
|
838
|
+
validate: (input) => {
|
|
839
|
+
if (!input || input.length === 0) {
|
|
840
|
+
return 'Select at least one instance type';
|
|
841
|
+
}
|
|
842
|
+
if (input.length > 5) {
|
|
843
|
+
return 'Maximum 5 instance types allowed (API limit). Please deselect some.';
|
|
844
|
+
}
|
|
845
|
+
return true;
|
|
846
|
+
}
|
|
847
|
+
},
|
|
848
|
+
// Custom input prompt for multi-select: shown when user selects "Custom Input" in instanceTypeSelections
|
|
849
|
+
{
|
|
850
|
+
type: 'input',
|
|
851
|
+
name: 'customInstanceTypeSelections',
|
|
852
|
+
message: 'Enter instance type(s) — single for homogeneous, comma-separated for heterogeneous (e.g., ml.g5.xlarge or ml.g5.xlarge,ml.g5.2xlarge):',
|
|
853
|
+
when: answers => Array.isArray(answers.instanceTypeSelections) &&
|
|
854
|
+
answers.instanceTypeSelections.includes('__custom_input__'),
|
|
855
|
+
validate: (input) => {
|
|
856
|
+
if (!input || input.trim() === '') {
|
|
857
|
+
return 'At least one instance type is required';
|
|
858
|
+
}
|
|
859
|
+
const instancePattern = /^ml\.[a-z0-9]+\.(nano|micro|small|medium|large|xlarge|[0-9]+xlarge)$/;
|
|
860
|
+
const instances = input.split(',').map(s => s.trim()).filter(s => s.length > 0);
|
|
861
|
+
if (instances.length === 0) {
|
|
862
|
+
return 'At least one instance type is required';
|
|
863
|
+
}
|
|
864
|
+
if (instances.length > 5) {
|
|
865
|
+
return 'Maximum 5 instance types allowed (API limit).';
|
|
866
|
+
}
|
|
867
|
+
for (const inst of instances) {
|
|
868
|
+
if (!instancePattern.test(inst)) {
|
|
869
|
+
return `Invalid instance type format: "${inst}". Expected format: ml.{family}.{size} (e.g., ml.g5.xlarge)`;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
return true;
|
|
873
|
+
}
|
|
874
|
+
},
|
|
875
|
+
// Single-select prompt: shown when no MCP choices, or for non-realtime targets, or only 1 MCP choice
|
|
703
876
|
{
|
|
704
877
|
type: 'list',
|
|
705
878
|
name: 'instanceType',
|
|
706
|
-
when: answers =>
|
|
879
|
+
when: answers => {
|
|
880
|
+
// Skip if multi-select was shown (realtime with multiple MCP choices)
|
|
881
|
+
if (answers.deploymentTarget === 'realtime-inference' &&
|
|
882
|
+
answers._mcpInstanceChoices && answers._mcpInstanceChoices.length > 1) {
|
|
883
|
+
return false;
|
|
884
|
+
}
|
|
885
|
+
return answers.deploymentTarget === 'realtime-inference' || answers.deploymentTarget === 'async-inference' || answers.deploymentTarget === 'batch-transform' || answers.deploymentTarget === 'hyperpod-eks';
|
|
886
|
+
},
|
|
707
887
|
message: (answers) => {
|
|
708
888
|
const framework = answers.framework || answers.deploymentConfig?.split('-')[0];
|
|
709
889
|
|
|
@@ -1121,6 +1301,41 @@ const baseImagePrompts = [
|
|
|
1121
1301
|
}
|
|
1122
1302
|
];
|
|
1123
1303
|
|
|
1304
|
+
/**
|
|
1305
|
+
* LoRA adapter prompts for multi-adapter serving configuration.
|
|
1306
|
+
* Only shown when architecture is transformers AND model server is vllm, sglang, or djl-lmi.
|
|
1307
|
+
* Requirements: 1.1, 1.2, 1.4
|
|
1308
|
+
*/
|
|
1309
|
+
const loraPrompts = [
|
|
1310
|
+
{
|
|
1311
|
+
type: 'confirm',
|
|
1312
|
+
name: 'enableLora',
|
|
1313
|
+
message: 'Enable LoRA adapter serving?',
|
|
1314
|
+
default: false,
|
|
1315
|
+
when: (answers) => {
|
|
1316
|
+
const architecture = answers.architecture || answers.deploymentConfig?.split('-')[0];
|
|
1317
|
+
const backend = answers.backend || answers.deploymentConfig?.split('-').slice(1).join('-');
|
|
1318
|
+
if (architecture !== 'transformers') return false;
|
|
1319
|
+
const loraCapableServers = ['vllm', 'sglang', 'djl-lmi', 'lmi', 'djl'];
|
|
1320
|
+
return loraCapableServers.includes(backend);
|
|
1321
|
+
}
|
|
1322
|
+
},
|
|
1323
|
+
{
|
|
1324
|
+
type: 'number',
|
|
1325
|
+
name: 'maxLoras',
|
|
1326
|
+
message: 'Maximum concurrent LoRA adapters in GPU memory:',
|
|
1327
|
+
default: 30,
|
|
1328
|
+
when: (answers) => answers.enableLora === true
|
|
1329
|
+
},
|
|
1330
|
+
{
|
|
1331
|
+
type: 'number',
|
|
1332
|
+
name: 'maxLoraRank',
|
|
1333
|
+
message: 'Maximum LoRA rank:',
|
|
1334
|
+
default: 64,
|
|
1335
|
+
when: (answers) => answers.enableLora === true
|
|
1336
|
+
}
|
|
1337
|
+
];
|
|
1338
|
+
|
|
1124
1339
|
/**
|
|
1125
1340
|
* Benchmark prompts for SageMaker AI Benchmarking (NVIDIA AIPerf)
|
|
1126
1341
|
* Sub-prompts shown when 'sagemaker-ai-automated-benchmarking' is selected in testTypes.
|
|
@@ -1184,9 +1399,11 @@ export {
|
|
|
1184
1399
|
hfTokenPrompts,
|
|
1185
1400
|
ngcApiKeyPrompts,
|
|
1186
1401
|
modulePrompts,
|
|
1402
|
+
loraPrompts,
|
|
1187
1403
|
benchmarkPrompts,
|
|
1188
1404
|
infrastructurePrompts,
|
|
1189
1405
|
infraRegionAndTargetPrompts,
|
|
1406
|
+
infraExistingEndpointPrompts,
|
|
1190
1407
|
infraInstancePrompts,
|
|
1191
1408
|
infraAsyncPrompts,
|
|
1192
1409
|
infraBatchTransformPrompts,
|
|
@@ -1196,5 +1413,8 @@ export {
|
|
|
1196
1413
|
destinationPrompts,
|
|
1197
1414
|
baseImageSearchPrompts,
|
|
1198
1415
|
baseImagePrompts,
|
|
1199
|
-
formatImageChoices
|
|
1416
|
+
formatImageChoices,
|
|
1417
|
+
filterByCudaGeneration,
|
|
1418
|
+
getInstanceCudaGeneration,
|
|
1419
|
+
instanceCatalogRaw
|
|
1200
1420
|
};
|
|
@@ -150,6 +150,18 @@ export default class RegistryCommandHandler {
|
|
|
150
150
|
}
|
|
151
151
|
}
|
|
152
152
|
|
|
153
|
+
// Parse icList from JSON string if provided
|
|
154
|
+
const icListRaw = options.icList || options['ic-list'];
|
|
155
|
+
if (icListRaw) {
|
|
156
|
+
try {
|
|
157
|
+
entry.deployment.icList = typeof icListRaw === 'string'
|
|
158
|
+
? JSON.parse(icListRaw)
|
|
159
|
+
: icListRaw;
|
|
160
|
+
} catch (err) {
|
|
161
|
+
console.log(`Warning: Could not parse ic-list JSON: ${err.message}`);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
153
165
|
try {
|
|
154
166
|
const id = registry.add(entry);
|
|
155
167
|
console.log('✅ Deployment entry logged successfully.');
|
package/templates/Dockerfile
CHANGED
|
@@ -238,6 +238,18 @@ ENV <%= key %>=<%= value %>
|
|
|
238
238
|
<% }); %>
|
|
239
239
|
<% } %>
|
|
240
240
|
|
|
241
|
+
<% if (enableLora && modelServer === 'vllm') { %>
|
|
242
|
+
# LoRA adapter serving configuration
|
|
243
|
+
ENV VLLM_ENABLE_LORA=true
|
|
244
|
+
ENV VLLM_MAX_LORAS=<%= maxLoras %>
|
|
245
|
+
ENV VLLM_MAX_LORA_RANK=<%= maxLoraRank %>
|
|
246
|
+
<% } %>
|
|
247
|
+
<% if (enableLora && modelServer === 'sglang') { %>
|
|
248
|
+
# LoRA adapter serving configuration
|
|
249
|
+
ENV SGLANG_ENABLE_LORA=true
|
|
250
|
+
ENV SGLANG_MAX_LORAS=<%= maxLoras %>
|
|
251
|
+
<% } %>
|
|
252
|
+
|
|
241
253
|
<% if (typeof modelSource !== 'undefined' && modelSource && modelSource !== 'huggingface' && modelServer !== 'lmi' && modelServer !== 'djl') { %>
|
|
242
254
|
# Install AWS CLI for S3 model downloads
|
|
243
255
|
RUN pip install --no-cache-dir awscli
|
|
@@ -53,6 +53,13 @@ option.chat_template=<%= chatTemplate %>
|
|
|
53
53
|
# option.gpu_memory_utilization=0.9
|
|
54
54
|
# option.enable_chunked_prefill=true
|
|
55
55
|
|
|
56
|
+
<% if (enableLora) { %>
|
|
57
|
+
# LoRA adapter serving configuration
|
|
58
|
+
option.enable_lora=true
|
|
59
|
+
option.max_loras=<%= maxLoras %>
|
|
60
|
+
option.max_cpu_loras=70
|
|
61
|
+
<% } %>
|
|
62
|
+
|
|
56
63
|
<% } else if (modelServer === 'djl') { %>
|
|
57
64
|
# DJL Serving Configuration
|
|
58
65
|
# DJL provides flexible model serving with multiple framework support
|
|
@@ -94,6 +101,13 @@ option.chat_template=<%= chatTemplate %>
|
|
|
94
101
|
# option.tensor_parallel_degree=1
|
|
95
102
|
# option.device_map=auto
|
|
96
103
|
|
|
104
|
+
<% if (enableLora) { %>
|
|
105
|
+
# LoRA adapter serving configuration
|
|
106
|
+
option.enable_lora=true
|
|
107
|
+
option.max_loras=<%= maxLoras %>
|
|
108
|
+
option.max_cpu_loras=70
|
|
109
|
+
<% } %>
|
|
110
|
+
|
|
97
111
|
<% } %>
|
|
98
112
|
|
|
99
113
|
# Additional Environment-Specific Configuration
|