@aws/ml-container-creator 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/bin/cli.js +5 -2
  2. package/config/bootstrap-stack.json +40 -9
  3. package/infra/ci-harness/buildspec.yml +60 -0
  4. package/infra/ci-harness/package-lock.json +5 -1
  5. package/package.json +1 -1
  6. package/servers/README.md +41 -1
  7. package/servers/instance-sizer/index.js +10 -4
  8. package/servers/instance-sizer/lib/model-resolver.js +1 -1
  9. package/servers/lib/catalogs/model-sizes.json +135 -90
  10. package/servers/lib/catalogs/models.json +483 -411
  11. package/src/app.js +33 -2
  12. package/src/lib/bootstrap-command-handler.js +6 -0
  13. package/src/lib/cli-handler.js +1 -1
  14. package/src/lib/config-manager.js +41 -2
  15. package/src/lib/deployment-entry-schema.js +16 -0
  16. package/src/lib/mcp-client.js +3 -3
  17. package/src/lib/prompt-runner.js +179 -8
  18. package/src/lib/prompts.js +253 -7
  19. package/src/lib/registry-command-handler.js +12 -0
  20. package/templates/Dockerfile +12 -0
  21. package/templates/code/serving.properties +14 -0
  22. package/templates/do/adapter +1230 -0
  23. package/templates/do/adapters/.gitkeep +2 -0
  24. package/templates/do/add-ic +130 -0
  25. package/templates/do/benchmark +81 -9
  26. package/templates/do/clean +507 -17
  27. package/templates/do/config +28 -5
  28. package/templates/do/deploy +513 -367
  29. package/templates/do/ic/default.conf +32 -0
  30. package/templates/do/lib/endpoint-config.sh +216 -0
  31. package/templates/do/lib/inference-component.sh +167 -0
  32. package/templates/do/lib/secrets.sh +44 -0
  33. package/templates/do/lib/wait.sh +131 -0
  34. package/templates/do/logs +107 -27
  35. package/templates/do/optimize +528 -0
  36. package/templates/do/register +111 -1
  37. package/templates/do/status +337 -0
  38. package/templates/do/test +80 -28
@@ -47,6 +47,69 @@ function loadInstanceTypeRegistry() {
47
47
 
48
48
  const instanceTypeRegistry = loadInstanceTypeRegistry();
49
49
 
50
+ /**
51
+ * Load the raw instance catalog for GPU/CUDA generation lookups.
52
+ * Returns the full catalog entries keyed by instance type.
53
+ */
54
+ function loadInstanceCatalogRaw() {
55
+ try {
56
+ const raw = readFileSync(instancesCatalogPath, 'utf8');
57
+ const catalog = JSON.parse(raw);
58
+ return catalog?.catalog || {};
59
+ } catch (error) {
60
+ return {};
61
+ }
62
+ }
63
+
64
+ const instanceCatalogRaw = loadInstanceCatalogRaw();
65
+
66
+ /**
67
+ * Get the CUDA generation key for an instance type.
68
+ * Uses gpuArchitecture as the generation grouping (e.g., "Turing", "Ampere", "Hopper").
69
+ * Instances in the same generation share AMI compatibility.
70
+ * @param {string} instanceType - e.g., "ml.g5.xlarge"
71
+ * @returns {string|null} Generation key or null if not found/not GPU
72
+ */
73
+ function getInstanceCudaGeneration(instanceType) {
74
+ const entry = instanceCatalogRaw[instanceType];
75
+ if (!entry) return null;
76
+ if (entry.acceleratorType !== 'cuda') return null;
77
+ return entry.gpuArchitecture || null;
78
+ }
79
+
80
+ /**
81
+ * Filter instance choices to only include instances from the same CUDA generation
82
+ * as the first (highest-priority) instance in the list.
83
+ * @param {string[]} instanceTypes - Array of instance type strings
84
+ * @returns {{ filtered: string[], generation: string|null, removed: string[] }}
85
+ */
86
+ function filterByCudaGeneration(instanceTypes) {
87
+ if (!instanceTypes || instanceTypes.length === 0) {
88
+ return { filtered: [], generation: null, removed: [] };
89
+ }
90
+
91
+ // Find the generation of the first instance
92
+ const firstGen = getInstanceCudaGeneration(instanceTypes[0]);
93
+ if (!firstGen) {
94
+ // First instance not in catalog or not CUDA — return all (can't filter)
95
+ return { filtered: instanceTypes, generation: null, removed: [] };
96
+ }
97
+
98
+ const filtered = [];
99
+ const removed = [];
100
+ for (const it of instanceTypes) {
101
+ const gen = getInstanceCudaGeneration(it);
102
+ // Keep if same generation, or if not in catalog (don't block unknown types)
103
+ if (gen === firstGen || gen === null) {
104
+ filtered.push(it);
105
+ } else {
106
+ removed.push(it);
107
+ }
108
+ }
109
+
110
+ return { filtered, generation: firstGen, removed };
111
+ }
112
+
50
113
  /**
51
114
  * Generate pseudo-randomized project name based on framework
52
115
  * @param {string} framework - The ML framework
@@ -336,9 +399,33 @@ const modelFormatPrompts = [
336
399
  ];
337
400
  }
338
401
  return [
339
- 'openai/gpt-oss-20b',
340
- 'meta-llama/Llama-3.2-3B-Instruct',
402
+ { type: 'separator', separator: '── Meta Llama ──' },
341
403
  'meta-llama/Llama-3.2-1B-Instruct',
404
+ 'meta-llama/Llama-3.2-3B-Instruct',
405
+ 'meta-llama/Llama-3.1-8B-Instruct',
406
+ 'meta-llama/Llama-3.3-70B-Instruct',
407
+ { type: 'separator', separator: '── Qwen (Alibaba) ──' },
408
+ 'Qwen/Qwen3-0.6B',
409
+ 'Qwen/Qwen3-1.7B',
410
+ 'Qwen/Qwen3-4B',
411
+ 'Qwen/Qwen3-8B',
412
+ 'Qwen/Qwen3-14B',
413
+ 'Qwen/Qwen3-32B',
414
+ 'Qwen/Qwen2.5-7B-Instruct',
415
+ 'Qwen/Qwen2.5-14B-Instruct',
416
+ 'Qwen/Qwen2.5-32B-Instruct',
417
+ 'Qwen/Qwen2.5-72B-Instruct',
418
+ { type: 'separator', separator: '── DeepSeek ──' },
419
+ 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
420
+ 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
421
+ 'deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
422
+ 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
423
+ 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
424
+ 'deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
425
+ { type: 'separator', separator: '── OpenAI ──' },
426
+ 'openai/gpt-oss-20b',
427
+ 'openai/gpt-oss-120b',
428
+ { type: 'separator', separator: '──────────────' },
342
429
  'Custom (enter manually)'
343
430
  ];
344
431
  },
@@ -350,7 +437,7 @@ const modelFormatPrompts = [
350
437
  if (architecture === 'diffusors') {
351
438
  return 'stabilityai/stable-diffusion-3.5-medium';
352
439
  }
353
- return 'openai/gpt-oss-20b';
440
+ return 'meta-llama/Llama-3.1-8B-Instruct';
354
441
  },
355
442
  when: answers => {
356
443
  const architecture = answers.architecture || answers.deploymentConfig?.split('-')[0];
@@ -465,9 +552,11 @@ const modelProfilePrompts = [
465
552
  */
466
553
  // eslint-disable-next-line no-unused-vars -- reference list for future use
467
554
  const EXAMPLE_MODEL_IDS = [
468
- 'openai/gpt-oss-20b',
555
+ 'meta-llama/Llama-3.1-8B-Instruct',
469
556
  'meta-llama/Llama-3.2-3B-Instruct',
470
- 'meta-llama/Llama-3.2-1B-Instruct'
557
+ 'Qwen/Qwen3-8B',
558
+ 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
559
+ 'openai/gpt-oss-20b'
471
560
  ];
472
561
 
473
562
  const hfTokenPrompts = [
@@ -698,12 +787,129 @@ const infraRegionAndTargetPrompts = [
698
787
  }
699
788
  ];
700
789
 
790
+ // Sub-phase A2: Existing endpoint prompt (only when deploymentTarget === 'realtime-inference')
791
+ const infraExistingEndpointPrompts = [
792
+ {
793
+ type: 'list',
794
+ name: 'useExistingEndpoint',
795
+ message: 'Deploy to an existing endpoint? (attach IC to running endpoint)',
796
+ choices: [
797
+ { name: 'No — create a new endpoint', value: 'no' },
798
+ { name: 'Yes — attach to an existing endpoint', value: 'yes' }
799
+ ],
800
+ default: 'no',
801
+ when: answers => answers.deploymentTarget === 'realtime-inference'
802
+ },
803
+ {
804
+ type: 'list',
805
+ name: 'existingEndpointName',
806
+ message: 'Select endpoint:',
807
+ choices: (answers) => {
808
+ const mcpChoices = answers._mcpEndpointChoices || [];
809
+ if (mcpChoices.length > 0) {
810
+ return [...mcpChoices, { name: 'Custom (enter manually)', value: 'custom' }];
811
+ }
812
+ return [{ name: 'Enter endpoint name manually', value: 'custom' }];
813
+ },
814
+ when: answers => answers.useExistingEndpoint === 'yes'
815
+ },
816
+ {
817
+ type: 'input',
818
+ name: 'customExistingEndpointName',
819
+ message: 'Enter existing endpoint name:',
820
+ validate: (input) => {
821
+ if (!input || input.trim() === '') {
822
+ return 'Endpoint name is required';
823
+ }
824
+ return true;
825
+ },
826
+ when: answers => answers.useExistingEndpoint === 'yes' && answers.existingEndpointName === 'custom'
827
+ }
828
+ ];
829
+
701
830
  // Sub-phase B: Instance type (only when deploymentTarget === 'realtime-inference')
702
831
  const infraInstancePrompts = [
832
+ // Multi-select prompt: shown when MCP sizer has choices AND deployment target is realtime-inference
833
+ // User can select 1-5 instances; selection count determines single-type vs instance-pools behavior
834
+ // Requirements: 6.4
835
+ {
836
+ type: 'checkbox',
837
+ name: 'instanceTypeSelections',
838
+ when: answers => answers.deploymentTarget === 'realtime-inference' &&
839
+ answers._mcpInstanceChoices && answers._mcpInstanceChoices.length > 1,
840
+ message: 'Select instance type(s) — select multiple for instance pools (priority = selection order, max 5):',
841
+ choices: (answers) => {
842
+ const mcpChoices = answers._mcpInstanceChoices || [];
843
+ // Show all compatible instances — CUDA generation filtering happens
844
+ // after selection to allow users to see all options and make informed choices.
845
+ // If they select instances from different generations, the post-selection
846
+ // filter (filterByCudaGeneration in prompt-runner.js) will warn and remove incompatible ones.
847
+ const choices = mcpChoices.map(instanceType => {
848
+ const entry = instanceCatalogRaw[instanceType];
849
+ const gpuInfo = entry ? `${entry.gpus} GPU${entry.gpus > 1 ? 's' : ''}, ${entry.gpuMemoryGb || '?'}GB` : '';
850
+ return {
851
+ name: gpuInfo ? `${instanceType} (${gpuInfo})` : instanceType,
852
+ value: instanceType,
853
+ short: instanceType
854
+ };
855
+ });
856
+ // Always include a "Custom Input" option at the end
857
+ choices.push({
858
+ name: 'Custom Input (enter one or comma-separated list)',
859
+ value: '__custom_input__',
860
+ short: 'Custom'
861
+ });
862
+ return choices;
863
+ },
864
+ validate: (input) => {
865
+ if (!input || input.length === 0) {
866
+ return 'Select at least one instance type';
867
+ }
868
+ if (input.length > 5) {
869
+ return 'Maximum 5 instance types allowed (API limit). Please deselect some.';
870
+ }
871
+ return true;
872
+ }
873
+ },
874
+ // Custom input prompt for multi-select: shown when user selects "Custom Input" in instanceTypeSelections
875
+ {
876
+ type: 'input',
877
+ name: 'customInstanceTypeSelections',
878
+ message: 'Enter instance type(s) — single for homogeneous, comma-separated for heterogeneous (e.g., ml.g5.xlarge or ml.g5.xlarge,ml.g5.2xlarge):',
879
+ when: answers => Array.isArray(answers.instanceTypeSelections) &&
880
+ answers.instanceTypeSelections.includes('__custom_input__'),
881
+ validate: (input) => {
882
+ if (!input || input.trim() === '') {
883
+ return 'At least one instance type is required';
884
+ }
885
+ const instancePattern = /^ml\.[a-z0-9]+\.(nano|micro|small|medium|large|xlarge|[0-9]+xlarge)$/;
886
+ const instances = input.split(',').map(s => s.trim()).filter(s => s.length > 0);
887
+ if (instances.length === 0) {
888
+ return 'At least one instance type is required';
889
+ }
890
+ if (instances.length > 5) {
891
+ return 'Maximum 5 instance types allowed (API limit).';
892
+ }
893
+ for (const inst of instances) {
894
+ if (!instancePattern.test(inst)) {
895
+ return `Invalid instance type format: "${inst}". Expected format: ml.{family}.{size} (e.g., ml.g5.xlarge)`;
896
+ }
897
+ }
898
+ return true;
899
+ }
900
+ },
901
+ // Single-select prompt: shown when no MCP choices, or for non-realtime targets, or only 1 MCP choice
703
902
  {
704
903
  type: 'list',
705
904
  name: 'instanceType',
706
- when: answers => answers.deploymentTarget === 'realtime-inference' || answers.deploymentTarget === 'async-inference' || answers.deploymentTarget === 'batch-transform' || answers.deploymentTarget === 'hyperpod-eks',
905
+ when: answers => {
906
+ // Skip if multi-select was shown (realtime with multiple MCP choices)
907
+ if (answers.deploymentTarget === 'realtime-inference' &&
908
+ answers._mcpInstanceChoices && answers._mcpInstanceChoices.length > 1) {
909
+ return false;
910
+ }
911
+ return answers.deploymentTarget === 'realtime-inference' || answers.deploymentTarget === 'async-inference' || answers.deploymentTarget === 'batch-transform' || answers.deploymentTarget === 'hyperpod-eks';
912
+ },
707
913
  message: (answers) => {
708
914
  const framework = answers.framework || answers.deploymentConfig?.split('-')[0];
709
915
 
@@ -1121,6 +1327,41 @@ const baseImagePrompts = [
1121
1327
  }
1122
1328
  ];
1123
1329
 
1330
+ /**
1331
+ * LoRA adapter prompts for multi-adapter serving configuration.
1332
+ * Only shown when architecture is transformers AND model server is vllm, sglang, or djl-lmi.
1333
+ * Requirements: 1.1, 1.2, 1.4
1334
+ */
1335
+ const loraPrompts = [
1336
+ {
1337
+ type: 'confirm',
1338
+ name: 'enableLora',
1339
+ message: 'Enable LoRA adapter serving?',
1340
+ default: false,
1341
+ when: (answers) => {
1342
+ const architecture = answers.architecture || answers.deploymentConfig?.split('-')[0];
1343
+ const backend = answers.backend || answers.deploymentConfig?.split('-').slice(1).join('-');
1344
+ if (architecture !== 'transformers') return false;
1345
+ const loraCapableServers = ['vllm', 'sglang', 'djl-lmi', 'lmi', 'djl'];
1346
+ return loraCapableServers.includes(backend);
1347
+ }
1348
+ },
1349
+ {
1350
+ type: 'number',
1351
+ name: 'maxLoras',
1352
+ message: 'Maximum concurrent LoRA adapters in GPU memory:',
1353
+ default: 30,
1354
+ when: (answers) => answers.enableLora === true
1355
+ },
1356
+ {
1357
+ type: 'number',
1358
+ name: 'maxLoraRank',
1359
+ message: 'Maximum LoRA rank:',
1360
+ default: 64,
1361
+ when: (answers) => answers.enableLora === true
1362
+ }
1363
+ ];
1364
+
1124
1365
  /**
1125
1366
  * Benchmark prompts for SageMaker AI Benchmarking (NVIDIA AIPerf)
1126
1367
  * Sub-prompts shown when 'sagemaker-ai-automated-benchmarking' is selected in testTypes.
@@ -1184,9 +1425,11 @@ export {
1184
1425
  hfTokenPrompts,
1185
1426
  ngcApiKeyPrompts,
1186
1427
  modulePrompts,
1428
+ loraPrompts,
1187
1429
  benchmarkPrompts,
1188
1430
  infrastructurePrompts,
1189
1431
  infraRegionAndTargetPrompts,
1432
+ infraExistingEndpointPrompts,
1190
1433
  infraInstancePrompts,
1191
1434
  infraAsyncPrompts,
1192
1435
  infraBatchTransformPrompts,
@@ -1196,5 +1439,8 @@ export {
1196
1439
  destinationPrompts,
1197
1440
  baseImageSearchPrompts,
1198
1441
  baseImagePrompts,
1199
- formatImageChoices
1442
+ formatImageChoices,
1443
+ filterByCudaGeneration,
1444
+ getInstanceCudaGeneration,
1445
+ instanceCatalogRaw
1200
1446
  };
@@ -150,6 +150,18 @@ export default class RegistryCommandHandler {
150
150
  }
151
151
  }
152
152
 
153
+ // Parse icList from JSON string if provided
154
+ const icListRaw = options.icList || options['ic-list'];
155
+ if (icListRaw) {
156
+ try {
157
+ entry.deployment.icList = typeof icListRaw === 'string'
158
+ ? JSON.parse(icListRaw)
159
+ : icListRaw;
160
+ } catch (err) {
161
+ console.log(`Warning: Could not parse ic-list JSON: ${err.message}`);
162
+ }
163
+ }
164
+
153
165
  try {
154
166
  const id = registry.add(entry);
155
167
  console.log('✅ Deployment entry logged successfully.');
@@ -238,6 +238,18 @@ ENV <%= key %>=<%= value %>
238
238
  <% }); %>
239
239
  <% } %>
240
240
 
241
+ <% if (enableLora && modelServer === 'vllm') { %>
242
+ # LoRA adapter serving configuration
243
+ ENV VLLM_ENABLE_LORA=true
244
+ ENV VLLM_MAX_LORAS=<%= maxLoras %>
245
+ ENV VLLM_MAX_LORA_RANK=<%= maxLoraRank %>
246
+ <% } %>
247
+ <% if (enableLora && modelServer === 'sglang') { %>
248
+ # LoRA adapter serving configuration
249
+ ENV SGLANG_ENABLE_LORA=true
250
+ ENV SGLANG_MAX_LORAS=<%= maxLoras %>
251
+ <% } %>
252
+
241
253
  <% if (typeof modelSource !== 'undefined' && modelSource && modelSource !== 'huggingface' && modelServer !== 'lmi' && modelServer !== 'djl') { %>
242
254
  # Install AWS CLI for S3 model downloads
243
255
  RUN pip install --no-cache-dir awscli
@@ -53,6 +53,13 @@ option.chat_template=<%= chatTemplate %>
53
53
  # option.gpu_memory_utilization=0.9
54
54
  # option.enable_chunked_prefill=true
55
55
 
56
+ <% if (enableLora) { %>
57
+ # LoRA adapter serving configuration
58
+ option.enable_lora=true
59
+ option.max_loras=<%= maxLoras %>
60
+ option.max_cpu_loras=70
61
+ <% } %>
62
+
56
63
  <% } else if (modelServer === 'djl') { %>
57
64
  # DJL Serving Configuration
58
65
  # DJL provides flexible model serving with multiple framework support
@@ -94,6 +101,13 @@ option.chat_template=<%= chatTemplate %>
94
101
  # option.tensor_parallel_degree=1
95
102
  # option.device_map=auto
96
103
 
104
+ <% if (enableLora) { %>
105
+ # LoRA adapter serving configuration
106
+ option.enable_lora=true
107
+ option.max_loras=<%= maxLoras %>
108
+ option.max_cpu_loras=70
109
+ <% } %>
110
+
97
111
  <% } %>
98
112
 
99
113
  # Additional Environment-Specific Configuration