@aws/ml-container-creator 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/bin/cli.js +5 -2
  2. package/config/bootstrap-stack.json +86 -7
  3. package/config/defaults.json +1 -1
  4. package/infra/ci-harness/buildspec.yml +60 -0
  5. package/package.json +3 -1
  6. package/servers/README.md +41 -1
  7. package/servers/instance-sizer/index.js +42 -2
  8. package/servers/instance-sizer/lib/instance-ranker.js +114 -10
  9. package/servers/instance-sizer/lib/quota-resolver.js +368 -0
  10. package/servers/instance-sizer/package.json +2 -0
  11. package/servers/lib/catalogs/instances.json +527 -12
  12. package/servers/lib/catalogs/model-servers.json +15 -15
  13. package/servers/lib/catalogs/model-sizes.json +27 -0
  14. package/servers/lib/catalogs/models.json +71 -0
  15. package/servers/lib/schemas/image-catalog.schema.json +9 -1
  16. package/src/app.js +109 -3
  17. package/src/lib/bootstrap-command-handler.js +96 -3
  18. package/src/lib/cli-handler.js +2 -2
  19. package/src/lib/config-manager.js +117 -1
  20. package/src/lib/deployment-entry-schema.js +16 -0
  21. package/src/lib/prompt-runner.js +270 -12
  22. package/src/lib/prompts.js +288 -6
  23. package/src/lib/registry-command-handler.js +12 -0
  24. package/src/lib/schema-sync.js +31 -0
  25. package/src/lib/template-manager.js +49 -1
  26. package/src/lib/validate-runner.js +125 -2
  27. package/templates/Dockerfile +22 -2
  28. package/templates/code/cuda_compat.sh +22 -0
  29. package/templates/code/serve +3 -0
  30. package/templates/code/serving.properties +14 -0
  31. package/templates/code/start_server.sh +3 -0
  32. package/templates/diffusors/Dockerfile +2 -1
  33. package/templates/diffusors/serve +3 -0
  34. package/templates/do/README.md +33 -0
  35. package/templates/do/adapter +1214 -0
  36. package/templates/do/adapters/.gitkeep +2 -0
  37. package/templates/do/add-ic +130 -0
  38. package/templates/do/benchmark +718 -0
  39. package/templates/do/clean +593 -17
  40. package/templates/do/config +49 -4
  41. package/templates/do/deploy +513 -362
  42. package/templates/do/ic/default.conf +32 -0
  43. package/templates/do/lib/endpoint-config.sh +216 -0
  44. package/templates/do/lib/inference-component.sh +167 -0
  45. package/templates/do/lib/secrets.sh +44 -0
  46. package/templates/do/lib/wait.sh +131 -0
  47. package/templates/do/logs +107 -27
  48. package/templates/do/optimize +528 -0
  49. package/templates/do/register +119 -2
  50. package/templates/do/status +337 -0
  51. package/templates/do/test +80 -28
  52. package/templates/triton/Dockerfile +5 -0
@@ -18,7 +18,10 @@ import {
18
18
  modelLoadStrategyPrompts,
19
19
  modelProfilePrompts,
20
20
  modulePrompts,
21
+ loraPrompts,
22
+ benchmarkPrompts,
21
23
  infraRegionAndTargetPrompts,
24
+ infraExistingEndpointPrompts,
22
25
  infraInstancePrompts,
23
26
  infraAsyncPrompts,
24
27
  infraBatchTransformPrompts,
@@ -28,7 +31,9 @@ import {
28
31
  destinationPrompts,
29
32
  baseImageSearchPrompts,
30
33
  baseImagePrompts,
31
- formatImageChoices
34
+ formatImageChoices,
35
+ filterByCudaGeneration,
36
+ instanceCatalogRaw
32
37
  } from './prompts.js';
33
38
 
34
39
  import fs from 'fs';
@@ -186,12 +191,40 @@ export default class PromptRunner {
186
191
  // 3a. Region query
187
192
  await this._queryMcpForRegion(frameworkAnswers, explicitConfig);
188
193
 
194
+ // 3a2. Existing endpoint prompt (only for realtime-inference)
195
+ // Requirements: 3.3, 4.3, 4.4 — endpoint-picker MCP query
196
+ let existingEndpointAnswers = {};
197
+ if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference') {
198
+ // Query endpoint-picker MCP server for available endpoints
199
+ const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
200
+ await this._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
201
+
202
+ const endpointPreviousAnswers = {
203
+ ...regionAndTargetAnswers,
204
+ ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
205
+ };
206
+ existingEndpointAnswers = await this._runPhase(
207
+ infraExistingEndpointPrompts,
208
+ endpointPreviousAnswers,
209
+ explicitConfig,
210
+ existingConfig
211
+ );
212
+
213
+ // Resolve custom endpoint name
214
+ if (existingEndpointAnswers.customExistingEndpointName) {
215
+ existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
216
+ delete existingEndpointAnswers.customExistingEndpointName;
217
+ }
218
+ }
219
+
189
220
  // 3b. Instance type — query instance-sizer with full context (model + profile + CUDA)
190
221
  let instanceAnswers = {};
191
- const needsInstance = regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
222
+ // Skip instance prompts when attaching to an existing endpoint (instance is inherited)
223
+ const useExistingEndpoint = !!(existingEndpointAnswers.existingEndpointName);
224
+ const needsInstance = !useExistingEndpoint && (regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
192
225
  regionAndTargetAnswers.deploymentTarget === 'async-inference' ||
193
226
  regionAndTargetAnswers.deploymentTarget === 'batch-transform' ||
194
- regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks';
227
+ regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks');
195
228
 
196
229
  if (needsInstance) {
197
230
  // Determine architecture type for heuristic fallback
@@ -229,6 +262,74 @@ export default class PromptRunner {
229
262
  if (!instanceAnswers.instanceType && !explicitConfig.instanceType && this._architectureHeuristicDefault) {
230
263
  instanceAnswers.instanceType = this._architectureHeuristicDefault;
231
264
  }
265
+
266
+ // Process multi-select instance type results (Requirements: 6.4)
267
+ // When user selects multiple instances via checkbox, derive instanceType and instancePools
268
+ if (instanceAnswers.instanceTypeSelections && instanceAnswers.instanceTypeSelections.length > 0) {
269
+ let selections = instanceAnswers.instanceTypeSelections.slice(0, 5); // Cap at 5 (API limit)
270
+
271
+ // Resolve custom input: replace __custom_input__ sentinel with parsed instances
272
+ if (selections.includes('__custom_input__') && instanceAnswers.customInstanceTypeSelections) {
273
+ const customInstances = instanceAnswers.customInstanceTypeSelections
274
+ .split(',').map(s => s.trim()).filter(s => s.length > 0);
275
+ // Remove the sentinel and any other MCP selections, replace with custom entries
276
+ selections = selections.filter(s => s !== '__custom_input__');
277
+ selections = [...selections, ...customInstances];
278
+ delete instanceAnswers.customInstanceTypeSelections;
279
+ } else if (selections.includes('__custom_input__')) {
280
+ // Sentinel selected but no custom input provided — remove it
281
+ selections = selections.filter(s => s !== '__custom_input__');
282
+ }
283
+
284
+ // Cap at 5 after custom expansion
285
+ if (selections.length > 5) {
286
+ console.log(' ⚠️ Maximum 5 instance types allowed. Using first 5 selections.');
287
+ selections = selections.slice(0, 5);
288
+ }
289
+
290
+ // Filter to same CUDA generation and warn about incompatible removals
291
+ const { filtered, generation, removed } = filterByCudaGeneration(selections);
292
+ if (removed.length > 0) {
293
+ console.log(` ⚠️ Removed incompatible instances (different CUDA generation): ${removed.join(', ')}`);
294
+ console.log(` Keeping ${generation} generation: ${filtered.join(', ')}`);
295
+ }
296
+
297
+ const finalSelections = filtered.length > 0 ? filtered : selections;
298
+
299
+ if (finalSelections.length === 1) {
300
+ // Single selection → standard single instance type (no pools)
301
+ instanceAnswers.instanceType = finalSelections[0];
302
+ console.log(` ✓ Single instance selected: ${finalSelections[0]}`);
303
+ } else {
304
+ // Multiple selections → instance pools with priority = selection order
305
+ instanceAnswers.instanceType = finalSelections[0]; // backward compat: first is primary
306
+ instanceAnswers.instancePools = finalSelections.map((it, idx) => ({
307
+ InstanceType: it,
308
+ Priority: idx + 1
309
+ }));
310
+
311
+ // Auto-generate multi-spec IC config from catalog
312
+ instanceAnswers.instancePoolSpecs = finalSelections.map(it => {
313
+ const entry = instanceCatalogRaw[it];
314
+ return {
315
+ instanceType: it,
316
+ gpuCount: entry?.gpus || 1,
317
+ minMemoryMb: entry?.gpuMemoryGb ? entry.gpuMemoryGb * 1024 : 1024
318
+ };
319
+ });
320
+
321
+ console.log(` ✓ Instance pools configured (${finalSelections.length} types):`);
322
+ finalSelections.forEach((it, idx) => {
323
+ const entry = instanceCatalogRaw[it];
324
+ const gpus = entry?.gpus || '?';
325
+ const mem = entry?.gpuMemoryGb || '?';
326
+ console.log(` Priority ${idx + 1}: ${it} (${gpus} GPUs, ${mem}GB GPU memory)`);
327
+ });
328
+ }
329
+
330
+ // Clean up the raw selections from answers (not needed downstream)
331
+ delete instanceAnswers.instanceTypeSelections;
332
+ }
232
333
  }
233
334
 
234
335
  // In auto-prompt mode, use instance-sizer's top recommendation as the instance type
@@ -252,6 +353,29 @@ export default class PromptRunner {
252
353
  this._autoGpuCount = tpRec.gpuCount;
253
354
  console.log(` ✓ Auto-set tensor parallelism: TP=${tpRec.tensorParallelism} (${tpRec.gpuCount} GPUs)`);
254
355
  }
356
+
357
+ // Display capacity type confirmation for selected instance
358
+ // Requirements: 5.4
359
+ if (matchingRec && matchingRec.capacityType) {
360
+ if (matchingRec.capacityType === 'reserved') {
361
+ const resType = matchingRec.reservationType === 'capacity-block' ? 'Capacity Block' : 'ODCR';
362
+ const endInfo = matchingRec.reservationType === 'capacity-block' && matchingRec.reservationInfo?.endDate
363
+ ? `, ends ${new Date(matchingRec.reservationInfo.endDate).toLocaleDateString()}`
364
+ : '';
365
+ console.log(` ✓ Using reserved capacity — ${resType} (reservation ${matchingRec.reservationInfo?.reservationId || 'unknown'}${endInfo})`);
366
+ } else if (matchingRec.capacityType === 'ftp') {
367
+ console.log(` ✓ Using reserved capacity (plan ${matchingRec.ftpInfo?.planName || 'unknown'})`);
368
+ } else {
369
+ const headroom = matchingRec.quotaHeadroom;
370
+ console.log(` ✓ Using on-demand capacity (quota headroom: ${headroom ?? 'unknown'})`);
371
+ }
372
+ }
373
+
374
+ // Extract reservation ARN from selected instance for deployment config
375
+ // Requirements: 2.3
376
+ if (matchingRec && matchingRec.capacityType === 'reserved' && matchingRec.reservationInfo?.reservationArn) {
377
+ this._selectedCapacityReservationArn = matchingRec.reservationInfo.reservationArn;
378
+ }
255
379
  }
256
380
 
257
381
  // 3c. Async-specific prompts (only when deploymentTarget === 'async-inference')
@@ -294,6 +418,7 @@ export default class PromptRunner {
294
418
  // Combine all infrastructure answers
295
419
  const infraAnswers = {
296
420
  ...regionAndTargetAnswers,
421
+ ...existingEndpointAnswers,
297
422
  ...instanceAnswers,
298
423
  ...asyncAnswers,
299
424
  ...batchTransformAnswers,
@@ -375,6 +500,29 @@ export default class PromptRunner {
375
500
  moduleAnswers.includeSampleModel = false;
376
501
  }
377
502
 
503
+ // Benchmark prompts — derive includeBenchmark from testTypes selection or CLI flag
504
+ // Requirements: 1.1, 1.2
505
+ let benchmarkAnswers = {};
506
+ if (frameworkAnswers.architecture === 'transformers' || frameworkAnswers.architecture === 'diffusors') {
507
+ const testTypes = moduleAnswers.testTypes || [];
508
+ const includeBenchmark = testTypes.includes('sagemaker-ai-automated-benchmarking') ||
509
+ explicitConfig.includeBenchmark === true ||
510
+ explicitConfig.includeBenchmark === 'true';
511
+ benchmarkAnswers.includeBenchmark = includeBenchmark;
512
+ if (includeBenchmark) {
513
+ const subAnswers = await this._runPhase(benchmarkPrompts, { ...frameworkAnswers, ...moduleAnswers, includeBenchmark }, explicitConfig, existingConfig);
514
+ benchmarkAnswers = { ...benchmarkAnswers, ...subAnswers };
515
+ }
516
+ }
517
+
518
+ // LoRA adapter prompts — only for transformers with vllm/sglang/djl-lmi
519
+ // Requirements: 1.1, 1.2, 1.4
520
+ let loraAnswers = {};
521
+ const loraSubAnswers = await this._runPhase(loraPrompts, { ...frameworkAnswers, ...engineAnswers }, explicitConfig, existingConfig);
522
+ if (loraSubAnswers.enableLora !== undefined) {
523
+ loraAnswers = loraSubAnswers;
524
+ }
525
+
378
526
  // Validate instance type against framework requirements (now that framework version is known)
379
527
  const finalInstanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
380
528
  if (finalInstanceType && frameworkVersionAnswers.frameworkVersion) {
@@ -416,6 +564,8 @@ export default class PromptRunner {
416
564
  ...hfTokenAnswers,
417
565
  ...ngcApiKeyAnswers,
418
566
  ...moduleAnswers,
567
+ ...benchmarkAnswers,
568
+ ...loraAnswers,
419
569
  ...projectAnswers,
420
570
  ...destinationAnswers,
421
571
  buildTimestamp
@@ -435,6 +585,12 @@ export default class PromptRunner {
435
585
  combinedAnswers.artifactUri = this._mcpArtifactUri;
436
586
  }
437
587
 
588
+ // Flow capacity reservation ARN from instance-sizer selection
589
+ // Requirements: 2.3
590
+ if (this._selectedCapacityReservationArn) {
591
+ combinedAnswers.capacityReservationArn = this._selectedCapacityReservationArn;
592
+ }
593
+
438
594
  // Validate: non-HF model sources require an artifact URI
439
595
  // Without it, the serve script can't download the model at runtime
440
596
  // Infer modelSource from model name prefix if not set by MCP
@@ -1036,13 +1192,58 @@ export default class PromptRunner {
1036
1192
  : '';
1037
1193
 
1038
1194
  console.log(` ✓ ${choices.length} compatible instance(s) found${vramInfo}`);
1039
- // Display compact recommendation table
1040
- for (const rec of recommendations) {
1041
- const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
1042
- const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
1043
- const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
1044
- console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}`);
1195
+
1196
+ // Warn if all instances had zero quota but were restored for visibility
1197
+ if (parsed.metadata?.allFilteredByQuota) {
1198
+ console.log(' ⚠️ All instances have zero quota request a quota increase for your preferred type');
1045
1199
  }
1200
+
1201
+ // Check if availability data is present (recommendations have capacityType)
1202
+ const hasAvailabilityData = recommendations.some(r => r.capacityType);
1203
+
1204
+ if (hasAvailabilityData) {
1205
+ // Group by capacityType for display
1206
+ const reserved = recommendations.filter(r => r.capacityType === 'reserved' || r.capacityType === 'ftp');
1207
+ const onDemand = recommendations.filter(r => r.capacityType === 'on-demand');
1208
+
1209
+ if (reserved.length > 0) {
1210
+ console.log(' ── Reserved Capacity ──');
1211
+ for (const rec of reserved) {
1212
+ const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
1213
+ const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
1214
+ const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
1215
+ const tag = rec.capacityType === 'reserved'
1216
+ ? ` [CR] ${rec.reservationInfo?.planName || rec.reservationInfo?.reservationId || ''}`
1217
+ : ` [FTP] ${rec.ftpInfo?.planName || ''}`;
1218
+ console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}${tag}`);
1219
+ }
1220
+ }
1221
+
1222
+ if (onDemand.length > 0) {
1223
+ console.log(' ── On-Demand ──');
1224
+ for (const rec of onDemand) {
1225
+ const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
1226
+ const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
1227
+ const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
1228
+ const deployed = rec.quotaDeployed;
1229
+ const quota = rec.quotaLimit;
1230
+ const tag = quota !== null && quota !== undefined ? ` [Q:${deployed ?? 0}/${quota}]` : '';
1231
+ console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}${tag}`);
1232
+ }
1233
+ }
1234
+ } else {
1235
+ // Fallback: display compact recommendation table (no availability data)
1236
+ for (const rec of recommendations) {
1237
+ const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
1238
+ const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
1239
+ const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
1240
+ console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}`);
1241
+ }
1242
+ }
1243
+ } else if (parsed.metadata?.allFilteredByQuota) {
1244
+ // All VRAM-compatible instances had zero quota
1245
+ console.log(' ⚠️ No quota available for compatible instances. Request a quota increase.');
1246
+ this._instanceSizerMetadata = parsed.metadata || null;
1046
1247
  } else if (parsed.metadata?.warning) {
1047
1248
  console.log(` ⚠️ ${parsed.metadata.warning}`);
1048
1249
  } else {
@@ -1101,6 +1302,62 @@ export default class PromptRunner {
1101
1302
  }
1102
1303
  }
1103
1304
 
1305
+ /**
1306
+ * Query the endpoint-picker MCP server for available InService real-time endpoints.
1307
+ * Populates this._mcpEndpointChoices for the existing endpoint selection prompt.
1308
+ * Graceful fallback: if MCP server fails (no credentials, timeout), skip and create new endpoint.
1309
+ * Requirements: 3.3, 4.3, 4.4
1310
+ * @private
1311
+ */
1312
+ async _queryMcpForEndpoints(infraAnswers, explicitConfig) {
1313
+ const cm = this.configManager;
1314
+ if (!cm) return;
1315
+
1316
+ const mcpServers = cm.getMcpServerNames();
1317
+ if (!mcpServers.includes('endpoint-picker')) return;
1318
+
1319
+ // Skip if existing endpoint already provided via CLI/config
1320
+ if (explicitConfig.existingEndpointName) return;
1321
+
1322
+ console.log(' 🔍 Querying endpoint-picker...');
1323
+
1324
+ try {
1325
+ const result = await cm.queryMcpServer('endpoint-picker', {
1326
+ awsRegion: infraAnswers.awsRegion,
1327
+ deploymentTarget: 'realtime-inference'
1328
+ });
1329
+
1330
+ if (result && result.choices?.endpointName?.length > 0) {
1331
+ const endpointNames = result.choices.endpointName;
1332
+ const metadata = result.metadata || {};
1333
+
1334
+ // Build choices with metadata annotations
1335
+ this._mcpEndpointChoices = endpointNames.map(name => {
1336
+ const meta = metadata[name];
1337
+ if (meta) {
1338
+ const gpuInfo = meta.availableGpus === '?' ? 'GPUs: ?' : `${meta.availableGpus} GPUs free`;
1339
+ return {
1340
+ name: `${name} (${meta.instanceType}, ${gpuInfo}, ${meta.icCount} IC${meta.icCount !== 1 ? 's' : ''})`,
1341
+ value: name
1342
+ };
1343
+ }
1344
+ return { name, value: name };
1345
+ });
1346
+
1347
+ console.log(` ✓ ${endpointNames.length} endpoint(s) with available capacity`);
1348
+ } else {
1349
+ if (result?.message) {
1350
+ console.log(` ↳ ${result.message}`);
1351
+ } else {
1352
+ console.log(' ↳ No endpoints with available capacity found');
1353
+ }
1354
+ }
1355
+ } catch (err) {
1356
+ // Graceful fallback: if MCP server fails, skip and create new endpoint
1357
+ console.log(` ⚠️ endpoint-picker: ${err.message || 'query failed'} — will create new endpoint`);
1358
+ }
1359
+ }
1360
+
1104
1361
  /**
1105
1362
  * Query MCP base-image-picker server after deployment config is selected.
1106
1363
  * Populates _mcpBaseImageChoices for the base image selection prompt.
@@ -1972,9 +2229,10 @@ export default class PromptRunner {
1972
2229
  '11.4': 'al2-ami-sagemaker-inference-gpu-2-1',
1973
2230
  '11.8': 'al2-ami-sagemaker-inference-gpu-2-1',
1974
2231
  '12.1': 'al2-ami-sagemaker-inference-gpu-3-1',
1975
- '12.2': 'al2023-ami-sagemaker-inference-gpu-4-1',
1976
- '12.4': 'al2023-ami-sagemaker-inference-gpu-4-1',
1977
- '12.6': 'al2023-ami-sagemaker-inference-gpu-4-1'
2232
+ '12.2': 'al2-ami-sagemaker-inference-gpu-3-1',
2233
+ '12.4': 'al2-ami-sagemaker-inference-gpu-3-1',
2234
+ '12.6': 'al2-ami-sagemaker-inference-gpu-3-1',
2235
+ '13.0': 'al2023-ami-sagemaker-inference-gpu-4-1'
1978
2236
  };
1979
2237
 
1980
2238
  /**