@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +33 -22
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +53 -67
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +166 -153
  23. package/servers/instance-sizer/lib/instance-ranker.js +120 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/instances.json +27 -0
  29. package/servers/lib/catalogs/model-servers.json +201 -3
  30. package/servers/lib/custom-validators.js +13 -13
  31. package/servers/lib/dynamic-resolver.js +4 -4
  32. package/servers/marketplace-picker/index.js +342 -0
  33. package/servers/marketplace-picker/manifest.json +14 -0
  34. package/servers/marketplace-picker/package.json +18 -0
  35. package/servers/model-picker/index.js +382 -382
  36. package/servers/region-picker/index.js +56 -56
  37. package/servers/workload-picker/LICENSE +202 -0
  38. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  39. package/servers/workload-picker/index.js +171 -0
  40. package/servers/workload-picker/manifest.json +16 -0
  41. package/servers/workload-picker/package.json +16 -0
  42. package/src/app.js +12 -3
  43. package/src/lib/bootstrap-command-handler.js +609 -15
  44. package/src/lib/bootstrap-config.js +36 -0
  45. package/src/lib/bootstrap-profile-manager.js +48 -41
  46. package/src/lib/ci-register-helpers.js +74 -0
  47. package/src/lib/config-loader.js +3 -0
  48. package/src/lib/config-manager.js +7 -0
  49. package/src/lib/config-validator.js +1 -1
  50. package/src/lib/cuda-resolver.js +17 -8
  51. package/src/lib/generated/cli-options.js +319 -314
  52. package/src/lib/generated/parameter-matrix.js +672 -661
  53. package/src/lib/generated/validation-rules.js +76 -72
  54. package/src/lib/path-prover-brain.js +664 -0
  55. package/src/lib/prompts/infrastructure-prompts.js +2 -2
  56. package/src/lib/prompts/model-prompts.js +6 -0
  57. package/src/lib/prompts/project-prompts.js +12 -0
  58. package/src/lib/secrets-prompt-runner.js +4 -0
  59. package/src/lib/template-manager.js +1 -1
  60. package/src/lib/template-variable-resolver.js +87 -1
  61. package/src/lib/tune-catalog-validator.js +37 -4
  62. package/templates/Dockerfile +9 -0
  63. package/templates/code/adapter_sidecar.py +444 -0
  64. package/templates/code/serve +6 -0
  65. package/templates/code/serve.d/vllm.ejs +1 -1
  66. package/templates/do/.benchmark_writer.py +1476 -0
  67. package/templates/do/.tune_helper.py +982 -57
  68. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  69. package/templates/do/adapter +154 -0
  70. package/templates/do/benchmark +639 -85
  71. package/templates/do/build +5 -0
  72. package/templates/do/clean.d/async-inference.ejs +5 -0
  73. package/templates/do/clean.d/batch-transform.ejs +5 -0
  74. package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
  75. package/templates/do/clean.d/managed-inference.ejs +5 -0
  76. package/templates/do/config +115 -45
  77. package/templates/do/deploy.d/async-inference.ejs +30 -3
  78. package/templates/do/deploy.d/batch-transform.ejs +29 -3
  79. package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
  80. package/templates/do/deploy.d/managed-inference.ejs +216 -14
  81. package/templates/do/lib/endpoint-config.sh +1 -1
  82. package/templates/do/lib/profile.sh +44 -0
  83. package/templates/do/optimize +106 -37
  84. package/templates/do/push +5 -0
  85. package/templates/do/register +94 -0
  86. package/templates/do/stage +567 -0
  87. package/templates/do/submit +7 -0
  88. package/templates/do/test +14 -0
  89. package/templates/do/tune +382 -59
  90. package/templates/do/validate +44 -4
@@ -29,6 +29,8 @@
29
29
  * Optional CI fields (added by bootstrap --ci):
30
30
  * - ciInfraProvisioned (boolean): Whether CI harness infrastructure has been deployed. Defaults to false.
31
31
  * - ciTableName (string): Name of the DynamoDB CI table. Defaults to "mlcc-ci-table".
32
+ * - ciGlueDatabase (string|null): Name of the Glue database for benchmark results. Defaults to null (benchmark infra not provisioned).
33
+ * - ciBenchmarkResultsBucket (string|null): Name of the S3 bucket for benchmark Parquet files. Defaults to null (benchmark infra not provisioned).
32
34
  */
33
35
 
34
36
  import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'node:fs';
@@ -134,6 +136,8 @@ export default class BootstrapConfig {
134
136
  return {
135
137
  ciInfraProvisioned: false,
136
138
  ciTableName: 'mlcc-ci-table',
139
+ ciGlueDatabase: null,
140
+ ciBenchmarkResultsBucket: null,
137
141
  ...profile
138
142
  };
139
143
  }
@@ -156,11 +160,43 @@ export default class BootstrapConfig {
156
160
  config: {
157
161
  ciInfraProvisioned: false,
158
162
  ciTableName: 'mlcc-ci-table',
163
+ ciGlueDatabase: null,
164
+ ciBenchmarkResultsBucket: null,
159
165
  ...active.config
160
166
  }
161
167
  };
162
168
  }
163
169
 
170
+ /**
171
+ * Find the profile that has CI infrastructure provisioned.
172
+ * Scans all profiles and returns the first one with ciInfraProvisioned: true.
173
+ *
174
+ * @returns {{ name: string, config: Object }|null} The CI profile, or null if none found
175
+ */
176
+ findCiProfile() {
177
+ const config = this.read();
178
+ if (!config || !config.profiles) return null;
179
+
180
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
181
+ if (profileConfig.ciInfraProvisioned) {
182
+ return { name, config: profileConfig };
183
+ }
184
+ }
185
+ return null;
186
+ }
187
+
188
+ /**
189
+ * Get the sharedInfraFrom field, handling legacy sharedStackFrom.
190
+ * Returns the source stack name if infrastructure was shared from another profile,
191
+ * or null if this profile has standalone infrastructure.
192
+ *
193
+ * @param {Object} profileConfig - A profile configuration object
194
+ * @returns {string|null} The source stack name or null
195
+ */
196
+ getSharedInfraSource(profileConfig) {
197
+ return profileConfig.sharedInfraFrom || profileConfig.sharedStackFrom || null;
198
+ }
199
+
164
200
  /**
165
201
  * Create or update a profile in the config.
166
202
  * Sets the given profile as the active profile and writes the config.
@@ -313,7 +313,12 @@ export default class BootstrapProfileManager {
313
313
  }
314
314
 
315
315
  /**
316
- * Remove a bootstrap profile.
316
+ * Remove a bootstrap profile (metadata-only).
317
+ *
318
+ * Only removes the profile entry from config.json and the local manifest file.
319
+ * AWS resources (CloudFormation stack, S3 buckets, ECR repo, IAM roles) are
320
+ * intentionally retained — they may be shared across profiles or still in use.
321
+ *
317
322
  * @param {string} profileName - Profile name to remove
318
323
  * @param {object} options - Parsed CLI options (e.g., --force)
319
324
  */
@@ -340,23 +345,6 @@ export default class BootstrapProfileManager {
340
345
  }
341
346
  }
342
347
 
343
- // Check for CloudFormation stack
344
- const stackName = profile.stackName || `${STACK_NAME_PREFIX}-${profileName}`;
345
- let hasStack = false;
346
- try {
347
- hasStack = this.handler._resourceExists(
348
- `cloudformation describe-stacks --stack-name ${stackName} --region ${profile.awsRegion}`,
349
- profile.awsProfile
350
- );
351
- } catch {
352
- // ignore
353
- }
354
-
355
- if (hasStack && !options.force) {
356
- console.log(`⚠️ Profile "${profileName}" has a CloudFormation stack: ${stackName}`);
357
- console.log(' Use --delete-stack to also delete the AWS resources, or --force to remove the profile only.');
358
- }
359
-
360
348
  if (!options.force) {
361
349
  const { confirm } = await this.handler._promptFn([{
362
350
  type: 'confirm',
@@ -371,29 +359,6 @@ export default class BootstrapProfileManager {
371
359
  }
372
360
  }
373
361
 
374
- // Delete CloudFormation stack if requested
375
- if (hasStack && options['delete-stack']) {
376
- try {
377
- console.log(`🗑️ Deleting CloudFormation stack: ${stackName}`);
378
- execSync(
379
- `aws cloudformation delete-stack --stack-name ${stackName} --region ${profile.awsRegion} --profile ${profile.awsProfile}`,
380
- { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
381
- );
382
- console.log('⏳ Waiting for stack deletion...');
383
- execSync(
384
- `aws cloudformation wait stack-delete-complete --stack-name ${stackName} --region ${profile.awsRegion} --profile ${profile.awsProfile}`,
385
- { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
386
- );
387
- console.log(`✅ Stack "${stackName}" deleted.`);
388
- } catch (err) {
389
- console.log(`⚠️ Could not delete stack "${stackName}": ${err.message}`);
390
- console.log(' You may need to delete it manually from the CloudFormation console.');
391
- }
392
- } else if (hasStack) {
393
- console.log(`Note: CloudFormation stack "${stackName}" was left in place.`);
394
- console.log(' To delete AWS resources, re-run with --delete-stack');
395
- }
396
-
397
362
  // Delete manifest file if it exists
398
363
  if (hasManifest) {
399
364
  try {
@@ -406,6 +371,12 @@ export default class BootstrapProfileManager {
406
371
 
407
372
  this.handler.config.removeProfile(profileName);
408
373
  console.log(`Profile "${profileName}" removed.`);
374
+
375
+ // Advisory: AWS resources are retained for safety
376
+ const stackName = profile.stackName || `${STACK_NAME_PREFIX}-${profileName}`;
377
+ console.log('');
378
+ console.log('ℹ️ Profile removed from config. AWS resources (CloudFormation stack, S3 buckets, ECR repo, IAM roles) have been retained.');
379
+ console.log(` To delete AWS resources, manually delete the CloudFormation stack "${stackName}" in the AWS console.`);
409
380
  }
410
381
 
411
382
  /**
@@ -631,4 +602,40 @@ export default class BootstrapProfileManager {
631
602
 
632
603
  console.log(` Manifest written: lastSynced = ${result.manifest.lastSynced}\n`);
633
604
  }
605
+
606
+ /**
607
+ * Handle sync-model-families subcommand: discover tune-eligible models from
608
+ * the SageMaker JumpStart Hub and update the tune catalog.
609
+ *
610
+ * Requires AWS credentials with sagemaker:ListHubContents and
611
+ * sagemaker:DescribeHubContent permissions.
612
+ */
613
+ async _handleSyncModelFamilies() {
614
+ console.log('\n📦 Sync Model Families — Discovering supported models...\n');
615
+
616
+ // Determine region from active profile or environment
617
+ const profile = this.handler.config.getActiveProfile();
618
+ const region = profile?.config?.awsRegion || process.env.AWS_REGION || 'us-west-2';
619
+
620
+ try {
621
+ const { syncModelFamilies } = await import('../../scripts/sync-model-families.js');
622
+ const result = await syncModelFamilies({ region });
623
+ console.log(`\n✅ Sync complete: ${result.added} new, ${result.total} total models`);
624
+ } catch (err) {
625
+ if (err.name === 'CredentialsProviderError' || err.message?.includes('credentials') || err.message?.includes('Could not load credentials')) {
626
+ console.log('❌ AWS credentials not available or insufficient permissions.');
627
+ console.log('');
628
+ console.log(' Required permissions:');
629
+ console.log(' • sagemaker:ListHubContents');
630
+ console.log(' • sagemaker:DescribeHubContent');
631
+ console.log('');
632
+ console.log(' Ensure your AWS credentials are configured:');
633
+ console.log(' aws configure');
634
+ console.log(' # or set AWS_PROFILE to a profile with SageMaker AI access');
635
+ } else {
636
+ console.log(`❌ Sync failed: ${err.message}`);
637
+ }
638
+ process.exit(1);
639
+ }
640
+ }
634
641
  }
@@ -104,6 +104,15 @@ export function applyRecordDefaults(record) {
104
104
  if (!record.projectName) {
105
105
  record.projectName = '';
106
106
  }
107
+ // Benchmark fields — optional, backward-compatible defaults (Requirement 7.1, 7.4)
108
+ if (record.benchmarkEnabled === undefined || record.benchmarkEnabled === null) {
109
+ record.benchmarkEnabled = false;
110
+ }
111
+ if (!record.benchmarkConcurrencyLevels) {
112
+ record.benchmarkConcurrencyLevels = [1, 4, 8];
113
+ }
114
+ // lastBenchmarkRunId, lastBenchmarkTimestamp, lastBenchmarkStatus are intentionally
115
+ // NOT defaulted — their absence indicates "never benchmarked" (Requirement 7.4)
107
116
  return record;
108
117
  }
109
118
 
@@ -122,3 +131,68 @@ export function extractBaseImageVersion(baseImage) {
122
131
  }
123
132
  return baseImage.split(':').pop();
124
133
  }
134
+
135
+ /**
136
+ * Build the benchmark fields to update on a DynamoDB CI record after
137
+ * a benchmark stage completes (or fails).
138
+ *
139
+ * Only returns the benchmark-specific fields — caller merges into the
140
+ * existing record. Existing fields (testStatus, configJson, etc.) are
141
+ * intentionally NOT included to satisfy Requirement 7.3.
142
+ *
143
+ * @param {string} runId - Benchmark run identifier (e.g., "bmk-20260609T143022Z")
144
+ * @param {string} status - One of: "completed", "failed", "in-progress"
145
+ * @param {string} [timestamp] - ISO 8601 timestamp; defaults to current time
146
+ * @returns {object} Object with lastBenchmarkRunId, lastBenchmarkTimestamp, lastBenchmarkStatus
147
+ */
148
+ export function buildBenchmarkFields(runId, status, timestamp) {
149
+ const validStatuses = ['completed', 'failed', 'in-progress'];
150
+ if (!validStatuses.includes(status)) {
151
+ throw new Error(`Invalid benchmark status: '${status}'. Must be one of: ${validStatuses.join(', ')}`);
152
+ }
153
+ if (!runId || typeof runId !== 'string') {
154
+ throw new Error('Benchmark runId is required and must be a non-empty string');
155
+ }
156
+ return {
157
+ lastBenchmarkRunId: runId,
158
+ lastBenchmarkTimestamp: timestamp || new Date().toISOString().replace(/\.\d{3}Z$/, 'Z'),
159
+ lastBenchmarkStatus: status
160
+ };
161
+ }
162
+
163
+ /**
164
+ * Check whether a CI record has ever been benchmarked.
165
+ *
166
+ * Per Requirement 7.4, absence of `lastBenchmarkRunId` indicates
167
+ * "never benchmarked" — this is the canonical check.
168
+ *
169
+ * @param {object} record - A CI DynamoDB record
170
+ * @returns {boolean} True if the record has benchmark data
171
+ */
172
+ export function hasBeenBenchmarked(record) {
173
+ return !!(record && record.lastBenchmarkRunId);
174
+ }
175
+
176
+ /**
177
+ * Check whether benchmarking is enabled for a CI record/config.
178
+ *
179
+ * @param {object} record - A CI DynamoDB record (with defaults applied)
180
+ * @returns {boolean} True if benchmarkEnabled is true
181
+ */
182
+ export function isBenchmarkEnabled(record) {
183
+ if (!record) return false;
184
+ return record.benchmarkEnabled === true;
185
+ }
186
+
187
+ /**
188
+ * Get the benchmark concurrency levels for a CI record/config.
189
+ *
190
+ * @param {object} record - A CI DynamoDB record (with defaults applied)
191
+ * @returns {number[]} Array of concurrency level integers
192
+ */
193
+ export function getBenchmarkConcurrencyLevels(record) {
194
+ if (!record || !Array.isArray(record.benchmarkConcurrencyLevels)) {
195
+ return [1, 4, 8];
196
+ }
197
+ return record.benchmarkConcurrencyLevels;
198
+ }
@@ -41,6 +41,9 @@ export default class ConfigLoader {
41
41
  if (profileConfig.awsProfile) {
42
42
  mapped.awsProfile = profileConfig.awsProfile;
43
43
  }
44
+ if (profileConfig.ciBenchmarkResultsBucket) {
45
+ mapped.ciBenchmarkResultsBucket = profileConfig.ciBenchmarkResultsBucket;
46
+ }
44
47
 
45
48
  this.manager._mergeConfig(mapped);
46
49
  } catch (error) {
@@ -300,6 +300,13 @@ export default class ConfigManager {
300
300
  (!finalConfig.destinationDir || finalConfig.destinationDir === '.')) {
301
301
  finalConfig.destinationDir = `./${finalConfig.projectName}`;
302
302
  }
303
+
304
+ // Ensure destinationDir is never null — derive from projectName if not set.
305
+ // This covers interactive mode where destinationDir is non-promptable and no
306
+ // CLI positional argument was provided.
307
+ if (!finalConfig.destinationDir) {
308
+ finalConfig.destinationDir = `./${finalConfig.projectName}`;
309
+ }
303
310
 
304
311
  // Generate CodeBuild project name if buildTarget is codebuild
305
312
  if ((finalConfig.buildTarget === 'codebuild' || finalConfig.deployTarget === 'codebuild') && !finalConfig.codebuildProjectName) {
@@ -361,7 +361,7 @@ export default class ConfigValidator {
361
361
 
362
362
  case 'instanceType':
363
363
  if (value) {
364
- const instancePattern = /^ml\.[a-z0-9]+\.(nano|micro|small|medium|large|xlarge|[0-9]+xlarge)$/;
364
+ const instancePattern = /^ml\.[a-z0-9-]+\.(nano|micro|small|medium|large|xlarge|[0-9]+xlarge)$/;
365
365
  if (!instancePattern.test(value)) {
366
366
  throw new ValidationError(
367
367
  `Invalid instance type format: ${value}. Expected format: ml.{family}.{size} (e.g., ml.m5.large, ml.g4dn.xlarge)`,
@@ -96,22 +96,31 @@ export default class CudaResolver {
96
96
  return inferenceAmiVersion ? { cudaVersion, inferenceAmiVersion } : null;
97
97
  }
98
98
 
99
- // Multiple options — let the user choose (or auto-select in auto-prompt mode)
99
+ // Multiple options — determine the best default
100
100
  const defaultVersion = frameworkAccel?.version
101
101
  && compatibleVersions.includes(frameworkAccel.version)
102
102
  ? frameworkAccel.version
103
103
  : instanceInfo.accelerator.default || compatibleVersions[compatibleVersions.length - 1];
104
104
 
105
- // In auto-prompt mode, auto-select the default without prompting
106
- if (this.runner.configManager?.isAutoPrompt()) {
105
+ // Auto-select when we have a reliable default no need to prompt the user about
106
+ // AMI internals they shouldn't need to care about. The default is derived from:
107
+ // 1. Framework's declared CUDA version (highest confidence)
108
+ // 2. Instance catalog's defaultCudaVersion (hardware-appropriate)
109
+ // 3. Highest compatible version (safe fallback)
110
+ // Only prompt if none of these sources provide a default (shouldn't happen in practice).
111
+ if (defaultVersion && CUDA_AMI_MAP[defaultVersion]) {
107
112
  const inferenceAmiVersion = CUDA_AMI_MAP[defaultVersion];
108
- if (inferenceAmiVersion) {
109
- console.log(`\n🔧 CUDA ${defaultVersion} auto-selected (auto-prompt mode)`);
110
- console.log(` AMI: ${inferenceAmiVersion}`);
111
- }
112
- return inferenceAmiVersion ? { cudaVersion: defaultVersion, inferenceAmiVersion } : null;
113
+ const source = frameworkAccel?.version && compatibleVersions.includes(frameworkAccel.version)
114
+ ? 'framework requirement'
115
+ : instanceInfo.accelerator.default === defaultVersion
116
+ ? 'instance default'
117
+ : 'highest compatible';
118
+ console.log(`\n🔧 CUDA ${defaultVersion} auto-selected (${source})`);
119
+ console.log(` AMI: ${inferenceAmiVersion}`);
120
+ return { cudaVersion: defaultVersion, inferenceAmiVersion };
113
121
  }
114
122
 
123
+ // Fallback: prompt only when no reliable default exists (edge case)
115
124
  const choices = compatibleVersions.map(v => {
116
125
  const ami = CUDA_AMI_MAP[v] || 'unknown';
117
126
  const isDefault = v === defaultVersion ? ' (recommended)' : '';