@aws/ml-container-creator 0.10.0 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +5 -21
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +51 -66
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +138 -138
  23. package/servers/instance-sizer/lib/instance-ranker.js +76 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/model-servers.json +201 -3
  29. package/servers/lib/custom-validators.js +13 -13
  30. package/servers/lib/dynamic-resolver.js +4 -4
  31. package/servers/marketplace-picker/index.js +342 -0
  32. package/servers/marketplace-picker/manifest.json +14 -0
  33. package/servers/marketplace-picker/package.json +18 -0
  34. package/servers/model-picker/index.js +382 -382
  35. package/servers/region-picker/index.js +56 -56
  36. package/servers/workload-picker/LICENSE +202 -0
  37. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  38. package/servers/workload-picker/index.js +171 -0
  39. package/servers/workload-picker/manifest.json +16 -0
  40. package/servers/workload-picker/package.json +16 -0
  41. package/src/app.js +4 -2
  42. package/src/lib/bootstrap-command-handler.js +579 -14
  43. package/src/lib/bootstrap-config.js +36 -0
  44. package/src/lib/bootstrap-profile-manager.js +48 -41
  45. package/src/lib/ci-register-helpers.js +74 -0
  46. package/src/lib/config-loader.js +3 -0
  47. package/src/lib/config-manager.js +7 -0
  48. package/src/lib/cuda-resolver.js +17 -8
  49. package/src/lib/generated/cli-options.js +315 -315
  50. package/src/lib/generated/parameter-matrix.js +661 -661
  51. package/src/lib/generated/validation-rules.js +71 -71
  52. package/src/lib/path-prover-brain.js +607 -0
  53. package/src/lib/prompts/project-prompts.js +12 -0
  54. package/src/lib/template-variable-resolver.js +25 -1
  55. package/src/lib/tune-catalog-validator.js +37 -4
  56. package/templates/Dockerfile +9 -0
  57. package/templates/code/adapter_sidecar.py +444 -0
  58. package/templates/code/serve +6 -0
  59. package/templates/code/serve.d/vllm.ejs +1 -1
  60. package/templates/do/.benchmark_writer.py +1476 -0
  61. package/templates/do/.tune_helper.py +982 -57
  62. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  63. package/templates/do/adapter +149 -0
  64. package/templates/do/benchmark +639 -85
  65. package/templates/do/config +108 -5
  66. package/templates/do/deploy.d/managed-inference.ejs +192 -11
  67. package/templates/do/optimize +106 -37
  68. package/templates/do/register +89 -0
  69. package/templates/do/test +13 -0
  70. package/templates/do/tune +378 -59
  71. package/templates/do/validate +44 -4
@@ -62,6 +62,7 @@ export default class BootstrapCommandHandler {
62
62
  _handleScan() { return this.profileManager._handleScan(); }
63
63
  _handlePrune() { return this.profileManager._handlePrune(); }
64
64
  _handleSyncSchemas() { return this.profileManager._handleSyncSchemas(); }
65
+ _handleSyncModelFamilies() { return this.profileManager._handleSyncModelFamilies(); }
65
66
 
66
67
  /**
67
68
  * Dispatch bootstrap subcommands.
@@ -69,8 +70,28 @@ export default class BootstrapCommandHandler {
69
70
  * @param {object} options - Parsed CLI options
70
71
  */
71
72
  async handle(args, options) {
73
+ // Commander.js with passThroughOptions() captures flags after positional
74
+ // arguments in args rather than options. Extract known flags from args.
75
+ const extractedOptions = { ...options };
76
+ const cleanArgs = [];
77
+ for (const arg of args) {
78
+ if (arg === '--ci') extractedOptions.ci = true;
79
+ else if (arg === '--benchmark-infra') extractedOptions.benchmarkInfra = true;
80
+ else if (arg === '--skip-ci') extractedOptions.skipCi = true;
81
+ else if (arg === '--skip-s3') extractedOptions.skipS3 = true;
82
+ else if (arg === '--skip-post-setup') extractedOptions.skipPostSetup = true;
83
+ else if (arg === '--force') extractedOptions.force = true;
84
+ else if (arg === '--verify') extractedOptions.verify = true;
85
+ else if (arg === '--delete-stack') extractedOptions.deleteStack = true;
86
+ else if (arg === '--non-interactive') extractedOptions.nonInteractive = true;
87
+ else if (arg === '--ignore-staleness') extractedOptions.ignoreStaleness = true;
88
+ else cleanArgs.push(arg);
89
+ }
90
+ args = cleanArgs;
91
+ options = extractedOptions;
92
+
72
93
  // Handle legacy --sync-schemas flag for backward compatibility
73
- if (options['sync-schemas']) {
94
+ if ((options['sync-schemas'] || options.syncSchemas)) {
74
95
  await this._handleSyncSchemas();
75
96
  if (args.length === 0) return;
76
97
  }
@@ -107,6 +128,15 @@ export default class BootstrapCommandHandler {
107
128
  case 'sync-schemas':
108
129
  await this._handleSyncSchemas();
109
130
  break;
131
+ case 'sync-model-families':
132
+ await this._handleSyncModelFamilies();
133
+ break;
134
+ // Migration path: upgrades legacy profiles to current naming conventions.
135
+ // Corrects stackName to mlcc-bootstrap-{profileName}, renames sharedStackFrom
136
+ // to sharedInfraFrom. Idempotent — safe to run multiple times.
137
+ case 'migrate':
138
+ await this._handleMigrate();
139
+ break;
110
140
  default:
111
141
  console.log(`Unknown bootstrap subcommand: ${subcommand}`);
112
142
  this._showHelp();
@@ -119,7 +149,8 @@ export default class BootstrapCommandHandler {
119
149
  * @param {object} options - Parsed CLI options
120
150
  */
121
151
  async _handleInteractiveSetup(options) {
122
- const nonInteractive = options['non-interactive'];
152
+ // Commander.js converts --non-interactive to options.nonInteractive (camelCase)
153
+ const nonInteractive = options['non-interactive'] || options.nonInteractive;
123
154
 
124
155
  // Non-interactive mode: validate required flags upfront
125
156
  if (nonInteractive) {
@@ -180,13 +211,13 @@ export default class BootstrapCommandHandler {
180
211
 
181
212
  // Step 3: Determine stack parameters
182
213
  let useExistingRoleArn = '';
183
- if (nonInteractive && options['role-arn']) {
184
- useExistingRoleArn = options['role-arn'];
185
- console.log(` Using provided IAM role ARN: ${options['role-arn']}`);
214
+ if (nonInteractive && (options['role-arn'] || options.roleArn)) {
215
+ useExistingRoleArn = (options['role-arn'] || options.roleArn);
216
+ console.log(` Using provided IAM role ARN: ${(options['role-arn'] || options.roleArn)}`);
186
217
  }
187
218
 
188
219
  let createS3Buckets = false;
189
- if (nonInteractive && options['skip-s3']) {
220
+ if (nonInteractive && (options['skip-s3'] || options.skipS3)) {
190
221
  console.log(' ⏭️ Skipping S3 bucket creation (--skip-s3)');
191
222
  } else if (nonInteractive) {
192
223
  createS3Buckets = true;
@@ -231,7 +262,8 @@ export default class BootstrapCommandHandler {
231
262
 
232
263
  profileData.roleArn = stackOutputs.RoleArn;
233
264
  profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
234
- profileData.stackName = otherStack;
265
+ profileData.stackName = stackName;
266
+ profileData.sharedInfraFrom = otherStack; // Track that this profile reuses another's stack
235
267
  if (stackOutputs.AsyncS3BucketName) profileData.asyncS3Bucket = stackOutputs.AsyncS3BucketName;
236
268
  if (stackOutputs.BatchS3BucketName) profileData.batchS3Bucket = stackOutputs.BatchS3BucketName;
237
269
  if (stackOutputs.AdapterS3BucketName) profileData.adapterS3Bucket = stackOutputs.AdapterS3BucketName;
@@ -245,15 +277,45 @@ export default class BootstrapCommandHandler {
245
277
  }
246
278
 
247
279
  if (!profileData.stackName) {
280
+ // Pre-check: if IAM role already exists globally (from another region's deployment),
281
+ // pass its ARN so CloudFormation skips re-creation (account-level singleton)
282
+ if (!useExistingRoleArn) {
283
+ try {
284
+ const roleResult = this._execAws(
285
+ 'iam get-role --role-name mlcc-sagemaker-execution-role',
286
+ awsProfile
287
+ );
288
+ const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
289
+ if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
290
+ useExistingRoleArn = roleArn;
291
+ console.log(` ℹ️ Reusing existing IAM role: ${roleArn}`);
292
+ }
293
+ } catch (_) {
294
+ // Role doesn't exist yet — will be created by the stack
295
+ }
296
+ }
297
+
248
298
  try {
299
+ // Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
300
+ let skipEcr = 'false';
301
+ try {
302
+ this._execAws(
303
+ `ecr describe-repositories --repository-names ml-container-creator --region ${region}`,
304
+ awsProfile
305
+ );
306
+ skipEcr = 'true';
307
+ console.log(' ℹ️ ECR repository already exists — skipping creation');
308
+ } catch (_) { /* doesn't exist — will be created */ }
309
+
249
310
  const stackOutputs = this._deployStack(stackName, {
250
311
  CreateS3Buckets: createS3Buckets ? 'true' : 'false',
251
- UseExistingRoleArn: useExistingRoleArn
312
+ UseExistingRoleArn: useExistingRoleArn,
313
+ SkipEcrCreation: skipEcr
252
314
  }, awsProfile, region);
253
315
 
254
316
  // Read outputs into profile data
255
317
  profileData.roleArn = stackOutputs.RoleArn;
256
- profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
318
+ profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName || 'ml-container-creator';
257
319
  profileData.stackName = stackName;
258
320
 
259
321
  if (stackOutputs.AsyncS3BucketName) {
@@ -278,6 +340,23 @@ export default class BootstrapCommandHandler {
278
340
  }
279
341
  } // end if (!profileData.stackName)
280
342
 
343
+ // Step 4b: MLflow App for model customization experiment tracking
344
+ this._displayProgress('📊', 'MLflow App for experiment tracking...');
345
+ try {
346
+ if (!profileData.mlflowAppArn) {
347
+ const mlflowAppArn = this._ensureMlflowApp(profileData, awsProfile);
348
+ if (mlflowAppArn) {
349
+ profileData.mlflowAppArn = mlflowAppArn;
350
+ console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
351
+ }
352
+ } else {
353
+ console.log(` ✅ MLflow App already configured: ${profileData.mlflowAppArn}`);
354
+ }
355
+ } catch (error) {
356
+ console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
357
+ console.log(' Tune jobs will still work but experiment tracking may not be available.');
358
+ }
359
+
281
360
  // Step 5: CI Infrastructure setup (separate CDK stack — unchanged)
282
361
  this._displayProgress('🧪', 'CI Testing Infrastructure...');
283
362
  try {
@@ -286,7 +365,7 @@ export default class BootstrapCommandHandler {
286
365
  if (nonInteractive) {
287
366
  if (options.ci) {
288
367
  provisionCi = true;
289
- } else if (options['skip-ci']) {
368
+ } else if ((options['skip-ci'] || options.skipCi)) {
290
369
  console.log(' ⏭️ Skipping CI infrastructure (--skip-ci)');
291
370
  provisionCi = false;
292
371
  } else {
@@ -303,6 +382,21 @@ export default class BootstrapCommandHandler {
303
382
  }
304
383
 
305
384
  if (provisionCi) {
385
+ // --- CI single-region enforcement ---
386
+ const ciConflict = this._findExistingCiProfile(profileName);
387
+ if (ciConflict) {
388
+ console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
389
+ console.log(' CI can only be deployed in one region per account.');
390
+ provisionCi = false;
391
+ }
392
+ }
393
+
394
+ if (provisionCi) {
395
+ // Persist CI intent immediately so that `bootstrap update --ci` can
396
+ // retry if the CDK deploy fails. Don't wait for success.
397
+ profileData.ciInfraProvisioned = true;
398
+ profileData.ciTableName = profileData.ciTableName || 'mlcc-ci-table';
399
+
306
400
  // Ensure CDK is bootstrapped in this account/region
307
401
  const cdkBootstrapped = this._resourceExists(
308
402
  `ssm get-parameter --name /cdk-bootstrap/hnb659fds/version --region ${profileData.awsRegion}`,
@@ -358,14 +452,25 @@ export default class BootstrapCommandHandler {
358
452
  stdio: ['pipe', 'pipe', 'pipe']
359
453
  });
360
454
 
455
+ // Warn if shell AWS_REGION differs from profile region
456
+ if (process.env.AWS_REGION && process.env.AWS_REGION !== profileData.awsRegion) {
457
+ console.log(` ⚠️ AWS_REGION env var (${process.env.AWS_REGION}) differs from profile region (${profileData.awsRegion}) — using profile region`);
458
+ }
459
+
460
+ // --no-rollback prevents rollback on AlreadyExists errors for IAM roles
461
+ // that may pre-exist from a prior deployment or another region.
462
+ const cdkDeployCmd = options.benchmarkInfra
463
+ ? 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true'
464
+ : 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
361
465
  execSync(
362
- 'npx cdk deploy MlccCiHarnessStack --require-approval never',
466
+ cdkDeployCmd,
363
467
  {
364
468
  cwd: ciHarnessDir,
365
469
  encoding: 'utf8',
366
470
  stdio: 'inherit',
367
471
  env: {
368
472
  ...process.env,
473
+ AWS_REGION: profileData.awsRegion,
369
474
  CDK_DEFAULT_REGION: profileData.awsRegion,
370
475
  CDK_DEFAULT_ACCOUNT: profileData.accountId,
371
476
  AWS_PROFILE: profileData.awsProfile
@@ -376,6 +481,11 @@ export default class BootstrapCommandHandler {
376
481
 
377
482
  profileData.ciInfraProvisioned = true;
378
483
  profileData.ciTableName = 'mlcc-ci-table';
484
+ if (options.benchmarkInfra) {
485
+ profileData.benchmarkInfraProvisioned = true;
486
+ profileData.ciGlueDatabase = 'mlcc_ci';
487
+ profileData.ciBenchmarkResultsBucket = `mlcc-benchmark-results-${profileData.accountId}-${profileData.awsRegion}`;
488
+ }
379
489
  }
380
490
  }
381
491
  } catch (error) {
@@ -413,14 +523,80 @@ export default class BootstrapCommandHandler {
413
523
  console.log(` Region: ${profileConfig.awsRegion}`);
414
524
  console.log(` Account: ${profileConfig.accountId}`);
415
525
 
526
+ // --- SANITY CHECK 1: Account identity ---
527
+ const callerAccount = this._getCallerAccount(profileConfig.awsProfile);
528
+ if (callerAccount !== profileConfig.accountId) {
529
+ console.log(`❌ Account mismatch: profile expects ${profileConfig.accountId} but credentials resolve to ${callerAccount}`);
530
+ return;
531
+ }
532
+
416
533
  // Re-deploy the CloudFormation bootstrap stack
417
534
  const stackName = profileConfig.stackName || `${STACK_NAME_PREFIX}-${name}`;
535
+
536
+ // Sanity check: stack name consistency (warn-and-continue)
537
+ const expectedStackName = `${STACK_NAME_PREFIX}-${name}`;
538
+ if (profileConfig.stackName && profileConfig.stackName !== expectedStackName) {
539
+ console.log(`⚠️ Stack name mismatch: expected "${expectedStackName}" but profile has "${profileConfig.stackName}"`);
540
+ console.log(' Run `ml-container-creator bootstrap migrate` to fix.');
541
+ console.log(' Proceeding with stored stack name...');
542
+ }
543
+
544
+ // --- SANITY CHECK 3: Stack exists in target region ---
545
+ const stackExists = this._resourceExists(
546
+ `cloudformation describe-stacks --stack-name ${stackName} --region ${profileConfig.awsRegion}`,
547
+ profileConfig.awsProfile
548
+ );
549
+ if (!stackExists) {
550
+ console.log(`❌ Stack "${stackName}" not found in ${profileConfig.awsRegion}.`);
551
+ console.log(' Run `ml-container-creator bootstrap` to create it.');
552
+ return;
553
+ }
554
+
555
+ // --- CI single-region enforcement ---
556
+ if (options.ci) {
557
+ const ciConflict = this._findExistingCiProfile(name);
558
+ if (ciConflict) {
559
+ console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
560
+ console.log(' CI can only be deployed in one region per account.');
561
+ return;
562
+ }
563
+ }
564
+
418
565
  this._displayProgress('☁️', 'Updating bootstrap stack...');
419
566
 
567
+ // Pre-check: if IAM role already exists globally (from another region's deployment),
568
+ // pass its ARN so CloudFormation skips re-creation (account-level singleton)
569
+ let useExistingRoleArn = profileConfig.roleArn || '';
570
+ if (!useExistingRoleArn) {
571
+ try {
572
+ const roleResult = this._execAws(
573
+ 'iam get-role --role-name mlcc-sagemaker-execution-role',
574
+ profileConfig.awsProfile
575
+ );
576
+ const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
577
+ if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
578
+ useExistingRoleArn = roleArn;
579
+ }
580
+ } catch (_) {
581
+ // Role doesn't exist yet — will be created by the stack
582
+ }
583
+ }
584
+
420
585
  try {
586
+ // Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
587
+ let skipEcr = 'false';
588
+ try {
589
+ this._execAws(
590
+ `ecr describe-repositories --repository-names ml-container-creator --region ${profileConfig.awsRegion}`,
591
+ profileConfig.awsProfile
592
+ );
593
+ skipEcr = 'true';
594
+ } catch (_) { /* doesn't exist */ }
595
+
421
596
  const stackOutputs = this._deployStack(stackName, {
422
597
  CreateS3Buckets: (profileConfig.asyncS3Bucket || profileConfig.batchS3Bucket) ? 'true' : 'false',
423
- UseExistingRoleArn: ''
598
+ UseExistingRoleArn: useExistingRoleArn,
599
+ SkipEcrCreation: skipEcr
424
600
  }, profileConfig.awsProfile, profileConfig.awsRegion);
425
601
 
426
602
  // Update profile with any new outputs
@@ -456,14 +632,20 @@ export default class BootstrapCommandHandler {
456
632
  stdio: ['pipe', 'pipe', 'pipe']
457
633
  });
458
634
 
635
+ // --no-rollback prevents rollback on AlreadyExists errors for IAM roles
636
+ // that may pre-exist from a prior deployment or another region.
637
+ const updateCdkCmd = (options.benchmarkInfra || profileConfig.benchmarkInfraProvisioned)
638
+ ? 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true'
639
+ : 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
459
640
  execSync(
460
- 'npx cdk deploy MlccCiHarnessStack --require-approval never',
641
+ updateCdkCmd,
461
642
  {
462
643
  cwd: ciHarnessDir,
463
644
  encoding: 'utf8',
464
645
  stdio: 'inherit',
465
646
  env: {
466
647
  ...process.env,
648
+ AWS_REGION: profileConfig.awsRegion,
467
649
  CDK_DEFAULT_REGION: profileConfig.awsRegion,
468
650
  CDK_DEFAULT_ACCOUNT: profileConfig.accountId,
469
651
  AWS_PROFILE: profileConfig.awsProfile
@@ -471,6 +653,8 @@ export default class BootstrapCommandHandler {
471
653
  }
472
654
  );
473
655
  profileConfig.ciInfraProvisioned = true;
656
+ profileConfig.ciGlueDatabase = profileConfig.ciGlueDatabase || 'mlcc_ci';
657
+ profileConfig.ciBenchmarkResultsBucket = profileConfig.ciBenchmarkResultsBucket || `mlcc-benchmark-results-${profileConfig.accountId}-${profileConfig.awsRegion}`;
474
658
  console.log(' ✅ CI harness stack updated');
475
659
  }
476
660
  } catch (error) {
@@ -480,6 +664,18 @@ export default class BootstrapCommandHandler {
480
664
  console.log(' ⏭️ CI stack skipped (not provisioned — use --ci to force)');
481
665
  }
482
666
 
667
+ // Ensure MLflow App exists
668
+ this._displayProgress('📊', 'MLflow App for experiment tracking...');
669
+ try {
670
+ const mlflowAppArn = this._ensureMlflowApp(profileConfig, profileConfig.awsProfile);
671
+ if (mlflowAppArn) {
672
+ profileConfig.mlflowAppArn = mlflowAppArn;
673
+ console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
674
+ }
675
+ } catch (error) {
676
+ console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
677
+ }
678
+
483
679
  // Save updated profile
484
680
  this.config.setProfile(name, profileConfig);
485
681
  console.log(`\n✅ Update complete for profile "${name}"`);
@@ -488,6 +684,82 @@ export default class BootstrapCommandHandler {
488
684
  await this._runPostSetupChain(options);
489
685
  }
490
686
 
687
+ /**
688
+ * Migrate legacy profiles to current naming conventions.
689
+ * Corrects stackName mismatches and renames sharedStackFrom → sharedInfraFrom.
690
+ * Displays a preview of all changes and requires confirmation before writing.
691
+ */
692
+ async _handleMigrate() {
693
+ const config = this.config.read();
694
+ if (!config || !config.profiles) {
695
+ console.log('No profiles to migrate.');
696
+ return;
697
+ }
698
+
699
+ const changes = [];
700
+
701
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
702
+ const expected = `${STACK_NAME_PREFIX}-${name}`;
703
+
704
+ // Fix stackName mismatch
705
+ if (profileConfig.stackName && profileConfig.stackName !== expected) {
706
+ changes.push({
707
+ profile: name,
708
+ field: 'stackName',
709
+ from: profileConfig.stackName,
710
+ to: expected
711
+ });
712
+ }
713
+
714
+ // Rename sharedStackFrom → sharedInfraFrom
715
+ if (profileConfig.sharedStackFrom) {
716
+ changes.push({
717
+ profile: name,
718
+ field: 'sharedStackFrom → sharedInfraFrom',
719
+ from: profileConfig.sharedStackFrom,
720
+ to: profileConfig.sharedStackFrom
721
+ });
722
+ }
723
+ }
724
+
725
+ if (changes.length === 0) {
726
+ console.log('✅ All profiles already use current naming conventions.');
727
+ return;
728
+ }
729
+
730
+ // Display preview
731
+ console.log('📋 Migration Preview:\n');
732
+ for (const change of changes) {
733
+ console.log(` Profile "${change.profile}":`);
734
+ console.log(` ${change.field}: "${change.from}" → "${change.to}"`);
735
+ }
736
+
737
+ // Prompt for confirmation
738
+ const { confirm } = await this._promptFn([{
739
+ type: 'confirm',
740
+ name: 'confirm',
741
+ message: 'Apply these changes?',
742
+ default: true
743
+ }]);
744
+
745
+ if (!confirm) return;
746
+
747
+ // Apply changes
748
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
749
+ const expected = `${STACK_NAME_PREFIX}-${name}`;
750
+ if (profileConfig.stackName !== expected) {
751
+ profileConfig.stackName = expected;
752
+ }
753
+ if (profileConfig.sharedStackFrom) {
754
+ profileConfig.sharedInfraFrom = profileConfig.sharedStackFrom;
755
+ delete profileConfig.sharedStackFrom;
756
+ }
757
+ }
758
+
759
+ this.config.write(config);
760
+ console.log('✅ Migration complete.');
761
+ }
762
+
491
763
  /**
492
764
  * Run the post-setup chain: mcp init → registry sync-architectures → sync-schemas.
493
765
  * Each step is independent — failures are collected and reported at the end.
@@ -495,7 +767,7 @@ export default class BootstrapCommandHandler {
495
767
  * @param {object} options - Parsed CLI options (checks skipPostSetup)
496
768
  */
497
769
  async _runPostSetupChain(options = {}) {
498
- if (options['skip-post-setup']) {
770
+ if ((options['skip-post-setup'] || options.skipPostSetup)) {
499
771
  console.log('\n⏭️ Skipping post-setup chain (--skip-post-setup)');
500
772
  return;
501
773
  }
@@ -729,6 +1001,12 @@ export default class BootstrapCommandHandler {
729
1001
  /**
730
1002
  * Deploy the bootstrap CloudFormation stack and return its outputs.
731
1003
  *
1004
+ * Before deploying, checks for pre-existing S3 buckets that would cause
1005
+ * ResourceExistenceCheck failures. If the stack is in REVIEW_IN_PROGRESS
1006
+ * state (empty shell from a failed prior attempt), deletes it first.
1007
+ * If buckets exist but aren't managed by the stack, uses a CloudFormation
1008
+ * import changeset to adopt them before proceeding with the normal deploy.
1009
+ *
732
1010
  * Uses `aws cloudformation deploy` which is idempotent — it creates the
733
1011
  * stack on first run and updates it on subsequent runs. If the template
734
1012
  * hasn't changed, it exits with "No changes to deploy" which we handle
@@ -741,6 +1019,9 @@ export default class BootstrapCommandHandler {
741
1019
  * @returns {object} Map of output key → output value
742
1020
  */
743
1021
  _deployStack(stackName, parameters, profile, region) {
1022
+ // Handle ghost stacks and pre-existing resources
1023
+ this._resolveStackConflicts(stackName, parameters, profile, region);
1024
+
744
1025
  // Build parameter overrides string
745
1026
  const paramOverrides = Object.entries(parameters)
746
1027
  .map(([key, value]) => `${key}=${value}`)
@@ -764,6 +1045,32 @@ export default class BootstrapCommandHandler {
764
1045
  const stderr = error.stderr || error.message || '';
765
1046
  if (stderr.includes('No changes to deploy')) {
766
1047
  console.log(' ℹ️ Stack is up to date — no changes needed');
1048
+ } else if (stderr.includes('ResourceExistenceCheck')) {
1049
+ // Resources already exist outside the stack — attempt import and retry
1050
+ console.log(' ⚠️ Pre-existing resources detected — attempting import...');
1051
+ this._resolveStackConflicts(stackName, parameters, profile, region);
1052
+ // Rebuild deploy command with updated parameters (e.g., CreateS3Buckets may now be 'false')
1053
+ const retryParamOverrides = Object.entries(parameters)
1054
+ .map(([key, value]) => `${key}=${value}`)
1055
+ .join(' ');
1056
+ const retryDeployCommand = [
1057
+ 'aws cloudformation deploy',
1058
+ `--template-file ${STACK_TEMPLATE_PATH}`,
1059
+ `--stack-name ${stackName}`,
1060
+ '--capabilities CAPABILITY_NAMED_IAM',
1061
+ `--parameter-overrides ${retryParamOverrides}`,
1062
+ `--profile ${profile}`,
1063
+ `--region ${region}`
1064
+ ].join(' ');
1065
+ // Retry the deploy after import
1066
+ try {
1067
+ execSync(retryDeployCommand, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] });
1068
+ } catch (retryError) {
1069
+ const retryStderr = retryError.stderr || retryError.message || '';
1070
+ if (!retryStderr.includes('No changes to deploy')) {
1071
+ throw retryError;
1072
+ }
1073
+ }
767
1074
  } else {
768
1075
  throw error;
769
1076
  }
@@ -785,9 +1092,144 @@ export default class BootstrapCommandHandler {
785
1092
  outputs[output.OutputKey] = output.OutputValue;
786
1093
  }
787
1094
 
1095
+ // If S3 buckets already existed (skipped creation), inject their names
1096
+ // into outputs so the profile config gets populated correctly.
1097
+ if (this._preExistingBuckets && this._preExistingBuckets.length > 0) {
1098
+ const bucketOutputMap = {
1099
+ 'AsyncS3Bucket': 'AsyncS3BucketName',
1100
+ 'BatchS3Bucket': 'BatchS3BucketName',
1101
+ 'AdapterS3Bucket': 'AdapterS3BucketName',
1102
+ 'BenchmarkS3Bucket': 'BenchmarkS3BucketName',
1103
+ 'TuneS3Bucket': 'TuneS3BucketName'
1104
+ };
1105
+ for (const bucket of this._preExistingBuckets) {
1106
+ const outputKey = bucketOutputMap[bucket.logicalId];
1107
+ if (outputKey && !outputs[outputKey]) {
1108
+ outputs[outputKey] = bucket.name;
1109
+ }
1110
+ }
1111
+ this._preExistingBuckets = null;
1112
+ }
1113
+
788
1114
  return outputs;
789
1115
  }
790
1116
 
1117
+ /**
1118
+ * Resolve stack conflicts before deploying.
1119
+ *
1120
+ * Handles two scenarios that cause ResourceExistenceCheck failures:
1121
+ * 1. Ghost stacks (REVIEW_IN_PROGRESS) — delete them first
1122
+ * 2. Pre-existing S3 buckets not managed by the stack — import them
1123
+ *
1124
+ * @param {string} stackName - CloudFormation stack name
1125
+ * @param {object} parameters - Stack parameter key-value pairs
1126
+ * @param {string} profile - AWS CLI profile name
1127
+ * @param {string} region - AWS region
1128
+ */
1129
+ _resolveStackConflicts(stackName, parameters, profile, region) {
1130
+ // Check if stack exists and its status
1131
+ let stackStatus = null;
1132
+ let managedResources = [];
1133
+
1134
+ try {
1135
+ const describeResult = this._execAws(
1136
+ `cloudformation describe-stacks --stack-name ${stackName} --region ${region}`,
1137
+ profile
1138
+ );
1139
+ const stack = describeResult.Stacks && describeResult.Stacks[0];
1140
+ if (stack) {
1141
+ stackStatus = stack.StackStatus;
1142
+ }
1143
+ } catch (_) {
1144
+ // Stack doesn't exist — no conflicts possible
1145
+ return;
1146
+ }
1147
+
1148
+ // Handle ghost stacks (created but never successfully deployed)
1149
+ if (stackStatus === 'REVIEW_IN_PROGRESS') {
1150
+ console.log(' ⚠️ Found ghost stack (REVIEW_IN_PROGRESS) — deleting before redeploy...');
1151
+ try {
1152
+ execSync(
1153
+ `aws cloudformation delete-stack --stack-name ${stackName} --profile ${profile} --region ${region}`,
1154
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
1155
+ );
1156
+ execSync(
1157
+ `aws cloudformation wait stack-delete-complete --stack-name ${stackName} --profile ${profile} --region ${region}`,
1158
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 60000 }
1159
+ );
1160
+ console.log(' ✅ Ghost stack deleted');
1161
+ } catch (err) {
1162
+ console.log(` ⚠️ Could not delete ghost stack: ${err.message || err}`);
1163
+ }
1164
+ // Don't return — fall through to check for pre-existing S3 buckets
1165
+ // that need to be imported on the fresh deploy. The ghost stack had
1166
+ // DeletionPolicy:Retain buckets that survive stack deletion.
1167
+ stackStatus = null;
1168
+ managedResources = [];
1169
+ }
1170
+
1171
+ // For active stacks (or post-ghost-deletion), check if S3 buckets exist but aren't managed
1172
+ if (parameters.CreateS3Buckets !== 'true') {
1173
+ return; // Not creating buckets — no conflict
1174
+ }
1175
+
1176
+ // Get list of resources currently managed by the stack (empty if stack was just deleted)
1177
+ if (stackStatus) {
1178
+ try {
1179
+ const resources = this._execAws(
1180
+ `cloudformation list-stack-resources --stack-name ${stackName} --region ${region}`,
1181
+ profile
1182
+ );
1183
+ managedResources = (resources.StackResourceSummaries || [])
1184
+ .map(r => r.LogicalResourceId);
1185
+ } catch (_) {
1186
+ // Stack doesn't exist or can't be queried — proceed with empty managedResources
1187
+ }
1188
+ }
1189
+
1190
+ // Check each S3 bucket that the template would create
1191
+ const accountId = this._currentAccountId;
1192
+ const bucketConfigs = [
1193
+ { logicalId: 'AsyncS3Bucket', name: `mlcc-async-${accountId}-${region}` },
1194
+ { logicalId: 'BatchS3Bucket', name: `mlcc-batch-${accountId}-${region}` },
1195
+ { logicalId: 'AdapterS3Bucket', name: `mlcc-adapters-${accountId}-${region}` },
1196
+ { logicalId: 'BenchmarkS3Bucket', name: `mlcc-benchmark-${accountId}-${region}` },
1197
+ { logicalId: 'TuneS3Bucket', name: `mlcc-tune-${accountId}-${region}` }
1198
+ ];
1199
+
1200
+ const bucketsToImport = [];
1201
+
1202
+ for (const bucket of bucketConfigs) {
1203
+ if (managedResources.includes(bucket.logicalId)) {
1204
+ continue; // Already managed by the stack — no conflict
1205
+ }
1206
+ // Check if bucket exists in AWS
1207
+ try {
1208
+ execSync(
1209
+ `aws s3api head-bucket --bucket ${bucket.name} --profile ${profile} --region ${region}`,
1210
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
1211
+ );
1212
+ // Bucket exists but not in stack — needs import
1213
+ bucketsToImport.push(bucket);
1214
+ } catch (_) {
1215
+ // Bucket doesn't exist — will be created normally
1216
+ }
1217
+ }
1218
+
1219
+ if (bucketsToImport.length > 0) {
1220
+ console.log(` ℹ️ ${bucketsToImport.length} pre-existing S3 bucket(s) detected — skipping S3 creation (buckets already exist)`);
1221
+
1222
+ // Pre-existing S3 buckets survive stack deletion (DeletionPolicy: Retain).
1223
+ // Rather than fighting CloudFormation's IMPORT limitations, just skip S3
1224
+ // creation and wire the existing bucket names into the profile config directly.
1225
+ // The naming convention is deterministic, so we know exactly what they are.
1226
+ this._preExistingBuckets = bucketsToImport;
1227
+
1228
+ // Modify the parameters to skip S3 bucket creation in the deploy
1229
+ parameters.CreateS3Buckets = 'false';
1230
+ }
1231
+ }
1232
+
791
1233
  /**
792
1234
  * Write a JSON object to a temp file and return the `file://` path.
793
1235
  * Used for passing complex JSON to AWS CLI commands without shell escaping issues.
@@ -821,6 +1263,125 @@ export default class BootstrapCommandHandler {
821
1263
  }
822
1264
  }
823
1265
 
1266
+ /**
1267
+ * Get the AWS account ID from the caller's credentials.
1268
+ * Uses `sts get-caller-identity` to resolve the actual account.
1269
+ *
1270
+ * @param {string} awsProfile - AWS CLI profile name
1271
+ * @returns {string} The 12-digit AWS account ID
1272
+ */
1273
+ _getCallerAccount(awsProfile) {
1274
+ const identity = this._execAws('sts get-caller-identity', awsProfile);
1275
+ return identity.Account;
1276
+ }
1277
+
1278
+ /**
1279
+ * Scan all profiles to find one with ciInfraProvisioned=true,
1280
+ * excluding the given profile name.
1281
+ *
1282
+ * @param {string} excludeProfile - Profile name to exclude from the scan
1283
+ * @returns {{ name: string, config: Object }|null} The CI profile, or null if none found
1284
+ */
1285
+ _findExistingCiProfile(excludeProfile) {
1286
+ const config = this.config.read();
1287
+ if (!config || !config.profiles) return null;
1288
+
1289
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
1290
+ if (name === excludeProfile) continue;
1291
+ if (profileConfig.ciInfraProvisioned) {
1292
+ return { name, config: profileConfig };
1293
+ }
1294
+ }
1295
+ return null;
1296
+ }
1297
+
1298
+ /**
1299
+ * Ensure an MLCC-owned MLflow App exists for experiment tracking.
1300
+ * Creates one if it doesn't exist, using the tune S3 bucket as artifact store.
1301
+ *
1302
+ * @param {object} profileData - Bootstrap profile data (needs roleArn, awsRegion, accountId)
1303
+ * @param {string} awsProfile - AWS CLI profile name
1304
+ * @returns {string|null} MLflow App ARN or null if creation failed
1305
+ */
1306
+ _ensureMlflowApp(profileData, awsProfile) {
1307
+ const region = profileData.awsRegion;
1308
+ const accountId = profileData.accountId;
1309
+ const roleArn = profileData.roleArn;
1310
+ const appName = 'mlcc-tune-tracking';
1311
+ const artifactBucket = `mlcc-tune-${accountId}-${region}`;
1312
+
1313
+ // Check if MLCC app already exists
1314
+ try {
1315
+ const apps = this._execAws(
1316
+ `sagemaker list-mlflow-apps --region ${region}`,
1317
+ awsProfile
1318
+ );
1319
+ const summaries = apps.Summaries || [];
1320
+ const existing = summaries.find(a => a.Name === appName);
1321
+ if (existing) {
1322
+ return existing.Arn;
1323
+ }
1324
+ } catch {
1325
+ // list-mlflow-apps may not be available in all CLI versions — proceed to create
1326
+ }
1327
+
1328
+ // Create the MLflow App
1329
+ console.log(` Creating MLflow App "${appName}" with artifact store s3://${artifactBucket}...`);
1330
+
1331
+ // Ensure the artifact bucket exists (it's the tune bucket from the stack)
1332
+ try {
1333
+ this._execAws(
1334
+ `s3api head-bucket --bucket ${artifactBucket} --region ${region}`,
1335
+ awsProfile
1336
+ );
1337
+ } catch {
1338
+ // Bucket doesn't exist — create it
1339
+ console.log(` Creating artifact bucket: ${artifactBucket}`);
1340
+ try {
1341
+ this._execAws(
1342
+ `s3api create-bucket --bucket ${artifactBucket} --region ${region} --create-bucket-configuration LocationConstraint=${region}`,
1343
+ awsProfile
1344
+ );
1345
+ } catch (bucketErr) {
1346
+ // May already exist or region doesn't need LocationConstraint (us-east-1)
1347
+ if (!bucketErr.message?.includes('BucketAlreadyOwnedByYou')) {
1348
+ try {
1349
+ this._execAws(
1350
+ `s3api create-bucket --bucket ${artifactBucket} --region ${region}`,
1351
+ awsProfile
1352
+ );
1353
+ } catch {
1354
+ // Bucket likely exists, continue
1355
+ }
1356
+ }
1357
+ }
1358
+ }
1359
+
1360
+ // Create the app
1361
+ try {
1362
+ const result = this._execAws(
1363
+ `sagemaker create-mlflow-app --name ${appName} --artifact-store-uri s3://${artifactBucket} --role-arn ${roleArn} --model-registration-mode AutoModelRegistrationEnabled --region ${region}`,
1364
+ awsProfile
1365
+ );
1366
+ return result.Arn;
1367
+ } catch (err) {
1368
+ // If app already exists (race condition), try to describe it
1369
+ if (err.message?.includes('ResourceLimitExceeded') || err.message?.includes('already exists')) {
1370
+ try {
1371
+ const apps = this._execAws(
1372
+ `sagemaker list-mlflow-apps --region ${region}`,
1373
+ awsProfile
1374
+ );
1375
+ const found = (apps.Summaries || []).find(a => a.Name === appName);
1376
+ if (found) return found.Arn;
1377
+ } catch {
1378
+ // Fall through
1379
+ }
1380
+ }
1381
+ throw err;
1382
+ }
1383
+ }
1384
+
824
1385
  /**
825
1386
  * Format tags for the AWS CLI --tags parameter.
826
1387
  * Writes tags to a temp file and returns the file:// reference
@@ -858,6 +1419,8 @@ SUBCOMMANDS:
858
1419
  scan Discover pre-existing MLCC-managed resources in AWS
859
1420
  prune Remove deleted and unknown records from the deployment manifest
860
1421
  update Re-deploy bootstrap stacks using active profile (no prompts)
1422
+ migrate Upgrade legacy profiles to current naming conventions
1423
+ sync-model-families Discover tune-eligible models from JumpStart Hub and update catalog
861
1424
 
862
1425
  SETUP OPTIONS:
863
1426
  --non-interactive Run without interactive prompts
@@ -886,6 +1449,8 @@ EXAMPLES:
886
1449
  ml-container-creator bootstrap remove dev
887
1450
  ml-container-creator bootstrap remove dev --force --delete-stack
888
1451
  ml-container-creator bootstrap scan
1452
+ ml-container-creator bootstrap sync-model-families
1453
+ ml-container-creator bootstrap migrate
889
1454
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2
890
1455
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --role-arn arn:aws:iam::123456789012:role/MyRole --skip-s3
891
1456
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --ci