@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +33 -22
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +53 -67
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +166 -153
  23. package/servers/instance-sizer/lib/instance-ranker.js +120 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/instances.json +27 -0
  29. package/servers/lib/catalogs/model-servers.json +201 -3
  30. package/servers/lib/custom-validators.js +13 -13
  31. package/servers/lib/dynamic-resolver.js +4 -4
  32. package/servers/marketplace-picker/index.js +342 -0
  33. package/servers/marketplace-picker/manifest.json +14 -0
  34. package/servers/marketplace-picker/package.json +18 -0
  35. package/servers/model-picker/index.js +382 -382
  36. package/servers/region-picker/index.js +56 -56
  37. package/servers/workload-picker/LICENSE +202 -0
  38. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  39. package/servers/workload-picker/index.js +171 -0
  40. package/servers/workload-picker/manifest.json +16 -0
  41. package/servers/workload-picker/package.json +16 -0
  42. package/src/app.js +12 -3
  43. package/src/lib/bootstrap-command-handler.js +609 -15
  44. package/src/lib/bootstrap-config.js +36 -0
  45. package/src/lib/bootstrap-profile-manager.js +48 -41
  46. package/src/lib/ci-register-helpers.js +74 -0
  47. package/src/lib/config-loader.js +3 -0
  48. package/src/lib/config-manager.js +7 -0
  49. package/src/lib/config-validator.js +1 -1
  50. package/src/lib/cuda-resolver.js +17 -8
  51. package/src/lib/generated/cli-options.js +319 -314
  52. package/src/lib/generated/parameter-matrix.js +672 -661
  53. package/src/lib/generated/validation-rules.js +76 -72
  54. package/src/lib/path-prover-brain.js +664 -0
  55. package/src/lib/prompts/infrastructure-prompts.js +2 -2
  56. package/src/lib/prompts/model-prompts.js +6 -0
  57. package/src/lib/prompts/project-prompts.js +12 -0
  58. package/src/lib/secrets-prompt-runner.js +4 -0
  59. package/src/lib/template-manager.js +1 -1
  60. package/src/lib/template-variable-resolver.js +87 -1
  61. package/src/lib/tune-catalog-validator.js +37 -4
  62. package/templates/Dockerfile +9 -0
  63. package/templates/code/adapter_sidecar.py +444 -0
  64. package/templates/code/serve +6 -0
  65. package/templates/code/serve.d/vllm.ejs +1 -1
  66. package/templates/do/.benchmark_writer.py +1476 -0
  67. package/templates/do/.tune_helper.py +982 -57
  68. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  69. package/templates/do/adapter +154 -0
  70. package/templates/do/benchmark +639 -85
  71. package/templates/do/build +5 -0
  72. package/templates/do/clean.d/async-inference.ejs +5 -0
  73. package/templates/do/clean.d/batch-transform.ejs +5 -0
  74. package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
  75. package/templates/do/clean.d/managed-inference.ejs +5 -0
  76. package/templates/do/config +115 -45
  77. package/templates/do/deploy.d/async-inference.ejs +30 -3
  78. package/templates/do/deploy.d/batch-transform.ejs +29 -3
  79. package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
  80. package/templates/do/deploy.d/managed-inference.ejs +216 -14
  81. package/templates/do/lib/endpoint-config.sh +1 -1
  82. package/templates/do/lib/profile.sh +44 -0
  83. package/templates/do/optimize +106 -37
  84. package/templates/do/push +5 -0
  85. package/templates/do/register +94 -0
  86. package/templates/do/stage +567 -0
  87. package/templates/do/submit +7 -0
  88. package/templates/do/test +14 -0
  89. package/templates/do/tune +382 -59
  90. package/templates/do/validate +44 -4
@@ -62,6 +62,7 @@ export default class BootstrapCommandHandler {
62
62
  _handleScan() { return this.profileManager._handleScan(); }
63
63
  _handlePrune() { return this.profileManager._handlePrune(); }
64
64
  _handleSyncSchemas() { return this.profileManager._handleSyncSchemas(); }
65
+ _handleSyncModelFamilies() { return this.profileManager._handleSyncModelFamilies(); }
65
66
 
66
67
  /**
67
68
  * Dispatch bootstrap subcommands.
@@ -69,8 +70,28 @@ export default class BootstrapCommandHandler {
69
70
  * @param {object} options - Parsed CLI options
70
71
  */
71
72
  async handle(args, options) {
73
+ // Commander.js with passThroughOptions() captures flags after positional
74
+ // arguments in args rather than options. Extract known flags from args.
75
+ const extractedOptions = { ...options };
76
+ const cleanArgs = [];
77
+ for (const arg of args) {
78
+ if (arg === '--ci') extractedOptions.ci = true;
79
+ else if (arg === '--benchmark-infra') extractedOptions.benchmarkInfra = true;
80
+ else if (arg === '--skip-ci') extractedOptions.skipCi = true;
81
+ else if (arg === '--skip-s3') extractedOptions.skipS3 = true;
82
+ else if (arg === '--skip-post-setup') extractedOptions.skipPostSetup = true;
83
+ else if (arg === '--force') extractedOptions.force = true;
84
+ else if (arg === '--verify') extractedOptions.verify = true;
85
+ else if (arg === '--delete-stack') extractedOptions.deleteStack = true;
86
+ else if (arg === '--non-interactive') extractedOptions.nonInteractive = true;
87
+ else if (arg === '--ignore-staleness') extractedOptions.ignoreStaleness = true;
88
+ else cleanArgs.push(arg);
89
+ }
90
+ args = cleanArgs;
91
+ options = extractedOptions;
92
+
72
93
  // Handle legacy --sync-schemas flag for backward compatibility
73
- if (options['sync-schemas']) {
94
+ if ((options['sync-schemas'] || options.syncSchemas)) {
74
95
  await this._handleSyncSchemas();
75
96
  if (args.length === 0) return;
76
97
  }
@@ -107,6 +128,15 @@ export default class BootstrapCommandHandler {
107
128
  case 'sync-schemas':
108
129
  await this._handleSyncSchemas();
109
130
  break;
131
+ case 'sync-model-families':
132
+ await this._handleSyncModelFamilies();
133
+ break;
134
+ // Migration path: upgrades legacy profiles to current naming conventions.
135
+ // Corrects stackName to mlcc-bootstrap-{profileName}, renames sharedStackFrom
136
+ // to sharedInfraFrom. Idempotent — safe to run multiple times.
137
+ case 'migrate':
138
+ await this._handleMigrate();
139
+ break;
110
140
  default:
111
141
  console.log(`Unknown bootstrap subcommand: ${subcommand}`);
112
142
  this._showHelp();
@@ -119,7 +149,8 @@ export default class BootstrapCommandHandler {
119
149
  * @param {object} options - Parsed CLI options
120
150
  */
121
151
  async _handleInteractiveSetup(options) {
122
- const nonInteractive = options['non-interactive'];
152
+ // Commander.js converts --non-interactive to options.nonInteractive (camelCase)
153
+ const nonInteractive = options['non-interactive'] || options.nonInteractive;
123
154
 
124
155
  // Non-interactive mode: validate required flags upfront
125
156
  if (nonInteractive) {
@@ -180,13 +211,13 @@ export default class BootstrapCommandHandler {
180
211
 
181
212
  // Step 3: Determine stack parameters
182
213
  let useExistingRoleArn = '';
183
- if (nonInteractive && options['role-arn']) {
184
- useExistingRoleArn = options['role-arn'];
185
- console.log(` Using provided IAM role ARN: ${options['role-arn']}`);
214
+ if (nonInteractive && (options['role-arn'] || options.roleArn)) {
215
+ useExistingRoleArn = (options['role-arn'] || options.roleArn);
216
+ console.log(` Using provided IAM role ARN: ${(options['role-arn'] || options.roleArn)}`);
186
217
  }
187
218
 
188
219
  let createS3Buckets = false;
189
- if (nonInteractive && options['skip-s3']) {
220
+ if (nonInteractive && (options['skip-s3'] || options.skipS3)) {
190
221
  console.log(' ⏭️ Skipping S3 bucket creation (--skip-s3)');
191
222
  } else if (nonInteractive) {
192
223
  createS3Buckets = true;
@@ -231,7 +262,8 @@ export default class BootstrapCommandHandler {
231
262
 
232
263
  profileData.roleArn = stackOutputs.RoleArn;
233
264
  profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
234
- profileData.stackName = otherStack;
265
+ profileData.stackName = stackName;
266
+ profileData.sharedInfraFrom = otherStack; // Track that this profile reuses another's stack
235
267
  if (stackOutputs.AsyncS3BucketName) profileData.asyncS3Bucket = stackOutputs.AsyncS3BucketName;
236
268
  if (stackOutputs.BatchS3BucketName) profileData.batchS3Bucket = stackOutputs.BatchS3BucketName;
237
269
  if (stackOutputs.AdapterS3BucketName) profileData.adapterS3Bucket = stackOutputs.AdapterS3BucketName;
@@ -245,15 +277,45 @@ export default class BootstrapCommandHandler {
245
277
  }
246
278
 
247
279
  if (!profileData.stackName) {
280
+ // Pre-check: if IAM role already exists globally (from another region's deployment),
281
+ // pass its ARN so CloudFormation skips re-creation (account-level singleton)
282
+ if (!useExistingRoleArn) {
283
+ try {
284
+ const roleResult = this._execAws(
285
+ 'iam get-role --role-name mlcc-sagemaker-execution-role',
286
+ awsProfile
287
+ );
288
+ const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
289
+ if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
290
+ useExistingRoleArn = roleArn;
291
+ console.log(` ℹ️ Reusing existing IAM role: ${roleArn}`);
292
+ }
293
+ } catch (_) {
294
+ // Role doesn't exist yet — will be created by the stack
295
+ }
296
+ }
297
+
248
298
  try {
299
+ // Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
300
+ let skipEcr = 'false';
301
+ try {
302
+ this._execAws(
303
+ `ecr describe-repositories --repository-names ml-container-creator --region ${region}`,
304
+ awsProfile
305
+ );
306
+ skipEcr = 'true';
307
+ console.log(' ℹ️ ECR repository already exists — skipping creation');
308
+ } catch (_) { /* doesn't exist — will be created */ }
309
+
249
310
  const stackOutputs = this._deployStack(stackName, {
250
311
  CreateS3Buckets: createS3Buckets ? 'true' : 'false',
251
- UseExistingRoleArn: useExistingRoleArn
312
+ UseExistingRoleArn: useExistingRoleArn,
313
+ SkipEcrCreation: skipEcr
252
314
  }, awsProfile, region);
253
315
 
254
316
  // Read outputs into profile data
255
317
  profileData.roleArn = stackOutputs.RoleArn;
256
- profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
318
+ profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName || 'ml-container-creator';
257
319
  profileData.stackName = stackName;
258
320
 
259
321
  if (stackOutputs.AsyncS3BucketName) {
@@ -278,6 +340,23 @@ export default class BootstrapCommandHandler {
278
340
  }
279
341
  } // end if (!profileData.stackName)
280
342
 
343
+ // Step 4b: MLflow App for model customization experiment tracking
344
+ this._displayProgress('📊', 'MLflow App for experiment tracking...');
345
+ try {
346
+ if (!profileData.mlflowAppArn) {
347
+ const mlflowAppArn = this._ensureMlflowApp(profileData, awsProfile);
348
+ if (mlflowAppArn) {
349
+ profileData.mlflowAppArn = mlflowAppArn;
350
+ console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
351
+ }
352
+ } else {
353
+ console.log(` ✅ MLflow App already configured: ${profileData.mlflowAppArn}`);
354
+ }
355
+ } catch (error) {
356
+ console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
357
+ console.log(' Tune jobs will still work but experiment tracking may not be available.');
358
+ }
359
+
281
360
  // Step 5: CI Infrastructure setup (separate CDK stack — unchanged)
282
361
  this._displayProgress('🧪', 'CI Testing Infrastructure...');
283
362
  try {
@@ -286,7 +365,7 @@ export default class BootstrapCommandHandler {
286
365
  if (nonInteractive) {
287
366
  if (options.ci) {
288
367
  provisionCi = true;
289
- } else if (options['skip-ci']) {
368
+ } else if ((options['skip-ci'] || options.skipCi)) {
290
369
  console.log(' ⏭️ Skipping CI infrastructure (--skip-ci)');
291
370
  provisionCi = false;
292
371
  } else {
@@ -303,6 +382,21 @@ export default class BootstrapCommandHandler {
303
382
  }
304
383
 
305
384
  if (provisionCi) {
385
+ // --- CI single-region enforcement ---
386
+ const ciConflict = this._findExistingCiProfile(profileName);
387
+ if (ciConflict) {
388
+ console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
389
+ console.log(' CI can only be deployed in one region per account.');
390
+ provisionCi = false;
391
+ }
392
+ }
393
+
394
+ if (provisionCi) {
395
+ // Persist CI intent immediately so that `bootstrap update --ci` can
396
+ // retry if the CDK deploy fails. Don't wait for success.
397
+ profileData.ciInfraProvisioned = true;
398
+ profileData.ciTableName = profileData.ciTableName || 'mlcc-ci-table';
399
+
306
400
  // Ensure CDK is bootstrapped in this account/region
307
401
  const cdkBootstrapped = this._resourceExists(
308
402
  `ssm get-parameter --name /cdk-bootstrap/hnb659fds/version --region ${profileData.awsRegion}`,
@@ -358,14 +452,39 @@ export default class BootstrapCommandHandler {
358
452
  stdio: ['pipe', 'pipe', 'pipe']
359
453
  });
360
454
 
455
+ // Warn if shell AWS_REGION differs from profile region
456
+ if (process.env.AWS_REGION && process.env.AWS_REGION !== profileData.awsRegion) {
457
+ console.log(` ⚠️ AWS_REGION env var (${process.env.AWS_REGION}) differs from profile region (${profileData.awsRegion}) — using profile region`);
458
+ }
459
+
460
+ // --no-rollback prevents rollback on AlreadyExists errors for IAM roles
461
+ // that may pre-exist from a prior deployment or another region.
462
+ // Check if benchmark bucket already exists (from a prior torn-down stack with RETAIN policy)
463
+ let importBucketCtx = '';
464
+ if (options.benchmarkInfra) {
465
+ try {
466
+ execSync(
467
+ `aws s3api head-bucket --bucket mlcc-benchmark-results-${profileData.accountId}-${profileData.awsRegion}${profileData.awsProfile ? ` --profile ${profileData.awsProfile}` : ''} --region ${profileData.awsRegion}`,
468
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
469
+ );
470
+ importBucketCtx = ' -c importExistingBenchmarkBucket=true';
471
+ console.log(' ℹ️ Benchmark results bucket already exists — importing into stack');
472
+ } catch {
473
+ // Bucket doesn't exist — will be created fresh
474
+ }
475
+ }
476
+ const cdkDeployCmd = options.benchmarkInfra
477
+ ? `npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true${importBucketCtx}`
478
+ : 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
361
479
  execSync(
362
- 'npx cdk deploy MlccCiHarnessStack --require-approval never',
480
+ cdkDeployCmd,
363
481
  {
364
482
  cwd: ciHarnessDir,
365
483
  encoding: 'utf8',
366
484
  stdio: 'inherit',
367
485
  env: {
368
486
  ...process.env,
487
+ AWS_REGION: profileData.awsRegion,
369
488
  CDK_DEFAULT_REGION: profileData.awsRegion,
370
489
  CDK_DEFAULT_ACCOUNT: profileData.accountId,
371
490
  AWS_PROFILE: profileData.awsProfile
@@ -376,6 +495,11 @@ export default class BootstrapCommandHandler {
376
495
 
377
496
  profileData.ciInfraProvisioned = true;
378
497
  profileData.ciTableName = 'mlcc-ci-table';
498
+ if (options.benchmarkInfra) {
499
+ profileData.benchmarkInfraProvisioned = true;
500
+ profileData.ciGlueDatabase = 'mlcc_ci';
501
+ profileData.ciBenchmarkResultsBucket = `mlcc-benchmark-results-${profileData.accountId}-${profileData.awsRegion}`;
502
+ }
379
503
  }
380
504
  }
381
505
  } catch (error) {
@@ -413,14 +537,80 @@ export default class BootstrapCommandHandler {
413
537
  console.log(` Region: ${profileConfig.awsRegion}`);
414
538
  console.log(` Account: ${profileConfig.accountId}`);
415
539
 
540
+ // --- SANITY CHECK 1: Account identity ---
541
+ const callerAccount = this._getCallerAccount(profileConfig.awsProfile);
542
+ if (callerAccount !== profileConfig.accountId) {
543
+ console.log(`❌ Account mismatch: profile expects ${profileConfig.accountId} but credentials resolve to ${callerAccount}`);
544
+ return;
545
+ }
546
+
416
547
  // Re-deploy the CloudFormation bootstrap stack
417
548
  const stackName = profileConfig.stackName || `${STACK_NAME_PREFIX}-${name}`;
549
+
550
+ // Sanity check: stack name consistency (warn-and-continue)
551
+ const expectedStackName = `${STACK_NAME_PREFIX}-${name}`;
552
+ if (profileConfig.stackName && profileConfig.stackName !== expectedStackName) {
553
+ console.log(`⚠️ Stack name mismatch: expected "${expectedStackName}" but profile has "${profileConfig.stackName}"`);
554
+ console.log(' Run `ml-container-creator bootstrap migrate` to fix.');
555
+ console.log(' Proceeding with stored stack name...');
556
+ }
557
+
558
+ // --- SANITY CHECK 3: Stack exists in target region ---
559
+ const stackExists = this._resourceExists(
560
+ `cloudformation describe-stacks --stack-name ${stackName} --region ${profileConfig.awsRegion}`,
561
+ profileConfig.awsProfile
562
+ );
563
+ if (!stackExists) {
564
+ console.log(`❌ Stack "${stackName}" not found in ${profileConfig.awsRegion}.`);
565
+ console.log(' Run `ml-container-creator bootstrap` to create it.');
566
+ return;
567
+ }
568
+
569
+ // --- CI single-region enforcement ---
570
+ if (options.ci) {
571
+ const ciConflict = this._findExistingCiProfile(name);
572
+ if (ciConflict) {
573
+ console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
574
+ console.log(' CI can only be deployed in one region per account.');
575
+ return;
576
+ }
577
+ }
578
+
418
579
  this._displayProgress('☁️', 'Updating bootstrap stack...');
419
580
 
581
+ // Pre-check: if IAM role already exists globally (from another region's deployment),
582
+ // pass its ARN so CloudFormation skips re-creation (account-level singleton)
583
+ let useExistingRoleArn = profileConfig.roleArn || '';
584
+ if (!useExistingRoleArn) {
585
+ try {
586
+ const roleResult = this._execAws(
587
+ 'iam get-role --role-name mlcc-sagemaker-execution-role',
588
+ profileConfig.awsProfile
589
+ );
590
+ const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
591
+ if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
592
+ useExistingRoleArn = roleArn;
593
+ }
594
+ } catch (_) {
595
+ // Role doesn't exist yet — will be created by the stack
596
+ }
597
+ }
598
+
420
599
  try {
600
+ // Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
601
+ let skipEcr = 'false';
602
+ try {
603
+ this._execAws(
604
+ `ecr describe-repositories --repository-names ml-container-creator --region ${profileConfig.awsRegion}`,
605
+ profileConfig.awsProfile
606
+ );
607
+ skipEcr = 'true';
608
+ } catch (_) { /* doesn't exist */ }
609
+
421
610
  const stackOutputs = this._deployStack(stackName, {
422
611
  CreateS3Buckets: (profileConfig.asyncS3Bucket || profileConfig.batchS3Bucket) ? 'true' : 'false',
423
- UseExistingRoleArn: ''
612
+ UseExistingRoleArn: useExistingRoleArn,
613
+ SkipEcrCreation: skipEcr
424
614
  }, profileConfig.awsProfile, profileConfig.awsRegion);
425
615
 
426
616
  // Update profile with any new outputs
@@ -456,14 +646,34 @@ export default class BootstrapCommandHandler {
456
646
  stdio: ['pipe', 'pipe', 'pipe']
457
647
  });
458
648
 
649
+ // --no-rollback prevents rollback on AlreadyExists errors for IAM roles
650
+ // that may pre-exist from a prior deployment or another region.
651
+ // Check if benchmark bucket already exists (from a prior torn-down stack with RETAIN policy)
652
+ let updateImportBucketCtx = '';
653
+ if (options.benchmarkInfra || profileConfig.benchmarkInfraProvisioned) {
654
+ try {
655
+ execSync(
656
+ `aws s3api head-bucket --bucket mlcc-benchmark-results-${profileConfig.accountId}-${profileConfig.awsRegion}${profileConfig.awsProfile ? ` --profile ${profileConfig.awsProfile}` : ''} --region ${profileConfig.awsRegion}`,
657
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
658
+ );
659
+ updateImportBucketCtx = ' -c importExistingBenchmarkBucket=true';
660
+ console.log(' ℹ️ Benchmark results bucket already exists — importing into stack');
661
+ } catch {
662
+ // Bucket doesn't exist — will be created fresh
663
+ }
664
+ }
665
+ const updateCdkCmd = (options.benchmarkInfra || profileConfig.benchmarkInfraProvisioned)
666
+ ? `npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true${updateImportBucketCtx}`
667
+ : 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
459
668
  execSync(
460
- 'npx cdk deploy MlccCiHarnessStack --require-approval never',
669
+ updateCdkCmd,
461
670
  {
462
671
  cwd: ciHarnessDir,
463
672
  encoding: 'utf8',
464
673
  stdio: 'inherit',
465
674
  env: {
466
675
  ...process.env,
676
+ AWS_REGION: profileConfig.awsRegion,
467
677
  CDK_DEFAULT_REGION: profileConfig.awsRegion,
468
678
  CDK_DEFAULT_ACCOUNT: profileConfig.accountId,
469
679
  AWS_PROFILE: profileConfig.awsProfile
@@ -471,6 +681,8 @@ export default class BootstrapCommandHandler {
471
681
  }
472
682
  );
473
683
  profileConfig.ciInfraProvisioned = true;
684
+ profileConfig.ciGlueDatabase = profileConfig.ciGlueDatabase || 'mlcc_ci';
685
+ profileConfig.ciBenchmarkResultsBucket = profileConfig.ciBenchmarkResultsBucket || `mlcc-benchmark-results-${profileConfig.accountId}-${profileConfig.awsRegion}`;
474
686
  console.log(' ✅ CI harness stack updated');
475
687
  }
476
688
  } catch (error) {
@@ -480,6 +692,18 @@ export default class BootstrapCommandHandler {
480
692
  console.log(' ⏭️ CI stack skipped (not provisioned — use --ci to force)');
481
693
  }
482
694
 
695
+ // Ensure MLflow App exists
696
+ this._displayProgress('📊', 'MLflow App for experiment tracking...');
697
+ try {
698
+ const mlflowAppArn = this._ensureMlflowApp(profileConfig, profileConfig.awsProfile);
699
+ if (mlflowAppArn) {
700
+ profileConfig.mlflowAppArn = mlflowAppArn;
701
+ console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
702
+ }
703
+ } catch (error) {
704
+ console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
705
+ }
706
+
483
707
  // Save updated profile
484
708
  this.config.setProfile(name, profileConfig);
485
709
  console.log(`\n✅ Update complete for profile "${name}"`);
@@ -488,6 +712,82 @@ export default class BootstrapCommandHandler {
488
712
  await this._runPostSetupChain(options);
489
713
  }
490
714
 
715
+ /**
716
+ * Migrate legacy profiles to current naming conventions.
717
+ * Corrects stackName mismatches and renames sharedStackFrom → sharedInfraFrom.
718
+ * Displays a preview of all changes and requires confirmation before writing.
719
+ */
720
+ async _handleMigrate() {
721
+ const config = this.config.read();
722
+ if (!config || !config.profiles) {
723
+ console.log('No profiles to migrate.');
724
+ return;
725
+ }
726
+
727
+ const changes = [];
728
+
729
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
730
+ const expected = `${STACK_NAME_PREFIX}-${name}`;
731
+
732
+ // Fix stackName mismatch
733
+ if (profileConfig.stackName && profileConfig.stackName !== expected) {
734
+ changes.push({
735
+ profile: name,
736
+ field: 'stackName',
737
+ from: profileConfig.stackName,
738
+ to: expected
739
+ });
740
+ }
741
+
742
+ // Rename sharedStackFrom → sharedInfraFrom
743
+ if (profileConfig.sharedStackFrom) {
744
+ changes.push({
745
+ profile: name,
746
+ field: 'sharedStackFrom → sharedInfraFrom',
747
+ from: profileConfig.sharedStackFrom,
748
+ to: profileConfig.sharedStackFrom
749
+ });
750
+ }
751
+ }
752
+
753
+ if (changes.length === 0) {
754
+ console.log('✅ All profiles already use current naming conventions.');
755
+ return;
756
+ }
757
+
758
+ // Display preview
759
+ console.log('📋 Migration Preview:\n');
760
+ for (const change of changes) {
761
+ console.log(` Profile "${change.profile}":`);
762
+ console.log(` ${change.field}: "${change.from}" → "${change.to}"`);
763
+ }
764
+
765
+ // Prompt for confirmation
766
+ const { confirm } = await this._promptFn([{
767
+ type: 'confirm',
768
+ name: 'confirm',
769
+ message: 'Apply these changes?',
770
+ default: true
771
+ }]);
772
+
773
+ if (!confirm) return;
774
+
775
+ // Apply changes
776
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
777
+ const expected = `${STACK_NAME_PREFIX}-${name}`;
778
+ if (profileConfig.stackName !== expected) {
779
+ profileConfig.stackName = expected;
780
+ }
781
+ if (profileConfig.sharedStackFrom) {
782
+ profileConfig.sharedInfraFrom = profileConfig.sharedStackFrom;
783
+ delete profileConfig.sharedStackFrom;
784
+ }
785
+ }
786
+
787
+ this.config.write(config);
788
+ console.log('✅ Migration complete.');
789
+ }
790
+
491
791
  /**
492
792
  * Run the post-setup chain: mcp init → registry sync-architectures → sync-schemas.
493
793
  * Each step is independent — failures are collected and reported at the end.
@@ -495,7 +795,7 @@ export default class BootstrapCommandHandler {
495
795
  * @param {object} options - Parsed CLI options (checks skipPostSetup)
496
796
  */
497
797
  async _runPostSetupChain(options = {}) {
498
- if (options['skip-post-setup']) {
798
+ if ((options['skip-post-setup'] || options.skipPostSetup)) {
499
799
  console.log('\n⏭️ Skipping post-setup chain (--skip-post-setup)');
500
800
  return;
501
801
  }
@@ -717,7 +1017,8 @@ export default class BootstrapCommandHandler {
717
1017
  * @returns {object} Parsed JSON output
718
1018
  */
719
1019
  _execAws(command, profile) {
720
- const fullCommand = `aws ${command} --profile ${profile} --output json`;
1020
+ const profileFlag = profile ? `--profile ${profile}` : '';
1021
+ const fullCommand = `aws ${command} ${profileFlag} --output json`.replace(/\s+/g, ' ').trim();
721
1022
  const output = execSync(fullCommand, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] });
722
1023
  const trimmed = output.trim();
723
1024
  if (!trimmed) {
@@ -729,6 +1030,12 @@ export default class BootstrapCommandHandler {
729
1030
  /**
730
1031
  * Deploy the bootstrap CloudFormation stack and return its outputs.
731
1032
  *
1033
+ * Before deploying, checks for pre-existing S3 buckets that would cause
1034
+ * ResourceExistenceCheck failures. If the stack is in REVIEW_IN_PROGRESS
1035
+ * state (empty shell from a failed prior attempt), deletes it first.
1036
+ * If buckets exist but aren't managed by the stack, uses a CloudFormation
1037
+ * import changeset to adopt them before proceeding with the normal deploy.
1038
+ *
732
1039
  * Uses `aws cloudformation deploy` which is idempotent — it creates the
733
1040
  * stack on first run and updates it on subsequent runs. If the template
734
1041
  * hasn't changed, it exits with "No changes to deploy" which we handle
@@ -741,6 +1048,9 @@ export default class BootstrapCommandHandler {
741
1048
  * @returns {object} Map of output key → output value
742
1049
  */
743
1050
  _deployStack(stackName, parameters, profile, region) {
1051
+ // Handle ghost stacks and pre-existing resources
1052
+ this._resolveStackConflicts(stackName, parameters, profile, region);
1053
+
744
1054
  // Build parameter overrides string
745
1055
  const paramOverrides = Object.entries(parameters)
746
1056
  .map(([key, value]) => `${key}=${value}`)
@@ -764,6 +1074,32 @@ export default class BootstrapCommandHandler {
764
1074
  const stderr = error.stderr || error.message || '';
765
1075
  if (stderr.includes('No changes to deploy')) {
766
1076
  console.log(' ℹ️ Stack is up to date — no changes needed');
1077
+ } else if (stderr.includes('ResourceExistenceCheck')) {
1078
+ // Resources already exist outside the stack — attempt import and retry
1079
+ console.log(' ⚠️ Pre-existing resources detected — attempting import...');
1080
+ this._resolveStackConflicts(stackName, parameters, profile, region);
1081
+ // Rebuild deploy command with updated parameters (e.g., CreateS3Buckets may now be 'false')
1082
+ const retryParamOverrides = Object.entries(parameters)
1083
+ .map(([key, value]) => `${key}=${value}`)
1084
+ .join(' ');
1085
+ const retryDeployCommand = [
1086
+ 'aws cloudformation deploy',
1087
+ `--template-file ${STACK_TEMPLATE_PATH}`,
1088
+ `--stack-name ${stackName}`,
1089
+ '--capabilities CAPABILITY_NAMED_IAM',
1090
+ `--parameter-overrides ${retryParamOverrides}`,
1091
+ `--profile ${profile}`,
1092
+ `--region ${region}`
1093
+ ].join(' ');
1094
+ // Retry the deploy after import
1095
+ try {
1096
+ execSync(retryDeployCommand, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] });
1097
+ } catch (retryError) {
1098
+ const retryStderr = retryError.stderr || retryError.message || '';
1099
+ if (!retryStderr.includes('No changes to deploy')) {
1100
+ throw retryError;
1101
+ }
1102
+ }
767
1103
  } else {
768
1104
  throw error;
769
1105
  }
@@ -785,9 +1121,144 @@ export default class BootstrapCommandHandler {
785
1121
  outputs[output.OutputKey] = output.OutputValue;
786
1122
  }
787
1123
 
1124
+ // If S3 buckets already existed (skipped creation), inject their names
1125
+ // into outputs so the profile config gets populated correctly.
1126
+ if (this._preExistingBuckets && this._preExistingBuckets.length > 0) {
1127
+ const bucketOutputMap = {
1128
+ 'AsyncS3Bucket': 'AsyncS3BucketName',
1129
+ 'BatchS3Bucket': 'BatchS3BucketName',
1130
+ 'AdapterS3Bucket': 'AdapterS3BucketName',
1131
+ 'BenchmarkS3Bucket': 'BenchmarkS3BucketName',
1132
+ 'TuneS3Bucket': 'TuneS3BucketName'
1133
+ };
1134
+ for (const bucket of this._preExistingBuckets) {
1135
+ const outputKey = bucketOutputMap[bucket.logicalId];
1136
+ if (outputKey && !outputs[outputKey]) {
1137
+ outputs[outputKey] = bucket.name;
1138
+ }
1139
+ }
1140
+ this._preExistingBuckets = null;
1141
+ }
1142
+
788
1143
  return outputs;
789
1144
  }
790
1145
 
1146
+ /**
1147
+ * Resolve stack conflicts before deploying.
1148
+ *
1149
+ * Handles two scenarios that cause ResourceExistenceCheck failures:
1150
+ * 1. Ghost stacks (REVIEW_IN_PROGRESS) — delete them first
1151
+ * 2. Pre-existing S3 buckets not managed by the stack — import them
1152
+ *
1153
+ * @param {string} stackName - CloudFormation stack name
1154
+ * @param {object} parameters - Stack parameter key-value pairs
1155
+ * @param {string} profile - AWS CLI profile name
1156
+ * @param {string} region - AWS region
1157
+ */
1158
+ _resolveStackConflicts(stackName, parameters, profile, region) {
1159
+ // Check if stack exists and its status
1160
+ let stackStatus = null;
1161
+ let managedResources = [];
1162
+
1163
+ try {
1164
+ const describeResult = this._execAws(
1165
+ `cloudformation describe-stacks --stack-name ${stackName} --region ${region}`,
1166
+ profile
1167
+ );
1168
+ const stack = describeResult.Stacks && describeResult.Stacks[0];
1169
+ if (stack) {
1170
+ stackStatus = stack.StackStatus;
1171
+ }
1172
+ } catch (_) {
1173
+ // Stack doesn't exist — no conflicts possible
1174
+ return;
1175
+ }
1176
+
1177
+ // Handle ghost stacks (created but never successfully deployed)
1178
+ if (stackStatus === 'REVIEW_IN_PROGRESS') {
1179
+ console.log(' ⚠️ Found ghost stack (REVIEW_IN_PROGRESS) — deleting before redeploy...');
1180
+ try {
1181
+ execSync(
1182
+ `aws cloudformation delete-stack --stack-name ${stackName} --profile ${profile} --region ${region}`,
1183
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
1184
+ );
1185
+ execSync(
1186
+ `aws cloudformation wait stack-delete-complete --stack-name ${stackName} --profile ${profile} --region ${region}`,
1187
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 60000 }
1188
+ );
1189
+ console.log(' ✅ Ghost stack deleted');
1190
+ } catch (err) {
1191
+ console.log(` ⚠️ Could not delete ghost stack: ${err.message || err}`);
1192
+ }
1193
+ // Don't return — fall through to check for pre-existing S3 buckets
1194
+ // that need to be imported on the fresh deploy. The ghost stack had
1195
+ // DeletionPolicy:Retain buckets that survive stack deletion.
1196
+ stackStatus = null;
1197
+ managedResources = [];
1198
+ }
1199
+
1200
+ // For active stacks (or post-ghost-deletion), check if S3 buckets exist but aren't managed
1201
+ if (parameters.CreateS3Buckets !== 'true') {
1202
+ return; // Not creating buckets — no conflict
1203
+ }
1204
+
1205
+ // Get list of resources currently managed by the stack (empty if stack was just deleted)
1206
+ if (stackStatus) {
1207
+ try {
1208
+ const resources = this._execAws(
1209
+ `cloudformation list-stack-resources --stack-name ${stackName} --region ${region}`,
1210
+ profile
1211
+ );
1212
+ managedResources = (resources.StackResourceSummaries || [])
1213
+ .map(r => r.LogicalResourceId);
1214
+ } catch (_) {
1215
+ // Stack doesn't exist or can't be queried — proceed with empty managedResources
1216
+ }
1217
+ }
1218
+
1219
+ // Check each S3 bucket that the template would create
1220
+ const accountId = this._currentAccountId;
1221
+ const bucketConfigs = [
1222
+ { logicalId: 'AsyncS3Bucket', name: `mlcc-async-${accountId}-${region}` },
1223
+ { logicalId: 'BatchS3Bucket', name: `mlcc-batch-${accountId}-${region}` },
1224
+ { logicalId: 'AdapterS3Bucket', name: `mlcc-adapters-${accountId}-${region}` },
1225
+ { logicalId: 'BenchmarkS3Bucket', name: `mlcc-benchmark-${accountId}-${region}` },
1226
+ { logicalId: 'TuneS3Bucket', name: `mlcc-tune-${accountId}-${region}` }
1227
+ ];
1228
+
1229
+ const bucketsToImport = [];
1230
+
1231
+ for (const bucket of bucketConfigs) {
1232
+ if (managedResources.includes(bucket.logicalId)) {
1233
+ continue; // Already managed by the stack — no conflict
1234
+ }
1235
+ // Check if bucket exists in AWS
1236
+ try {
1237
+ execSync(
1238
+ `aws s3api head-bucket --bucket ${bucket.name} --profile ${profile} --region ${region}`,
1239
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
1240
+ );
1241
+ // Bucket exists but not in stack — needs import
1242
+ bucketsToImport.push(bucket);
1243
+ } catch (_) {
1244
+ // Bucket doesn't exist — will be created normally
1245
+ }
1246
+ }
1247
+
1248
+ if (bucketsToImport.length > 0) {
1249
+ console.log(` ℹ️ ${bucketsToImport.length} pre-existing S3 bucket(s) detected — skipping S3 creation (buckets already exist)`);
1250
+
1251
+ // Pre-existing S3 buckets survive stack deletion (DeletionPolicy: Retain).
1252
+ // Rather than fighting CloudFormation's IMPORT limitations, just skip S3
1253
+ // creation and wire the existing bucket names into the profile config directly.
1254
+ // The naming convention is deterministic, so we know exactly what they are.
1255
+ this._preExistingBuckets = bucketsToImport;
1256
+
1257
+ // Modify the parameters to skip S3 bucket creation in the deploy
1258
+ parameters.CreateS3Buckets = 'false';
1259
+ }
1260
+ }
1261
+
791
1262
  /**
792
1263
  * Write a JSON object to a temp file and return the `file://` path.
793
1264
  * Used for passing complex JSON to AWS CLI commands without shell escaping issues.
@@ -821,6 +1292,125 @@ export default class BootstrapCommandHandler {
821
1292
  }
822
1293
  }
823
1294
 
1295
+ /**
1296
+ * Get the AWS account ID from the caller's credentials.
1297
+ * Uses `sts get-caller-identity` to resolve the actual account.
1298
+ *
1299
+ * @param {string} awsProfile - AWS CLI profile name
1300
+ * @returns {string} The 12-digit AWS account ID
1301
+ */
1302
+ _getCallerAccount(awsProfile) {
1303
+ const identity = this._execAws('sts get-caller-identity', awsProfile);
1304
+ return identity.Account;
1305
+ }
1306
+
1307
+ /**
1308
+ * Scan all profiles to find one with ciInfraProvisioned=true,
1309
+ * excluding the given profile name.
1310
+ *
1311
+ * @param {string} excludeProfile - Profile name to exclude from the scan
1312
+ * @returns {{ name: string, config: Object }|null} The CI profile, or null if none found
1313
+ */
1314
+ _findExistingCiProfile(excludeProfile) {
1315
+ const config = this.config.read();
1316
+ if (!config || !config.profiles) return null;
1317
+
1318
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
1319
+ if (name === excludeProfile) continue;
1320
+ if (profileConfig.ciInfraProvisioned) {
1321
+ return { name, config: profileConfig };
1322
+ }
1323
+ }
1324
+ return null;
1325
+ }
1326
+
1327
+ /**
1328
+ * Ensure an MLCC-owned MLflow App exists for experiment tracking.
1329
+ * Creates one if it doesn't exist, using the tune S3 bucket as artifact store.
1330
+ *
1331
+ * @param {object} profileData - Bootstrap profile data (needs roleArn, awsRegion, accountId)
1332
+ * @param {string} awsProfile - AWS CLI profile name
1333
+ * @returns {string|null} MLflow App ARN or null if creation failed
1334
+ */
1335
+ _ensureMlflowApp(profileData, awsProfile) {
1336
+ const region = profileData.awsRegion;
1337
+ const accountId = profileData.accountId;
1338
+ const roleArn = profileData.roleArn;
1339
+ const appName = 'mlcc-tune-tracking';
1340
+ const artifactBucket = `mlcc-tune-${accountId}-${region}`;
1341
+
1342
+ // Check if MLCC app already exists
1343
+ try {
1344
+ const apps = this._execAws(
1345
+ `sagemaker list-mlflow-apps --region ${region}`,
1346
+ awsProfile
1347
+ );
1348
+ const summaries = apps.Summaries || [];
1349
+ const existing = summaries.find(a => a.Name === appName);
1350
+ if (existing) {
1351
+ return existing.Arn;
1352
+ }
1353
+ } catch {
1354
+ // list-mlflow-apps may not be available in all CLI versions — proceed to create
1355
+ }
1356
+
1357
+ // Create the MLflow App
1358
+ console.log(` Creating MLflow App "${appName}" with artifact store s3://${artifactBucket}...`);
1359
+
1360
+ // Ensure the artifact bucket exists (it's the tune bucket from the stack)
1361
+ try {
1362
+ this._execAws(
1363
+ `s3api head-bucket --bucket ${artifactBucket} --region ${region}`,
1364
+ awsProfile
1365
+ );
1366
+ } catch {
1367
+ // Bucket doesn't exist — create it
1368
+ console.log(` Creating artifact bucket: ${artifactBucket}`);
1369
+ try {
1370
+ this._execAws(
1371
+ `s3api create-bucket --bucket ${artifactBucket} --region ${region} --create-bucket-configuration LocationConstraint=${region}`,
1372
+ awsProfile
1373
+ );
1374
+ } catch (bucketErr) {
1375
+ // May already exist or region doesn't need LocationConstraint (us-east-1)
1376
+ if (!bucketErr.message?.includes('BucketAlreadyOwnedByYou')) {
1377
+ try {
1378
+ this._execAws(
1379
+ `s3api create-bucket --bucket ${artifactBucket} --region ${region}`,
1380
+ awsProfile
1381
+ );
1382
+ } catch {
1383
+ // Bucket likely exists, continue
1384
+ }
1385
+ }
1386
+ }
1387
+ }
1388
+
1389
+ // Create the app
1390
+ try {
1391
+ const result = this._execAws(
1392
+ `sagemaker create-mlflow-app --name ${appName} --artifact-store-uri s3://${artifactBucket} --role-arn ${roleArn} --model-registration-mode AutoModelRegistrationEnabled --region ${region}`,
1393
+ awsProfile
1394
+ );
1395
+ return result.Arn;
1396
+ } catch (err) {
1397
+ // If app already exists (race condition), try to describe it
1398
+ if (err.message?.includes('ResourceLimitExceeded') || err.message?.includes('already exists')) {
1399
+ try {
1400
+ const apps = this._execAws(
1401
+ `sagemaker list-mlflow-apps --region ${region}`,
1402
+ awsProfile
1403
+ );
1404
+ const found = (apps.Summaries || []).find(a => a.Name === appName);
1405
+ if (found) return found.Arn;
1406
+ } catch {
1407
+ // Fall through
1408
+ }
1409
+ }
1410
+ throw err;
1411
+ }
1412
+ }
1413
+
824
1414
  /**
825
1415
  * Format tags for the AWS CLI --tags parameter.
826
1416
  * Writes tags to a temp file and returns the file:// reference
@@ -858,6 +1448,8 @@ SUBCOMMANDS:
858
1448
  scan Discover pre-existing MLCC-managed resources in AWS
859
1449
  prune Remove deleted and unknown records from the deployment manifest
860
1450
  update Re-deploy bootstrap stacks using active profile (no prompts)
1451
+ migrate Upgrade legacy profiles to current naming conventions
1452
+ sync-model-families Discover tune-eligible models from JumpStart Hub and update catalog
861
1453
 
862
1454
  SETUP OPTIONS:
863
1455
  --non-interactive Run without interactive prompts
@@ -886,6 +1478,8 @@ EXAMPLES:
886
1478
  ml-container-creator bootstrap remove dev
887
1479
  ml-container-creator bootstrap remove dev --force --delete-stack
888
1480
  ml-container-creator bootstrap scan
1481
+ ml-container-creator bootstrap sync-model-families
1482
+ ml-container-creator bootstrap migrate
889
1483
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2
890
1484
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --role-arn arn:aws:iam::123456789012:role/MyRole --skip-s3
891
1485
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --ci