@aws/ml-container-creator 0.9.1 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +2049 -0
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +53 -68
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +138 -138
  23. package/servers/instance-sizer/lib/instance-ranker.js +76 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/jumpstart-public.json +101 -16
  29. package/servers/lib/catalogs/model-servers.json +201 -3
  30. package/servers/lib/catalogs/models.json +182 -26
  31. package/servers/lib/custom-validators.js +13 -13
  32. package/servers/lib/dynamic-resolver.js +4 -4
  33. package/servers/marketplace-picker/index.js +342 -0
  34. package/servers/marketplace-picker/manifest.json +14 -0
  35. package/servers/marketplace-picker/package.json +18 -0
  36. package/servers/model-picker/index.js +382 -382
  37. package/servers/region-picker/index.js +56 -56
  38. package/servers/workload-picker/LICENSE +202 -0
  39. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  40. package/servers/workload-picker/index.js +171 -0
  41. package/servers/workload-picker/manifest.json +16 -0
  42. package/servers/workload-picker/package.json +16 -0
  43. package/src/app.js +4 -390
  44. package/src/lib/bootstrap-command-handler.js +710 -1148
  45. package/src/lib/bootstrap-config.js +36 -0
  46. package/src/lib/bootstrap-profile-manager.js +641 -0
  47. package/src/lib/bootstrap-provisioners.js +421 -0
  48. package/src/lib/ci-register-helpers.js +74 -0
  49. package/src/lib/config-loader.js +408 -0
  50. package/src/lib/config-manager.js +66 -1685
  51. package/src/lib/config-mcp-client.js +118 -0
  52. package/src/lib/config-validator.js +634 -0
  53. package/src/lib/cuda-resolver.js +149 -0
  54. package/src/lib/e2e-catalog-validator.js +251 -3
  55. package/src/lib/e2e-ci-recorder.js +103 -0
  56. package/src/lib/generated/cli-options.js +315 -311
  57. package/src/lib/generated/parameter-matrix.js +671 -0
  58. package/src/lib/generated/validation-rules.js +71 -71
  59. package/src/lib/marketplace-flow.js +276 -0
  60. package/src/lib/mcp-query-runner.js +768 -0
  61. package/src/lib/parameter-schema-validator.js +62 -18
  62. package/src/lib/path-prover-brain.js +607 -0
  63. package/src/lib/prompt-runner.js +41 -1504
  64. package/src/lib/prompts/feature-prompts.js +172 -0
  65. package/src/lib/prompts/index.js +48 -0
  66. package/src/lib/prompts/infrastructure-prompts.js +690 -0
  67. package/src/lib/prompts/model-prompts.js +552 -0
  68. package/src/lib/prompts/project-prompts.js +82 -0
  69. package/src/lib/prompts.js +2 -1446
  70. package/src/lib/registry-command-handler.js +135 -3
  71. package/src/lib/secrets-prompt-runner.js +251 -0
  72. package/src/lib/template-variable-resolver.js +422 -0
  73. package/src/lib/tune-catalog-validator.js +37 -4
  74. package/templates/Dockerfile +9 -0
  75. package/templates/code/adapter_sidecar.py +444 -0
  76. package/templates/code/serve +6 -0
  77. package/templates/code/serve.d/vllm.ejs +1 -1
  78. package/templates/do/.benchmark_writer.py +1476 -0
  79. package/templates/do/.tune_helper.py +982 -57
  80. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  81. package/templates/do/adapter +149 -0
  82. package/templates/do/benchmark +639 -85
  83. package/templates/do/config +108 -5
  84. package/templates/do/deploy.d/managed-inference.ejs +192 -11
  85. package/templates/do/optimize +106 -37
  86. package/templates/do/register +89 -0
  87. package/templates/do/test +13 -0
  88. package/templates/do/tune +378 -59
  89. package/templates/do/validate +44 -4
  90. package/config/parameter-schema.json +0 -88
@@ -17,16 +17,17 @@
17
17
  */
18
18
 
19
19
  import { execSync } from 'node:child_process';
20
- import { existsSync, readFileSync, unlinkSync, writeFileSync, mkdirSync } from 'node:fs';
20
+ import { existsSync, writeFileSync, mkdirSync } from 'node:fs';
21
21
  import path from 'node:path';
22
22
  import { tmpdir } from 'node:os';
23
23
  import { fileURLToPath } from 'node:url';
24
24
  import BootstrapConfig from './bootstrap-config.js';
25
25
  import AwsProfileParser from './aws-profile-parser.js';
26
- import AssetManager from './asset-manager.js';
27
26
  import McpCommandHandler from './mcp-command-handler.js';
28
27
  import RegistryCommandHandler from './registry-command-handler.js';
29
28
  import { runPrompts } from '../prompt-adapter.js';
29
+ import BootstrapProfileManager from './bootstrap-profile-manager.js';
30
+ import BootstrapProvisioners from './bootstrap-provisioners.js';
30
31
 
31
32
  const __filename = fileURLToPath(import.meta.url);
32
33
  const __dirname = path.dirname(__filename);
@@ -39,16 +40,58 @@ export default class BootstrapCommandHandler {
39
40
  this.config = new BootstrapConfig();
40
41
  this.profileParser = new AwsProfileParser();
41
42
  this._promptFn = promptFn || runPrompts;
43
+ this.profileManager = new BootstrapProfileManager(this);
44
+ this.provisioners = new BootstrapProvisioners(this);
42
45
  }
43
46
 
47
+ // ── Provisioner delegations (backward compat for tests) ─────────
48
+
49
+ _buildResourceTags() { return this.provisioners._buildResourceTags(); }
50
+ _setupEcrRepository() { return this.provisioners._setupEcrRepository(); }
51
+ _setupIamRole(options) { return this.provisioners._setupIamRole(options); }
52
+ _setupS3Buckets() { return this.provisioners._setupS3Buckets(); }
53
+ _createS3Bucket(name, tags) { return this.provisioners._createS3Bucket(name, tags); }
54
+ _verifyCliV2() { return this.provisioners._verifyCliV2(); }
55
+
56
+ // ── ProfileManager delegations (backward compat for tests) ──────
57
+
58
+ _handleStatus(options) { return this.profileManager._handleStatus(options); }
59
+ _handleUse(profileName) { return this.profileManager._handleUse(profileName); }
60
+ _handleList() { return this.profileManager._handleList(); }
61
+ _handleRemove(profileName, options) { return this.profileManager._handleRemove(profileName, options); }
62
+ _handleScan() { return this.profileManager._handleScan(); }
63
+ _handlePrune() { return this.profileManager._handlePrune(); }
64
+ _handleSyncSchemas() { return this.profileManager._handleSyncSchemas(); }
65
+ _handleSyncModelFamilies() { return this.profileManager._handleSyncModelFamilies(); }
66
+
44
67
  /**
45
68
  * Dispatch bootstrap subcommands.
46
69
  * @param {string[]} args - Remaining positional args after 'bootstrap'
47
70
  * @param {object} options - Parsed CLI options
48
71
  */
49
72
  async handle(args, options) {
73
+ // Commander.js with passThroughOptions() captures flags after positional
74
+ // arguments in args rather than options. Extract known flags from args.
75
+ const extractedOptions = { ...options };
76
+ const cleanArgs = [];
77
+ for (const arg of args) {
78
+ if (arg === '--ci') extractedOptions.ci = true;
79
+ else if (arg === '--benchmark-infra') extractedOptions.benchmarkInfra = true;
80
+ else if (arg === '--skip-ci') extractedOptions.skipCi = true;
81
+ else if (arg === '--skip-s3') extractedOptions.skipS3 = true;
82
+ else if (arg === '--skip-post-setup') extractedOptions.skipPostSetup = true;
83
+ else if (arg === '--force') extractedOptions.force = true;
84
+ else if (arg === '--verify') extractedOptions.verify = true;
85
+ else if (arg === '--delete-stack') extractedOptions.deleteStack = true;
86
+ else if (arg === '--non-interactive') extractedOptions.nonInteractive = true;
87
+ else if (arg === '--ignore-staleness') extractedOptions.ignoreStaleness = true;
88
+ else cleanArgs.push(arg);
89
+ }
90
+ args = cleanArgs;
91
+ options = extractedOptions;
92
+
50
93
  // Handle legacy --sync-schemas flag for backward compatibility
51
- if (options['sync-schemas']) {
94
+ if ((options['sync-schemas'] || options.syncSchemas)) {
52
95
  await this._handleSyncSchemas();
53
96
  if (args.length === 0) return;
54
97
  }
@@ -85,6 +128,15 @@ export default class BootstrapCommandHandler {
85
128
  case 'sync-schemas':
86
129
  await this._handleSyncSchemas();
87
130
  break;
131
+ case 'sync-model-families':
132
+ await this._handleSyncModelFamilies();
133
+ break;
134
+ // Migration path: upgrades legacy profiles to current naming conventions.
135
+ // Corrects stackName to mlcc-bootstrap-{profileName}, renames sharedStackFrom
136
+ // to sharedInfraFrom. Idempotent — safe to run multiple times.
137
+ case 'migrate':
138
+ await this._handleMigrate();
139
+ break;
88
140
  default:
89
141
  console.log(`Unknown bootstrap subcommand: ${subcommand}`);
90
142
  this._showHelp();
@@ -97,7 +149,8 @@ export default class BootstrapCommandHandler {
97
149
  * @param {object} options - Parsed CLI options
98
150
  */
99
151
  async _handleInteractiveSetup(options) {
100
- const nonInteractive = options['non-interactive'];
152
+ // Commander.js converts --non-interactive to options.nonInteractive (camelCase)
153
+ const nonInteractive = options['non-interactive'] || options.nonInteractive;
101
154
 
102
155
  // Non-interactive mode: validate required flags upfront
103
156
  if (nonInteractive) {
@@ -117,7 +170,7 @@ export default class BootstrapCommandHandler {
117
170
  console.log('\n🚀 Bootstrap — Shared AWS Infrastructure Setup\n');
118
171
 
119
172
  // Verify AWS CLI v2 is installed
120
- if (!this._verifyCliV2()) {
173
+ if (!this.provisioners._verifyCliV2()) {
121
174
  return;
122
175
  }
123
176
 
@@ -158,13 +211,13 @@ export default class BootstrapCommandHandler {
158
211
 
159
212
  // Step 3: Determine stack parameters
160
213
  let useExistingRoleArn = '';
161
- if (nonInteractive && options['role-arn']) {
162
- useExistingRoleArn = options['role-arn'];
163
- console.log(` Using provided IAM role ARN: ${options['role-arn']}`);
214
+ if (nonInteractive && (options['role-arn'] || options.roleArn)) {
215
+ useExistingRoleArn = (options['role-arn'] || options.roleArn);
216
+ console.log(` Using provided IAM role ARN: ${(options['role-arn'] || options.roleArn)}`);
164
217
  }
165
218
 
166
219
  let createS3Buckets = false;
167
- if (nonInteractive && options['skip-s3']) {
220
+ if (nonInteractive && (options['skip-s3'] || options.skipS3)) {
168
221
  console.log(' ⏭️ Skipping S3 bucket creation (--skip-s3)');
169
222
  } else if (nonInteractive) {
170
223
  createS3Buckets = true;
@@ -209,7 +262,8 @@ export default class BootstrapCommandHandler {
209
262
 
210
263
  profileData.roleArn = stackOutputs.RoleArn;
211
264
  profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
212
- profileData.stackName = otherStack;
265
+ profileData.stackName = stackName;
266
+ profileData.sharedInfraFrom = otherStack; // Track that this profile reuses another's stack
213
267
  if (stackOutputs.AsyncS3BucketName) profileData.asyncS3Bucket = stackOutputs.AsyncS3BucketName;
214
268
  if (stackOutputs.BatchS3BucketName) profileData.batchS3Bucket = stackOutputs.BatchS3BucketName;
215
269
  if (stackOutputs.AdapterS3BucketName) profileData.adapterS3Bucket = stackOutputs.AdapterS3BucketName;
@@ -223,15 +277,45 @@ export default class BootstrapCommandHandler {
223
277
  }
224
278
 
225
279
  if (!profileData.stackName) {
280
+ // Pre-check: if IAM role already exists globally (from another region's deployment),
281
+ // pass its ARN so CloudFormation skips re-creation (account-level singleton)
282
+ if (!useExistingRoleArn) {
283
+ try {
284
+ const roleResult = this._execAws(
285
+ 'iam get-role --role-name mlcc-sagemaker-execution-role',
286
+ awsProfile
287
+ );
288
+ const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
289
+ if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
290
+ useExistingRoleArn = roleArn;
291
+ console.log(` ℹ️ Reusing existing IAM role: ${roleArn}`);
292
+ }
293
+ } catch (_) {
294
+ // Role doesn't exist yet — will be created by the stack
295
+ }
296
+ }
297
+
226
298
  try {
299
+ // Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
300
+ let skipEcr = 'false';
301
+ try {
302
+ this._execAws(
303
+ `ecr describe-repositories --repository-names ml-container-creator --region ${region}`,
304
+ awsProfile
305
+ );
306
+ skipEcr = 'true';
307
+ console.log(' ℹ️ ECR repository already exists — skipping creation');
308
+ } catch (_) { /* doesn't exist — will be created */ }
309
+
227
310
  const stackOutputs = this._deployStack(stackName, {
228
311
  CreateS3Buckets: createS3Buckets ? 'true' : 'false',
229
- UseExistingRoleArn: useExistingRoleArn
312
+ UseExistingRoleArn: useExistingRoleArn,
313
+ SkipEcrCreation: skipEcr
230
314
  }, awsProfile, region);
231
315
 
232
316
  // Read outputs into profile data
233
317
  profileData.roleArn = stackOutputs.RoleArn;
234
- profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
318
+ profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName || 'ml-container-creator';
235
319
  profileData.stackName = stackName;
236
320
 
237
321
  if (stackOutputs.AsyncS3BucketName) {
@@ -256,6 +340,23 @@ export default class BootstrapCommandHandler {
256
340
  }
257
341
  } // end if (!profileData.stackName)
258
342
 
343
+ // Step 4b: MLflow App for model customization experiment tracking
344
+ this._displayProgress('📊', 'MLflow App for experiment tracking...');
345
+ try {
346
+ if (!profileData.mlflowAppArn) {
347
+ const mlflowAppArn = this._ensureMlflowApp(profileData, awsProfile);
348
+ if (mlflowAppArn) {
349
+ profileData.mlflowAppArn = mlflowAppArn;
350
+ console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
351
+ }
352
+ } else {
353
+ console.log(` ✅ MLflow App already configured: ${profileData.mlflowAppArn}`);
354
+ }
355
+ } catch (error) {
356
+ console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
357
+ console.log(' Tune jobs will still work but experiment tracking may not be available.');
358
+ }
359
+
259
360
  // Step 5: CI Infrastructure setup (separate CDK stack — unchanged)
260
361
  this._displayProgress('🧪', 'CI Testing Infrastructure...');
261
362
  try {
@@ -264,7 +365,7 @@ export default class BootstrapCommandHandler {
264
365
  if (nonInteractive) {
265
366
  if (options.ci) {
266
367
  provisionCi = true;
267
- } else if (options['skip-ci']) {
368
+ } else if ((options['skip-ci'] || options.skipCi)) {
268
369
  console.log(' ⏭️ Skipping CI infrastructure (--skip-ci)');
269
370
  provisionCi = false;
270
371
  } else {
@@ -281,6 +382,21 @@ export default class BootstrapCommandHandler {
281
382
  }
282
383
 
283
384
  if (provisionCi) {
385
+ // --- CI single-region enforcement ---
386
+ const ciConflict = this._findExistingCiProfile(profileName);
387
+ if (ciConflict) {
388
+ console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
389
+ console.log(' CI can only be deployed in one region per account.');
390
+ provisionCi = false;
391
+ }
392
+ }
393
+
394
+ if (provisionCi) {
395
+ // Persist CI intent immediately so that `bootstrap update --ci` can
396
+ // retry if the CDK deploy fails. Don't wait for success.
397
+ profileData.ciInfraProvisioned = true;
398
+ profileData.ciTableName = profileData.ciTableName || 'mlcc-ci-table';
399
+
284
400
  // Ensure CDK is bootstrapped in this account/region
285
401
  const cdkBootstrapped = this._resourceExists(
286
402
  `ssm get-parameter --name /cdk-bootstrap/hnb659fds/version --region ${profileData.awsRegion}`,
@@ -336,14 +452,25 @@ export default class BootstrapCommandHandler {
336
452
  stdio: ['pipe', 'pipe', 'pipe']
337
453
  });
338
454
 
455
+ // Warn if shell AWS_REGION differs from profile region
456
+ if (process.env.AWS_REGION && process.env.AWS_REGION !== profileData.awsRegion) {
457
+ console.log(` ⚠️ AWS_REGION env var (${process.env.AWS_REGION}) differs from profile region (${profileData.awsRegion}) — using profile region`);
458
+ }
459
+
460
+ // --no-rollback prevents rollback on AlreadyExists errors for IAM roles
461
+ // that may pre-exist from a prior deployment or another region.
462
+ const cdkDeployCmd = options.benchmarkInfra
463
+ ? 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true'
464
+ : 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
339
465
  execSync(
340
- 'npx cdk deploy MlccCiHarnessStack --require-approval never',
466
+ cdkDeployCmd,
341
467
  {
342
468
  cwd: ciHarnessDir,
343
469
  encoding: 'utf8',
344
470
  stdio: 'inherit',
345
471
  env: {
346
472
  ...process.env,
473
+ AWS_REGION: profileData.awsRegion,
347
474
  CDK_DEFAULT_REGION: profileData.awsRegion,
348
475
  CDK_DEFAULT_ACCOUNT: profileData.accountId,
349
476
  AWS_PROFILE: profileData.awsProfile
@@ -354,6 +481,11 @@ export default class BootstrapCommandHandler {
354
481
 
355
482
  profileData.ciInfraProvisioned = true;
356
483
  profileData.ciTableName = 'mlcc-ci-table';
484
+ if (options.benchmarkInfra) {
485
+ profileData.benchmarkInfraProvisioned = true;
486
+ profileData.ciGlueDatabase = 'mlcc_ci';
487
+ profileData.ciBenchmarkResultsBucket = `mlcc-benchmark-results-${profileData.accountId}-${profileData.awsRegion}`;
488
+ }
357
489
  }
358
490
  }
359
491
  } catch (error) {
@@ -372,241 +504,325 @@ export default class BootstrapCommandHandler {
372
504
  }
373
505
 
374
506
  /**
375
- * Display active bootstrap profile and resource state.
376
- * @param {object} [options] - Parsed CLI options (e.g., --verify)
507
+ * Re-deploy bootstrap infrastructure using the active profile.
508
+ * No prompts reads all config from the existing profile and re-applies
509
+ * the CloudFormation stack and optionally the CI CDK stack.
510
+ *
511
+ * @param {object} [options] - Parsed CLI options (e.g., --ci to force CI update)
377
512
  */
378
- async _handleStatus(options = {}) {
379
- const config = this.config.read();
380
- if (!config) {
381
- console.log('No bootstrap configuration found.');
382
- console.log('Run `ml-container-creator bootstrap` to set up shared infrastructure.');
383
- return;
384
- }
385
-
513
+ async _handleUpdate(options = {}) {
386
514
  const profile = this.config.getActiveProfile();
387
515
  if (!profile) {
388
516
  console.log('No active bootstrap profile found.');
389
- console.log('Run `ml-container-creator bootstrap` to set up shared infrastructure.');
517
+ console.log('Run `ml-container-creator bootstrap` to set up shared infrastructure first.');
390
518
  return;
391
519
  }
392
520
 
393
- const allProfiles = this.config.listProfiles();
394
- console.log(`\n📋 Active Profile: ${profile.name} (${allProfiles.length} profile${allProfiles.length === 1 ? '' : 's'} total)`);
395
- console.log('─'.repeat(40));
521
+ const { name, config: profileConfig } = profile;
522
+ console.log(`\n🔄 Updating bootstrap infrastructure for profile "${name}"`);
523
+ console.log(` Region: ${profileConfig.awsRegion}`);
524
+ console.log(` Account: ${profileConfig.accountId}`);
396
525
 
397
- for (const [key, value] of Object.entries(profile.config)) {
398
- console.log(` ${key}: ${value}`);
526
+ // --- SANITY CHECK 1: Account identity ---
527
+ const callerAccount = this._getCallerAccount(profileConfig.awsProfile);
528
+ if (callerAccount !== profileConfig.accountId) {
529
+ console.log(`❌ Account mismatch: profile expects ${profileConfig.accountId} but credentials resolve to ${callerAccount}`);
530
+ return;
399
531
  }
400
532
 
401
- console.log('─'.repeat(40));
402
-
403
- // Validate bootstrap stack
404
- console.log('\n🔍 Resource Validation:');
405
-
406
- const stackName = profile.config.stackName || `${STACK_NAME_PREFIX}-${profile.name}`;
533
+ // Re-deploy the CloudFormation bootstrap stack
534
+ const stackName = profileConfig.stackName || `${STACK_NAME_PREFIX}-${name}`;
407
535
 
408
- try {
409
- const stackInfo = this._execAws(
410
- `cloudformation describe-stacks --stack-name ${stackName} --region ${profile.config.awsRegion}`,
411
- profile.config.awsProfile
412
- );
536
+ // Sanity check: stack name consistency (warn-and-continue)
537
+ const expectedStackName = `${STACK_NAME_PREFIX}-${name}`;
538
+ if (profileConfig.stackName && profileConfig.stackName !== expectedStackName) {
539
+ console.log(`⚠️ Stack name mismatch: expected "${expectedStackName}" but profile has "${profileConfig.stackName}"`);
540
+ console.log(' Run `ml-container-creator bootstrap migrate` to fix.');
541
+ console.log(' Proceeding with stored stack name...');
542
+ }
413
543
 
414
- const stack = stackInfo.Stacks && stackInfo.Stacks[0];
415
- if (stack) {
416
- const status = stack.StackStatus;
417
- const statusIcon = status === 'CREATE_COMPLETE' || status === 'UPDATE_COMPLETE' ? '✅' : '⚠️';
418
- console.log(` ${statusIcon} Bootstrap stack: ${stackName} (${status})`);
419
-
420
- // Show stack outputs
421
- const outputs = {};
422
- for (const output of (stack.Outputs || [])) {
423
- outputs[output.OutputKey] = output.OutputValue;
424
- }
544
+ // --- SANITY CHECK 3: Stack exists in target region ---
545
+ const stackExists = this._resourceExists(
546
+ `cloudformation describe-stacks --stack-name ${stackName} --region ${profileConfig.awsRegion}`,
547
+ profileConfig.awsProfile
548
+ );
549
+ if (!stackExists) {
550
+ console.log(`❌ Stack "${stackName}" not found in ${profileConfig.awsRegion}.`);
551
+ console.log(' Run `ml-container-creator bootstrap` to create it.');
552
+ return;
553
+ }
425
554
 
426
- if (outputs.RoleArn) {
427
- console.log(` ✅ IAM role: ${outputs.RoleArn.split('/').pop()}`);
428
- }
429
- if (outputs.EcrRepositoryName) {
430
- console.log(` ✅ ECR repository: ${outputs.EcrRepositoryName}`);
431
- }
432
- if (outputs.AsyncS3BucketName) {
433
- console.log(` ✅ S3 bucket (async): ${outputs.AsyncS3BucketName}`);
434
- }
435
- if (outputs.BatchS3BucketName) {
436
- console.log(` ✅ S3 bucket (batch): ${outputs.BatchS3BucketName}`);
437
- }
438
- if (outputs.AdapterS3BucketName) {
439
- console.log(` ✅ S3 bucket (adapters): ${outputs.AdapterS3BucketName}`);
440
- }
441
- if (outputs.BenchmarkS3BucketName) {
442
- console.log(` ✅ S3 bucket (benchmark): ${outputs.BenchmarkS3BucketName}`);
443
- }
444
- if (outputs.StackVersion) {
445
- console.log(` 📋 Stack version: ${outputs.StackVersion}`);
446
- }
555
+ // --- CI single-region enforcement ---
556
+ if (options.ci) {
557
+ const ciConflict = this._findExistingCiProfile(name);
558
+ if (ciConflict) {
559
+ console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
560
+ console.log(' CI can only be deployed in one region per account.');
561
+ return;
447
562
  }
448
- } catch {
449
- // Fall back to individual resource checks for profiles created before CloudFormation migration
450
- console.log(` ⚠️ Bootstrap stack "${stackName}" not found — checking resources individually`);
563
+ }
451
564
 
452
- try {
453
- const defaultRoleName = 'mlcc-sagemaker-execution-role';
454
- let roleName = defaultRoleName;
455
- if (profile.config.roleArn) {
456
- const arnParts = profile.config.roleArn.split('/');
457
- roleName = arnParts[arnParts.length - 1];
458
- }
565
+ this._displayProgress('☁️', 'Updating bootstrap stack...');
459
566
 
460
- const roleExists = this._resourceExists(
461
- `iam get-role --role-name ${roleName}`,
462
- profile.config.awsProfile
567
+ // Pre-check: if IAM role already exists globally (from another region's deployment),
568
+ // pass its ARN so CloudFormation skips re-creation (account-level singleton)
569
+ let useExistingRoleArn = profileConfig.roleArn || '';
570
+ if (!useExistingRoleArn) {
571
+ try {
572
+ const roleResult = this._execAws(
573
+ 'iam get-role --role-name mlcc-sagemaker-execution-role',
574
+ profileConfig.awsProfile
463
575
  );
464
- if (roleExists) {
465
- console.log(` ✅ IAM role: ${roleName}`);
466
- } else {
467
- console.log(` ⚠️ IAM role: ${roleName} — missing`);
576
+ const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
577
+ if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
578
+ useExistingRoleArn = roleArn;
468
579
  }
469
- } catch {
470
- console.log(' ⚠️ IAM role: could not validate');
580
+ } catch (_) {
581
+ // Role doesn't exist yet will be created by the stack
471
582
  }
583
+ }
472
584
 
585
+ try {
586
+ // Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
587
+ let skipEcr = 'false';
473
588
  try {
474
- const ecrExists = this._resourceExists(
475
- `ecr describe-repositories --repository-names ml-container-creator --region ${profile.config.awsRegion}`,
476
- profile.config.awsProfile
589
+ this._execAws(
590
+ `ecr describe-repositories --repository-names ml-container-creator --region ${profileConfig.awsRegion}`,
591
+ profileConfig.awsProfile
477
592
  );
478
- if (ecrExists) {
479
- console.log(' ECR repository: ml-container-creator');
480
- } else {
481
- console.log(' ⚠️ ECR repository: ml-container-creator — missing');
482
- }
483
- } catch {
484
- console.log(' ⚠️ ECR repository: could not validate');
485
- }
593
+ skipEcr = 'true';
594
+ } catch (_) { /* doesn't exist */ }
486
595
 
487
- if (profile.config.asyncS3Bucket) {
488
- try {
489
- const asyncExists = this._resourceExists(
490
- `s3api head-bucket --bucket ${profile.config.asyncS3Bucket}`,
491
- profile.config.awsProfile
492
- );
493
- console.log(asyncExists
494
- ? ` ✅ S3 bucket: ${profile.config.asyncS3Bucket}`
495
- : ` ⚠️ S3 bucket: ${profile.config.asyncS3Bucket} — missing`);
496
- } catch {
497
- console.log(` ⚠️ S3 bucket: ${profile.config.asyncS3Bucket} — could not validate`);
498
- }
499
- }
596
+ const stackOutputs = this._deployStack(stackName, {
597
+ CreateS3Buckets: (profileConfig.asyncS3Bucket || profileConfig.batchS3Bucket) ? 'true' : 'false',
598
+ UseExistingRoleArn: useExistingRoleArn,
599
+ SkipEcrCreation: skipEcr
600
+ }, profileConfig.awsProfile, profileConfig.awsRegion);
500
601
 
501
- if (profile.config.batchS3Bucket) {
502
- try {
503
- const batchExists = this._resourceExists(
504
- `s3api head-bucket --bucket ${profile.config.batchS3Bucket}`,
505
- profile.config.awsProfile
602
+ // Update profile with any new outputs
603
+ if (stackOutputs.RoleArn) profileConfig.roleArn = stackOutputs.RoleArn;
604
+ if (stackOutputs.EcrRepositoryName) profileConfig.ecrRepositoryName = stackOutputs.EcrRepositoryName;
605
+ if (stackOutputs.AsyncS3BucketName) profileConfig.asyncS3Bucket = stackOutputs.AsyncS3BucketName;
606
+ if (stackOutputs.BatchS3BucketName) profileConfig.batchS3Bucket = stackOutputs.BatchS3BucketName;
607
+ if (stackOutputs.BenchmarkS3BucketName) profileConfig.benchmarkS3Bucket = stackOutputs.BenchmarkS3BucketName;
608
+ profileConfig.stackName = stackName;
609
+
610
+ console.log(' ✅ Bootstrap stack updated');
611
+ } catch (error) {
612
+ console.log(` ❌ Stack update failed: ${error.message}`);
613
+ }
614
+
615
+ // Re-deploy CI stack if it was provisioned or --ci flag is set
616
+ const shouldUpdateCi = profileConfig.ciInfraProvisioned || options.ci;
617
+ if (shouldUpdateCi) {
618
+ this._displayProgress('🧪', 'Updating CI harness stack...');
619
+
620
+ try {
621
+ const ciHarnessDir = path.resolve(__dirname, '../../infra/ci-harness');
622
+
623
+ // CI harness source is not bundled in the npm package — only available from git clone
624
+ if (!existsSync(ciHarnessDir)) {
625
+ console.log(' ⏭️ CI harness source not available (npm install does not include infra/)');
626
+ console.log(' To update the CI stack, run from a git clone: git clone https://github.com/awslabs/ml-container-creator && cd ml-container-creator && npx cdk deploy -c region=REGION');
627
+ } else {
628
+ // Ensure dependencies are installed (handles cold starts / fresh clones)
629
+ execSync('npm install --silent', {
630
+ cwd: ciHarnessDir,
631
+ encoding: 'utf8',
632
+ stdio: ['pipe', 'pipe', 'pipe']
633
+ });
634
+
635
+ // --no-rollback prevents rollback on AlreadyExists errors for IAM roles
636
+ // that may pre-exist from a prior deployment or another region.
637
+ const updateCdkCmd = (options.benchmarkInfra || profileConfig.benchmarkInfraProvisioned)
638
+ ? 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true'
639
+ : 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
640
+ execSync(
641
+ updateCdkCmd,
642
+ {
643
+ cwd: ciHarnessDir,
644
+ encoding: 'utf8',
645
+ stdio: 'inherit',
646
+ env: {
647
+ ...process.env,
648
+ AWS_REGION: profileConfig.awsRegion,
649
+ CDK_DEFAULT_REGION: profileConfig.awsRegion,
650
+ CDK_DEFAULT_ACCOUNT: profileConfig.accountId,
651
+ AWS_PROFILE: profileConfig.awsProfile
652
+ }
653
+ }
506
654
  );
507
- console.log(batchExists
508
- ? ` ✅ S3 bucket: ${profile.config.batchS3Bucket}`
509
- : ` ⚠️ S3 bucket: ${profile.config.batchS3Bucket} — missing`);
510
- } catch {
511
- console.log(` ⚠️ S3 bucket: ${profile.config.batchS3Bucket} — could not validate`);
655
+ profileConfig.ciInfraProvisioned = true;
656
+ profileConfig.ciGlueDatabase = profileConfig.ciGlueDatabase || 'mlcc_ci';
657
+ profileConfig.ciBenchmarkResultsBucket = profileConfig.ciBenchmarkResultsBucket || `mlcc-benchmark-results-${profileConfig.accountId}-${profileConfig.awsRegion}`;
658
+ console.log(' ✅ CI harness stack updated');
512
659
  }
660
+ } catch (error) {
661
+ console.log(` ❌ CI stack update failed: ${error.message}`);
513
662
  }
663
+ } else {
664
+ console.log(' ⏭️ CI stack skipped (not provisioned — use --ci to force)');
665
+ }
514
666
 
515
- if (profile.config.benchmarkS3Bucket) {
516
- try {
517
- const benchmarkExists = this._resourceExists(
518
- `s3api head-bucket --bucket ${profile.config.benchmarkS3Bucket}`,
519
- profile.config.awsProfile
520
- );
521
- console.log(benchmarkExists
522
- ? ` ✅ S3 bucket (benchmark): ${profile.config.benchmarkS3Bucket}`
523
- : ` ⚠️ S3 bucket (benchmark): ${profile.config.benchmarkS3Bucket} — missing`);
524
- } catch {
525
- console.log(` ⚠️ S3 bucket (benchmark): ${profile.config.benchmarkS3Bucket} — could not validate`);
526
- }
667
+ // Ensure MLflow App exists
668
+ this._displayProgress('📊', 'MLflow App for experiment tracking...');
669
+ try {
670
+ const mlflowAppArn = this._ensureMlflowApp(profileConfig, profileConfig.awsProfile);
671
+ if (mlflowAppArn) {
672
+ profileConfig.mlflowAppArn = mlflowAppArn;
673
+ console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
527
674
  }
675
+ } catch (error) {
676
+ console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
528
677
  }
529
678
 
530
- // Display deployed resources from manifest
531
- console.log('\n📦 Deployed Resources:');
679
+ // Save updated profile
680
+ this.config.setProfile(name, profileConfig);
681
+ console.log(`\n✅ Update complete for profile "${name}"`);
532
682
 
533
- const assetManager = new AssetManager(profile.name);
683
+ // Re-run post-setup chain after updating AWS resources
684
+ await this._runPostSetupChain(options);
685
+ }
534
686
 
535
- if (!existsSync(assetManager.manifestPath)) {
536
- console.log(' No deployment tracking data available.');
537
- console.log(' Resources will be tracked after running deploy, push, or submit scripts.');
687
+ /**
688
+ * Migrate legacy profiles to current naming conventions.
689
+ * Corrects stackName mismatches and renames sharedStackFrom sharedInfraFrom.
690
+ * Displays a preview of all changes and requires confirmation before writing.
691
+ */
692
+ async _handleMigrate() {
693
+ const config = this.config.read();
694
+ if (!config || !config.profiles) {
695
+ console.log('No profiles to migrate.');
538
696
  return;
539
697
  }
540
698
 
541
- const resourcesByProject = assetManager.getResourcesByProject();
699
+ const changes = [];
700
+
701
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
702
+ const expected = `${STACK_NAME_PREFIX}-${name}`;
703
+
704
+ // Fix stackName mismatch
705
+ if (profileConfig.stackName && profileConfig.stackName !== expected) {
706
+ changes.push({
707
+ profile: name,
708
+ field: 'stackName',
709
+ from: profileConfig.stackName,
710
+ to: expected
711
+ });
712
+ }
713
+
714
+ // Rename sharedStackFrom → sharedInfraFrom
715
+ if (profileConfig.sharedStackFrom) {
716
+ changes.push({
717
+ profile: name,
718
+ field: 'sharedStackFrom → sharedInfraFrom',
719
+ from: profileConfig.sharedStackFrom,
720
+ to: profileConfig.sharedStackFrom
721
+ });
722
+ }
723
+ }
542
724
 
543
- if (resourcesByProject.size === 0) {
544
- console.log(' No deployed resources tracked.');
725
+ if (changes.length === 0) {
726
+ console.log(' All profiles already use current naming conventions.');
545
727
  return;
546
728
  }
547
729
 
548
- for (const [project, resources] of resourcesByProject) {
549
- console.log(`\n Project: ${project}`);
550
- for (const resource of resources) {
551
- const timestamp = resource.createdAt || resource.lastUpdatedAt;
552
- console.log(` ${resource.resourceType} ${resource.resourceId} [${resource.status}] ${timestamp}`);
553
- }
730
+ // Display preview
731
+ console.log('📋 Migration Preview:\n');
732
+ for (const change of changes) {
733
+ console.log(` Profile "${change.profile}":`);
734
+ console.log(` ${change.field}: "${change.from}" → "${change.to}"`);
554
735
  }
555
736
 
556
- const counts = assetManager.getStatusCounts();
557
- console.log(`\n Summary: ${counts.active} active, ${counts.deleted} deleted, ${counts.unknown} unknown`);
737
+ // Prompt for confirmation
738
+ const { confirm } = await this._promptFn([{
739
+ type: 'confirm',
740
+ name: 'confirm',
741
+ message: 'Apply these changes?',
742
+ default: true
743
+ }]);
744
+
745
+ if (!confirm) return;
558
746
 
559
- // Drift detection if --verify flag is set
560
- if (options.verify) {
561
- await this._handleStatusVerify(profile, assetManager);
747
+ // Apply changes
748
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
749
+ const expected = `${STACK_NAME_PREFIX}-${name}`;
750
+ if (profileConfig.stackName !== expected) {
751
+ profileConfig.stackName = expected;
752
+ }
753
+ if (profileConfig.sharedStackFrom) {
754
+ profileConfig.sharedInfraFrom = profileConfig.sharedStackFrom;
755
+ delete profileConfig.sharedStackFrom;
756
+ }
562
757
  }
758
+
759
+ this.config.write(config);
760
+ console.log('✅ Migration complete.');
563
761
  }
564
762
 
565
763
  /**
566
- * Perform drift detection for active resources.
567
- * @param {object} profile - Active profile object with name and config
568
- * @param {AssetManager} assetManager - AssetManager instance for the profile
764
+ * Run the post-setup chain: mcp init → registry sync-architectures → sync-schemas.
765
+ * Each step is independent failures are collected and reported at the end.
766
+ *
767
+ * @param {object} options - Parsed CLI options (checks skipPostSetup)
569
768
  */
570
- async _handleStatusVerify(profile, assetManager) {
571
- console.log('\n🔎 Drift Detection:');
572
-
573
- const activeResources = assetManager.listResources({ status: 'active' });
574
-
575
- if (activeResources.length === 0) {
576
- console.log(' No active resources to verify.');
769
+ async _runPostSetupChain(options = {}) {
770
+ if ((options['skip-post-setup'] || options.skipPostSetup)) {
771
+ console.log('\n⏭️ Skipping post-setup chain (--skip-post-setup)');
577
772
  return;
578
773
  }
579
774
 
580
- let verified = 0;
581
- let drifted = 0;
582
- let unchecked = 0;
775
+ console.log('\n🔗 Running post-setup configuration...\n');
583
776
 
584
- for (const resource of activeResources) {
585
- const checkCommand = this._buildDriftCheckCommand(resource);
777
+ const failures = [];
586
778
 
587
- if (!checkCommand) {
588
- unchecked++;
589
- continue;
590
- }
779
+ // 1. MCP init — register bundled MCP servers
780
+ console.log('📡 Registering MCP servers...');
781
+ try {
782
+ const generatorAdapter = {
783
+ destinationPath(...segments) {
784
+ return path.resolve(process.cwd(), ...segments);
785
+ }
786
+ };
787
+ const mcpHandler = new McpCommandHandler(generatorAdapter);
788
+ await mcpHandler.handle(['init'], {});
789
+ } catch (error) {
790
+ failures.push({ step: 'mcp init', error: error.message });
791
+ console.log(` ⚠️ mcp init failed: ${error.message}`);
792
+ }
591
793
 
592
- try {
593
- const exists = this._resourceExists(checkCommand, profile.config.awsProfile);
794
+ // 2. Registry sync-architectures — populate supportedModelTypes
795
+ console.log('\n📋 Syncing model architecture registry...');
796
+ try {
797
+ const registryHandler = new RegistryCommandHandler();
798
+ await registryHandler.handle(['sync-architectures'], {});
799
+ } catch (error) {
800
+ failures.push({ step: 'registry sync-architectures', error: error.message });
801
+ console.log(` ⚠️ registry sync-architectures failed: ${error.message}`);
802
+ }
594
803
 
595
- if (exists) {
596
- verified++;
597
- console.log(` ✅ ${resource.resourceType}: ${resource.resourceId}`);
598
- } else {
599
- drifted++;
600
- assetManager.updateStatus(resource.resourceId, 'unknown');
601
- console.log(` ⚠️ ${resource.resourceType}: ${resource.resourceId} — not found (status updated to unknown)`);
602
- }
603
- } catch {
604
- unchecked++;
605
- console.log(` ⚠️ ${resource.resourceType}: ${resource.resourceId} — could not verify (credentials or API unavailable)`);
606
- }
804
+ // 3. Schema sync — download AWS service models
805
+ console.log('\n📐 Syncing service schemas...');
806
+ try {
807
+ await this._handleSyncSchemas();
808
+ } catch (error) {
809
+ failures.push({ step: 'sync-schemas', error: error.message });
810
+ console.log(` ⚠️ sync-schemas failed: ${error.message}`);
607
811
  }
608
812
 
609
- console.log(`\n Drift Summary: ${verified} verified, ${drifted} drifted, ${unchecked} unchecked`);
813
+ // Report results
814
+ if (failures.length === 0) {
815
+ console.log('\n✅ Bootstrap complete — all systems operational');
816
+ } else {
817
+ console.log(`\n⚠️ Bootstrap complete with ${failures.length} warning${failures.length === 1 ? '' : 's'}:`);
818
+ for (const { step, error } of failures) {
819
+ console.log(` • ${step}: ${error}`);
820
+ }
821
+ console.log('\n These steps can be re-run individually:');
822
+ console.log(' ml-container-creator mcp init');
823
+ console.log(' ml-container-creator registry sync-architectures');
824
+ console.log(' ml-container-creator bootstrap sync-schemas');
825
+ }
610
826
  }
611
827
 
612
828
  /**
@@ -631,7 +847,6 @@ export default class BootstrapCommandHandler {
631
847
  return `sagemaker describe-inference-component --inference-component-name ${name}`;
632
848
  }
633
849
  case 'ecr-image': {
634
- // resourceId is a full image URI like 111111111111.dkr.ecr.us-east-1.amazonaws.com/repo:tag
635
850
  const parts = resourceId.split('/');
636
851
  const repoAndTag = parts[parts.length - 1];
637
852
  const [repo, tag] = repoAndTag.split(':');
@@ -652,573 +867,30 @@ export default class BootstrapCommandHandler {
652
867
 
653
868
  /**
654
869
  * Extract the resource name from an ARN.
655
- * ARN format: arn:aws:service:region:account:resource-type/resource-name
656
870
  * @param {string} arn - AWS ARN string
657
871
  * @returns {string} The resource name portion
658
872
  */
659
873
  _extractNameFromArn(arn) {
660
- // Handle ARN formats like:
661
- // arn:aws:sagemaker:us-east-1:111111111111:endpoint/my-endpoint
662
- // arn:aws:iam::111111111111:role/my-role
663
- // arn:aws:codebuild:us-east-1:111111111111:project/my-project
664
874
  const parts = arn.split('/');
665
875
  return parts[parts.length - 1];
666
876
  }
667
877
 
668
878
  /**
669
- * Switch the active bootstrap profile.
670
- * @param {string} profileName - Profile name to activate
879
+ * Infer the resource type from an ARN.
880
+ * @param {string} arn - AWS ARN
881
+ * @returns {string|null} Resource type or null if not recognized
671
882
  */
672
- async _handleUse(profileName) {
673
- if (!profileName) {
674
- console.log('Usage: ml-container-creator bootstrap use <profile>');
675
- console.log(' ml-container-creator bootstrap use none (deactivate)');
676
- return;
677
- }
678
-
679
- if (profileName === 'none') {
680
- this.config.setActiveProfile(null);
681
- console.log('Active profile cleared. No bootstrap profile is active.');
682
- return;
683
- }
684
-
685
- const profile = this.config.getProfile(profileName);
686
- if (!profile) {
687
- const available = this.config.listProfiles();
688
- console.log(`Profile "${profileName}" not found.`);
689
- if (available.length > 0) {
690
- console.log(`Available profiles: ${available.join(', ')}`);
691
- } else {
692
- console.log('No profiles configured. Run `ml-container-creator bootstrap` to create one.');
693
- }
694
- return;
695
- }
696
-
697
- this.config.setActiveProfile(profileName);
698
- console.log(`Switched active profile to "${profileName}".`);
699
- }
700
-
701
- /**
702
- * List all bootstrap profiles.
703
- */
704
- async _handleList() {
705
- const profiles = this.config.listProfiles();
706
-
707
- if (profiles.length === 0) {
708
- console.log('No bootstrap profiles configured.');
709
- console.log('Run `ml-container-creator bootstrap` to set up shared infrastructure.');
710
- return;
711
- }
712
-
713
- const config = this.config.read();
714
- const activeProfileName = config ? config.activeProfile : null;
715
-
716
- console.log('\nBootstrap Profiles:');
717
- for (const name of profiles) {
718
- if (name === activeProfileName) {
719
- console.log(` * ${name} (active)`);
720
- } else {
721
- console.log(` ${name}`);
722
- }
723
- }
724
- }
725
-
726
- /**
727
- * Remove a bootstrap profile.
728
- * @param {string} profileName - Profile name to remove
729
- * @param {object} options - Parsed CLI options (e.g., --force)
730
- */
731
- async _handleRemove(profileName, options) {
732
- if (!profileName) {
733
- console.log('Usage: ml-container-creator bootstrap remove <profile> [--force]');
734
- return;
735
- }
736
-
737
- const profile = this.config.getProfile(profileName);
738
- if (!profile) {
739
- console.log(`Profile "${profileName}" not found.`);
740
- return;
741
- }
742
-
743
- // Check for manifest file with active resources
744
- const assetManager = new AssetManager(profileName);
745
- const hasManifest = existsSync(assetManager.manifestPath);
746
-
747
- if (hasManifest) {
748
- const counts = assetManager.getStatusCounts();
749
- if (counts.active > 0 && !options.force) {
750
- console.log(`⚠️ Profile "${profileName}" has ${counts.active} active resource${counts.active === 1 ? '' : 's'} in the deployment manifest.`);
751
- }
752
- }
753
-
754
- // Check for CloudFormation stack
755
- const stackName = profile.stackName || `${STACK_NAME_PREFIX}-${profileName}`;
756
- let hasStack = false;
757
- try {
758
- hasStack = this._resourceExists(
759
- `cloudformation describe-stacks --stack-name ${stackName} --region ${profile.awsRegion}`,
760
- profile.awsProfile
761
- );
762
- } catch {
763
- // ignore
764
- }
765
-
766
- if (hasStack && !options.force) {
767
- console.log(`⚠️ Profile "${profileName}" has a CloudFormation stack: ${stackName}`);
768
- console.log(' Use --delete-stack to also delete the AWS resources, or --force to remove the profile only.');
769
- }
770
-
771
- if (!options.force) {
772
- const { confirm } = await this._promptFn([{
773
- type: 'confirm',
774
- name: 'confirm',
775
- message: `Remove bootstrap profile "${profileName}"?`,
776
- default: false
777
- }]);
778
-
779
- if (!confirm) {
780
- console.log('Removal cancelled.');
781
- return;
782
- }
783
- }
784
-
785
- // Delete CloudFormation stack if requested
786
- if (hasStack && options['delete-stack']) {
787
- try {
788
- console.log(`🗑️ Deleting CloudFormation stack: ${stackName}`);
789
- execSync(
790
- `aws cloudformation delete-stack --stack-name ${stackName} --region ${profile.awsRegion} --profile ${profile.awsProfile}`,
791
- { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
792
- );
793
- console.log('⏳ Waiting for stack deletion...');
794
- execSync(
795
- `aws cloudformation wait stack-delete-complete --stack-name ${stackName} --region ${profile.awsRegion} --profile ${profile.awsProfile}`,
796
- { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
797
- );
798
- console.log(`✅ Stack "${stackName}" deleted.`);
799
- } catch (err) {
800
- console.log(`⚠️ Could not delete stack "${stackName}": ${err.message}`);
801
- console.log(' You may need to delete it manually from the CloudFormation console.');
802
- }
803
- } else if (hasStack) {
804
- console.log(`Note: CloudFormation stack "${stackName}" was left in place.`);
805
- console.log(' To delete AWS resources, re-run with --delete-stack');
806
- }
807
-
808
- // Delete manifest file if it exists
809
- if (hasManifest) {
810
- try {
811
- unlinkSync(assetManager.manifestPath);
812
- console.log(`Manifest file for "${profileName}" deleted.`);
813
- } catch {
814
- console.log(`⚠️ Could not delete manifest file for "${profileName}".`);
815
- }
816
- }
817
-
818
- this.config.removeProfile(profileName);
819
- console.log(`Profile "${profileName}" removed.`);
820
- }
821
-
822
- /**
823
- * Scan AWS for pre-existing MLCC-managed resources and add them to the manifest.
824
- */
825
- async _handleScan() {
826
- const profile = this.config.getActiveProfile();
827
- if (!profile) {
828
- console.log('No active bootstrap profile found.');
829
- console.log('Run `ml-container-creator bootstrap` to set up shared infrastructure.');
830
- return;
831
- }
832
-
833
- console.log(`\n🔍 Scanning for pre-existing resources in ${profile.config.awsRegion}...`);
834
-
835
- const assetManager = new AssetManager(profile.name);
836
- const now = new Date().toISOString();
837
- let discovered = 0;
838
- let added = 0;
839
- let skipped = 0;
840
-
841
- // 1. Query Resource Groups Tagging API for mlcc:managed-by tagged resources
842
- try {
843
- console.log('\n Checking tagged resources...');
844
- const tagResult = this._execAws(
845
- `resourcegroupstaggingapi get-resources --tag-filters Key=mlcc:managed-by,Values=ml-container-creator --region ${profile.config.awsRegion}`,
846
- profile.config.awsProfile
847
- );
848
-
849
- const taggedResources = tagResult.ResourceTagMappingList || [];
850
- for (const tagged of taggedResources) {
851
- discovered++;
852
- const arn = tagged.ResourceARN;
853
- const existing = assetManager.getResource(arn);
854
- if (existing) {
855
- skipped++;
856
- continue;
857
- }
858
-
859
- const resourceType = this._inferResourceTypeFromArn(arn);
860
- if (!resourceType) {
861
- skipped++;
862
- continue;
863
- }
864
-
865
- const project = this._inferProjectFromTags(tagged.Tags) || 'unknown';
866
-
867
- try {
868
- assetManager.addResource({
869
- resourceId: arn,
870
- resourceType,
871
- createdAt: now,
872
- lastUpdatedAt: now,
873
- project,
874
- status: 'active',
875
- metadata: { discoveredBy: 'scan' }
876
- });
877
- added++;
878
- } catch {
879
- skipped++;
880
- }
881
- }
882
- } catch {
883
- console.log(' ⚠️ Could not query tagged resources (credentials or API unavailable)');
884
- }
885
-
886
- // 2. Query ECR for images in ml-container-creator repository
887
- try {
888
- console.log(' Checking ECR images...');
889
- const ecrResult = this._execAws(
890
- `ecr describe-images --repository-name ml-container-creator --region ${profile.config.awsRegion}`,
891
- profile.config.awsProfile
892
- );
893
-
894
- const images = ecrResult.imageDetails || [];
895
- for (const image of images) {
896
- const tags = image.imageTags || [];
897
- for (const tag of tags) {
898
- discovered++;
899
- const imageUri = `${profile.config.accountId}.dkr.ecr.${profile.config.awsRegion}.amazonaws.com/ml-container-creator:${tag}`;
900
- const existing = assetManager.getResource(imageUri);
901
- if (existing) {
902
- skipped++;
903
- continue;
904
- }
905
-
906
- try {
907
- assetManager.addResource({
908
- resourceId: imageUri,
909
- resourceType: 'ecr-image',
910
- createdAt: now,
911
- lastUpdatedAt: now,
912
- project: this._inferProjectFromImageTag(tag),
913
- status: 'active',
914
- metadata: {
915
- repositoryName: 'ml-container-creator',
916
- imageTag: tag,
917
- region: profile.config.awsRegion,
918
- discoveredBy: 'scan'
919
- }
920
- });
921
- added++;
922
- } catch {
923
- skipped++;
924
- }
925
- }
926
- }
927
- } catch {
928
- console.log(' ⚠️ Could not query ECR images (credentials or API unavailable)');
929
- }
930
-
931
- // 3. Query CodeBuild for *-build-* projects
932
- try {
933
- console.log(' Checking CodeBuild projects...');
934
- const cbResult = this._execAws(
935
- `codebuild list-projects --region ${profile.config.awsRegion}`,
936
- profile.config.awsProfile
937
- );
938
-
939
- const projects = (cbResult.projects || []).filter(name => name.includes('-build-'));
940
- for (const projectName of projects) {
941
- discovered++;
942
- const arn = `arn:aws:codebuild:${profile.config.awsRegion}:${profile.config.accountId}:project/${projectName}`;
943
- const existing = assetManager.getResource(arn);
944
- if (existing) {
945
- skipped++;
946
- continue;
947
- }
948
-
949
- try {
950
- assetManager.addResource({
951
- resourceId: arn,
952
- resourceType: 'codebuild-project',
953
- createdAt: now,
954
- lastUpdatedAt: now,
955
- project: this._inferProjectFromCodeBuildName(projectName),
956
- status: 'active',
957
- metadata: {
958
- projectName,
959
- region: profile.config.awsRegion,
960
- discoveredBy: 'scan'
961
- }
962
- });
963
- added++;
964
- } catch {
965
- skipped++;
966
- }
967
- }
968
- } catch {
969
- console.log(' ⚠️ Could not query CodeBuild projects (credentials or API unavailable)');
970
- }
971
-
972
- // Display summary
973
- console.log(`\n Scan complete: ${discovered} discovered, ${added} added, ${skipped} skipped (duplicates or unsupported)`);
974
-
975
- if (discovered === 0) {
976
- console.log(' No MLCC-managed resources were discovered.');
977
- }
978
- }
979
-
980
- /**
981
- * Prune stale records from the manifest — removes entries with status
982
- * 'deleted' or 'unknown' that are no longer useful.
983
- */
984
- async _handlePrune() {
985
- const profile = this.config.getActiveProfile();
986
- if (!profile) {
987
- console.log('No active bootstrap profile found.');
988
- return;
989
- }
990
-
991
- const assetManager = new AssetManager(profile.name);
992
-
993
- if (!existsSync(assetManager.manifestPath)) {
994
- console.log('No deployment tracking data to prune.');
995
- return;
996
- }
997
-
998
- const before = assetManager.listResources();
999
- const toRemove = before.filter(r => r.status === 'deleted' || r.status === 'unknown');
1000
-
1001
- if (toRemove.length === 0) {
1002
- console.log('Nothing to prune — no deleted or unknown records found.');
1003
- return;
1004
- }
1005
-
1006
- console.log(`\n🧹 Pruning ${toRemove.length} stale record${toRemove.length === 1 ? '' : 's'}:\n`);
1007
-
1008
- for (const resource of toRemove) {
1009
- assetManager.removeResource(resource.resourceId);
1010
- console.log(` 🗑️ [${resource.status}] ${resource.resourceType}: ${resource.resourceId}`);
1011
- }
1012
-
1013
- const after = assetManager.listResources();
1014
- console.log(`\n Done. ${toRemove.length} removed, ${after.length} remaining.`);
1015
- }
1016
-
1017
- /**
1018
- * Handle sync-schemas subcommand: download service models and verify AWS CLI.
1019
- */
1020
- async _handleSyncSchemas() {
1021
- console.log('\n📦 Schema Sync — Downloading AWS service models...\n');
1022
-
1023
- // Verify AWS CLI is installed
1024
- try {
1025
- const version = execSync('aws --version', { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }).trim();
1026
- console.log(` AWS CLI: ${version}`);
1027
- } catch {
1028
- console.log(' ⚠️ AWS CLI not found.');
1029
- console.log(' Install: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html');
1030
- console.log(' Continuing without AWS CLI verification...\n');
1031
- }
1032
-
1033
- // Dynamic import to avoid circular dependencies
1034
- const { syncSchemas } = await import('./schema-sync.js');
1035
- const result = await syncSchemas();
1036
-
1037
- if (result.success) {
1038
- console.log('\n ✅ Schema sync complete.');
1039
- } else {
1040
- console.log('\n ⚠️ Schema sync completed with errors (some services may be unavailable).');
1041
- }
1042
-
1043
- console.log(` Manifest written: lastSynced = ${result.manifest.lastSynced}\n`);
1044
- }
1045
-
1046
- /**
1047
- * Re-deploy bootstrap infrastructure using the active profile.
1048
- * No prompts — reads all config from the existing profile and re-applies
1049
- * the CloudFormation stack and optionally the CI CDK stack.
1050
- *
1051
- * @param {object} [options] - Parsed CLI options (e.g., --ci to force CI update)
1052
- */
1053
- async _handleUpdate(options = {}) {
1054
- const profile = this.config.getActiveProfile();
1055
- if (!profile) {
1056
- console.log('No active bootstrap profile found.');
1057
- console.log('Run `ml-container-creator bootstrap` to set up shared infrastructure first.');
1058
- return;
1059
- }
1060
-
1061
- const { name, config: profileConfig } = profile;
1062
- console.log(`\n🔄 Updating bootstrap infrastructure for profile "${name}"`);
1063
- console.log(` Region: ${profileConfig.awsRegion}`);
1064
- console.log(` Account: ${profileConfig.accountId}`);
1065
-
1066
- // Re-deploy the CloudFormation bootstrap stack
1067
- const stackName = profileConfig.stackName || `${STACK_NAME_PREFIX}-${name}`;
1068
- this._displayProgress('☁️', 'Updating bootstrap stack...');
1069
-
1070
- try {
1071
- const stackOutputs = this._deployStack(stackName, {
1072
- CreateS3Buckets: (profileConfig.asyncS3Bucket || profileConfig.batchS3Bucket) ? 'true' : 'false',
1073
- UseExistingRoleArn: ''
1074
- }, profileConfig.awsProfile, profileConfig.awsRegion);
1075
-
1076
- // Update profile with any new outputs
1077
- if (stackOutputs.RoleArn) profileConfig.roleArn = stackOutputs.RoleArn;
1078
- if (stackOutputs.EcrRepositoryName) profileConfig.ecrRepositoryName = stackOutputs.EcrRepositoryName;
1079
- if (stackOutputs.AsyncS3BucketName) profileConfig.asyncS3Bucket = stackOutputs.AsyncS3BucketName;
1080
- if (stackOutputs.BatchS3BucketName) profileConfig.batchS3Bucket = stackOutputs.BatchS3BucketName;
1081
- if (stackOutputs.BenchmarkS3BucketName) profileConfig.benchmarkS3Bucket = stackOutputs.BenchmarkS3BucketName;
1082
- profileConfig.stackName = stackName;
1083
-
1084
- console.log(' ✅ Bootstrap stack updated');
1085
- } catch (error) {
1086
- console.log(` ❌ Stack update failed: ${error.message}`);
1087
- }
1088
-
1089
- // Re-deploy CI stack if it was provisioned or --ci flag is set
1090
- const shouldUpdateCi = profileConfig.ciInfraProvisioned || options.ci;
1091
- if (shouldUpdateCi) {
1092
- this._displayProgress('🧪', 'Updating CI harness stack...');
1093
-
1094
- try {
1095
- const ciHarnessDir = path.resolve(__dirname, '../../infra/ci-harness');
1096
-
1097
- // CI harness source is not bundled in the npm package — only available from git clone
1098
- if (!existsSync(ciHarnessDir)) {
1099
- console.log(' ⏭️ CI harness source not available (npm install does not include infra/)');
1100
- console.log(' To update the CI stack, run from a git clone: git clone https://github.com/awslabs/ml-container-creator && cd ml-container-creator && npx cdk deploy -c region=REGION');
1101
- } else {
1102
- // Ensure dependencies are installed (handles cold starts / fresh clones)
1103
- execSync('npm install --silent', {
1104
- cwd: ciHarnessDir,
1105
- encoding: 'utf8',
1106
- stdio: ['pipe', 'pipe', 'pipe']
1107
- });
1108
-
1109
- execSync(
1110
- 'npx cdk deploy MlccCiHarnessStack --require-approval never',
1111
- {
1112
- cwd: ciHarnessDir,
1113
- encoding: 'utf8',
1114
- stdio: 'inherit',
1115
- env: {
1116
- ...process.env,
1117
- CDK_DEFAULT_REGION: profileConfig.awsRegion,
1118
- CDK_DEFAULT_ACCOUNT: profileConfig.accountId,
1119
- AWS_PROFILE: profileConfig.awsProfile
1120
- }
1121
- }
1122
- );
1123
- profileConfig.ciInfraProvisioned = true;
1124
- console.log(' ✅ CI harness stack updated');
1125
- }
1126
- } catch (error) {
1127
- console.log(` ❌ CI stack update failed: ${error.message}`);
1128
- }
1129
- } else {
1130
- console.log(' ⏭️ CI stack skipped (not provisioned — use --ci to force)');
1131
- }
1132
-
1133
- // Save updated profile
1134
- this.config.setProfile(name, profileConfig);
1135
- console.log(`\n✅ Update complete for profile "${name}"`);
1136
-
1137
- // Re-run post-setup chain after updating AWS resources
1138
- await this._runPostSetupChain(options);
1139
- }
1140
-
1141
- /**
1142
- * Run the post-setup chain: mcp init → registry sync-architectures → sync-schemas.
1143
- * Each step is independent — failures are collected and reported at the end.
1144
- *
1145
- * @param {object} options - Parsed CLI options (checks skipPostSetup)
1146
- */
1147
- async _runPostSetupChain(options = {}) {
1148
- if (options['skip-post-setup']) {
1149
- console.log('\n⏭️ Skipping post-setup chain (--skip-post-setup)');
1150
- return;
1151
- }
1152
-
1153
- console.log('\n🔗 Running post-setup configuration...\n');
1154
-
1155
- const failures = [];
1156
-
1157
- // 1. MCP init — register bundled MCP servers
1158
- console.log('📡 Registering MCP servers...');
1159
- try {
1160
- const generatorAdapter = {
1161
- destinationPath(...segments) {
1162
- return path.resolve(process.cwd(), ...segments);
1163
- }
1164
- };
1165
- const mcpHandler = new McpCommandHandler(generatorAdapter);
1166
- await mcpHandler.handle(['init'], {});
1167
- } catch (error) {
1168
- failures.push({ step: 'mcp init', error: error.message });
1169
- console.log(` ⚠️ mcp init failed: ${error.message}`);
1170
- }
1171
-
1172
- // 2. Registry sync-architectures — populate supportedModelTypes
1173
- console.log('\n📋 Syncing model architecture registry...');
1174
- try {
1175
- const registryHandler = new RegistryCommandHandler();
1176
- await registryHandler.handle(['sync-architectures'], {});
1177
- } catch (error) {
1178
- failures.push({ step: 'registry sync-architectures', error: error.message });
1179
- console.log(` ⚠️ registry sync-architectures failed: ${error.message}`);
1180
- }
1181
-
1182
- // 3. Schema sync — download AWS service models
1183
- console.log('\n📐 Syncing service schemas...');
1184
- try {
1185
- await this._handleSyncSchemas();
1186
- } catch (error) {
1187
- failures.push({ step: 'sync-schemas', error: error.message });
1188
- console.log(` ⚠️ sync-schemas failed: ${error.message}`);
1189
- }
1190
-
1191
- // Report results
1192
- if (failures.length === 0) {
1193
- console.log('\n✅ Bootstrap complete — all systems operational');
1194
- } else {
1195
- console.log(`\n⚠️ Bootstrap complete with ${failures.length} warning${failures.length === 1 ? '' : 's'}:`);
1196
- for (const { step, error } of failures) {
1197
- console.log(` • ${step}: ${error}`);
1198
- }
1199
- console.log('\n These steps can be re-run individually:');
1200
- console.log(' ml-container-creator mcp init');
1201
- console.log(' ml-container-creator registry sync-architectures');
1202
- console.log(' ml-container-creator bootstrap sync-schemas');
1203
- }
1204
- }
1205
-
1206
- /**
1207
- * Infer the resource type from an ARN.
1208
- * @param {string} arn - AWS ARN
1209
- * @returns {string|null} Resource type or null if not recognized
1210
- */
1211
- _inferResourceTypeFromArn(arn) {
1212
- if (arn.includes(':endpoint/')) return 'sagemaker-endpoint';
1213
- if (arn.includes(':endpoint-config/')) return 'sagemaker-endpoint-config';
1214
- if (arn.includes(':model/')) return 'sagemaker-model';
1215
- if (arn.includes(':inference-component/')) return 'sagemaker-inference-component';
1216
- if (arn.includes(':transform-job/')) return 'sagemaker-transform-job';
1217
- if (arn.includes(':project/')) return 'codebuild-project';
1218
- if (arn.includes(':role/')) return 'iam-role';
1219
- if (arn.includes(':topic')) return 'sns-topic';
1220
- return null;
1221
- }
883
+ _inferResourceTypeFromArn(arn) {
884
+ if (arn.includes(':endpoint/')) return 'sagemaker-endpoint';
885
+ if (arn.includes(':endpoint-config/')) return 'sagemaker-endpoint-config';
886
+ if (arn.includes(':model/')) return 'sagemaker-model';
887
+ if (arn.includes(':inference-component/')) return 'sagemaker-inference-component';
888
+ if (arn.includes(':transform-job/')) return 'sagemaker-transform-job';
889
+ if (arn.includes(':project/')) return 'codebuild-project';
890
+ if (arn.includes(':role/')) return 'iam-role';
891
+ if (arn.includes(':topic')) return 'sns-topic';
892
+ return null;
893
+ }
1222
894
 
1223
895
  /**
1224
896
  * Infer the project name from resource tags.
@@ -1307,396 +979,9 @@ export default class BootstrapCommandHandler {
1307
979
  return { accountId, region };
1308
980
  }
1309
981
 
1310
- /**
1311
- * Create or reuse the SageMaker execution IAM role.
1312
- * @param {object} options - Parsed CLI options
1313
- * @returns {Promise<string>} Role ARN
1314
- */
1315
- async _setupIamRole(_options) {
1316
- const roleName = 'mlcc-sagemaker-execution-role';
1317
-
1318
- // Define trust policy for SageMaker
1319
- const trustPolicy = {
1320
- Version: '2012-10-17',
1321
- Statement: [
1322
- {
1323
- Effect: 'Allow',
1324
- Principal: {
1325
- Service: 'sagemaker.amazonaws.com'
1326
- },
1327
- Action: 'sts:AssumeRole'
1328
- }
1329
- ]
1330
- };
1331
-
1332
- // Define execution policy with least-privilege permissions
1333
- const executionPolicy = {
1334
- Version: '2012-10-17',
1335
- Statement: [
1336
- {
1337
- Sid: 'SageMakerEndpoints',
1338
- Effect: 'Allow',
1339
- Action: [
1340
- 'sagemaker:CreateEndpoint',
1341
- 'sagemaker:CreateEndpointConfig',
1342
- 'sagemaker:CreateModel',
1343
- 'sagemaker:CreateInferenceComponent',
1344
- 'sagemaker:UpdateEndpoint',
1345
- 'sagemaker:UpdateEndpointWeightsAndCapacities',
1346
- 'sagemaker:UpdateInferenceComponent',
1347
- 'sagemaker:DeleteEndpoint',
1348
- 'sagemaker:DeleteEndpointConfig',
1349
- 'sagemaker:DeleteModel',
1350
- 'sagemaker:DeleteInferenceComponent',
1351
- 'sagemaker:DescribeEndpoint',
1352
- 'sagemaker:DescribeEndpointConfig',
1353
- 'sagemaker:DescribeModel',
1354
- 'sagemaker:DescribeInferenceComponent',
1355
- 'sagemaker:ListInferenceComponents',
1356
- 'sagemaker:InvokeEndpoint',
1357
- 'sagemaker:InvokeEndpointAsync'
1358
- ],
1359
- Resource: '*'
1360
- },
1361
- {
1362
- Sid: 'SageMakerBenchmarking',
1363
- Effect: 'Allow',
1364
- Action: [
1365
- 'sagemaker:CreateAIBenchmarkJob',
1366
- 'sagemaker:DescribeAIBenchmarkJob',
1367
- 'sagemaker:ListAIBenchmarkJobs',
1368
- 'sagemaker:StopAIBenchmarkJob',
1369
- 'sagemaker:DeleteAIBenchmarkJob',
1370
- 'sagemaker:CreateAIWorkloadConfig',
1371
- 'sagemaker:DescribeAIWorkloadConfig',
1372
- 'sagemaker:ListAIWorkloadConfigs',
1373
- 'sagemaker:DeleteAIWorkloadConfig'
1374
- ],
1375
- Resource: '*'
1376
- },
1377
- {
1378
- Sid: 'ECRPull',
1379
- Effect: 'Allow',
1380
- Action: [
1381
- 'ecr:GetAuthorizationToken',
1382
- 'ecr:BatchCheckLayerAvailability',
1383
- 'ecr:GetDownloadUrlForLayer',
1384
- 'ecr:BatchGetImage'
1385
- ],
1386
- Resource: 'arn:aws:ecr:*:*:repository/ml-container-creator'
1387
- },
1388
- {
1389
- Sid: 'ECRAuth',
1390
- Effect: 'Allow',
1391
- Action: 'ecr:GetAuthorizationToken',
1392
- Resource: '*'
1393
- },
1394
- {
1395
- Sid: 'CloudWatchLogs',
1396
- Effect: 'Allow',
1397
- Action: [
1398
- 'logs:CreateLogGroup',
1399
- 'logs:CreateLogStream',
1400
- 'logs:PutLogEvents'
1401
- ],
1402
- Resource: 'arn:aws:logs:*:*:*'
1403
- },
1404
- {
1405
- Sid: 'S3ModelRead',
1406
- Effect: 'Allow',
1407
- Action: [
1408
- 's3:GetObject',
1409
- 's3:PutObject',
1410
- 's3:AbortMultipartUpload',
1411
- 's3:ListBucket'
1412
- ],
1413
- Resource: [
1414
- 'arn:aws:s3:::ml-container-creator-*',
1415
- 'arn:aws:s3:::ml-container-creator-*/*'
1416
- ]
1417
- },
1418
- {
1419
- Sid: 'SNSPublish',
1420
- Effect: 'Allow',
1421
- Action: 'sns:Publish',
1422
- Resource: 'arn:aws:sns:*:*:ml-container-creator-*'
1423
- },
1424
- {
1425
- Sid: 'SecretsManagerBenchmark',
1426
- Effect: 'Allow',
1427
- Action: [
1428
- 'secretsmanager:CreateSecret',
1429
- 'secretsmanager:PutSecretValue',
1430
- 'secretsmanager:GetSecretValue',
1431
- 'secretsmanager:DescribeSecret'
1432
- ],
1433
- Resource: 'arn:aws:secretsmanager:*:*:secret:ml-container-creator/*'
1434
- },
1435
- {
1436
- Sid: 'QuotaAndAvailability',
1437
- Effect: 'Allow',
1438
- Action: [
1439
- 'service-quotas:GetServiceQuota',
1440
- 'service-quotas:ListServiceQuotas',
1441
- 'sagemaker:ListTrainingPlans',
1442
- 'sagemaker:DescribeTrainingPlan',
1443
- 'sagemaker:ListEndpoints'
1444
- ],
1445
- Resource: '*'
1446
- }
1447
- ]
1448
- };
1449
-
1450
- // Check if role already exists
1451
- const roleExists = this._resourceExists(
1452
- `iam get-role --role-name ${roleName}`,
1453
- this._currentProfile
1454
- );
1455
-
1456
- if (roleExists) {
1457
- const existingRole = this._execAws(
1458
- `iam get-role --role-name ${roleName}`,
1459
- this._currentProfile
1460
- );
1461
- const roleArn = existingRole.Role.Arn;
1462
- console.log(` ✅ IAM role "${roleName}" already exists — reused`);
1463
-
1464
- // Always update the inline policy and tags to ensure they're current
1465
- try {
1466
- const execPolicyFile = this._writeJsonTempFile(executionPolicy, 'exec-policy');
1467
- this._execAws(
1468
- `iam put-role-policy --role-name ${roleName} --policy-name mlcc-execution-policy --policy-document ${execPolicyFile}`,
1469
- this._currentProfile
1470
- );
1471
- console.log(' ✅ IAM policy "mlcc-execution-policy" — updated');
1472
- } catch (err) {
1473
- console.log(` ⚠️ Could not update inline policy: ${err.message}`);
1474
- }
1475
-
1476
- try {
1477
- const tags = this._buildResourceTags();
1478
- this._execAws(
1479
- `iam tag-role --role-name ${roleName} --tags ${this._formatTagsForCli(tags)}`,
1480
- this._currentProfile
1481
- );
1482
- console.log(' ✅ IAM role tags — updated');
1483
- } catch (err) {
1484
- console.log(` ⚠️ Could not update role tags: ${err.message}`);
1485
- }
1486
-
1487
- return roleArn;
1488
- }
1489
-
1490
- // Display policies to user before creation
1491
- console.log('\n Trust Policy:');
1492
- console.log(JSON.stringify(trustPolicy, null, 2));
1493
- console.log('\n Execution Policy:');
1494
- console.log(JSON.stringify(executionPolicy, null, 2));
1495
- console.log('');
1496
-
1497
- try {
1498
- // Create the IAM role — write policy to temp file to avoid shell escaping issues
1499
- const trustPolicyFile = this._writeJsonTempFile(trustPolicy, 'trust-policy');
1500
- const createRoleResult = this._execAws(
1501
- `iam create-role --role-name ${roleName} --assume-role-policy-document ${trustPolicyFile}`,
1502
- this._currentProfile
1503
- );
1504
- const roleArn = createRoleResult.Role.Arn;
1505
-
1506
- // Attach inline execution policy
1507
- const execPolicyFile = this._writeJsonTempFile(executionPolicy, 'exec-policy');
1508
- this._execAws(
1509
- `iam put-role-policy --role-name ${roleName} --policy-name mlcc-execution-policy --policy-document ${execPolicyFile}`,
1510
- this._currentProfile
1511
- );
1512
-
1513
- // Apply resource tags
1514
- const tags = this._buildResourceTags();
1515
- this._execAws(
1516
- `iam tag-role --role-name ${roleName} --tags ${this._formatTagsForCli(tags)}`,
1517
- this._currentProfile
1518
- );
1519
-
1520
- console.log(` ✅ IAM role "${roleName}" — created`);
1521
- return roleArn;
1522
- } catch (error) {
1523
- const errorMessage = error.message || '';
1524
- if (errorMessage.includes('AccessDenied') || errorMessage.includes('UnauthorizedAccess')) {
1525
- console.log(' ⚠️ Permission denied for iam:CreateRole. Please provide an existing role ARN.');
1526
- const { roleArn } = await this._promptFn([{
1527
- type: 'input',
1528
- name: 'roleArn',
1529
- message: 'Enter an existing IAM role ARN for SageMaker execution:'
1530
- }]);
1531
- return roleArn;
1532
- }
1533
- throw error;
1534
- }
1535
- }
1536
-
1537
- /**
1538
- * Create or reuse the ECR repository.
1539
- * @returns {Promise<string>} ECR repository name
1540
- */
1541
- async _setupEcrRepository() {
1542
- const repoName = 'ml-container-creator';
1543
-
1544
- // Check if repository already exists
1545
- const repoExists = this._resourceExists(
1546
- `ecr describe-repositories --repository-names ${repoName} --region ${this._currentRegion}`,
1547
- this._currentProfile
1548
- );
1549
-
1550
- if (repoExists) {
1551
- console.log(` ✅ ECR repository "${repoName}" already exists — reused`);
1552
- return repoName;
1553
- }
1554
-
1555
- // Build resource tags
1556
- const tags = this._buildResourceTags();
1557
-
1558
- // Create the ECR repository with image scanning and AES256 encryption
1559
- this._execAws(
1560
- `ecr create-repository --repository-name ${repoName} --image-scanning-configuration scanOnPush=true --encryption-configuration encryptionType=AES256 --region ${this._currentRegion} --tags ${this._formatTagsForCli(tags)}`,
1561
- this._currentProfile
1562
- );
1563
-
1564
- // Apply lifecycle policy to expire untagged images after 30 days
1565
- const lifecyclePolicy = {
1566
- rules: [
1567
- {
1568
- rulePriority: 1,
1569
- description: 'Expire untagged images after 30 days',
1570
- selection: {
1571
- tagStatus: 'untagged',
1572
- countType: 'sinceImagePushed',
1573
- countUnit: 'days',
1574
- countNumber: 30
1575
- },
1576
- action: {
1577
- type: 'expire'
1578
- }
1579
- }
1580
- ]
1581
- };
1582
-
1583
- const lifecyclePolicyFile = this._writeJsonTempFile(lifecyclePolicy, 'ecr-lifecycle');
1584
- this._execAws(
1585
- `ecr put-lifecycle-policy --repository-name ${repoName} --lifecycle-policy-text ${lifecyclePolicyFile} --region ${this._currentRegion}`,
1586
- this._currentProfile
1587
- );
1588
-
1589
- console.log(` ✅ ECR repository "${repoName}" — created`);
1590
- return repoName;
1591
- }
1592
-
1593
- /**
1594
- * Optionally create S3 buckets for async/batch deployments.
1595
- * Always creates the benchmark S3 bucket (unconditional).
1596
- * @returns {Promise<object|null>} Bucket names or null if skipped
1597
- */
1598
- async _setupS3Buckets() {
1599
- // Always create benchmark bucket (unconditional — avoids re-bootstrap when benchmarking is enabled later)
1600
- const benchmarkBucketName = `ml-container-creator-benchmark-${this._currentRegion}-${this._currentAccountId}`;
1601
- const tags = this._buildResourceTags();
1602
- const benchmarkS3Bucket = await this._createS3Bucket(benchmarkBucketName, tags);
1603
-
1604
- const { useS3 } = await this._promptFn([{
1605
- type: 'confirm',
1606
- name: 'useS3',
1607
- message: 'Will you use async inference or batch transform?',
1608
- default: false
1609
- }]);
1610
-
1611
- if (!useS3) {
1612
- return { benchmarkS3Bucket };
1613
- }
1614
-
1615
- const asyncBucketName = `ml-container-creator-async-${this._currentRegion}-${this._currentAccountId}`;
1616
- const batchBucketName = `ml-container-creator-batch-${this._currentRegion}-${this._currentAccountId}`;
1617
-
1618
- const asyncS3Bucket = await this._createS3Bucket(asyncBucketName, tags);
1619
- const batchS3Bucket = await this._createS3Bucket(batchBucketName, tags);
1620
-
1621
- return { asyncS3Bucket, batchS3Bucket, benchmarkS3Bucket };
1622
- }
1623
-
1624
- /**
1625
- * Create or reuse a single S3 bucket with versioning, encryption, and tags.
1626
- * @param {string} bucketName - S3 bucket name
1627
- * @param {Array<{Key: string, Value: string}>} tags - Resource tags
1628
- * @returns {Promise<string>} Bucket name
1629
- */
1630
- async _createS3Bucket(bucketName, tags) {
1631
- // Check if bucket already exists
1632
- const bucketExists = this._resourceExists(
1633
- `s3api head-bucket --bucket ${bucketName}`,
1634
- this._currentProfile
1635
- );
1636
-
1637
- if (bucketExists) {
1638
- console.log(` ✅ S3 bucket "${bucketName}" already exists — reused`);
1639
- return bucketName;
1640
- }
1641
-
1642
- // Build create-bucket command with region-appropriate configuration
1643
- let createCommand = `s3api create-bucket --bucket ${bucketName} --region ${this._currentRegion}`;
1644
- if (this._currentRegion !== 'us-east-1') {
1645
- createCommand += ` --create-bucket-configuration LocationConstraint=${this._currentRegion}`;
1646
- }
1647
-
1648
- this._execAws(createCommand, this._currentProfile);
1649
-
1650
- // Enable versioning
1651
- this._execAws(
1652
- `s3api put-bucket-versioning --bucket ${bucketName} --versioning-configuration Status=Enabled`,
1653
- this._currentProfile
1654
- );
1655
-
1656
- // Enable AES256 server-side encryption
1657
- const encryptionConfig = { Rules: [{ ApplyServerSideEncryptionByDefault: { SSEAlgorithm: 'AES256' } }] };
1658
- const encryptionFile = this._writeJsonTempFile(encryptionConfig, 's3-encryption');
1659
- this._execAws(
1660
- `s3api put-bucket-encryption --bucket ${bucketName} --server-side-encryption-configuration ${encryptionFile}`,
1661
- this._currentProfile
1662
- );
1663
-
1664
- // Apply resource tags
1665
- const tagging = { TagSet: tags };
1666
- const taggingFile = this._writeJsonTempFile(tagging, 's3-tagging');
1667
- this._execAws(
1668
- `s3api put-bucket-tagging --bucket ${bucketName} --tagging ${taggingFile}`,
1669
- this._currentProfile
1670
- );
1671
-
1672
- console.log(` ✅ S3 bucket "${bucketName}" — created`);
1673
- return bucketName;
1674
- }
1675
982
 
1676
983
  // ── AWS CLI helpers ─────────────────────────────────────────────
1677
984
 
1678
- /**
1679
- * Verify AWS CLI v2 is installed. Returns true if v2 is detected, false otherwise.
1680
- * Extracted as a method so tests can override it.
1681
- * @returns {boolean}
1682
- */
1683
- _verifyCliV2() {
1684
- try {
1685
- const versionOutput = execSync('aws --version', { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }).trim();
1686
- if (!versionOutput.includes('aws-cli/2')) {
1687
- console.log(` ❌ AWS CLI v2 is required. Detected: ${versionOutput.split(' ')[0]}`);
1688
- console.log(' Install: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html');
1689
- console.log(' Some features (benchmarking, newer SageMaker APIs) require CLI v2.\n');
1690
- return false;
1691
- }
1692
- return true;
1693
- } catch {
1694
- console.log(' ❌ AWS CLI not found.');
1695
- console.log(' Install: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html\n');
1696
- return false;
1697
- }
1698
- }
1699
-
1700
985
  /**
1701
986
  * Execute an AWS CLI command and return parsed JSON output.
1702
987
  * @param {string} command - AWS CLI command (without 'aws' prefix)
@@ -1716,6 +1001,12 @@ export default class BootstrapCommandHandler {
1716
1001
  /**
1717
1002
  * Deploy the bootstrap CloudFormation stack and return its outputs.
1718
1003
  *
1004
+ * Before deploying, checks for pre-existing S3 buckets that would cause
1005
+ * ResourceExistenceCheck failures. If the stack is in REVIEW_IN_PROGRESS
1006
+ * state (empty shell from a failed prior attempt), deletes it first.
1007
+ * If buckets exist but aren't managed by the stack, uses a CloudFormation
1008
+ * import changeset to adopt them before proceeding with the normal deploy.
1009
+ *
1719
1010
  * Uses `aws cloudformation deploy` which is idempotent — it creates the
1720
1011
  * stack on first run and updates it on subsequent runs. If the template
1721
1012
  * hasn't changed, it exits with "No changes to deploy" which we handle
@@ -1728,6 +1019,9 @@ export default class BootstrapCommandHandler {
1728
1019
  * @returns {object} Map of output key → output value
1729
1020
  */
1730
1021
  _deployStack(stackName, parameters, profile, region) {
1022
+ // Handle ghost stacks and pre-existing resources
1023
+ this._resolveStackConflicts(stackName, parameters, profile, region);
1024
+
1731
1025
  // Build parameter overrides string
1732
1026
  const paramOverrides = Object.entries(parameters)
1733
1027
  .map(([key, value]) => `${key}=${value}`)
@@ -1751,6 +1045,32 @@ export default class BootstrapCommandHandler {
1751
1045
  const stderr = error.stderr || error.message || '';
1752
1046
  if (stderr.includes('No changes to deploy')) {
1753
1047
  console.log(' ℹ️ Stack is up to date — no changes needed');
1048
+ } else if (stderr.includes('ResourceExistenceCheck')) {
1049
+ // Resources already exist outside the stack — attempt import and retry
1050
+ console.log(' ⚠️ Pre-existing resources detected — attempting import...');
1051
+ this._resolveStackConflicts(stackName, parameters, profile, region);
1052
+ // Rebuild deploy command with updated parameters (e.g., CreateS3Buckets may now be 'false')
1053
+ const retryParamOverrides = Object.entries(parameters)
1054
+ .map(([key, value]) => `${key}=${value}`)
1055
+ .join(' ');
1056
+ const retryDeployCommand = [
1057
+ 'aws cloudformation deploy',
1058
+ `--template-file ${STACK_TEMPLATE_PATH}`,
1059
+ `--stack-name ${stackName}`,
1060
+ '--capabilities CAPABILITY_NAMED_IAM',
1061
+ `--parameter-overrides ${retryParamOverrides}`,
1062
+ `--profile ${profile}`,
1063
+ `--region ${region}`
1064
+ ].join(' ');
1065
+ // Retry the deploy after import
1066
+ try {
1067
+ execSync(retryDeployCommand, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] });
1068
+ } catch (retryError) {
1069
+ const retryStderr = retryError.stderr || retryError.message || '';
1070
+ if (!retryStderr.includes('No changes to deploy')) {
1071
+ throw retryError;
1072
+ }
1073
+ }
1754
1074
  } else {
1755
1075
  throw error;
1756
1076
  }
@@ -1772,9 +1092,144 @@ export default class BootstrapCommandHandler {
1772
1092
  outputs[output.OutputKey] = output.OutputValue;
1773
1093
  }
1774
1094
 
1095
+ // If S3 buckets already existed (skipped creation), inject their names
1096
+ // into outputs so the profile config gets populated correctly.
1097
+ if (this._preExistingBuckets && this._preExistingBuckets.length > 0) {
1098
+ const bucketOutputMap = {
1099
+ 'AsyncS3Bucket': 'AsyncS3BucketName',
1100
+ 'BatchS3Bucket': 'BatchS3BucketName',
1101
+ 'AdapterS3Bucket': 'AdapterS3BucketName',
1102
+ 'BenchmarkS3Bucket': 'BenchmarkS3BucketName',
1103
+ 'TuneS3Bucket': 'TuneS3BucketName'
1104
+ };
1105
+ for (const bucket of this._preExistingBuckets) {
1106
+ const outputKey = bucketOutputMap[bucket.logicalId];
1107
+ if (outputKey && !outputs[outputKey]) {
1108
+ outputs[outputKey] = bucket.name;
1109
+ }
1110
+ }
1111
+ this._preExistingBuckets = null;
1112
+ }
1113
+
1775
1114
  return outputs;
1776
1115
  }
1777
1116
 
1117
+ /**
1118
+ * Resolve stack conflicts before deploying.
1119
+ *
1120
+ * Handles two scenarios that cause ResourceExistenceCheck failures:
1121
+ * 1. Ghost stacks (REVIEW_IN_PROGRESS) — delete them first
1122
+ * 2. Pre-existing S3 buckets not managed by the stack — import them
1123
+ *
1124
+ * @param {string} stackName - CloudFormation stack name
1125
+ * @param {object} parameters - Stack parameter key-value pairs
1126
+ * @param {string} profile - AWS CLI profile name
1127
+ * @param {string} region - AWS region
1128
+ */
1129
+ _resolveStackConflicts(stackName, parameters, profile, region) {
1130
+ // Check if stack exists and its status
1131
+ let stackStatus = null;
1132
+ let managedResources = [];
1133
+
1134
+ try {
1135
+ const describeResult = this._execAws(
1136
+ `cloudformation describe-stacks --stack-name ${stackName} --region ${region}`,
1137
+ profile
1138
+ );
1139
+ const stack = describeResult.Stacks && describeResult.Stacks[0];
1140
+ if (stack) {
1141
+ stackStatus = stack.StackStatus;
1142
+ }
1143
+ } catch (_) {
1144
+ // Stack doesn't exist — no conflicts possible
1145
+ return;
1146
+ }
1147
+
1148
+ // Handle ghost stacks (created but never successfully deployed)
1149
+ if (stackStatus === 'REVIEW_IN_PROGRESS') {
1150
+ console.log(' ⚠️ Found ghost stack (REVIEW_IN_PROGRESS) — deleting before redeploy...');
1151
+ try {
1152
+ execSync(
1153
+ `aws cloudformation delete-stack --stack-name ${stackName} --profile ${profile} --region ${region}`,
1154
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
1155
+ );
1156
+ execSync(
1157
+ `aws cloudformation wait stack-delete-complete --stack-name ${stackName} --profile ${profile} --region ${region}`,
1158
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 60000 }
1159
+ );
1160
+ console.log(' ✅ Ghost stack deleted');
1161
+ } catch (err) {
1162
+ console.log(` ⚠️ Could not delete ghost stack: ${err.message || err}`);
1163
+ }
1164
+ // Don't return — fall through to check for pre-existing S3 buckets
1165
+ // that need to be imported on the fresh deploy. The ghost stack had
1166
+ // DeletionPolicy:Retain buckets that survive stack deletion.
1167
+ stackStatus = null;
1168
+ managedResources = [];
1169
+ }
1170
+
1171
+ // For active stacks (or post-ghost-deletion), check if S3 buckets exist but aren't managed
1172
+ if (parameters.CreateS3Buckets !== 'true') {
1173
+ return; // Not creating buckets — no conflict
1174
+ }
1175
+
1176
+ // Get list of resources currently managed by the stack (empty if stack was just deleted)
1177
+ if (stackStatus) {
1178
+ try {
1179
+ const resources = this._execAws(
1180
+ `cloudformation list-stack-resources --stack-name ${stackName} --region ${region}`,
1181
+ profile
1182
+ );
1183
+ managedResources = (resources.StackResourceSummaries || [])
1184
+ .map(r => r.LogicalResourceId);
1185
+ } catch (_) {
1186
+ // Stack doesn't exist or can't be queried — proceed with empty managedResources
1187
+ }
1188
+ }
1189
+
1190
+ // Check each S3 bucket that the template would create
1191
+ const accountId = this._currentAccountId;
1192
+ const bucketConfigs = [
1193
+ { logicalId: 'AsyncS3Bucket', name: `mlcc-async-${accountId}-${region}` },
1194
+ { logicalId: 'BatchS3Bucket', name: `mlcc-batch-${accountId}-${region}` },
1195
+ { logicalId: 'AdapterS3Bucket', name: `mlcc-adapters-${accountId}-${region}` },
1196
+ { logicalId: 'BenchmarkS3Bucket', name: `mlcc-benchmark-${accountId}-${region}` },
1197
+ { logicalId: 'TuneS3Bucket', name: `mlcc-tune-${accountId}-${region}` }
1198
+ ];
1199
+
1200
+ const bucketsToImport = [];
1201
+
1202
+ for (const bucket of bucketConfigs) {
1203
+ if (managedResources.includes(bucket.logicalId)) {
1204
+ continue; // Already managed by the stack — no conflict
1205
+ }
1206
+ // Check if bucket exists in AWS
1207
+ try {
1208
+ execSync(
1209
+ `aws s3api head-bucket --bucket ${bucket.name} --profile ${profile} --region ${region}`,
1210
+ { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
1211
+ );
1212
+ // Bucket exists but not in stack — needs import
1213
+ bucketsToImport.push(bucket);
1214
+ } catch (_) {
1215
+ // Bucket doesn't exist — will be created normally
1216
+ }
1217
+ }
1218
+
1219
+ if (bucketsToImport.length > 0) {
1220
+ console.log(` ℹ️ ${bucketsToImport.length} pre-existing S3 bucket(s) detected — skipping S3 creation (buckets already exist)`);
1221
+
1222
+ // Pre-existing S3 buckets survive stack deletion (DeletionPolicy: Retain).
1223
+ // Rather than fighting CloudFormation's IMPORT limitations, just skip S3
1224
+ // creation and wire the existing bucket names into the profile config directly.
1225
+ // The naming convention is deterministic, so we know exactly what they are.
1226
+ this._preExistingBuckets = bucketsToImport;
1227
+
1228
+ // Modify the parameters to skip S3 bucket creation in the deploy
1229
+ parameters.CreateS3Buckets = 'false';
1230
+ }
1231
+ }
1232
+
1778
1233
  /**
1779
1234
  * Write a JSON object to a temp file and return the `file://` path.
1780
1235
  * Used for passing complex JSON to AWS CLI commands without shell escaping issues.
@@ -1808,20 +1263,123 @@ export default class BootstrapCommandHandler {
1808
1263
  }
1809
1264
  }
1810
1265
 
1811
- // ── Tag helpers ─────────────────────────────────────────────────
1266
+ /**
1267
+ * Get the AWS account ID from the caller's credentials.
1268
+ * Uses `sts get-caller-identity` to resolve the actual account.
1269
+ *
1270
+ * @param {string} awsProfile - AWS CLI profile name
1271
+ * @returns {string} The 12-digit AWS account ID
1272
+ */
1273
+ _getCallerAccount(awsProfile) {
1274
+ const identity = this._execAws('sts get-caller-identity', awsProfile);
1275
+ return identity.Account;
1276
+ }
1277
+
1278
+ /**
1279
+ * Scan all profiles to find one with ciInfraProvisioned=true,
1280
+ * excluding the given profile name.
1281
+ *
1282
+ * @param {string} excludeProfile - Profile name to exclude from the scan
1283
+ * @returns {{ name: string, config: Object }|null} The CI profile, or null if none found
1284
+ */
1285
+ _findExistingCiProfile(excludeProfile) {
1286
+ const config = this.config.read();
1287
+ if (!config || !config.profiles) return null;
1288
+
1289
+ for (const [name, profileConfig] of Object.entries(config.profiles)) {
1290
+ if (name === excludeProfile) continue;
1291
+ if (profileConfig.ciInfraProvisioned) {
1292
+ return { name, config: profileConfig };
1293
+ }
1294
+ }
1295
+ return null;
1296
+ }
1812
1297
 
1813
1298
  /**
1814
- * Build the standard resource tag set.
1815
- * @returns {Array<{Key: string, Value: string}>} Tag array
1299
+ * Ensure an MLCC-owned MLflow App exists for experiment tracking.
1300
+ * Creates one if it doesn't exist, using the tune S3 bucket as artifact store.
1301
+ *
1302
+ * @param {object} profileData - Bootstrap profile data (needs roleArn, awsRegion, accountId)
1303
+ * @param {string} awsProfile - AWS CLI profile name
1304
+ * @returns {string|null} MLflow App ARN or null if creation failed
1816
1305
  */
1817
- _buildResourceTags() {
1818
- const packageJsonPath = path.resolve(__dirname, '../../package.json');
1819
- const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf8'));
1820
- return [
1821
- { Key: 'mlcc:managed-by', Value: 'ml-container-creator' },
1822
- { Key: 'mlcc:created-by', Value: 'bootstrap' },
1823
- { Key: 'mlcc:version', Value: packageJson.version }
1824
- ];
1306
+ _ensureMlflowApp(profileData, awsProfile) {
1307
+ const region = profileData.awsRegion;
1308
+ const accountId = profileData.accountId;
1309
+ const roleArn = profileData.roleArn;
1310
+ const appName = 'mlcc-tune-tracking';
1311
+ const artifactBucket = `mlcc-tune-${accountId}-${region}`;
1312
+
1313
+ // Check if MLCC app already exists
1314
+ try {
1315
+ const apps = this._execAws(
1316
+ `sagemaker list-mlflow-apps --region ${region}`,
1317
+ awsProfile
1318
+ );
1319
+ const summaries = apps.Summaries || [];
1320
+ const existing = summaries.find(a => a.Name === appName);
1321
+ if (existing) {
1322
+ return existing.Arn;
1323
+ }
1324
+ } catch {
1325
+ // list-mlflow-apps may not be available in all CLI versions — proceed to create
1326
+ }
1327
+
1328
+ // Create the MLflow App
1329
+ console.log(` Creating MLflow App "${appName}" with artifact store s3://${artifactBucket}...`);
1330
+
1331
+ // Ensure the artifact bucket exists (it's the tune bucket from the stack)
1332
+ try {
1333
+ this._execAws(
1334
+ `s3api head-bucket --bucket ${artifactBucket} --region ${region}`,
1335
+ awsProfile
1336
+ );
1337
+ } catch {
1338
+ // Bucket doesn't exist — create it
1339
+ console.log(` Creating artifact bucket: ${artifactBucket}`);
1340
+ try {
1341
+ this._execAws(
1342
+ `s3api create-bucket --bucket ${artifactBucket} --region ${region} --create-bucket-configuration LocationConstraint=${region}`,
1343
+ awsProfile
1344
+ );
1345
+ } catch (bucketErr) {
1346
+ // May already exist or region doesn't need LocationConstraint (us-east-1)
1347
+ if (!bucketErr.message?.includes('BucketAlreadyOwnedByYou')) {
1348
+ try {
1349
+ this._execAws(
1350
+ `s3api create-bucket --bucket ${artifactBucket} --region ${region}`,
1351
+ awsProfile
1352
+ );
1353
+ } catch {
1354
+ // Bucket likely exists, continue
1355
+ }
1356
+ }
1357
+ }
1358
+ }
1359
+
1360
+ // Create the app
1361
+ try {
1362
+ const result = this._execAws(
1363
+ `sagemaker create-mlflow-app --name ${appName} --artifact-store-uri s3://${artifactBucket} --role-arn ${roleArn} --model-registration-mode AutoModelRegistrationEnabled --region ${region}`,
1364
+ awsProfile
1365
+ );
1366
+ return result.Arn;
1367
+ } catch (err) {
1368
+ // If app already exists (race condition), try to describe it
1369
+ if (err.message?.includes('ResourceLimitExceeded') || err.message?.includes('already exists')) {
1370
+ try {
1371
+ const apps = this._execAws(
1372
+ `sagemaker list-mlflow-apps --region ${region}`,
1373
+ awsProfile
1374
+ );
1375
+ const found = (apps.Summaries || []).find(a => a.Name === appName);
1376
+ if (found) return found.Arn;
1377
+ } catch {
1378
+ // Fall through
1379
+ }
1380
+ }
1381
+ throw err;
1382
+ }
1825
1383
  }
1826
1384
 
1827
1385
  /**
@@ -1861,6 +1419,8 @@ SUBCOMMANDS:
1861
1419
  scan Discover pre-existing MLCC-managed resources in AWS
1862
1420
  prune Remove deleted and unknown records from the deployment manifest
1863
1421
  update Re-deploy bootstrap stacks using active profile (no prompts)
1422
+ migrate Upgrade legacy profiles to current naming conventions
1423
+ sync-model-families Discover tune-eligible models from JumpStart Hub and update catalog
1864
1424
 
1865
1425
  SETUP OPTIONS:
1866
1426
  --non-interactive Run without interactive prompts
@@ -1889,6 +1449,8 @@ EXAMPLES:
1889
1449
  ml-container-creator bootstrap remove dev
1890
1450
  ml-container-creator bootstrap remove dev --force --delete-stack
1891
1451
  ml-container-creator bootstrap scan
1452
+ ml-container-creator bootstrap sync-model-families
1453
+ ml-container-creator bootstrap migrate
1892
1454
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2
1893
1455
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --role-arn arn:aws:iam::123456789012:role/MyRole --skip-s3
1894
1456
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --ci