@aws/ml-container-creator 0.10.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +9304 -0
- package/bin/cli.js +2 -0
- package/config/bootstrap-e2e-stack.json +341 -0
- package/config/bootstrap-stack.json +40 -3
- package/config/parameter-schema-v2.json +33 -22
- package/config/tune-catalog.json +1781 -0
- package/infra/ci-harness/buildspec.yml +1 -0
- package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
- package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
- package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
- package/package.json +53 -67
- package/servers/base-image-picker/index.js +121 -121
- package/servers/e2e-status/index.js +297 -0
- package/servers/e2e-status/manifest.json +14 -0
- package/servers/e2e-status/package.json +15 -0
- package/servers/endpoint-picker/LICENSE +202 -0
- package/servers/endpoint-picker/index.js +536 -0
- package/servers/endpoint-picker/manifest.json +14 -0
- package/servers/endpoint-picker/package.json +18 -0
- package/servers/hyperpod-cluster-picker/index.js +125 -125
- package/servers/instance-sizer/index.js +166 -153
- package/servers/instance-sizer/lib/instance-ranker.js +120 -76
- package/servers/instance-sizer/lib/model-resolver.js +61 -61
- package/servers/instance-sizer/lib/quota-resolver.js +113 -113
- package/servers/instance-sizer/lib/vram-estimator.js +31 -31
- package/servers/lib/bedrock-client.js +38 -38
- package/servers/lib/catalogs/instances.json +27 -0
- package/servers/lib/catalogs/model-servers.json +201 -3
- package/servers/lib/custom-validators.js +13 -13
- package/servers/lib/dynamic-resolver.js +4 -4
- package/servers/marketplace-picker/index.js +342 -0
- package/servers/marketplace-picker/manifest.json +14 -0
- package/servers/marketplace-picker/package.json +18 -0
- package/servers/model-picker/index.js +382 -382
- package/servers/region-picker/index.js +56 -56
- package/servers/workload-picker/LICENSE +202 -0
- package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
- package/servers/workload-picker/index.js +171 -0
- package/servers/workload-picker/manifest.json +16 -0
- package/servers/workload-picker/package.json +16 -0
- package/src/app.js +12 -3
- package/src/lib/bootstrap-command-handler.js +609 -15
- package/src/lib/bootstrap-config.js +36 -0
- package/src/lib/bootstrap-profile-manager.js +48 -41
- package/src/lib/ci-register-helpers.js +74 -0
- package/src/lib/config-loader.js +3 -0
- package/src/lib/config-manager.js +7 -0
- package/src/lib/config-validator.js +1 -1
- package/src/lib/cuda-resolver.js +17 -8
- package/src/lib/generated/cli-options.js +319 -314
- package/src/lib/generated/parameter-matrix.js +672 -661
- package/src/lib/generated/validation-rules.js +76 -72
- package/src/lib/path-prover-brain.js +664 -0
- package/src/lib/prompts/infrastructure-prompts.js +2 -2
- package/src/lib/prompts/model-prompts.js +6 -0
- package/src/lib/prompts/project-prompts.js +12 -0
- package/src/lib/secrets-prompt-runner.js +4 -0
- package/src/lib/template-manager.js +1 -1
- package/src/lib/template-variable-resolver.js +87 -1
- package/src/lib/tune-catalog-validator.js +37 -4
- package/templates/Dockerfile +9 -0
- package/templates/code/adapter_sidecar.py +444 -0
- package/templates/code/serve +6 -0
- package/templates/code/serve.d/vllm.ejs +1 -1
- package/templates/do/.benchmark_writer.py +1476 -0
- package/templates/do/.tune_helper.py +982 -57
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/adapter +154 -0
- package/templates/do/benchmark +639 -85
- package/templates/do/build +5 -0
- package/templates/do/clean.d/async-inference.ejs +5 -0
- package/templates/do/clean.d/batch-transform.ejs +5 -0
- package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
- package/templates/do/clean.d/managed-inference.ejs +5 -0
- package/templates/do/config +115 -45
- package/templates/do/deploy.d/async-inference.ejs +30 -3
- package/templates/do/deploy.d/batch-transform.ejs +29 -3
- package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
- package/templates/do/deploy.d/managed-inference.ejs +216 -14
- package/templates/do/lib/endpoint-config.sh +1 -1
- package/templates/do/lib/profile.sh +44 -0
- package/templates/do/optimize +106 -37
- package/templates/do/push +5 -0
- package/templates/do/register +94 -0
- package/templates/do/stage +567 -0
- package/templates/do/submit +7 -0
- package/templates/do/test +14 -0
- package/templates/do/tune +382 -59
- package/templates/do/validate +44 -4
|
@@ -62,6 +62,7 @@ export default class BootstrapCommandHandler {
|
|
|
62
62
|
_handleScan() { return this.profileManager._handleScan(); }
|
|
63
63
|
_handlePrune() { return this.profileManager._handlePrune(); }
|
|
64
64
|
_handleSyncSchemas() { return this.profileManager._handleSyncSchemas(); }
|
|
65
|
+
_handleSyncModelFamilies() { return this.profileManager._handleSyncModelFamilies(); }
|
|
65
66
|
|
|
66
67
|
/**
|
|
67
68
|
* Dispatch bootstrap subcommands.
|
|
@@ -69,8 +70,28 @@ export default class BootstrapCommandHandler {
|
|
|
69
70
|
* @param {object} options - Parsed CLI options
|
|
70
71
|
*/
|
|
71
72
|
async handle(args, options) {
|
|
73
|
+
// Commander.js with passThroughOptions() captures flags after positional
|
|
74
|
+
// arguments in args rather than options. Extract known flags from args.
|
|
75
|
+
const extractedOptions = { ...options };
|
|
76
|
+
const cleanArgs = [];
|
|
77
|
+
for (const arg of args) {
|
|
78
|
+
if (arg === '--ci') extractedOptions.ci = true;
|
|
79
|
+
else if (arg === '--benchmark-infra') extractedOptions.benchmarkInfra = true;
|
|
80
|
+
else if (arg === '--skip-ci') extractedOptions.skipCi = true;
|
|
81
|
+
else if (arg === '--skip-s3') extractedOptions.skipS3 = true;
|
|
82
|
+
else if (arg === '--skip-post-setup') extractedOptions.skipPostSetup = true;
|
|
83
|
+
else if (arg === '--force') extractedOptions.force = true;
|
|
84
|
+
else if (arg === '--verify') extractedOptions.verify = true;
|
|
85
|
+
else if (arg === '--delete-stack') extractedOptions.deleteStack = true;
|
|
86
|
+
else if (arg === '--non-interactive') extractedOptions.nonInteractive = true;
|
|
87
|
+
else if (arg === '--ignore-staleness') extractedOptions.ignoreStaleness = true;
|
|
88
|
+
else cleanArgs.push(arg);
|
|
89
|
+
}
|
|
90
|
+
args = cleanArgs;
|
|
91
|
+
options = extractedOptions;
|
|
92
|
+
|
|
72
93
|
// Handle legacy --sync-schemas flag for backward compatibility
|
|
73
|
-
if (options['sync-schemas']) {
|
|
94
|
+
if ((options['sync-schemas'] || options.syncSchemas)) {
|
|
74
95
|
await this._handleSyncSchemas();
|
|
75
96
|
if (args.length === 0) return;
|
|
76
97
|
}
|
|
@@ -107,6 +128,15 @@ export default class BootstrapCommandHandler {
|
|
|
107
128
|
case 'sync-schemas':
|
|
108
129
|
await this._handleSyncSchemas();
|
|
109
130
|
break;
|
|
131
|
+
case 'sync-model-families':
|
|
132
|
+
await this._handleSyncModelFamilies();
|
|
133
|
+
break;
|
|
134
|
+
// Migration path: upgrades legacy profiles to current naming conventions.
|
|
135
|
+
// Corrects stackName to mlcc-bootstrap-{profileName}, renames sharedStackFrom
|
|
136
|
+
// to sharedInfraFrom. Idempotent — safe to run multiple times.
|
|
137
|
+
case 'migrate':
|
|
138
|
+
await this._handleMigrate();
|
|
139
|
+
break;
|
|
110
140
|
default:
|
|
111
141
|
console.log(`Unknown bootstrap subcommand: ${subcommand}`);
|
|
112
142
|
this._showHelp();
|
|
@@ -119,7 +149,8 @@ export default class BootstrapCommandHandler {
|
|
|
119
149
|
* @param {object} options - Parsed CLI options
|
|
120
150
|
*/
|
|
121
151
|
async _handleInteractiveSetup(options) {
|
|
122
|
-
|
|
152
|
+
// Commander.js converts --non-interactive to options.nonInteractive (camelCase)
|
|
153
|
+
const nonInteractive = options['non-interactive'] || options.nonInteractive;
|
|
123
154
|
|
|
124
155
|
// Non-interactive mode: validate required flags upfront
|
|
125
156
|
if (nonInteractive) {
|
|
@@ -180,13 +211,13 @@ export default class BootstrapCommandHandler {
|
|
|
180
211
|
|
|
181
212
|
// Step 3: Determine stack parameters
|
|
182
213
|
let useExistingRoleArn = '';
|
|
183
|
-
if (nonInteractive && options['role-arn']) {
|
|
184
|
-
useExistingRoleArn = options['role-arn'];
|
|
185
|
-
console.log(` Using provided IAM role ARN: ${options['role-arn']}`);
|
|
214
|
+
if (nonInteractive && (options['role-arn'] || options.roleArn)) {
|
|
215
|
+
useExistingRoleArn = (options['role-arn'] || options.roleArn);
|
|
216
|
+
console.log(` Using provided IAM role ARN: ${(options['role-arn'] || options.roleArn)}`);
|
|
186
217
|
}
|
|
187
218
|
|
|
188
219
|
let createS3Buckets = false;
|
|
189
|
-
if (nonInteractive && options['skip-s3']) {
|
|
220
|
+
if (nonInteractive && (options['skip-s3'] || options.skipS3)) {
|
|
190
221
|
console.log(' ⏭️ Skipping S3 bucket creation (--skip-s3)');
|
|
191
222
|
} else if (nonInteractive) {
|
|
192
223
|
createS3Buckets = true;
|
|
@@ -231,7 +262,8 @@ export default class BootstrapCommandHandler {
|
|
|
231
262
|
|
|
232
263
|
profileData.roleArn = stackOutputs.RoleArn;
|
|
233
264
|
profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
|
|
234
|
-
profileData.stackName =
|
|
265
|
+
profileData.stackName = stackName;
|
|
266
|
+
profileData.sharedInfraFrom = otherStack; // Track that this profile reuses another's stack
|
|
235
267
|
if (stackOutputs.AsyncS3BucketName) profileData.asyncS3Bucket = stackOutputs.AsyncS3BucketName;
|
|
236
268
|
if (stackOutputs.BatchS3BucketName) profileData.batchS3Bucket = stackOutputs.BatchS3BucketName;
|
|
237
269
|
if (stackOutputs.AdapterS3BucketName) profileData.adapterS3Bucket = stackOutputs.AdapterS3BucketName;
|
|
@@ -245,15 +277,45 @@ export default class BootstrapCommandHandler {
|
|
|
245
277
|
}
|
|
246
278
|
|
|
247
279
|
if (!profileData.stackName) {
|
|
280
|
+
// Pre-check: if IAM role already exists globally (from another region's deployment),
|
|
281
|
+
// pass its ARN so CloudFormation skips re-creation (account-level singleton)
|
|
282
|
+
if (!useExistingRoleArn) {
|
|
283
|
+
try {
|
|
284
|
+
const roleResult = this._execAws(
|
|
285
|
+
'iam get-role --role-name mlcc-sagemaker-execution-role',
|
|
286
|
+
awsProfile
|
|
287
|
+
);
|
|
288
|
+
const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
|
|
289
|
+
if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
|
|
290
|
+
useExistingRoleArn = roleArn;
|
|
291
|
+
console.log(` ℹ️ Reusing existing IAM role: ${roleArn}`);
|
|
292
|
+
}
|
|
293
|
+
} catch (_) {
|
|
294
|
+
// Role doesn't exist yet — will be created by the stack
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
248
298
|
try {
|
|
299
|
+
// Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
|
|
300
|
+
let skipEcr = 'false';
|
|
301
|
+
try {
|
|
302
|
+
this._execAws(
|
|
303
|
+
`ecr describe-repositories --repository-names ml-container-creator --region ${region}`,
|
|
304
|
+
awsProfile
|
|
305
|
+
);
|
|
306
|
+
skipEcr = 'true';
|
|
307
|
+
console.log(' ℹ️ ECR repository already exists — skipping creation');
|
|
308
|
+
} catch (_) { /* doesn't exist — will be created */ }
|
|
309
|
+
|
|
249
310
|
const stackOutputs = this._deployStack(stackName, {
|
|
250
311
|
CreateS3Buckets: createS3Buckets ? 'true' : 'false',
|
|
251
|
-
UseExistingRoleArn: useExistingRoleArn
|
|
312
|
+
UseExistingRoleArn: useExistingRoleArn,
|
|
313
|
+
SkipEcrCreation: skipEcr
|
|
252
314
|
}, awsProfile, region);
|
|
253
315
|
|
|
254
316
|
// Read outputs into profile data
|
|
255
317
|
profileData.roleArn = stackOutputs.RoleArn;
|
|
256
|
-
profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
|
|
318
|
+
profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName || 'ml-container-creator';
|
|
257
319
|
profileData.stackName = stackName;
|
|
258
320
|
|
|
259
321
|
if (stackOutputs.AsyncS3BucketName) {
|
|
@@ -278,6 +340,23 @@ export default class BootstrapCommandHandler {
|
|
|
278
340
|
}
|
|
279
341
|
} // end if (!profileData.stackName)
|
|
280
342
|
|
|
343
|
+
// Step 4b: MLflow App for model customization experiment tracking
|
|
344
|
+
this._displayProgress('📊', 'MLflow App for experiment tracking...');
|
|
345
|
+
try {
|
|
346
|
+
if (!profileData.mlflowAppArn) {
|
|
347
|
+
const mlflowAppArn = this._ensureMlflowApp(profileData, awsProfile);
|
|
348
|
+
if (mlflowAppArn) {
|
|
349
|
+
profileData.mlflowAppArn = mlflowAppArn;
|
|
350
|
+
console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
|
|
351
|
+
}
|
|
352
|
+
} else {
|
|
353
|
+
console.log(` ✅ MLflow App already configured: ${profileData.mlflowAppArn}`);
|
|
354
|
+
}
|
|
355
|
+
} catch (error) {
|
|
356
|
+
console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
|
|
357
|
+
console.log(' Tune jobs will still work but experiment tracking may not be available.');
|
|
358
|
+
}
|
|
359
|
+
|
|
281
360
|
// Step 5: CI Infrastructure setup (separate CDK stack — unchanged)
|
|
282
361
|
this._displayProgress('🧪', 'CI Testing Infrastructure...');
|
|
283
362
|
try {
|
|
@@ -286,7 +365,7 @@ export default class BootstrapCommandHandler {
|
|
|
286
365
|
if (nonInteractive) {
|
|
287
366
|
if (options.ci) {
|
|
288
367
|
provisionCi = true;
|
|
289
|
-
} else if (options['skip-ci']) {
|
|
368
|
+
} else if ((options['skip-ci'] || options.skipCi)) {
|
|
290
369
|
console.log(' ⏭️ Skipping CI infrastructure (--skip-ci)');
|
|
291
370
|
provisionCi = false;
|
|
292
371
|
} else {
|
|
@@ -303,6 +382,21 @@ export default class BootstrapCommandHandler {
|
|
|
303
382
|
}
|
|
304
383
|
|
|
305
384
|
if (provisionCi) {
|
|
385
|
+
// --- CI single-region enforcement ---
|
|
386
|
+
const ciConflict = this._findExistingCiProfile(profileName);
|
|
387
|
+
if (ciConflict) {
|
|
388
|
+
console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
|
|
389
|
+
console.log(' CI can only be deployed in one region per account.');
|
|
390
|
+
provisionCi = false;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
if (provisionCi) {
|
|
395
|
+
// Persist CI intent immediately so that `bootstrap update --ci` can
|
|
396
|
+
// retry if the CDK deploy fails. Don't wait for success.
|
|
397
|
+
profileData.ciInfraProvisioned = true;
|
|
398
|
+
profileData.ciTableName = profileData.ciTableName || 'mlcc-ci-table';
|
|
399
|
+
|
|
306
400
|
// Ensure CDK is bootstrapped in this account/region
|
|
307
401
|
const cdkBootstrapped = this._resourceExists(
|
|
308
402
|
`ssm get-parameter --name /cdk-bootstrap/hnb659fds/version --region ${profileData.awsRegion}`,
|
|
@@ -358,14 +452,39 @@ export default class BootstrapCommandHandler {
|
|
|
358
452
|
stdio: ['pipe', 'pipe', 'pipe']
|
|
359
453
|
});
|
|
360
454
|
|
|
455
|
+
// Warn if shell AWS_REGION differs from profile region
|
|
456
|
+
if (process.env.AWS_REGION && process.env.AWS_REGION !== profileData.awsRegion) {
|
|
457
|
+
console.log(` ⚠️ AWS_REGION env var (${process.env.AWS_REGION}) differs from profile region (${profileData.awsRegion}) — using profile region`);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// --no-rollback prevents rollback on AlreadyExists errors for IAM roles
|
|
461
|
+
// that may pre-exist from a prior deployment or another region.
|
|
462
|
+
// Check if benchmark bucket already exists (from a prior torn-down stack with RETAIN policy)
|
|
463
|
+
let importBucketCtx = '';
|
|
464
|
+
if (options.benchmarkInfra) {
|
|
465
|
+
try {
|
|
466
|
+
execSync(
|
|
467
|
+
`aws s3api head-bucket --bucket mlcc-benchmark-results-${profileData.accountId}-${profileData.awsRegion}${profileData.awsProfile ? ` --profile ${profileData.awsProfile}` : ''} --region ${profileData.awsRegion}`,
|
|
468
|
+
{ encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
|
|
469
|
+
);
|
|
470
|
+
importBucketCtx = ' -c importExistingBenchmarkBucket=true';
|
|
471
|
+
console.log(' ℹ️ Benchmark results bucket already exists — importing into stack');
|
|
472
|
+
} catch {
|
|
473
|
+
// Bucket doesn't exist — will be created fresh
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
const cdkDeployCmd = options.benchmarkInfra
|
|
477
|
+
? `npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true${importBucketCtx}`
|
|
478
|
+
: 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
|
|
361
479
|
execSync(
|
|
362
|
-
|
|
480
|
+
cdkDeployCmd,
|
|
363
481
|
{
|
|
364
482
|
cwd: ciHarnessDir,
|
|
365
483
|
encoding: 'utf8',
|
|
366
484
|
stdio: 'inherit',
|
|
367
485
|
env: {
|
|
368
486
|
...process.env,
|
|
487
|
+
AWS_REGION: profileData.awsRegion,
|
|
369
488
|
CDK_DEFAULT_REGION: profileData.awsRegion,
|
|
370
489
|
CDK_DEFAULT_ACCOUNT: profileData.accountId,
|
|
371
490
|
AWS_PROFILE: profileData.awsProfile
|
|
@@ -376,6 +495,11 @@ export default class BootstrapCommandHandler {
|
|
|
376
495
|
|
|
377
496
|
profileData.ciInfraProvisioned = true;
|
|
378
497
|
profileData.ciTableName = 'mlcc-ci-table';
|
|
498
|
+
if (options.benchmarkInfra) {
|
|
499
|
+
profileData.benchmarkInfraProvisioned = true;
|
|
500
|
+
profileData.ciGlueDatabase = 'mlcc_ci';
|
|
501
|
+
profileData.ciBenchmarkResultsBucket = `mlcc-benchmark-results-${profileData.accountId}-${profileData.awsRegion}`;
|
|
502
|
+
}
|
|
379
503
|
}
|
|
380
504
|
}
|
|
381
505
|
} catch (error) {
|
|
@@ -413,14 +537,80 @@ export default class BootstrapCommandHandler {
|
|
|
413
537
|
console.log(` Region: ${profileConfig.awsRegion}`);
|
|
414
538
|
console.log(` Account: ${profileConfig.accountId}`);
|
|
415
539
|
|
|
540
|
+
// --- SANITY CHECK 1: Account identity ---
|
|
541
|
+
const callerAccount = this._getCallerAccount(profileConfig.awsProfile);
|
|
542
|
+
if (callerAccount !== profileConfig.accountId) {
|
|
543
|
+
console.log(`❌ Account mismatch: profile expects ${profileConfig.accountId} but credentials resolve to ${callerAccount}`);
|
|
544
|
+
return;
|
|
545
|
+
}
|
|
546
|
+
|
|
416
547
|
// Re-deploy the CloudFormation bootstrap stack
|
|
417
548
|
const stackName = profileConfig.stackName || `${STACK_NAME_PREFIX}-${name}`;
|
|
549
|
+
|
|
550
|
+
// Sanity check: stack name consistency (warn-and-continue)
|
|
551
|
+
const expectedStackName = `${STACK_NAME_PREFIX}-${name}`;
|
|
552
|
+
if (profileConfig.stackName && profileConfig.stackName !== expectedStackName) {
|
|
553
|
+
console.log(`⚠️ Stack name mismatch: expected "${expectedStackName}" but profile has "${profileConfig.stackName}"`);
|
|
554
|
+
console.log(' Run `ml-container-creator bootstrap migrate` to fix.');
|
|
555
|
+
console.log(' Proceeding with stored stack name...');
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// --- SANITY CHECK 3: Stack exists in target region ---
|
|
559
|
+
const stackExists = this._resourceExists(
|
|
560
|
+
`cloudformation describe-stacks --stack-name ${stackName} --region ${profileConfig.awsRegion}`,
|
|
561
|
+
profileConfig.awsProfile
|
|
562
|
+
);
|
|
563
|
+
if (!stackExists) {
|
|
564
|
+
console.log(`❌ Stack "${stackName}" not found in ${profileConfig.awsRegion}.`);
|
|
565
|
+
console.log(' Run `ml-container-creator bootstrap` to create it.');
|
|
566
|
+
return;
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
// --- CI single-region enforcement ---
|
|
570
|
+
if (options.ci) {
|
|
571
|
+
const ciConflict = this._findExistingCiProfile(name);
|
|
572
|
+
if (ciConflict) {
|
|
573
|
+
console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
|
|
574
|
+
console.log(' CI can only be deployed in one region per account.');
|
|
575
|
+
return;
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
|
|
418
579
|
this._displayProgress('☁️', 'Updating bootstrap stack...');
|
|
419
580
|
|
|
581
|
+
// Pre-check: if IAM role already exists globally (from another region's deployment),
|
|
582
|
+
// pass its ARN so CloudFormation skips re-creation (account-level singleton)
|
|
583
|
+
let useExistingRoleArn = profileConfig.roleArn || '';
|
|
584
|
+
if (!useExistingRoleArn) {
|
|
585
|
+
try {
|
|
586
|
+
const roleResult = this._execAws(
|
|
587
|
+
'iam get-role --role-name mlcc-sagemaker-execution-role',
|
|
588
|
+
profileConfig.awsProfile
|
|
589
|
+
);
|
|
590
|
+
const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
|
|
591
|
+
if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
|
|
592
|
+
useExistingRoleArn = roleArn;
|
|
593
|
+
}
|
|
594
|
+
} catch (_) {
|
|
595
|
+
// Role doesn't exist yet — will be created by the stack
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
|
|
420
599
|
try {
|
|
600
|
+
// Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
|
|
601
|
+
let skipEcr = 'false';
|
|
602
|
+
try {
|
|
603
|
+
this._execAws(
|
|
604
|
+
`ecr describe-repositories --repository-names ml-container-creator --region ${profileConfig.awsRegion}`,
|
|
605
|
+
profileConfig.awsProfile
|
|
606
|
+
);
|
|
607
|
+
skipEcr = 'true';
|
|
608
|
+
} catch (_) { /* doesn't exist */ }
|
|
609
|
+
|
|
421
610
|
const stackOutputs = this._deployStack(stackName, {
|
|
422
611
|
CreateS3Buckets: (profileConfig.asyncS3Bucket || profileConfig.batchS3Bucket) ? 'true' : 'false',
|
|
423
|
-
UseExistingRoleArn:
|
|
612
|
+
UseExistingRoleArn: useExistingRoleArn,
|
|
613
|
+
SkipEcrCreation: skipEcr
|
|
424
614
|
}, profileConfig.awsProfile, profileConfig.awsRegion);
|
|
425
615
|
|
|
426
616
|
// Update profile with any new outputs
|
|
@@ -456,14 +646,34 @@ export default class BootstrapCommandHandler {
|
|
|
456
646
|
stdio: ['pipe', 'pipe', 'pipe']
|
|
457
647
|
});
|
|
458
648
|
|
|
649
|
+
// --no-rollback prevents rollback on AlreadyExists errors for IAM roles
|
|
650
|
+
// that may pre-exist from a prior deployment or another region.
|
|
651
|
+
// Check if benchmark bucket already exists (from a prior torn-down stack with RETAIN policy)
|
|
652
|
+
let updateImportBucketCtx = '';
|
|
653
|
+
if (options.benchmarkInfra || profileConfig.benchmarkInfraProvisioned) {
|
|
654
|
+
try {
|
|
655
|
+
execSync(
|
|
656
|
+
`aws s3api head-bucket --bucket mlcc-benchmark-results-${profileConfig.accountId}-${profileConfig.awsRegion}${profileConfig.awsProfile ? ` --profile ${profileConfig.awsProfile}` : ''} --region ${profileConfig.awsRegion}`,
|
|
657
|
+
{ encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
|
|
658
|
+
);
|
|
659
|
+
updateImportBucketCtx = ' -c importExistingBenchmarkBucket=true';
|
|
660
|
+
console.log(' ℹ️ Benchmark results bucket already exists — importing into stack');
|
|
661
|
+
} catch {
|
|
662
|
+
// Bucket doesn't exist — will be created fresh
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
const updateCdkCmd = (options.benchmarkInfra || profileConfig.benchmarkInfraProvisioned)
|
|
666
|
+
? `npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true${updateImportBucketCtx}`
|
|
667
|
+
: 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
|
|
459
668
|
execSync(
|
|
460
|
-
|
|
669
|
+
updateCdkCmd,
|
|
461
670
|
{
|
|
462
671
|
cwd: ciHarnessDir,
|
|
463
672
|
encoding: 'utf8',
|
|
464
673
|
stdio: 'inherit',
|
|
465
674
|
env: {
|
|
466
675
|
...process.env,
|
|
676
|
+
AWS_REGION: profileConfig.awsRegion,
|
|
467
677
|
CDK_DEFAULT_REGION: profileConfig.awsRegion,
|
|
468
678
|
CDK_DEFAULT_ACCOUNT: profileConfig.accountId,
|
|
469
679
|
AWS_PROFILE: profileConfig.awsProfile
|
|
@@ -471,6 +681,8 @@ export default class BootstrapCommandHandler {
|
|
|
471
681
|
}
|
|
472
682
|
);
|
|
473
683
|
profileConfig.ciInfraProvisioned = true;
|
|
684
|
+
profileConfig.ciGlueDatabase = profileConfig.ciGlueDatabase || 'mlcc_ci';
|
|
685
|
+
profileConfig.ciBenchmarkResultsBucket = profileConfig.ciBenchmarkResultsBucket || `mlcc-benchmark-results-${profileConfig.accountId}-${profileConfig.awsRegion}`;
|
|
474
686
|
console.log(' ✅ CI harness stack updated');
|
|
475
687
|
}
|
|
476
688
|
} catch (error) {
|
|
@@ -480,6 +692,18 @@ export default class BootstrapCommandHandler {
|
|
|
480
692
|
console.log(' ⏭️ CI stack skipped (not provisioned — use --ci to force)');
|
|
481
693
|
}
|
|
482
694
|
|
|
695
|
+
// Ensure MLflow App exists
|
|
696
|
+
this._displayProgress('📊', 'MLflow App for experiment tracking...');
|
|
697
|
+
try {
|
|
698
|
+
const mlflowAppArn = this._ensureMlflowApp(profileConfig, profileConfig.awsProfile);
|
|
699
|
+
if (mlflowAppArn) {
|
|
700
|
+
profileConfig.mlflowAppArn = mlflowAppArn;
|
|
701
|
+
console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
|
|
702
|
+
}
|
|
703
|
+
} catch (error) {
|
|
704
|
+
console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
|
|
705
|
+
}
|
|
706
|
+
|
|
483
707
|
// Save updated profile
|
|
484
708
|
this.config.setProfile(name, profileConfig);
|
|
485
709
|
console.log(`\n✅ Update complete for profile "${name}"`);
|
|
@@ -488,6 +712,82 @@ export default class BootstrapCommandHandler {
|
|
|
488
712
|
await this._runPostSetupChain(options);
|
|
489
713
|
}
|
|
490
714
|
|
|
715
|
+
/**
|
|
716
|
+
* Migrate legacy profiles to current naming conventions.
|
|
717
|
+
* Corrects stackName mismatches and renames sharedStackFrom → sharedInfraFrom.
|
|
718
|
+
* Displays a preview of all changes and requires confirmation before writing.
|
|
719
|
+
*/
|
|
720
|
+
async _handleMigrate() {
|
|
721
|
+
const config = this.config.read();
|
|
722
|
+
if (!config || !config.profiles) {
|
|
723
|
+
console.log('No profiles to migrate.');
|
|
724
|
+
return;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
const changes = [];
|
|
728
|
+
|
|
729
|
+
for (const [name, profileConfig] of Object.entries(config.profiles)) {
|
|
730
|
+
const expected = `${STACK_NAME_PREFIX}-${name}`;
|
|
731
|
+
|
|
732
|
+
// Fix stackName mismatch
|
|
733
|
+
if (profileConfig.stackName && profileConfig.stackName !== expected) {
|
|
734
|
+
changes.push({
|
|
735
|
+
profile: name,
|
|
736
|
+
field: 'stackName',
|
|
737
|
+
from: profileConfig.stackName,
|
|
738
|
+
to: expected
|
|
739
|
+
});
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// Rename sharedStackFrom → sharedInfraFrom
|
|
743
|
+
if (profileConfig.sharedStackFrom) {
|
|
744
|
+
changes.push({
|
|
745
|
+
profile: name,
|
|
746
|
+
field: 'sharedStackFrom → sharedInfraFrom',
|
|
747
|
+
from: profileConfig.sharedStackFrom,
|
|
748
|
+
to: profileConfig.sharedStackFrom
|
|
749
|
+
});
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
if (changes.length === 0) {
|
|
754
|
+
console.log('✅ All profiles already use current naming conventions.');
|
|
755
|
+
return;
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
// Display preview
|
|
759
|
+
console.log('📋 Migration Preview:\n');
|
|
760
|
+
for (const change of changes) {
|
|
761
|
+
console.log(` Profile "${change.profile}":`);
|
|
762
|
+
console.log(` ${change.field}: "${change.from}" → "${change.to}"`);
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
// Prompt for confirmation
|
|
766
|
+
const { confirm } = await this._promptFn([{
|
|
767
|
+
type: 'confirm',
|
|
768
|
+
name: 'confirm',
|
|
769
|
+
message: 'Apply these changes?',
|
|
770
|
+
default: true
|
|
771
|
+
}]);
|
|
772
|
+
|
|
773
|
+
if (!confirm) return;
|
|
774
|
+
|
|
775
|
+
// Apply changes
|
|
776
|
+
for (const [name, profileConfig] of Object.entries(config.profiles)) {
|
|
777
|
+
const expected = `${STACK_NAME_PREFIX}-${name}`;
|
|
778
|
+
if (profileConfig.stackName !== expected) {
|
|
779
|
+
profileConfig.stackName = expected;
|
|
780
|
+
}
|
|
781
|
+
if (profileConfig.sharedStackFrom) {
|
|
782
|
+
profileConfig.sharedInfraFrom = profileConfig.sharedStackFrom;
|
|
783
|
+
delete profileConfig.sharedStackFrom;
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
this.config.write(config);
|
|
788
|
+
console.log('✅ Migration complete.');
|
|
789
|
+
}
|
|
790
|
+
|
|
491
791
|
/**
|
|
492
792
|
* Run the post-setup chain: mcp init → registry sync-architectures → sync-schemas.
|
|
493
793
|
* Each step is independent — failures are collected and reported at the end.
|
|
@@ -495,7 +795,7 @@ export default class BootstrapCommandHandler {
|
|
|
495
795
|
* @param {object} options - Parsed CLI options (checks skipPostSetup)
|
|
496
796
|
*/
|
|
497
797
|
async _runPostSetupChain(options = {}) {
|
|
498
|
-
if (options['skip-post-setup']) {
|
|
798
|
+
if ((options['skip-post-setup'] || options.skipPostSetup)) {
|
|
499
799
|
console.log('\n⏭️ Skipping post-setup chain (--skip-post-setup)');
|
|
500
800
|
return;
|
|
501
801
|
}
|
|
@@ -717,7 +1017,8 @@ export default class BootstrapCommandHandler {
|
|
|
717
1017
|
* @returns {object} Parsed JSON output
|
|
718
1018
|
*/
|
|
719
1019
|
_execAws(command, profile) {
|
|
720
|
-
const
|
|
1020
|
+
const profileFlag = profile ? `--profile ${profile}` : '';
|
|
1021
|
+
const fullCommand = `aws ${command} ${profileFlag} --output json`.replace(/\s+/g, ' ').trim();
|
|
721
1022
|
const output = execSync(fullCommand, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] });
|
|
722
1023
|
const trimmed = output.trim();
|
|
723
1024
|
if (!trimmed) {
|
|
@@ -729,6 +1030,12 @@ export default class BootstrapCommandHandler {
|
|
|
729
1030
|
/**
|
|
730
1031
|
* Deploy the bootstrap CloudFormation stack and return its outputs.
|
|
731
1032
|
*
|
|
1033
|
+
* Before deploying, checks for pre-existing S3 buckets that would cause
|
|
1034
|
+
* ResourceExistenceCheck failures. If the stack is in REVIEW_IN_PROGRESS
|
|
1035
|
+
* state (empty shell from a failed prior attempt), deletes it first.
|
|
1036
|
+
* If buckets exist but aren't managed by the stack, uses a CloudFormation
|
|
1037
|
+
* import changeset to adopt them before proceeding with the normal deploy.
|
|
1038
|
+
*
|
|
732
1039
|
* Uses `aws cloudformation deploy` which is idempotent — it creates the
|
|
733
1040
|
* stack on first run and updates it on subsequent runs. If the template
|
|
734
1041
|
* hasn't changed, it exits with "No changes to deploy" which we handle
|
|
@@ -741,6 +1048,9 @@ export default class BootstrapCommandHandler {
|
|
|
741
1048
|
* @returns {object} Map of output key → output value
|
|
742
1049
|
*/
|
|
743
1050
|
_deployStack(stackName, parameters, profile, region) {
|
|
1051
|
+
// Handle ghost stacks and pre-existing resources
|
|
1052
|
+
this._resolveStackConflicts(stackName, parameters, profile, region);
|
|
1053
|
+
|
|
744
1054
|
// Build parameter overrides string
|
|
745
1055
|
const paramOverrides = Object.entries(parameters)
|
|
746
1056
|
.map(([key, value]) => `${key}=${value}`)
|
|
@@ -764,6 +1074,32 @@ export default class BootstrapCommandHandler {
|
|
|
764
1074
|
const stderr = error.stderr || error.message || '';
|
|
765
1075
|
if (stderr.includes('No changes to deploy')) {
|
|
766
1076
|
console.log(' ℹ️ Stack is up to date — no changes needed');
|
|
1077
|
+
} else if (stderr.includes('ResourceExistenceCheck')) {
|
|
1078
|
+
// Resources already exist outside the stack — attempt import and retry
|
|
1079
|
+
console.log(' ⚠️ Pre-existing resources detected — attempting import...');
|
|
1080
|
+
this._resolveStackConflicts(stackName, parameters, profile, region);
|
|
1081
|
+
// Rebuild deploy command with updated parameters (e.g., CreateS3Buckets may now be 'false')
|
|
1082
|
+
const retryParamOverrides = Object.entries(parameters)
|
|
1083
|
+
.map(([key, value]) => `${key}=${value}`)
|
|
1084
|
+
.join(' ');
|
|
1085
|
+
const retryDeployCommand = [
|
|
1086
|
+
'aws cloudformation deploy',
|
|
1087
|
+
`--template-file ${STACK_TEMPLATE_PATH}`,
|
|
1088
|
+
`--stack-name ${stackName}`,
|
|
1089
|
+
'--capabilities CAPABILITY_NAMED_IAM',
|
|
1090
|
+
`--parameter-overrides ${retryParamOverrides}`,
|
|
1091
|
+
`--profile ${profile}`,
|
|
1092
|
+
`--region ${region}`
|
|
1093
|
+
].join(' ');
|
|
1094
|
+
// Retry the deploy after import
|
|
1095
|
+
try {
|
|
1096
|
+
execSync(retryDeployCommand, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] });
|
|
1097
|
+
} catch (retryError) {
|
|
1098
|
+
const retryStderr = retryError.stderr || retryError.message || '';
|
|
1099
|
+
if (!retryStderr.includes('No changes to deploy')) {
|
|
1100
|
+
throw retryError;
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
767
1103
|
} else {
|
|
768
1104
|
throw error;
|
|
769
1105
|
}
|
|
@@ -785,9 +1121,144 @@ export default class BootstrapCommandHandler {
|
|
|
785
1121
|
outputs[output.OutputKey] = output.OutputValue;
|
|
786
1122
|
}
|
|
787
1123
|
|
|
1124
|
+
// If S3 buckets already existed (skipped creation), inject their names
|
|
1125
|
+
// into outputs so the profile config gets populated correctly.
|
|
1126
|
+
if (this._preExistingBuckets && this._preExistingBuckets.length > 0) {
|
|
1127
|
+
const bucketOutputMap = {
|
|
1128
|
+
'AsyncS3Bucket': 'AsyncS3BucketName',
|
|
1129
|
+
'BatchS3Bucket': 'BatchS3BucketName',
|
|
1130
|
+
'AdapterS3Bucket': 'AdapterS3BucketName',
|
|
1131
|
+
'BenchmarkS3Bucket': 'BenchmarkS3BucketName',
|
|
1132
|
+
'TuneS3Bucket': 'TuneS3BucketName'
|
|
1133
|
+
};
|
|
1134
|
+
for (const bucket of this._preExistingBuckets) {
|
|
1135
|
+
const outputKey = bucketOutputMap[bucket.logicalId];
|
|
1136
|
+
if (outputKey && !outputs[outputKey]) {
|
|
1137
|
+
outputs[outputKey] = bucket.name;
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
this._preExistingBuckets = null;
|
|
1141
|
+
}
|
|
1142
|
+
|
|
788
1143
|
return outputs;
|
|
789
1144
|
}
|
|
790
1145
|
|
|
1146
|
+
/**
|
|
1147
|
+
* Resolve stack conflicts before deploying.
|
|
1148
|
+
*
|
|
1149
|
+
* Handles two scenarios that cause ResourceExistenceCheck failures:
|
|
1150
|
+
* 1. Ghost stacks (REVIEW_IN_PROGRESS) — delete them first
|
|
1151
|
+
* 2. Pre-existing S3 buckets not managed by the stack — import them
|
|
1152
|
+
*
|
|
1153
|
+
* @param {string} stackName - CloudFormation stack name
|
|
1154
|
+
* @param {object} parameters - Stack parameter key-value pairs
|
|
1155
|
+
* @param {string} profile - AWS CLI profile name
|
|
1156
|
+
* @param {string} region - AWS region
|
|
1157
|
+
*/
|
|
1158
|
+
_resolveStackConflicts(stackName, parameters, profile, region) {
|
|
1159
|
+
// Check if stack exists and its status
|
|
1160
|
+
let stackStatus = null;
|
|
1161
|
+
let managedResources = [];
|
|
1162
|
+
|
|
1163
|
+
try {
|
|
1164
|
+
const describeResult = this._execAws(
|
|
1165
|
+
`cloudformation describe-stacks --stack-name ${stackName} --region ${region}`,
|
|
1166
|
+
profile
|
|
1167
|
+
);
|
|
1168
|
+
const stack = describeResult.Stacks && describeResult.Stacks[0];
|
|
1169
|
+
if (stack) {
|
|
1170
|
+
stackStatus = stack.StackStatus;
|
|
1171
|
+
}
|
|
1172
|
+
} catch (_) {
|
|
1173
|
+
// Stack doesn't exist — no conflicts possible
|
|
1174
|
+
return;
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
// Handle ghost stacks (created but never successfully deployed)
|
|
1178
|
+
if (stackStatus === 'REVIEW_IN_PROGRESS') {
|
|
1179
|
+
console.log(' ⚠️ Found ghost stack (REVIEW_IN_PROGRESS) — deleting before redeploy...');
|
|
1180
|
+
try {
|
|
1181
|
+
execSync(
|
|
1182
|
+
`aws cloudformation delete-stack --stack-name ${stackName} --profile ${profile} --region ${region}`,
|
|
1183
|
+
{ encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
|
|
1184
|
+
);
|
|
1185
|
+
execSync(
|
|
1186
|
+
`aws cloudformation wait stack-delete-complete --stack-name ${stackName} --profile ${profile} --region ${region}`,
|
|
1187
|
+
{ encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 60000 }
|
|
1188
|
+
);
|
|
1189
|
+
console.log(' ✅ Ghost stack deleted');
|
|
1190
|
+
} catch (err) {
|
|
1191
|
+
console.log(` ⚠️ Could not delete ghost stack: ${err.message || err}`);
|
|
1192
|
+
}
|
|
1193
|
+
// Don't return — fall through to check for pre-existing S3 buckets
|
|
1194
|
+
// that need to be imported on the fresh deploy. The ghost stack had
|
|
1195
|
+
// DeletionPolicy:Retain buckets that survive stack deletion.
|
|
1196
|
+
stackStatus = null;
|
|
1197
|
+
managedResources = [];
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
// For active stacks (or post-ghost-deletion), check if S3 buckets exist but aren't managed
|
|
1201
|
+
if (parameters.CreateS3Buckets !== 'true') {
|
|
1202
|
+
return; // Not creating buckets — no conflict
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
// Get list of resources currently managed by the stack (empty if stack was just deleted)
|
|
1206
|
+
if (stackStatus) {
|
|
1207
|
+
try {
|
|
1208
|
+
const resources = this._execAws(
|
|
1209
|
+
`cloudformation list-stack-resources --stack-name ${stackName} --region ${region}`,
|
|
1210
|
+
profile
|
|
1211
|
+
);
|
|
1212
|
+
managedResources = (resources.StackResourceSummaries || [])
|
|
1213
|
+
.map(r => r.LogicalResourceId);
|
|
1214
|
+
} catch (_) {
|
|
1215
|
+
// Stack doesn't exist or can't be queried — proceed with empty managedResources
|
|
1216
|
+
}
|
|
1217
|
+
}
|
|
1218
|
+
|
|
1219
|
+
// Check each S3 bucket that the template would create
|
|
1220
|
+
const accountId = this._currentAccountId;
|
|
1221
|
+
const bucketConfigs = [
|
|
1222
|
+
{ logicalId: 'AsyncS3Bucket', name: `mlcc-async-${accountId}-${region}` },
|
|
1223
|
+
{ logicalId: 'BatchS3Bucket', name: `mlcc-batch-${accountId}-${region}` },
|
|
1224
|
+
{ logicalId: 'AdapterS3Bucket', name: `mlcc-adapters-${accountId}-${region}` },
|
|
1225
|
+
{ logicalId: 'BenchmarkS3Bucket', name: `mlcc-benchmark-${accountId}-${region}` },
|
|
1226
|
+
{ logicalId: 'TuneS3Bucket', name: `mlcc-tune-${accountId}-${region}` }
|
|
1227
|
+
];
|
|
1228
|
+
|
|
1229
|
+
const bucketsToImport = [];
|
|
1230
|
+
|
|
1231
|
+
for (const bucket of bucketConfigs) {
|
|
1232
|
+
if (managedResources.includes(bucket.logicalId)) {
|
|
1233
|
+
continue; // Already managed by the stack — no conflict
|
|
1234
|
+
}
|
|
1235
|
+
// Check if bucket exists in AWS
|
|
1236
|
+
try {
|
|
1237
|
+
execSync(
|
|
1238
|
+
`aws s3api head-bucket --bucket ${bucket.name} --profile ${profile} --region ${region}`,
|
|
1239
|
+
{ encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
|
|
1240
|
+
);
|
|
1241
|
+
// Bucket exists but not in stack — needs import
|
|
1242
|
+
bucketsToImport.push(bucket);
|
|
1243
|
+
} catch (_) {
|
|
1244
|
+
// Bucket doesn't exist — will be created normally
|
|
1245
|
+
}
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
if (bucketsToImport.length > 0) {
|
|
1249
|
+
console.log(` ℹ️ ${bucketsToImport.length} pre-existing S3 bucket(s) detected — skipping S3 creation (buckets already exist)`);
|
|
1250
|
+
|
|
1251
|
+
// Pre-existing S3 buckets survive stack deletion (DeletionPolicy: Retain).
|
|
1252
|
+
// Rather than fighting CloudFormation's IMPORT limitations, just skip S3
|
|
1253
|
+
// creation and wire the existing bucket names into the profile config directly.
|
|
1254
|
+
// The naming convention is deterministic, so we know exactly what they are.
|
|
1255
|
+
this._preExistingBuckets = bucketsToImport;
|
|
1256
|
+
|
|
1257
|
+
// Modify the parameters to skip S3 bucket creation in the deploy
|
|
1258
|
+
parameters.CreateS3Buckets = 'false';
|
|
1259
|
+
}
|
|
1260
|
+
}
|
|
1261
|
+
|
|
791
1262
|
/**
|
|
792
1263
|
* Write a JSON object to a temp file and return the `file://` path.
|
|
793
1264
|
* Used for passing complex JSON to AWS CLI commands without shell escaping issues.
|
|
@@ -821,6 +1292,125 @@ export default class BootstrapCommandHandler {
|
|
|
821
1292
|
}
|
|
822
1293
|
}
|
|
823
1294
|
|
|
1295
|
+
/**
|
|
1296
|
+
* Get the AWS account ID from the caller's credentials.
|
|
1297
|
+
* Uses `sts get-caller-identity` to resolve the actual account.
|
|
1298
|
+
*
|
|
1299
|
+
* @param {string} awsProfile - AWS CLI profile name
|
|
1300
|
+
* @returns {string} The 12-digit AWS account ID
|
|
1301
|
+
*/
|
|
1302
|
+
_getCallerAccount(awsProfile) {
|
|
1303
|
+
const identity = this._execAws('sts get-caller-identity', awsProfile);
|
|
1304
|
+
return identity.Account;
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
/**
|
|
1308
|
+
* Scan all profiles to find one with ciInfraProvisioned=true,
|
|
1309
|
+
* excluding the given profile name.
|
|
1310
|
+
*
|
|
1311
|
+
* @param {string} excludeProfile - Profile name to exclude from the scan
|
|
1312
|
+
* @returns {{ name: string, config: Object }|null} The CI profile, or null if none found
|
|
1313
|
+
*/
|
|
1314
|
+
_findExistingCiProfile(excludeProfile) {
|
|
1315
|
+
const config = this.config.read();
|
|
1316
|
+
if (!config || !config.profiles) return null;
|
|
1317
|
+
|
|
1318
|
+
for (const [name, profileConfig] of Object.entries(config.profiles)) {
|
|
1319
|
+
if (name === excludeProfile) continue;
|
|
1320
|
+
if (profileConfig.ciInfraProvisioned) {
|
|
1321
|
+
return { name, config: profileConfig };
|
|
1322
|
+
}
|
|
1323
|
+
}
|
|
1324
|
+
return null;
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
/**
|
|
1328
|
+
* Ensure an MLCC-owned MLflow App exists for experiment tracking.
|
|
1329
|
+
* Creates one if it doesn't exist, using the tune S3 bucket as artifact store.
|
|
1330
|
+
*
|
|
1331
|
+
* @param {object} profileData - Bootstrap profile data (needs roleArn, awsRegion, accountId)
|
|
1332
|
+
* @param {string} awsProfile - AWS CLI profile name
|
|
1333
|
+
* @returns {string|null} MLflow App ARN or null if creation failed
|
|
1334
|
+
*/
|
|
1335
|
+
_ensureMlflowApp(profileData, awsProfile) {
|
|
1336
|
+
const region = profileData.awsRegion;
|
|
1337
|
+
const accountId = profileData.accountId;
|
|
1338
|
+
const roleArn = profileData.roleArn;
|
|
1339
|
+
const appName = 'mlcc-tune-tracking';
|
|
1340
|
+
const artifactBucket = `mlcc-tune-${accountId}-${region}`;
|
|
1341
|
+
|
|
1342
|
+
// Check if MLCC app already exists
|
|
1343
|
+
try {
|
|
1344
|
+
const apps = this._execAws(
|
|
1345
|
+
`sagemaker list-mlflow-apps --region ${region}`,
|
|
1346
|
+
awsProfile
|
|
1347
|
+
);
|
|
1348
|
+
const summaries = apps.Summaries || [];
|
|
1349
|
+
const existing = summaries.find(a => a.Name === appName);
|
|
1350
|
+
if (existing) {
|
|
1351
|
+
return existing.Arn;
|
|
1352
|
+
}
|
|
1353
|
+
} catch {
|
|
1354
|
+
// list-mlflow-apps may not be available in all CLI versions — proceed to create
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
// Create the MLflow App
|
|
1358
|
+
console.log(` Creating MLflow App "${appName}" with artifact store s3://${artifactBucket}...`);
|
|
1359
|
+
|
|
1360
|
+
// Ensure the artifact bucket exists (it's the tune bucket from the stack)
|
|
1361
|
+
try {
|
|
1362
|
+
this._execAws(
|
|
1363
|
+
`s3api head-bucket --bucket ${artifactBucket} --region ${region}`,
|
|
1364
|
+
awsProfile
|
|
1365
|
+
);
|
|
1366
|
+
} catch {
|
|
1367
|
+
// Bucket doesn't exist — create it
|
|
1368
|
+
console.log(` Creating artifact bucket: ${artifactBucket}`);
|
|
1369
|
+
try {
|
|
1370
|
+
this._execAws(
|
|
1371
|
+
`s3api create-bucket --bucket ${artifactBucket} --region ${region} --create-bucket-configuration LocationConstraint=${region}`,
|
|
1372
|
+
awsProfile
|
|
1373
|
+
);
|
|
1374
|
+
} catch (bucketErr) {
|
|
1375
|
+
// May already exist or region doesn't need LocationConstraint (us-east-1)
|
|
1376
|
+
if (!bucketErr.message?.includes('BucketAlreadyOwnedByYou')) {
|
|
1377
|
+
try {
|
|
1378
|
+
this._execAws(
|
|
1379
|
+
`s3api create-bucket --bucket ${artifactBucket} --region ${region}`,
|
|
1380
|
+
awsProfile
|
|
1381
|
+
);
|
|
1382
|
+
} catch {
|
|
1383
|
+
// Bucket likely exists, continue
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
|
|
1389
|
+
// Create the app
|
|
1390
|
+
try {
|
|
1391
|
+
const result = this._execAws(
|
|
1392
|
+
`sagemaker create-mlflow-app --name ${appName} --artifact-store-uri s3://${artifactBucket} --role-arn ${roleArn} --model-registration-mode AutoModelRegistrationEnabled --region ${region}`,
|
|
1393
|
+
awsProfile
|
|
1394
|
+
);
|
|
1395
|
+
return result.Arn;
|
|
1396
|
+
} catch (err) {
|
|
1397
|
+
// If app already exists (race condition), try to describe it
|
|
1398
|
+
if (err.message?.includes('ResourceLimitExceeded') || err.message?.includes('already exists')) {
|
|
1399
|
+
try {
|
|
1400
|
+
const apps = this._execAws(
|
|
1401
|
+
`sagemaker list-mlflow-apps --region ${region}`,
|
|
1402
|
+
awsProfile
|
|
1403
|
+
);
|
|
1404
|
+
const found = (apps.Summaries || []).find(a => a.Name === appName);
|
|
1405
|
+
if (found) return found.Arn;
|
|
1406
|
+
} catch {
|
|
1407
|
+
// Fall through
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
throw err;
|
|
1411
|
+
}
|
|
1412
|
+
}
|
|
1413
|
+
|
|
824
1414
|
/**
|
|
825
1415
|
* Format tags for the AWS CLI --tags parameter.
|
|
826
1416
|
* Writes tags to a temp file and returns the file:// reference
|
|
@@ -858,6 +1448,8 @@ SUBCOMMANDS:
|
|
|
858
1448
|
scan Discover pre-existing MLCC-managed resources in AWS
|
|
859
1449
|
prune Remove deleted and unknown records from the deployment manifest
|
|
860
1450
|
update Re-deploy bootstrap stacks using active profile (no prompts)
|
|
1451
|
+
migrate Upgrade legacy profiles to current naming conventions
|
|
1452
|
+
sync-model-families Discover tune-eligible models from JumpStart Hub and update catalog
|
|
861
1453
|
|
|
862
1454
|
SETUP OPTIONS:
|
|
863
1455
|
--non-interactive Run without interactive prompts
|
|
@@ -886,6 +1478,8 @@ EXAMPLES:
|
|
|
886
1478
|
ml-container-creator bootstrap remove dev
|
|
887
1479
|
ml-container-creator bootstrap remove dev --force --delete-stack
|
|
888
1480
|
ml-container-creator bootstrap scan
|
|
1481
|
+
ml-container-creator bootstrap sync-model-families
|
|
1482
|
+
ml-container-creator bootstrap migrate
|
|
889
1483
|
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2
|
|
890
1484
|
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --role-arn arn:aws:iam::123456789012:role/MyRole --skip-s3
|
|
891
1485
|
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --ci
|