@aws/ml-container-creator 0.10.0 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +9304 -0
- package/bin/cli.js +2 -0
- package/config/bootstrap-e2e-stack.json +341 -0
- package/config/bootstrap-stack.json +40 -3
- package/config/parameter-schema-v2.json +5 -21
- package/config/tune-catalog.json +1781 -0
- package/infra/ci-harness/buildspec.yml +1 -0
- package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
- package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
- package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
- package/package.json +51 -66
- package/servers/base-image-picker/index.js +121 -121
- package/servers/e2e-status/index.js +297 -0
- package/servers/e2e-status/manifest.json +14 -0
- package/servers/e2e-status/package.json +15 -0
- package/servers/endpoint-picker/LICENSE +202 -0
- package/servers/endpoint-picker/index.js +536 -0
- package/servers/endpoint-picker/manifest.json +14 -0
- package/servers/endpoint-picker/package.json +18 -0
- package/servers/hyperpod-cluster-picker/index.js +125 -125
- package/servers/instance-sizer/index.js +138 -138
- package/servers/instance-sizer/lib/instance-ranker.js +76 -76
- package/servers/instance-sizer/lib/model-resolver.js +61 -61
- package/servers/instance-sizer/lib/quota-resolver.js +113 -113
- package/servers/instance-sizer/lib/vram-estimator.js +31 -31
- package/servers/lib/bedrock-client.js +38 -38
- package/servers/lib/catalogs/model-servers.json +201 -3
- package/servers/lib/custom-validators.js +13 -13
- package/servers/lib/dynamic-resolver.js +4 -4
- package/servers/marketplace-picker/index.js +342 -0
- package/servers/marketplace-picker/manifest.json +14 -0
- package/servers/marketplace-picker/package.json +18 -0
- package/servers/model-picker/index.js +382 -382
- package/servers/region-picker/index.js +56 -56
- package/servers/workload-picker/LICENSE +202 -0
- package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
- package/servers/workload-picker/index.js +171 -0
- package/servers/workload-picker/manifest.json +16 -0
- package/servers/workload-picker/package.json +16 -0
- package/src/app.js +4 -2
- package/src/lib/bootstrap-command-handler.js +579 -14
- package/src/lib/bootstrap-config.js +36 -0
- package/src/lib/bootstrap-profile-manager.js +48 -41
- package/src/lib/ci-register-helpers.js +74 -0
- package/src/lib/config-loader.js +3 -0
- package/src/lib/config-manager.js +7 -0
- package/src/lib/cuda-resolver.js +17 -8
- package/src/lib/generated/cli-options.js +315 -315
- package/src/lib/generated/parameter-matrix.js +661 -661
- package/src/lib/generated/validation-rules.js +71 -71
- package/src/lib/path-prover-brain.js +607 -0
- package/src/lib/prompts/project-prompts.js +12 -0
- package/src/lib/template-variable-resolver.js +25 -1
- package/src/lib/tune-catalog-validator.js +37 -4
- package/templates/Dockerfile +9 -0
- package/templates/code/adapter_sidecar.py +444 -0
- package/templates/code/serve +6 -0
- package/templates/code/serve.d/vllm.ejs +1 -1
- package/templates/do/.benchmark_writer.py +1476 -0
- package/templates/do/.tune_helper.py +982 -57
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/adapter +149 -0
- package/templates/do/benchmark +639 -85
- package/templates/do/config +108 -5
- package/templates/do/deploy.d/managed-inference.ejs +192 -11
- package/templates/do/optimize +106 -37
- package/templates/do/register +89 -0
- package/templates/do/test +13 -0
- package/templates/do/tune +378 -59
- package/templates/do/validate +44 -4
|
@@ -62,6 +62,7 @@ export default class BootstrapCommandHandler {
|
|
|
62
62
|
_handleScan() { return this.profileManager._handleScan(); }
|
|
63
63
|
_handlePrune() { return this.profileManager._handlePrune(); }
|
|
64
64
|
_handleSyncSchemas() { return this.profileManager._handleSyncSchemas(); }
|
|
65
|
+
_handleSyncModelFamilies() { return this.profileManager._handleSyncModelFamilies(); }
|
|
65
66
|
|
|
66
67
|
/**
|
|
67
68
|
* Dispatch bootstrap subcommands.
|
|
@@ -69,8 +70,28 @@ export default class BootstrapCommandHandler {
|
|
|
69
70
|
* @param {object} options - Parsed CLI options
|
|
70
71
|
*/
|
|
71
72
|
async handle(args, options) {
|
|
73
|
+
// Commander.js with passThroughOptions() captures flags after positional
|
|
74
|
+
// arguments in args rather than options. Extract known flags from args.
|
|
75
|
+
const extractedOptions = { ...options };
|
|
76
|
+
const cleanArgs = [];
|
|
77
|
+
for (const arg of args) {
|
|
78
|
+
if (arg === '--ci') extractedOptions.ci = true;
|
|
79
|
+
else if (arg === '--benchmark-infra') extractedOptions.benchmarkInfra = true;
|
|
80
|
+
else if (arg === '--skip-ci') extractedOptions.skipCi = true;
|
|
81
|
+
else if (arg === '--skip-s3') extractedOptions.skipS3 = true;
|
|
82
|
+
else if (arg === '--skip-post-setup') extractedOptions.skipPostSetup = true;
|
|
83
|
+
else if (arg === '--force') extractedOptions.force = true;
|
|
84
|
+
else if (arg === '--verify') extractedOptions.verify = true;
|
|
85
|
+
else if (arg === '--delete-stack') extractedOptions.deleteStack = true;
|
|
86
|
+
else if (arg === '--non-interactive') extractedOptions.nonInteractive = true;
|
|
87
|
+
else if (arg === '--ignore-staleness') extractedOptions.ignoreStaleness = true;
|
|
88
|
+
else cleanArgs.push(arg);
|
|
89
|
+
}
|
|
90
|
+
args = cleanArgs;
|
|
91
|
+
options = extractedOptions;
|
|
92
|
+
|
|
72
93
|
// Handle legacy --sync-schemas flag for backward compatibility
|
|
73
|
-
if (options['sync-schemas']) {
|
|
94
|
+
if ((options['sync-schemas'] || options.syncSchemas)) {
|
|
74
95
|
await this._handleSyncSchemas();
|
|
75
96
|
if (args.length === 0) return;
|
|
76
97
|
}
|
|
@@ -107,6 +128,15 @@ export default class BootstrapCommandHandler {
|
|
|
107
128
|
case 'sync-schemas':
|
|
108
129
|
await this._handleSyncSchemas();
|
|
109
130
|
break;
|
|
131
|
+
case 'sync-model-families':
|
|
132
|
+
await this._handleSyncModelFamilies();
|
|
133
|
+
break;
|
|
134
|
+
// Migration path: upgrades legacy profiles to current naming conventions.
|
|
135
|
+
// Corrects stackName to mlcc-bootstrap-{profileName}, renames sharedStackFrom
|
|
136
|
+
// to sharedInfraFrom. Idempotent — safe to run multiple times.
|
|
137
|
+
case 'migrate':
|
|
138
|
+
await this._handleMigrate();
|
|
139
|
+
break;
|
|
110
140
|
default:
|
|
111
141
|
console.log(`Unknown bootstrap subcommand: ${subcommand}`);
|
|
112
142
|
this._showHelp();
|
|
@@ -119,7 +149,8 @@ export default class BootstrapCommandHandler {
|
|
|
119
149
|
* @param {object} options - Parsed CLI options
|
|
120
150
|
*/
|
|
121
151
|
async _handleInteractiveSetup(options) {
|
|
122
|
-
|
|
152
|
+
// Commander.js converts --non-interactive to options.nonInteractive (camelCase)
|
|
153
|
+
const nonInteractive = options['non-interactive'] || options.nonInteractive;
|
|
123
154
|
|
|
124
155
|
// Non-interactive mode: validate required flags upfront
|
|
125
156
|
if (nonInteractive) {
|
|
@@ -180,13 +211,13 @@ export default class BootstrapCommandHandler {
|
|
|
180
211
|
|
|
181
212
|
// Step 3: Determine stack parameters
|
|
182
213
|
let useExistingRoleArn = '';
|
|
183
|
-
if (nonInteractive && options['role-arn']) {
|
|
184
|
-
useExistingRoleArn = options['role-arn'];
|
|
185
|
-
console.log(` Using provided IAM role ARN: ${options['role-arn']}`);
|
|
214
|
+
if (nonInteractive && (options['role-arn'] || options.roleArn)) {
|
|
215
|
+
useExistingRoleArn = (options['role-arn'] || options.roleArn);
|
|
216
|
+
console.log(` Using provided IAM role ARN: ${(options['role-arn'] || options.roleArn)}`);
|
|
186
217
|
}
|
|
187
218
|
|
|
188
219
|
let createS3Buckets = false;
|
|
189
|
-
if (nonInteractive && options['skip-s3']) {
|
|
220
|
+
if (nonInteractive && (options['skip-s3'] || options.skipS3)) {
|
|
190
221
|
console.log(' ⏭️ Skipping S3 bucket creation (--skip-s3)');
|
|
191
222
|
} else if (nonInteractive) {
|
|
192
223
|
createS3Buckets = true;
|
|
@@ -231,7 +262,8 @@ export default class BootstrapCommandHandler {
|
|
|
231
262
|
|
|
232
263
|
profileData.roleArn = stackOutputs.RoleArn;
|
|
233
264
|
profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
|
|
234
|
-
profileData.stackName =
|
|
265
|
+
profileData.stackName = stackName;
|
|
266
|
+
profileData.sharedInfraFrom = otherStack; // Track that this profile reuses another's stack
|
|
235
267
|
if (stackOutputs.AsyncS3BucketName) profileData.asyncS3Bucket = stackOutputs.AsyncS3BucketName;
|
|
236
268
|
if (stackOutputs.BatchS3BucketName) profileData.batchS3Bucket = stackOutputs.BatchS3BucketName;
|
|
237
269
|
if (stackOutputs.AdapterS3BucketName) profileData.adapterS3Bucket = stackOutputs.AdapterS3BucketName;
|
|
@@ -245,15 +277,45 @@ export default class BootstrapCommandHandler {
|
|
|
245
277
|
}
|
|
246
278
|
|
|
247
279
|
if (!profileData.stackName) {
|
|
280
|
+
// Pre-check: if IAM role already exists globally (from another region's deployment),
|
|
281
|
+
// pass its ARN so CloudFormation skips re-creation (account-level singleton)
|
|
282
|
+
if (!useExistingRoleArn) {
|
|
283
|
+
try {
|
|
284
|
+
const roleResult = this._execAws(
|
|
285
|
+
'iam get-role --role-name mlcc-sagemaker-execution-role',
|
|
286
|
+
awsProfile
|
|
287
|
+
);
|
|
288
|
+
const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
|
|
289
|
+
if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
|
|
290
|
+
useExistingRoleArn = roleArn;
|
|
291
|
+
console.log(` ℹ️ Reusing existing IAM role: ${roleArn}`);
|
|
292
|
+
}
|
|
293
|
+
} catch (_) {
|
|
294
|
+
// Role doesn't exist yet — will be created by the stack
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
248
298
|
try {
|
|
299
|
+
// Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
|
|
300
|
+
let skipEcr = 'false';
|
|
301
|
+
try {
|
|
302
|
+
this._execAws(
|
|
303
|
+
`ecr describe-repositories --repository-names ml-container-creator --region ${region}`,
|
|
304
|
+
awsProfile
|
|
305
|
+
);
|
|
306
|
+
skipEcr = 'true';
|
|
307
|
+
console.log(' ℹ️ ECR repository already exists — skipping creation');
|
|
308
|
+
} catch (_) { /* doesn't exist — will be created */ }
|
|
309
|
+
|
|
249
310
|
const stackOutputs = this._deployStack(stackName, {
|
|
250
311
|
CreateS3Buckets: createS3Buckets ? 'true' : 'false',
|
|
251
|
-
UseExistingRoleArn: useExistingRoleArn
|
|
312
|
+
UseExistingRoleArn: useExistingRoleArn,
|
|
313
|
+
SkipEcrCreation: skipEcr
|
|
252
314
|
}, awsProfile, region);
|
|
253
315
|
|
|
254
316
|
// Read outputs into profile data
|
|
255
317
|
profileData.roleArn = stackOutputs.RoleArn;
|
|
256
|
-
profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
|
|
318
|
+
profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName || 'ml-container-creator';
|
|
257
319
|
profileData.stackName = stackName;
|
|
258
320
|
|
|
259
321
|
if (stackOutputs.AsyncS3BucketName) {
|
|
@@ -278,6 +340,23 @@ export default class BootstrapCommandHandler {
|
|
|
278
340
|
}
|
|
279
341
|
} // end if (!profileData.stackName)
|
|
280
342
|
|
|
343
|
+
// Step 4b: MLflow App for model customization experiment tracking
|
|
344
|
+
this._displayProgress('📊', 'MLflow App for experiment tracking...');
|
|
345
|
+
try {
|
|
346
|
+
if (!profileData.mlflowAppArn) {
|
|
347
|
+
const mlflowAppArn = this._ensureMlflowApp(profileData, awsProfile);
|
|
348
|
+
if (mlflowAppArn) {
|
|
349
|
+
profileData.mlflowAppArn = mlflowAppArn;
|
|
350
|
+
console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
|
|
351
|
+
}
|
|
352
|
+
} else {
|
|
353
|
+
console.log(` ✅ MLflow App already configured: ${profileData.mlflowAppArn}`);
|
|
354
|
+
}
|
|
355
|
+
} catch (error) {
|
|
356
|
+
console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
|
|
357
|
+
console.log(' Tune jobs will still work but experiment tracking may not be available.');
|
|
358
|
+
}
|
|
359
|
+
|
|
281
360
|
// Step 5: CI Infrastructure setup (separate CDK stack — unchanged)
|
|
282
361
|
this._displayProgress('🧪', 'CI Testing Infrastructure...');
|
|
283
362
|
try {
|
|
@@ -286,7 +365,7 @@ export default class BootstrapCommandHandler {
|
|
|
286
365
|
if (nonInteractive) {
|
|
287
366
|
if (options.ci) {
|
|
288
367
|
provisionCi = true;
|
|
289
|
-
} else if (options['skip-ci']) {
|
|
368
|
+
} else if ((options['skip-ci'] || options.skipCi)) {
|
|
290
369
|
console.log(' ⏭️ Skipping CI infrastructure (--skip-ci)');
|
|
291
370
|
provisionCi = false;
|
|
292
371
|
} else {
|
|
@@ -303,6 +382,21 @@ export default class BootstrapCommandHandler {
|
|
|
303
382
|
}
|
|
304
383
|
|
|
305
384
|
if (provisionCi) {
|
|
385
|
+
// --- CI single-region enforcement ---
|
|
386
|
+
const ciConflict = this._findExistingCiProfile(profileName);
|
|
387
|
+
if (ciConflict) {
|
|
388
|
+
console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
|
|
389
|
+
console.log(' CI can only be deployed in one region per account.');
|
|
390
|
+
provisionCi = false;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
if (provisionCi) {
|
|
395
|
+
// Persist CI intent immediately so that `bootstrap update --ci` can
|
|
396
|
+
// retry if the CDK deploy fails. Don't wait for success.
|
|
397
|
+
profileData.ciInfraProvisioned = true;
|
|
398
|
+
profileData.ciTableName = profileData.ciTableName || 'mlcc-ci-table';
|
|
399
|
+
|
|
306
400
|
// Ensure CDK is bootstrapped in this account/region
|
|
307
401
|
const cdkBootstrapped = this._resourceExists(
|
|
308
402
|
`ssm get-parameter --name /cdk-bootstrap/hnb659fds/version --region ${profileData.awsRegion}`,
|
|
@@ -358,14 +452,25 @@ export default class BootstrapCommandHandler {
|
|
|
358
452
|
stdio: ['pipe', 'pipe', 'pipe']
|
|
359
453
|
});
|
|
360
454
|
|
|
455
|
+
// Warn if shell AWS_REGION differs from profile region
|
|
456
|
+
if (process.env.AWS_REGION && process.env.AWS_REGION !== profileData.awsRegion) {
|
|
457
|
+
console.log(` ⚠️ AWS_REGION env var (${process.env.AWS_REGION}) differs from profile region (${profileData.awsRegion}) — using profile region`);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// --no-rollback prevents rollback on AlreadyExists errors for IAM roles
|
|
461
|
+
// that may pre-exist from a prior deployment or another region.
|
|
462
|
+
const cdkDeployCmd = options.benchmarkInfra
|
|
463
|
+
? 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true'
|
|
464
|
+
: 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
|
|
361
465
|
execSync(
|
|
362
|
-
|
|
466
|
+
cdkDeployCmd,
|
|
363
467
|
{
|
|
364
468
|
cwd: ciHarnessDir,
|
|
365
469
|
encoding: 'utf8',
|
|
366
470
|
stdio: 'inherit',
|
|
367
471
|
env: {
|
|
368
472
|
...process.env,
|
|
473
|
+
AWS_REGION: profileData.awsRegion,
|
|
369
474
|
CDK_DEFAULT_REGION: profileData.awsRegion,
|
|
370
475
|
CDK_DEFAULT_ACCOUNT: profileData.accountId,
|
|
371
476
|
AWS_PROFILE: profileData.awsProfile
|
|
@@ -376,6 +481,11 @@ export default class BootstrapCommandHandler {
|
|
|
376
481
|
|
|
377
482
|
profileData.ciInfraProvisioned = true;
|
|
378
483
|
profileData.ciTableName = 'mlcc-ci-table';
|
|
484
|
+
if (options.benchmarkInfra) {
|
|
485
|
+
profileData.benchmarkInfraProvisioned = true;
|
|
486
|
+
profileData.ciGlueDatabase = 'mlcc_ci';
|
|
487
|
+
profileData.ciBenchmarkResultsBucket = `mlcc-benchmark-results-${profileData.accountId}-${profileData.awsRegion}`;
|
|
488
|
+
}
|
|
379
489
|
}
|
|
380
490
|
}
|
|
381
491
|
} catch (error) {
|
|
@@ -413,14 +523,80 @@ export default class BootstrapCommandHandler {
|
|
|
413
523
|
console.log(` Region: ${profileConfig.awsRegion}`);
|
|
414
524
|
console.log(` Account: ${profileConfig.accountId}`);
|
|
415
525
|
|
|
526
|
+
// --- SANITY CHECK 1: Account identity ---
|
|
527
|
+
const callerAccount = this._getCallerAccount(profileConfig.awsProfile);
|
|
528
|
+
if (callerAccount !== profileConfig.accountId) {
|
|
529
|
+
console.log(`❌ Account mismatch: profile expects ${profileConfig.accountId} but credentials resolve to ${callerAccount}`);
|
|
530
|
+
return;
|
|
531
|
+
}
|
|
532
|
+
|
|
416
533
|
// Re-deploy the CloudFormation bootstrap stack
|
|
417
534
|
const stackName = profileConfig.stackName || `${STACK_NAME_PREFIX}-${name}`;
|
|
535
|
+
|
|
536
|
+
// Sanity check: stack name consistency (warn-and-continue)
|
|
537
|
+
const expectedStackName = `${STACK_NAME_PREFIX}-${name}`;
|
|
538
|
+
if (profileConfig.stackName && profileConfig.stackName !== expectedStackName) {
|
|
539
|
+
console.log(`⚠️ Stack name mismatch: expected "${expectedStackName}" but profile has "${profileConfig.stackName}"`);
|
|
540
|
+
console.log(' Run `ml-container-creator bootstrap migrate` to fix.');
|
|
541
|
+
console.log(' Proceeding with stored stack name...');
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
// --- SANITY CHECK 3: Stack exists in target region ---
|
|
545
|
+
const stackExists = this._resourceExists(
|
|
546
|
+
`cloudformation describe-stacks --stack-name ${stackName} --region ${profileConfig.awsRegion}`,
|
|
547
|
+
profileConfig.awsProfile
|
|
548
|
+
);
|
|
549
|
+
if (!stackExists) {
|
|
550
|
+
console.log(`❌ Stack "${stackName}" not found in ${profileConfig.awsRegion}.`);
|
|
551
|
+
console.log(' Run `ml-container-creator bootstrap` to create it.');
|
|
552
|
+
return;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// --- CI single-region enforcement ---
|
|
556
|
+
if (options.ci) {
|
|
557
|
+
const ciConflict = this._findExistingCiProfile(name);
|
|
558
|
+
if (ciConflict) {
|
|
559
|
+
console.log(`❌ CI infrastructure already deployed in region ${ciConflict.config.awsRegion} (profile: ${ciConflict.name}).`);
|
|
560
|
+
console.log(' CI can only be deployed in one region per account.');
|
|
561
|
+
return;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
418
565
|
this._displayProgress('☁️', 'Updating bootstrap stack...');
|
|
419
566
|
|
|
567
|
+
// Pre-check: if IAM role already exists globally (from another region's deployment),
|
|
568
|
+
// pass its ARN so CloudFormation skips re-creation (account-level singleton)
|
|
569
|
+
let useExistingRoleArn = profileConfig.roleArn || '';
|
|
570
|
+
if (!useExistingRoleArn) {
|
|
571
|
+
try {
|
|
572
|
+
const roleResult = this._execAws(
|
|
573
|
+
'iam get-role --role-name mlcc-sagemaker-execution-role',
|
|
574
|
+
profileConfig.awsProfile
|
|
575
|
+
);
|
|
576
|
+
const roleArn = roleResult && roleResult.Role && roleResult.Role.Arn;
|
|
577
|
+
if (roleArn && roleArn.startsWith('arn:aws:iam::')) {
|
|
578
|
+
useExistingRoleArn = roleArn;
|
|
579
|
+
}
|
|
580
|
+
} catch (_) {
|
|
581
|
+
// Role doesn't exist yet — will be created by the stack
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
420
585
|
try {
|
|
586
|
+
// Check if ECR repo already exists (avoid ResourceExistenceCheck failure)
|
|
587
|
+
let skipEcr = 'false';
|
|
588
|
+
try {
|
|
589
|
+
this._execAws(
|
|
590
|
+
`ecr describe-repositories --repository-names ml-container-creator --region ${profileConfig.awsRegion}`,
|
|
591
|
+
profileConfig.awsProfile
|
|
592
|
+
);
|
|
593
|
+
skipEcr = 'true';
|
|
594
|
+
} catch (_) { /* doesn't exist */ }
|
|
595
|
+
|
|
421
596
|
const stackOutputs = this._deployStack(stackName, {
|
|
422
597
|
CreateS3Buckets: (profileConfig.asyncS3Bucket || profileConfig.batchS3Bucket) ? 'true' : 'false',
|
|
423
|
-
UseExistingRoleArn:
|
|
598
|
+
UseExistingRoleArn: useExistingRoleArn,
|
|
599
|
+
SkipEcrCreation: skipEcr
|
|
424
600
|
}, profileConfig.awsProfile, profileConfig.awsRegion);
|
|
425
601
|
|
|
426
602
|
// Update profile with any new outputs
|
|
@@ -456,14 +632,20 @@ export default class BootstrapCommandHandler {
|
|
|
456
632
|
stdio: ['pipe', 'pipe', 'pipe']
|
|
457
633
|
});
|
|
458
634
|
|
|
635
|
+
// --no-rollback prevents rollback on AlreadyExists errors for IAM roles
|
|
636
|
+
// that may pre-exist from a prior deployment or another region.
|
|
637
|
+
const updateCdkCmd = (options.benchmarkInfra || profileConfig.benchmarkInfraProvisioned)
|
|
638
|
+
? 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback --parameters MlccCiHarnessStack:CreateBenchmarkInfra=true'
|
|
639
|
+
: 'npx cdk deploy MlccCiHarnessStack --require-approval never --no-rollback';
|
|
459
640
|
execSync(
|
|
460
|
-
|
|
641
|
+
updateCdkCmd,
|
|
461
642
|
{
|
|
462
643
|
cwd: ciHarnessDir,
|
|
463
644
|
encoding: 'utf8',
|
|
464
645
|
stdio: 'inherit',
|
|
465
646
|
env: {
|
|
466
647
|
...process.env,
|
|
648
|
+
AWS_REGION: profileConfig.awsRegion,
|
|
467
649
|
CDK_DEFAULT_REGION: profileConfig.awsRegion,
|
|
468
650
|
CDK_DEFAULT_ACCOUNT: profileConfig.accountId,
|
|
469
651
|
AWS_PROFILE: profileConfig.awsProfile
|
|
@@ -471,6 +653,8 @@ export default class BootstrapCommandHandler {
|
|
|
471
653
|
}
|
|
472
654
|
);
|
|
473
655
|
profileConfig.ciInfraProvisioned = true;
|
|
656
|
+
profileConfig.ciGlueDatabase = profileConfig.ciGlueDatabase || 'mlcc_ci';
|
|
657
|
+
profileConfig.ciBenchmarkResultsBucket = profileConfig.ciBenchmarkResultsBucket || `mlcc-benchmark-results-${profileConfig.accountId}-${profileConfig.awsRegion}`;
|
|
474
658
|
console.log(' ✅ CI harness stack updated');
|
|
475
659
|
}
|
|
476
660
|
} catch (error) {
|
|
@@ -480,6 +664,18 @@ export default class BootstrapCommandHandler {
|
|
|
480
664
|
console.log(' ⏭️ CI stack skipped (not provisioned — use --ci to force)');
|
|
481
665
|
}
|
|
482
666
|
|
|
667
|
+
// Ensure MLflow App exists
|
|
668
|
+
this._displayProgress('📊', 'MLflow App for experiment tracking...');
|
|
669
|
+
try {
|
|
670
|
+
const mlflowAppArn = this._ensureMlflowApp(profileConfig, profileConfig.awsProfile);
|
|
671
|
+
if (mlflowAppArn) {
|
|
672
|
+
profileConfig.mlflowAppArn = mlflowAppArn;
|
|
673
|
+
console.log(` ✅ MLflow App ready: ${mlflowAppArn}`);
|
|
674
|
+
}
|
|
675
|
+
} catch (error) {
|
|
676
|
+
console.log(` ⚠️ MLflow App setup skipped: ${error.message}`);
|
|
677
|
+
}
|
|
678
|
+
|
|
483
679
|
// Save updated profile
|
|
484
680
|
this.config.setProfile(name, profileConfig);
|
|
485
681
|
console.log(`\n✅ Update complete for profile "${name}"`);
|
|
@@ -488,6 +684,82 @@ export default class BootstrapCommandHandler {
|
|
|
488
684
|
await this._runPostSetupChain(options);
|
|
489
685
|
}
|
|
490
686
|
|
|
687
|
+
/**
|
|
688
|
+
* Migrate legacy profiles to current naming conventions.
|
|
689
|
+
* Corrects stackName mismatches and renames sharedStackFrom → sharedInfraFrom.
|
|
690
|
+
* Displays a preview of all changes and requires confirmation before writing.
|
|
691
|
+
*/
|
|
692
|
+
async _handleMigrate() {
|
|
693
|
+
const config = this.config.read();
|
|
694
|
+
if (!config || !config.profiles) {
|
|
695
|
+
console.log('No profiles to migrate.');
|
|
696
|
+
return;
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
const changes = [];
|
|
700
|
+
|
|
701
|
+
for (const [name, profileConfig] of Object.entries(config.profiles)) {
|
|
702
|
+
const expected = `${STACK_NAME_PREFIX}-${name}`;
|
|
703
|
+
|
|
704
|
+
// Fix stackName mismatch
|
|
705
|
+
if (profileConfig.stackName && profileConfig.stackName !== expected) {
|
|
706
|
+
changes.push({
|
|
707
|
+
profile: name,
|
|
708
|
+
field: 'stackName',
|
|
709
|
+
from: profileConfig.stackName,
|
|
710
|
+
to: expected
|
|
711
|
+
});
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
// Rename sharedStackFrom → sharedInfraFrom
|
|
715
|
+
if (profileConfig.sharedStackFrom) {
|
|
716
|
+
changes.push({
|
|
717
|
+
profile: name,
|
|
718
|
+
field: 'sharedStackFrom → sharedInfraFrom',
|
|
719
|
+
from: profileConfig.sharedStackFrom,
|
|
720
|
+
to: profileConfig.sharedStackFrom
|
|
721
|
+
});
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
if (changes.length === 0) {
|
|
726
|
+
console.log('✅ All profiles already use current naming conventions.');
|
|
727
|
+
return;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
// Display preview
|
|
731
|
+
console.log('📋 Migration Preview:\n');
|
|
732
|
+
for (const change of changes) {
|
|
733
|
+
console.log(` Profile "${change.profile}":`);
|
|
734
|
+
console.log(` ${change.field}: "${change.from}" → "${change.to}"`);
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
// Prompt for confirmation
|
|
738
|
+
const { confirm } = await this._promptFn([{
|
|
739
|
+
type: 'confirm',
|
|
740
|
+
name: 'confirm',
|
|
741
|
+
message: 'Apply these changes?',
|
|
742
|
+
default: true
|
|
743
|
+
}]);
|
|
744
|
+
|
|
745
|
+
if (!confirm) return;
|
|
746
|
+
|
|
747
|
+
// Apply changes
|
|
748
|
+
for (const [name, profileConfig] of Object.entries(config.profiles)) {
|
|
749
|
+
const expected = `${STACK_NAME_PREFIX}-${name}`;
|
|
750
|
+
if (profileConfig.stackName !== expected) {
|
|
751
|
+
profileConfig.stackName = expected;
|
|
752
|
+
}
|
|
753
|
+
if (profileConfig.sharedStackFrom) {
|
|
754
|
+
profileConfig.sharedInfraFrom = profileConfig.sharedStackFrom;
|
|
755
|
+
delete profileConfig.sharedStackFrom;
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
this.config.write(config);
|
|
760
|
+
console.log('✅ Migration complete.');
|
|
761
|
+
}
|
|
762
|
+
|
|
491
763
|
/**
|
|
492
764
|
* Run the post-setup chain: mcp init → registry sync-architectures → sync-schemas.
|
|
493
765
|
* Each step is independent — failures are collected and reported at the end.
|
|
@@ -495,7 +767,7 @@ export default class BootstrapCommandHandler {
|
|
|
495
767
|
* @param {object} options - Parsed CLI options (checks skipPostSetup)
|
|
496
768
|
*/
|
|
497
769
|
async _runPostSetupChain(options = {}) {
|
|
498
|
-
if (options['skip-post-setup']) {
|
|
770
|
+
if ((options['skip-post-setup'] || options.skipPostSetup)) {
|
|
499
771
|
console.log('\n⏭️ Skipping post-setup chain (--skip-post-setup)');
|
|
500
772
|
return;
|
|
501
773
|
}
|
|
@@ -729,6 +1001,12 @@ export default class BootstrapCommandHandler {
|
|
|
729
1001
|
/**
|
|
730
1002
|
* Deploy the bootstrap CloudFormation stack and return its outputs.
|
|
731
1003
|
*
|
|
1004
|
+
* Before deploying, checks for pre-existing S3 buckets that would cause
|
|
1005
|
+
* ResourceExistenceCheck failures. If the stack is in REVIEW_IN_PROGRESS
|
|
1006
|
+
* state (empty shell from a failed prior attempt), deletes it first.
|
|
1007
|
+
* If buckets exist but aren't managed by the stack, uses a CloudFormation
|
|
1008
|
+
* import changeset to adopt them before proceeding with the normal deploy.
|
|
1009
|
+
*
|
|
732
1010
|
* Uses `aws cloudformation deploy` which is idempotent — it creates the
|
|
733
1011
|
* stack on first run and updates it on subsequent runs. If the template
|
|
734
1012
|
* hasn't changed, it exits with "No changes to deploy" which we handle
|
|
@@ -741,6 +1019,9 @@ export default class BootstrapCommandHandler {
|
|
|
741
1019
|
* @returns {object} Map of output key → output value
|
|
742
1020
|
*/
|
|
743
1021
|
_deployStack(stackName, parameters, profile, region) {
|
|
1022
|
+
// Handle ghost stacks and pre-existing resources
|
|
1023
|
+
this._resolveStackConflicts(stackName, parameters, profile, region);
|
|
1024
|
+
|
|
744
1025
|
// Build parameter overrides string
|
|
745
1026
|
const paramOverrides = Object.entries(parameters)
|
|
746
1027
|
.map(([key, value]) => `${key}=${value}`)
|
|
@@ -764,6 +1045,32 @@ export default class BootstrapCommandHandler {
|
|
|
764
1045
|
const stderr = error.stderr || error.message || '';
|
|
765
1046
|
if (stderr.includes('No changes to deploy')) {
|
|
766
1047
|
console.log(' ℹ️ Stack is up to date — no changes needed');
|
|
1048
|
+
} else if (stderr.includes('ResourceExistenceCheck')) {
|
|
1049
|
+
// Resources already exist outside the stack — attempt import and retry
|
|
1050
|
+
console.log(' ⚠️ Pre-existing resources detected — attempting import...');
|
|
1051
|
+
this._resolveStackConflicts(stackName, parameters, profile, region);
|
|
1052
|
+
// Rebuild deploy command with updated parameters (e.g., CreateS3Buckets may now be 'false')
|
|
1053
|
+
const retryParamOverrides = Object.entries(parameters)
|
|
1054
|
+
.map(([key, value]) => `${key}=${value}`)
|
|
1055
|
+
.join(' ');
|
|
1056
|
+
const retryDeployCommand = [
|
|
1057
|
+
'aws cloudformation deploy',
|
|
1058
|
+
`--template-file ${STACK_TEMPLATE_PATH}`,
|
|
1059
|
+
`--stack-name ${stackName}`,
|
|
1060
|
+
'--capabilities CAPABILITY_NAMED_IAM',
|
|
1061
|
+
`--parameter-overrides ${retryParamOverrides}`,
|
|
1062
|
+
`--profile ${profile}`,
|
|
1063
|
+
`--region ${region}`
|
|
1064
|
+
].join(' ');
|
|
1065
|
+
// Retry the deploy after import
|
|
1066
|
+
try {
|
|
1067
|
+
execSync(retryDeployCommand, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] });
|
|
1068
|
+
} catch (retryError) {
|
|
1069
|
+
const retryStderr = retryError.stderr || retryError.message || '';
|
|
1070
|
+
if (!retryStderr.includes('No changes to deploy')) {
|
|
1071
|
+
throw retryError;
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
767
1074
|
} else {
|
|
768
1075
|
throw error;
|
|
769
1076
|
}
|
|
@@ -785,9 +1092,144 @@ export default class BootstrapCommandHandler {
|
|
|
785
1092
|
outputs[output.OutputKey] = output.OutputValue;
|
|
786
1093
|
}
|
|
787
1094
|
|
|
1095
|
+
// If S3 buckets already existed (skipped creation), inject their names
|
|
1096
|
+
// into outputs so the profile config gets populated correctly.
|
|
1097
|
+
if (this._preExistingBuckets && this._preExistingBuckets.length > 0) {
|
|
1098
|
+
const bucketOutputMap = {
|
|
1099
|
+
'AsyncS3Bucket': 'AsyncS3BucketName',
|
|
1100
|
+
'BatchS3Bucket': 'BatchS3BucketName',
|
|
1101
|
+
'AdapterS3Bucket': 'AdapterS3BucketName',
|
|
1102
|
+
'BenchmarkS3Bucket': 'BenchmarkS3BucketName',
|
|
1103
|
+
'TuneS3Bucket': 'TuneS3BucketName'
|
|
1104
|
+
};
|
|
1105
|
+
for (const bucket of this._preExistingBuckets) {
|
|
1106
|
+
const outputKey = bucketOutputMap[bucket.logicalId];
|
|
1107
|
+
if (outputKey && !outputs[outputKey]) {
|
|
1108
|
+
outputs[outputKey] = bucket.name;
|
|
1109
|
+
}
|
|
1110
|
+
}
|
|
1111
|
+
this._preExistingBuckets = null;
|
|
1112
|
+
}
|
|
1113
|
+
|
|
788
1114
|
return outputs;
|
|
789
1115
|
}
|
|
790
1116
|
|
|
1117
|
+
/**
|
|
1118
|
+
* Resolve stack conflicts before deploying.
|
|
1119
|
+
*
|
|
1120
|
+
* Handles two scenarios that cause ResourceExistenceCheck failures:
|
|
1121
|
+
* 1. Ghost stacks (REVIEW_IN_PROGRESS) — delete them first
|
|
1122
|
+
* 2. Pre-existing S3 buckets not managed by the stack — import them
|
|
1123
|
+
*
|
|
1124
|
+
* @param {string} stackName - CloudFormation stack name
|
|
1125
|
+
* @param {object} parameters - Stack parameter key-value pairs
|
|
1126
|
+
* @param {string} profile - AWS CLI profile name
|
|
1127
|
+
* @param {string} region - AWS region
|
|
1128
|
+
*/
|
|
1129
|
+
_resolveStackConflicts(stackName, parameters, profile, region) {
|
|
1130
|
+
// Check if stack exists and its status
|
|
1131
|
+
let stackStatus = null;
|
|
1132
|
+
let managedResources = [];
|
|
1133
|
+
|
|
1134
|
+
try {
|
|
1135
|
+
const describeResult = this._execAws(
|
|
1136
|
+
`cloudformation describe-stacks --stack-name ${stackName} --region ${region}`,
|
|
1137
|
+
profile
|
|
1138
|
+
);
|
|
1139
|
+
const stack = describeResult.Stacks && describeResult.Stacks[0];
|
|
1140
|
+
if (stack) {
|
|
1141
|
+
stackStatus = stack.StackStatus;
|
|
1142
|
+
}
|
|
1143
|
+
} catch (_) {
|
|
1144
|
+
// Stack doesn't exist — no conflicts possible
|
|
1145
|
+
return;
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1148
|
+
// Handle ghost stacks (created but never successfully deployed)
|
|
1149
|
+
if (stackStatus === 'REVIEW_IN_PROGRESS') {
|
|
1150
|
+
console.log(' ⚠️ Found ghost stack (REVIEW_IN_PROGRESS) — deleting before redeploy...');
|
|
1151
|
+
try {
|
|
1152
|
+
execSync(
|
|
1153
|
+
`aws cloudformation delete-stack --stack-name ${stackName} --profile ${profile} --region ${region}`,
|
|
1154
|
+
{ encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
|
|
1155
|
+
);
|
|
1156
|
+
execSync(
|
|
1157
|
+
`aws cloudformation wait stack-delete-complete --stack-name ${stackName} --profile ${profile} --region ${region}`,
|
|
1158
|
+
{ encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 60000 }
|
|
1159
|
+
);
|
|
1160
|
+
console.log(' ✅ Ghost stack deleted');
|
|
1161
|
+
} catch (err) {
|
|
1162
|
+
console.log(` ⚠️ Could not delete ghost stack: ${err.message || err}`);
|
|
1163
|
+
}
|
|
1164
|
+
// Don't return — fall through to check for pre-existing S3 buckets
|
|
1165
|
+
// that need to be imported on the fresh deploy. The ghost stack had
|
|
1166
|
+
// DeletionPolicy:Retain buckets that survive stack deletion.
|
|
1167
|
+
stackStatus = null;
|
|
1168
|
+
managedResources = [];
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
// For active stacks (or post-ghost-deletion), check if S3 buckets exist but aren't managed
|
|
1172
|
+
if (parameters.CreateS3Buckets !== 'true') {
|
|
1173
|
+
return; // Not creating buckets — no conflict
|
|
1174
|
+
}
|
|
1175
|
+
|
|
1176
|
+
// Get list of resources currently managed by the stack (empty if stack was just deleted)
|
|
1177
|
+
if (stackStatus) {
|
|
1178
|
+
try {
|
|
1179
|
+
const resources = this._execAws(
|
|
1180
|
+
`cloudformation list-stack-resources --stack-name ${stackName} --region ${region}`,
|
|
1181
|
+
profile
|
|
1182
|
+
);
|
|
1183
|
+
managedResources = (resources.StackResourceSummaries || [])
|
|
1184
|
+
.map(r => r.LogicalResourceId);
|
|
1185
|
+
} catch (_) {
|
|
1186
|
+
// Stack doesn't exist or can't be queried — proceed with empty managedResources
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
// Check each S3 bucket that the template would create
|
|
1191
|
+
const accountId = this._currentAccountId;
|
|
1192
|
+
const bucketConfigs = [
|
|
1193
|
+
{ logicalId: 'AsyncS3Bucket', name: `mlcc-async-${accountId}-${region}` },
|
|
1194
|
+
{ logicalId: 'BatchS3Bucket', name: `mlcc-batch-${accountId}-${region}` },
|
|
1195
|
+
{ logicalId: 'AdapterS3Bucket', name: `mlcc-adapters-${accountId}-${region}` },
|
|
1196
|
+
{ logicalId: 'BenchmarkS3Bucket', name: `mlcc-benchmark-${accountId}-${region}` },
|
|
1197
|
+
{ logicalId: 'TuneS3Bucket', name: `mlcc-tune-${accountId}-${region}` }
|
|
1198
|
+
];
|
|
1199
|
+
|
|
1200
|
+
const bucketsToImport = [];
|
|
1201
|
+
|
|
1202
|
+
for (const bucket of bucketConfigs) {
|
|
1203
|
+
if (managedResources.includes(bucket.logicalId)) {
|
|
1204
|
+
continue; // Already managed by the stack — no conflict
|
|
1205
|
+
}
|
|
1206
|
+
// Check if bucket exists in AWS
|
|
1207
|
+
try {
|
|
1208
|
+
execSync(
|
|
1209
|
+
`aws s3api head-bucket --bucket ${bucket.name} --profile ${profile} --region ${region}`,
|
|
1210
|
+
{ encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'] }
|
|
1211
|
+
);
|
|
1212
|
+
// Bucket exists but not in stack — needs import
|
|
1213
|
+
bucketsToImport.push(bucket);
|
|
1214
|
+
} catch (_) {
|
|
1215
|
+
// Bucket doesn't exist — will be created normally
|
|
1216
|
+
}
|
|
1217
|
+
}
|
|
1218
|
+
|
|
1219
|
+
if (bucketsToImport.length > 0) {
|
|
1220
|
+
console.log(` ℹ️ ${bucketsToImport.length} pre-existing S3 bucket(s) detected — skipping S3 creation (buckets already exist)`);
|
|
1221
|
+
|
|
1222
|
+
// Pre-existing S3 buckets survive stack deletion (DeletionPolicy: Retain).
|
|
1223
|
+
// Rather than fighting CloudFormation's IMPORT limitations, just skip S3
|
|
1224
|
+
// creation and wire the existing bucket names into the profile config directly.
|
|
1225
|
+
// The naming convention is deterministic, so we know exactly what they are.
|
|
1226
|
+
this._preExistingBuckets = bucketsToImport;
|
|
1227
|
+
|
|
1228
|
+
// Modify the parameters to skip S3 bucket creation in the deploy
|
|
1229
|
+
parameters.CreateS3Buckets = 'false';
|
|
1230
|
+
}
|
|
1231
|
+
}
|
|
1232
|
+
|
|
791
1233
|
/**
|
|
792
1234
|
* Write a JSON object to a temp file and return the `file://` path.
|
|
793
1235
|
* Used for passing complex JSON to AWS CLI commands without shell escaping issues.
|
|
@@ -821,6 +1263,125 @@ export default class BootstrapCommandHandler {
|
|
|
821
1263
|
}
|
|
822
1264
|
}
|
|
823
1265
|
|
|
1266
|
+
/**
|
|
1267
|
+
* Get the AWS account ID from the caller's credentials.
|
|
1268
|
+
* Uses `sts get-caller-identity` to resolve the actual account.
|
|
1269
|
+
*
|
|
1270
|
+
* @param {string} awsProfile - AWS CLI profile name
|
|
1271
|
+
* @returns {string} The 12-digit AWS account ID
|
|
1272
|
+
*/
|
|
1273
|
+
_getCallerAccount(awsProfile) {
|
|
1274
|
+
const identity = this._execAws('sts get-caller-identity', awsProfile);
|
|
1275
|
+
return identity.Account;
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
/**
|
|
1279
|
+
* Scan all profiles to find one with ciInfraProvisioned=true,
|
|
1280
|
+
* excluding the given profile name.
|
|
1281
|
+
*
|
|
1282
|
+
* @param {string} excludeProfile - Profile name to exclude from the scan
|
|
1283
|
+
* @returns {{ name: string, config: Object }|null} The CI profile, or null if none found
|
|
1284
|
+
*/
|
|
1285
|
+
_findExistingCiProfile(excludeProfile) {
|
|
1286
|
+
const config = this.config.read();
|
|
1287
|
+
if (!config || !config.profiles) return null;
|
|
1288
|
+
|
|
1289
|
+
for (const [name, profileConfig] of Object.entries(config.profiles)) {
|
|
1290
|
+
if (name === excludeProfile) continue;
|
|
1291
|
+
if (profileConfig.ciInfraProvisioned) {
|
|
1292
|
+
return { name, config: profileConfig };
|
|
1293
|
+
}
|
|
1294
|
+
}
|
|
1295
|
+
return null;
|
|
1296
|
+
}
|
|
1297
|
+
|
|
1298
|
+
/**
|
|
1299
|
+
* Ensure an MLCC-owned MLflow App exists for experiment tracking.
|
|
1300
|
+
* Creates one if it doesn't exist, using the tune S3 bucket as artifact store.
|
|
1301
|
+
*
|
|
1302
|
+
* @param {object} profileData - Bootstrap profile data (needs roleArn, awsRegion, accountId)
|
|
1303
|
+
* @param {string} awsProfile - AWS CLI profile name
|
|
1304
|
+
* @returns {string|null} MLflow App ARN or null if creation failed
|
|
1305
|
+
*/
|
|
1306
|
+
_ensureMlflowApp(profileData, awsProfile) {
|
|
1307
|
+
const region = profileData.awsRegion;
|
|
1308
|
+
const accountId = profileData.accountId;
|
|
1309
|
+
const roleArn = profileData.roleArn;
|
|
1310
|
+
const appName = 'mlcc-tune-tracking';
|
|
1311
|
+
const artifactBucket = `mlcc-tune-${accountId}-${region}`;
|
|
1312
|
+
|
|
1313
|
+
// Check if MLCC app already exists
|
|
1314
|
+
try {
|
|
1315
|
+
const apps = this._execAws(
|
|
1316
|
+
`sagemaker list-mlflow-apps --region ${region}`,
|
|
1317
|
+
awsProfile
|
|
1318
|
+
);
|
|
1319
|
+
const summaries = apps.Summaries || [];
|
|
1320
|
+
const existing = summaries.find(a => a.Name === appName);
|
|
1321
|
+
if (existing) {
|
|
1322
|
+
return existing.Arn;
|
|
1323
|
+
}
|
|
1324
|
+
} catch {
|
|
1325
|
+
// list-mlflow-apps may not be available in all CLI versions — proceed to create
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
// Create the MLflow App
|
|
1329
|
+
console.log(` Creating MLflow App "${appName}" with artifact store s3://${artifactBucket}...`);
|
|
1330
|
+
|
|
1331
|
+
// Ensure the artifact bucket exists (it's the tune bucket from the stack)
|
|
1332
|
+
try {
|
|
1333
|
+
this._execAws(
|
|
1334
|
+
`s3api head-bucket --bucket ${artifactBucket} --region ${region}`,
|
|
1335
|
+
awsProfile
|
|
1336
|
+
);
|
|
1337
|
+
} catch {
|
|
1338
|
+
// Bucket doesn't exist — create it
|
|
1339
|
+
console.log(` Creating artifact bucket: ${artifactBucket}`);
|
|
1340
|
+
try {
|
|
1341
|
+
this._execAws(
|
|
1342
|
+
`s3api create-bucket --bucket ${artifactBucket} --region ${region} --create-bucket-configuration LocationConstraint=${region}`,
|
|
1343
|
+
awsProfile
|
|
1344
|
+
);
|
|
1345
|
+
} catch (bucketErr) {
|
|
1346
|
+
// May already exist or region doesn't need LocationConstraint (us-east-1)
|
|
1347
|
+
if (!bucketErr.message?.includes('BucketAlreadyOwnedByYou')) {
|
|
1348
|
+
try {
|
|
1349
|
+
this._execAws(
|
|
1350
|
+
`s3api create-bucket --bucket ${artifactBucket} --region ${region}`,
|
|
1351
|
+
awsProfile
|
|
1352
|
+
);
|
|
1353
|
+
} catch {
|
|
1354
|
+
// Bucket likely exists, continue
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
// Create the app
|
|
1361
|
+
try {
|
|
1362
|
+
const result = this._execAws(
|
|
1363
|
+
`sagemaker create-mlflow-app --name ${appName} --artifact-store-uri s3://${artifactBucket} --role-arn ${roleArn} --model-registration-mode AutoModelRegistrationEnabled --region ${region}`,
|
|
1364
|
+
awsProfile
|
|
1365
|
+
);
|
|
1366
|
+
return result.Arn;
|
|
1367
|
+
} catch (err) {
|
|
1368
|
+
// If app already exists (race condition), try to describe it
|
|
1369
|
+
if (err.message?.includes('ResourceLimitExceeded') || err.message?.includes('already exists')) {
|
|
1370
|
+
try {
|
|
1371
|
+
const apps = this._execAws(
|
|
1372
|
+
`sagemaker list-mlflow-apps --region ${region}`,
|
|
1373
|
+
awsProfile
|
|
1374
|
+
);
|
|
1375
|
+
const found = (apps.Summaries || []).find(a => a.Name === appName);
|
|
1376
|
+
if (found) return found.Arn;
|
|
1377
|
+
} catch {
|
|
1378
|
+
// Fall through
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
throw err;
|
|
1382
|
+
}
|
|
1383
|
+
}
|
|
1384
|
+
|
|
824
1385
|
/**
|
|
825
1386
|
* Format tags for the AWS CLI --tags parameter.
|
|
826
1387
|
* Writes tags to a temp file and returns the file:// reference
|
|
@@ -858,6 +1419,8 @@ SUBCOMMANDS:
|
|
|
858
1419
|
scan Discover pre-existing MLCC-managed resources in AWS
|
|
859
1420
|
prune Remove deleted and unknown records from the deployment manifest
|
|
860
1421
|
update Re-deploy bootstrap stacks using active profile (no prompts)
|
|
1422
|
+
migrate Upgrade legacy profiles to current naming conventions
|
|
1423
|
+
sync-model-families Discover tune-eligible models from JumpStart Hub and update catalog
|
|
861
1424
|
|
|
862
1425
|
SETUP OPTIONS:
|
|
863
1426
|
--non-interactive Run without interactive prompts
|
|
@@ -886,6 +1449,8 @@ EXAMPLES:
|
|
|
886
1449
|
ml-container-creator bootstrap remove dev
|
|
887
1450
|
ml-container-creator bootstrap remove dev --force --delete-stack
|
|
888
1451
|
ml-container-creator bootstrap scan
|
|
1452
|
+
ml-container-creator bootstrap sync-model-families
|
|
1453
|
+
ml-container-creator bootstrap migrate
|
|
889
1454
|
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2
|
|
890
1455
|
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --role-arn arn:aws:iam::123456789012:role/MyRole --skip-s3
|
|
891
1456
|
ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --ci
|