@aws/ml-container-creator 0.9.1 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +9304 -0
- package/bin/cli.js +2 -0
- package/config/bootstrap-e2e-stack.json +341 -0
- package/config/bootstrap-stack.json +40 -3
- package/config/parameter-schema-v2.json +2049 -0
- package/config/tune-catalog.json +1781 -0
- package/infra/ci-harness/buildspec.yml +1 -0
- package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
- package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
- package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
- package/package.json +53 -68
- package/servers/base-image-picker/index.js +121 -121
- package/servers/e2e-status/index.js +297 -0
- package/servers/e2e-status/manifest.json +14 -0
- package/servers/e2e-status/package.json +15 -0
- package/servers/endpoint-picker/LICENSE +202 -0
- package/servers/endpoint-picker/index.js +536 -0
- package/servers/endpoint-picker/manifest.json +14 -0
- package/servers/endpoint-picker/package.json +18 -0
- package/servers/hyperpod-cluster-picker/index.js +125 -125
- package/servers/instance-sizer/index.js +138 -138
- package/servers/instance-sizer/lib/instance-ranker.js +76 -76
- package/servers/instance-sizer/lib/model-resolver.js +61 -61
- package/servers/instance-sizer/lib/quota-resolver.js +113 -113
- package/servers/instance-sizer/lib/vram-estimator.js +31 -31
- package/servers/lib/bedrock-client.js +38 -38
- package/servers/lib/catalogs/jumpstart-public.json +101 -16
- package/servers/lib/catalogs/model-servers.json +201 -3
- package/servers/lib/catalogs/models.json +182 -26
- package/servers/lib/custom-validators.js +13 -13
- package/servers/lib/dynamic-resolver.js +4 -4
- package/servers/marketplace-picker/index.js +342 -0
- package/servers/marketplace-picker/manifest.json +14 -0
- package/servers/marketplace-picker/package.json +18 -0
- package/servers/model-picker/index.js +382 -382
- package/servers/region-picker/index.js +56 -56
- package/servers/workload-picker/LICENSE +202 -0
- package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
- package/servers/workload-picker/index.js +171 -0
- package/servers/workload-picker/manifest.json +16 -0
- package/servers/workload-picker/package.json +16 -0
- package/src/app.js +4 -390
- package/src/lib/bootstrap-command-handler.js +710 -1148
- package/src/lib/bootstrap-config.js +36 -0
- package/src/lib/bootstrap-profile-manager.js +641 -0
- package/src/lib/bootstrap-provisioners.js +421 -0
- package/src/lib/ci-register-helpers.js +74 -0
- package/src/lib/config-loader.js +408 -0
- package/src/lib/config-manager.js +66 -1685
- package/src/lib/config-mcp-client.js +118 -0
- package/src/lib/config-validator.js +634 -0
- package/src/lib/cuda-resolver.js +149 -0
- package/src/lib/e2e-catalog-validator.js +251 -3
- package/src/lib/e2e-ci-recorder.js +103 -0
- package/src/lib/generated/cli-options.js +315 -311
- package/src/lib/generated/parameter-matrix.js +671 -0
- package/src/lib/generated/validation-rules.js +71 -71
- package/src/lib/marketplace-flow.js +276 -0
- package/src/lib/mcp-query-runner.js +768 -0
- package/src/lib/parameter-schema-validator.js +62 -18
- package/src/lib/path-prover-brain.js +607 -0
- package/src/lib/prompt-runner.js +41 -1504
- package/src/lib/prompts/feature-prompts.js +172 -0
- package/src/lib/prompts/index.js +48 -0
- package/src/lib/prompts/infrastructure-prompts.js +690 -0
- package/src/lib/prompts/model-prompts.js +552 -0
- package/src/lib/prompts/project-prompts.js +82 -0
- package/src/lib/prompts.js +2 -1446
- package/src/lib/registry-command-handler.js +135 -3
- package/src/lib/secrets-prompt-runner.js +251 -0
- package/src/lib/template-variable-resolver.js +422 -0
- package/src/lib/tune-catalog-validator.js +37 -4
- package/templates/Dockerfile +9 -0
- package/templates/code/adapter_sidecar.py +444 -0
- package/templates/code/serve +6 -0
- package/templates/code/serve.d/vllm.ejs +1 -1
- package/templates/do/.benchmark_writer.py +1476 -0
- package/templates/do/.tune_helper.py +982 -57
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/adapter +149 -0
- package/templates/do/benchmark +639 -85
- package/templates/do/config +108 -5
- package/templates/do/deploy.d/managed-inference.ejs +192 -11
- package/templates/do/optimize +106 -37
- package/templates/do/register +89 -0
- package/templates/do/test +13 -0
- package/templates/do/tune +378 -59
- package/templates/do/validate +44 -4
- package/config/parameter-schema.json +0 -88
|
@@ -2,7 +2,9 @@ import * as cdk from 'aws-cdk-lib';
|
|
|
2
2
|
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
|
|
3
3
|
import * as events from 'aws-cdk-lib/aws-events';
|
|
4
4
|
import * as targets from 'aws-cdk-lib/aws-events-targets';
|
|
5
|
+
import * as glue from 'aws-cdk-lib/aws-glue';
|
|
5
6
|
import * as iam from 'aws-cdk-lib/aws-iam';
|
|
7
|
+
import * as s3 from 'aws-cdk-lib/aws-s3';
|
|
6
8
|
import * as lambda from 'aws-cdk-lib/aws-lambda';
|
|
7
9
|
import { NodejsFunction } from 'aws-cdk-lib/aws-lambda-nodejs';
|
|
8
10
|
import * as logs from 'aws-cdk-lib/aws-logs';
|
|
@@ -117,8 +119,12 @@ export class MlccCiHarnessStack extends cdk.Stack {
|
|
|
117
119
|
});
|
|
118
120
|
|
|
119
121
|
// Scanner Lambda IAM role with least-privilege permissions
|
|
122
|
+
//
|
|
123
|
+
// RETAIN policy: IAM roles are retained on stack deletion to prevent conflicts
|
|
124
|
+
// during multi-region bootstrap. If the stack is re-created, existing roles will
|
|
125
|
+
// be reused via --no-rollback.
|
|
120
126
|
const scannerRole = new iam.Role(this, 'ScannerRole', {
|
|
121
|
-
roleName:
|
|
127
|
+
roleName: `mlcc-ci-scanner-role-${this.region}`,
|
|
122
128
|
assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'),
|
|
123
129
|
description: 'IAM role for the MLCC CI Scanner Lambda function',
|
|
124
130
|
});
|
|
@@ -178,16 +184,23 @@ export class MlccCiHarnessStack extends cdk.Stack {
|
|
|
178
184
|
|
|
179
185
|
// Step Functions Orchestrator IAM role
|
|
180
186
|
// Permissions for DynamoDB UpdateItem, Logs, and CodeBuild are defined here.
|
|
187
|
+
//
|
|
188
|
+
// RETAIN policy: IAM roles are retained on stack deletion to prevent conflicts
|
|
189
|
+
// during multi-region bootstrap. If the stack is re-created, existing roles will
|
|
190
|
+
// be reused via --no-rollback.
|
|
181
191
|
this.orchestratorRole = new iam.Role(this, 'OrchestratorRole', {
|
|
182
|
-
roleName:
|
|
183
|
-
assumedBy: new iam.
|
|
192
|
+
roleName: `mlcc-ci-orchestrator-role-${this.region}`,
|
|
193
|
+
assumedBy: new iam.CompositePrincipal(
|
|
194
|
+
new iam.ServicePrincipal('states.amazonaws.com'),
|
|
195
|
+
new iam.ServicePrincipal('events.amazonaws.com'),
|
|
196
|
+
),
|
|
184
197
|
description: 'IAM role for the MLCC CI Orchestrator Step Functions state machine',
|
|
185
198
|
});
|
|
186
199
|
|
|
187
200
|
// DynamoDB:UpdateItem on CI_Table for UpdateResults states
|
|
188
201
|
this.orchestratorRole.addToPolicy(new iam.PolicyStatement({
|
|
189
202
|
effect: iam.Effect.ALLOW,
|
|
190
|
-
actions: ['dynamodb:UpdateItem'],
|
|
203
|
+
actions: ['dynamodb:UpdateItem', 'dynamodb:GetItem'],
|
|
191
204
|
resources: [this.ciTable.tableArn],
|
|
192
205
|
}));
|
|
193
206
|
|
|
@@ -257,6 +270,14 @@ export class MlccCiHarnessStack extends cdk.Stack {
|
|
|
257
270
|
Value: this.ciLogGroup.logGroupName,
|
|
258
271
|
Type: 'PLAINTEXT',
|
|
259
272
|
},
|
|
273
|
+
{
|
|
274
|
+
// Benchmark concurrency levels (comma-separated string, e.g. "1,4,8")
|
|
275
|
+
// Set by the CI Scanner Lambda from DynamoDB benchmarkConcurrencyLevels field.
|
|
276
|
+
// Falls back to default [1,4,8] in do/benchmark if empty.
|
|
277
|
+
Name: 'BENCHMARK_CONCURRENCY_LEVELS',
|
|
278
|
+
'Value.$': '$.benchmarkConcurrencyLevels',
|
|
279
|
+
Type: 'PLAINTEXT',
|
|
280
|
+
},
|
|
260
281
|
],
|
|
261
282
|
},
|
|
262
283
|
ResultPath: '$.buildResult',
|
|
@@ -369,6 +390,270 @@ export class MlccCiHarnessStack extends cdk.Stack {
|
|
|
369
390
|
BackoffRate: 2.0,
|
|
370
391
|
},
|
|
371
392
|
],
|
|
393
|
+
},
|
|
394
|
+
});
|
|
395
|
+
|
|
396
|
+
// ─── Stage 2 Orchestration (Req 1.1, 1.2, 1.3) ──────────────────────────
|
|
397
|
+
// After Stage 1 completes, the orchestrator reads the DynamoDB record to
|
|
398
|
+
// determine if the build passed and if benchmark is enabled. Stage 2 runs
|
|
399
|
+
// benchmarks asynchronously and does NOT affect testStatus on failure.
|
|
400
|
+
|
|
401
|
+
// GetBenchmarkConfig: Read benchmarkEnabled flag from DynamoDB after Stage 1
|
|
402
|
+
const getBenchmarkConfig = new sfn.CustomState(this, 'GetBenchmarkConfig', {
|
|
403
|
+
stateJson: {
|
|
404
|
+
Type: 'Task',
|
|
405
|
+
Resource: 'arn:aws:states:::dynamodb:getItem',
|
|
406
|
+
Parameters: {
|
|
407
|
+
TableName: this.ciTable.tableName,
|
|
408
|
+
Key: {
|
|
409
|
+
configId: { 'S.$': '$.configId' },
|
|
410
|
+
},
|
|
411
|
+
ProjectionExpression: 'testStatus, benchmarkEnabled, benchmarkConcurrencyLevels',
|
|
412
|
+
ConsistentRead: true,
|
|
413
|
+
},
|
|
414
|
+
ResultPath: '$.dynamoResult',
|
|
415
|
+
Retry: [
|
|
416
|
+
{
|
|
417
|
+
ErrorEquals: ['States.ALL'],
|
|
418
|
+
IntervalSeconds: 2,
|
|
419
|
+
MaxAttempts: 3,
|
|
420
|
+
BackoffRate: 2.0,
|
|
421
|
+
},
|
|
422
|
+
],
|
|
423
|
+
},
|
|
424
|
+
});
|
|
425
|
+
|
|
426
|
+
// ExtractBenchmarkFlags: Extract benchmarkEnabled and testStatus into top-level fields
|
|
427
|
+
const extractBenchmarkFlags = new sfn.Pass(this, 'ExtractBenchmarkFlags', {
|
|
428
|
+
parameters: {
|
|
429
|
+
'configId.$': '$.configId',
|
|
430
|
+
'configJson.$': '$.configJson',
|
|
431
|
+
'buildStrategy.$': '$.buildStrategy',
|
|
432
|
+
'startTime.$': '$.startTime',
|
|
433
|
+
'buildStatus.$': '$.buildStatus',
|
|
434
|
+
'testStatus.$': '$.dynamoResult.Item.testStatus.S',
|
|
435
|
+
'benchmarkEnabled.$': '$.dynamoResult.Item.benchmarkEnabled.BOOL',
|
|
436
|
+
},
|
|
437
|
+
resultPath: '$',
|
|
438
|
+
});
|
|
439
|
+
|
|
440
|
+
// ExtractBenchmarkFlagsDefault: Fallback when benchmarkEnabled is not set in DynamoDB
|
|
441
|
+
// (backward-compatible — absence means disabled)
|
|
442
|
+
const extractBenchmarkFlagsDefault = new sfn.Pass(this, 'ExtractBenchmarkFlagsDefault', {
|
|
443
|
+
parameters: {
|
|
444
|
+
'configId.$': '$.configId',
|
|
445
|
+
'configJson.$': '$.configJson',
|
|
446
|
+
'buildStrategy.$': '$.buildStrategy',
|
|
447
|
+
'startTime.$': '$.startTime',
|
|
448
|
+
'buildStatus.$': '$.buildStatus',
|
|
449
|
+
'testStatus': 'unknown',
|
|
450
|
+
'benchmarkEnabled': false,
|
|
451
|
+
},
|
|
452
|
+
resultPath: '$',
|
|
453
|
+
});
|
|
454
|
+
|
|
455
|
+
// CheckDynamoItemHasBenchmarkField: determine if the DynamoDB response contains
|
|
456
|
+
// the benchmarkEnabled field. If not present, default to false.
|
|
457
|
+
const checkDynamoItemHasBenchmarkField = new sfn.Choice(this, 'CheckDynamoItemHasBenchmarkField')
|
|
458
|
+
.when(
|
|
459
|
+
sfn.Condition.isPresent('$.dynamoResult.Item.benchmarkEnabled'),
|
|
460
|
+
extractBenchmarkFlags,
|
|
461
|
+
)
|
|
462
|
+
.otherwise(extractBenchmarkFlagsDefault);
|
|
463
|
+
|
|
464
|
+
// CheckStage1Passed: Determine if Stage 1 passed (testStatus from DynamoDB read)
|
|
465
|
+
// If passed + benchmarkEnabled=true → Stage 2
|
|
466
|
+
// If passed + benchmarkEnabled=false → skip to End
|
|
467
|
+
// If failed → skip to End (do/clean already ran in CodeBuild's post_build)
|
|
468
|
+
const prepareStage2Input = new sfn.Pass(this, 'PrepareStage2Input', {
|
|
469
|
+
parameters: {
|
|
470
|
+
'configId.$': '$.configId',
|
|
471
|
+
'configJson.$': '$.configJson',
|
|
472
|
+
'buildStrategy.$': '$.buildStrategy',
|
|
473
|
+
},
|
|
474
|
+
});
|
|
475
|
+
|
|
476
|
+
const skipStage2 = new sfn.Succeed(this, 'SkipStage2');
|
|
477
|
+
|
|
478
|
+
const checkBenchmarkEnabled = new sfn.Choice(this, 'CheckBenchmarkEnabled')
|
|
479
|
+
.when(
|
|
480
|
+
sfn.Condition.and(
|
|
481
|
+
sfn.Condition.stringEquals('$.testStatus', 'pass'),
|
|
482
|
+
sfn.Condition.booleanEquals('$.benchmarkEnabled', true),
|
|
483
|
+
),
|
|
484
|
+
prepareStage2Input,
|
|
485
|
+
)
|
|
486
|
+
.otherwise(skipStage2);
|
|
487
|
+
|
|
488
|
+
// Stage2Benchmark: Run do/benchmark via CodeBuild
|
|
489
|
+
// Uses .sync integration to wait for build completion.
|
|
490
|
+
const stage2Benchmark = new sfn.CustomState(this, 'Stage2Benchmark', {
|
|
491
|
+
stateJson: {
|
|
492
|
+
Type: 'Task',
|
|
493
|
+
Resource: 'arn:aws:states:::codebuild:startBuild',
|
|
494
|
+
Parameters: {
|
|
495
|
+
ProjectName: 'mlcc-ci-executor',
|
|
496
|
+
EnvironmentVariablesOverride: [
|
|
497
|
+
{
|
|
498
|
+
Name: 'CONFIG_ID',
|
|
499
|
+
'Value.$': '$.configId',
|
|
500
|
+
Type: 'PLAINTEXT',
|
|
501
|
+
},
|
|
502
|
+
{
|
|
503
|
+
Name: 'CONFIG_JSON',
|
|
504
|
+
'Value.$': '$.configJson',
|
|
505
|
+
Type: 'PLAINTEXT',
|
|
506
|
+
},
|
|
507
|
+
{
|
|
508
|
+
Name: 'CI_STAGE',
|
|
509
|
+
Value: 'stage2-benchmark',
|
|
510
|
+
Type: 'PLAINTEXT',
|
|
511
|
+
},
|
|
512
|
+
{
|
|
513
|
+
Name: 'CI_TABLE_NAME',
|
|
514
|
+
Value: this.ciTable.tableName,
|
|
515
|
+
Type: 'PLAINTEXT',
|
|
516
|
+
},
|
|
517
|
+
{
|
|
518
|
+
Name: 'CI_LOG_GROUP',
|
|
519
|
+
Value: this.ciLogGroup.logGroupName,
|
|
520
|
+
Type: 'PLAINTEXT',
|
|
521
|
+
},
|
|
522
|
+
],
|
|
523
|
+
},
|
|
524
|
+
ResultPath: '$.stage2BuildResult',
|
|
525
|
+
},
|
|
526
|
+
});
|
|
527
|
+
|
|
528
|
+
// Stage2RegisterBenchmark: Run do/register --benchmark-status via CodeBuild
|
|
529
|
+
const stage2RegisterBenchmark = new sfn.CustomState(this, 'Stage2RegisterBenchmark', {
|
|
530
|
+
stateJson: {
|
|
531
|
+
Type: 'Task',
|
|
532
|
+
Resource: 'arn:aws:states:::codebuild:startBuild',
|
|
533
|
+
Parameters: {
|
|
534
|
+
ProjectName: 'mlcc-ci-executor',
|
|
535
|
+
EnvironmentVariablesOverride: [
|
|
536
|
+
{
|
|
537
|
+
Name: 'CONFIG_ID',
|
|
538
|
+
'Value.$': '$.configId',
|
|
539
|
+
Type: 'PLAINTEXT',
|
|
540
|
+
},
|
|
541
|
+
{
|
|
542
|
+
Name: 'CONFIG_JSON',
|
|
543
|
+
'Value.$': '$.configJson',
|
|
544
|
+
Type: 'PLAINTEXT',
|
|
545
|
+
},
|
|
546
|
+
{
|
|
547
|
+
Name: 'CI_STAGE',
|
|
548
|
+
Value: 'stage2-register-benchmark',
|
|
549
|
+
Type: 'PLAINTEXT',
|
|
550
|
+
},
|
|
551
|
+
{
|
|
552
|
+
Name: 'BENCHMARK_STATUS',
|
|
553
|
+
Value: 'completed',
|
|
554
|
+
Type: 'PLAINTEXT',
|
|
555
|
+
},
|
|
556
|
+
{
|
|
557
|
+
Name: 'CI_TABLE_NAME',
|
|
558
|
+
Value: this.ciTable.tableName,
|
|
559
|
+
Type: 'PLAINTEXT',
|
|
560
|
+
},
|
|
561
|
+
],
|
|
562
|
+
},
|
|
563
|
+
ResultPath: '$.stage2RegisterResult',
|
|
564
|
+
},
|
|
565
|
+
});
|
|
566
|
+
|
|
567
|
+
// Stage2Clean: Run do/clean after benchmark stage completes (success path)
|
|
568
|
+
const stage2Clean = new sfn.CustomState(this, 'Stage2Clean', {
|
|
569
|
+
stateJson: {
|
|
570
|
+
Type: 'Task',
|
|
571
|
+
Resource: 'arn:aws:states:::codebuild:startBuild',
|
|
572
|
+
Parameters: {
|
|
573
|
+
ProjectName: 'mlcc-ci-executor',
|
|
574
|
+
EnvironmentVariablesOverride: [
|
|
575
|
+
{
|
|
576
|
+
Name: 'CONFIG_ID',
|
|
577
|
+
'Value.$': '$.configId',
|
|
578
|
+
Type: 'PLAINTEXT',
|
|
579
|
+
},
|
|
580
|
+
{
|
|
581
|
+
Name: 'CONFIG_JSON',
|
|
582
|
+
'Value.$': '$.configJson',
|
|
583
|
+
Type: 'PLAINTEXT',
|
|
584
|
+
},
|
|
585
|
+
{
|
|
586
|
+
Name: 'CI_STAGE',
|
|
587
|
+
Value: 'stage2-clean',
|
|
588
|
+
Type: 'PLAINTEXT',
|
|
589
|
+
},
|
|
590
|
+
],
|
|
591
|
+
},
|
|
592
|
+
ResultPath: '$.stage2CleanResult',
|
|
593
|
+
},
|
|
594
|
+
});
|
|
595
|
+
|
|
596
|
+
// Stage2FailureHandler: Handle Stage 2 failures without affecting testStatus.
|
|
597
|
+
// Records lastBenchmarkStatus=failed in DynamoDB, then proceeds to clean.
|
|
598
|
+
// Per Req 1.4: Stage 2 failure SHALL NOT change the DynamoDB testStatus.
|
|
599
|
+
// Uses SET expression targeting ONLY the 3 benchmark fields — never touches
|
|
600
|
+
// testStatus, configJson, or any other pre-existing field.
|
|
601
|
+
const stage2FailureHandler = new sfn.CustomState(this, 'Stage2FailureHandler', {
|
|
602
|
+
stateJson: {
|
|
603
|
+
Type: 'Task',
|
|
604
|
+
Resource: 'arn:aws:states:::dynamodb:updateItem',
|
|
605
|
+
Parameters: {
|
|
606
|
+
TableName: this.ciTable.tableName,
|
|
607
|
+
Key: {
|
|
608
|
+
configId: { 'S.$': '$.configId' },
|
|
609
|
+
},
|
|
610
|
+
UpdateExpression: 'SET lastBenchmarkRunId = :rid, lastBenchmarkTimestamp = :ts, lastBenchmarkStatus = :status',
|
|
611
|
+
ExpressionAttributeValues: {
|
|
612
|
+
':rid': {
|
|
613
|
+
'S.$': "States.Format('bmk-failure-{}', $.configId)",
|
|
614
|
+
},
|
|
615
|
+
':ts': { 'S.$': '$$.State.EnteredTime' },
|
|
616
|
+
':status': { 'S': 'failed' },
|
|
617
|
+
},
|
|
618
|
+
},
|
|
619
|
+
ResultPath: '$.stage2FailureUpdateResult',
|
|
620
|
+
Retry: [
|
|
621
|
+
{
|
|
622
|
+
ErrorEquals: ['States.ALL'],
|
|
623
|
+
IntervalSeconds: 2,
|
|
624
|
+
MaxAttempts: 3,
|
|
625
|
+
BackoffRate: 2.0,
|
|
626
|
+
},
|
|
627
|
+
],
|
|
628
|
+
},
|
|
629
|
+
});
|
|
630
|
+
|
|
631
|
+
// Stage2FailureClean: Clean up after a Stage 2 failure
|
|
632
|
+
const stage2FailureClean = new sfn.CustomState(this, 'Stage2FailureClean', {
|
|
633
|
+
stateJson: {
|
|
634
|
+
Type: 'Task',
|
|
635
|
+
Resource: 'arn:aws:states:::codebuild:startBuild',
|
|
636
|
+
Parameters: {
|
|
637
|
+
ProjectName: 'mlcc-ci-executor',
|
|
638
|
+
EnvironmentVariablesOverride: [
|
|
639
|
+
{
|
|
640
|
+
Name: 'CONFIG_ID',
|
|
641
|
+
'Value.$': '$.configId',
|
|
642
|
+
Type: 'PLAINTEXT',
|
|
643
|
+
},
|
|
644
|
+
{
|
|
645
|
+
Name: 'CONFIG_JSON',
|
|
646
|
+
'Value.$': '$.configJson',
|
|
647
|
+
Type: 'PLAINTEXT',
|
|
648
|
+
},
|
|
649
|
+
{
|
|
650
|
+
Name: 'CI_STAGE',
|
|
651
|
+
Value: 'stage2-clean',
|
|
652
|
+
Type: 'PLAINTEXT',
|
|
653
|
+
},
|
|
654
|
+
],
|
|
655
|
+
},
|
|
656
|
+
ResultPath: '$.stage2FailureCleanResult',
|
|
372
657
|
End: true,
|
|
373
658
|
},
|
|
374
659
|
});
|
|
@@ -403,11 +688,24 @@ export class MlccCiHarnessStack extends cdk.Stack {
|
|
|
403
688
|
},
|
|
404
689
|
});
|
|
405
690
|
|
|
691
|
+
// ─── Stage 2: Benchmark Error Handling ────────────────────────────────
|
|
692
|
+
// Stage 2 failure isolation: if benchmarking fails, we record the failure
|
|
693
|
+
// in the benchmark-specific fields (lastBenchmarkStatus=failed) without
|
|
694
|
+
// touching testStatus. Uses a Parallel wrapper with addCatch so that CDK
|
|
695
|
+
// properly includes the failure handler states in the definition graph.
|
|
696
|
+
// Requirements: 1.4, 7.3
|
|
697
|
+
|
|
406
698
|
// Wire up the state machine chain
|
|
407
699
|
// RecordStartTime → StartCodeBuild → WaitForBuild → PollBuildStatus → CheckTimestamp → CheckBuildStatus
|
|
408
700
|
// CheckBuildStatus branches:
|
|
409
|
-
// - SUCCEEDED/FAILED/STOPPED → SetBuildCompleteResult → UpdateResults
|
|
410
|
-
//
|
|
701
|
+
// - SUCCEEDED/FAILED/STOPPED → SetBuildCompleteResult → UpdateResults → GetBenchmarkConfig
|
|
702
|
+
// → CheckDynamoItemHasBenchmarkField → ExtractBenchmarkFlags → CheckBenchmarkEnabled
|
|
703
|
+
// CheckBenchmarkEnabled branches:
|
|
704
|
+
// - pass + benchmarkEnabled=true → PrepareStage2Input → Stage2Pipeline (Parallel)
|
|
705
|
+
// Success: Stage2Benchmark → Stage2RegisterBenchmark → Stage2Clean → End
|
|
706
|
+
// Failure: Stage2FailureHandler → Stage2FailureClean → End
|
|
707
|
+
// - pass + benchmarkEnabled=false OR failed → SkipStage2 → End
|
|
708
|
+
// - TIMED_OUT → HandleTimeout → UpdateResultsFromTimeout → End
|
|
411
709
|
// - IN_PROGRESS (otherwise) → WaitForBuild (loop)
|
|
412
710
|
recordStartTime.next(startCodeBuild);
|
|
413
711
|
startCodeBuild.next(waitForBuild);
|
|
@@ -417,6 +715,33 @@ export class MlccCiHarnessStack extends cdk.Stack {
|
|
|
417
715
|
setSuccessResult.next(updateResults);
|
|
418
716
|
handleTimeout.next(updateResultsFromTimeout);
|
|
419
717
|
|
|
718
|
+
// Stage 2 wiring: after UpdateResults, read DynamoDB for benchmark config
|
|
719
|
+
updateResults.next(getBenchmarkConfig);
|
|
720
|
+
getBenchmarkConfig.next(checkDynamoItemHasBenchmarkField);
|
|
721
|
+
extractBenchmarkFlags.next(checkBenchmarkEnabled);
|
|
722
|
+
extractBenchmarkFlagsDefault.next(checkBenchmarkEnabled);
|
|
723
|
+
|
|
724
|
+
// Stage 2 execution uses a Parallel state to enable proper CDK Catch handling.
|
|
725
|
+
// The Parallel has one branch (the success path), and addCatch routes errors
|
|
726
|
+
// to the failure handler chain.
|
|
727
|
+
const stage2Pipeline = new sfn.Parallel(this, 'Stage2Pipeline', {
|
|
728
|
+
resultPath: '$.stage2PipelineResult',
|
|
729
|
+
});
|
|
730
|
+
stage2Pipeline.branch(
|
|
731
|
+
stage2Benchmark
|
|
732
|
+
.next(stage2RegisterBenchmark)
|
|
733
|
+
.next(stage2Clean),
|
|
734
|
+
);
|
|
735
|
+
stage2Pipeline.addCatch(stage2FailureHandler, {
|
|
736
|
+
resultPath: '$.stage2Error',
|
|
737
|
+
});
|
|
738
|
+
|
|
739
|
+
// After PrepareStage2Input, enter the Stage2Pipeline parallel wrapper
|
|
740
|
+
prepareStage2Input.next(stage2Pipeline);
|
|
741
|
+
|
|
742
|
+
// Stage 2 failure path: FailureHandler → FailureClean → End
|
|
743
|
+
stage2FailureHandler.next(stage2FailureClean);
|
|
744
|
+
|
|
420
745
|
// Create the state machine
|
|
421
746
|
this.ciOrchestrator = new sfn.StateMachine(this, 'CiOrchestrator', {
|
|
422
747
|
stateMachineName: 'mlcc-ci-orchestrator',
|
|
@@ -442,8 +767,12 @@ export class MlccCiHarnessStack extends cdk.Stack {
|
|
|
442
767
|
this.scannerFunction.addEnvironment('STATE_MACHINE_ARN', this.ciOrchestrator.stateMachineArn);
|
|
443
768
|
|
|
444
769
|
// CodeBuild IAM role with permissions for lifecycle execution
|
|
770
|
+
//
|
|
771
|
+
// RETAIN policy: IAM roles are retained on stack deletion to prevent conflicts
|
|
772
|
+
// during multi-region bootstrap. If the stack is re-created, existing roles will
|
|
773
|
+
// be reused via --no-rollback.
|
|
445
774
|
const codebuildRole = new iam.Role(this, 'CodeBuildRole', {
|
|
446
|
-
roleName:
|
|
775
|
+
roleName: `mlcc-ci-codebuild-role-${this.region}`,
|
|
447
776
|
assumedBy: new iam.ServicePrincipal('codebuild.amazonaws.com'),
|
|
448
777
|
description: 'IAM role for the MLCC CI CodeBuild executor project',
|
|
449
778
|
});
|
|
@@ -605,5 +934,506 @@ export class MlccCiHarnessStack extends cdk.Stack {
|
|
|
605
934
|
],
|
|
606
935
|
resources: [this.ciCodeBuildProject.projectArn],
|
|
607
936
|
}));
|
|
937
|
+
|
|
938
|
+
// ─── Benchmark Infrastructure (opt-in) ────────────────────────────────
|
|
939
|
+
// Gated by the CreateBenchmarkInfra parameter. When enabled, provisions
|
|
940
|
+
// Glue database/table, S3 results bucket, IAM permissions for benchmark
|
|
941
|
+
// writes (S3, Glue, Athena), and stack outputs for downstream consumers.
|
|
942
|
+
// Idempotent: CloudFormation conditions ensure re-running `cdk deploy`
|
|
943
|
+
// with the same parameter value produces no changes. Resources are only
|
|
944
|
+
// created when CreateBenchmarkInfra=true; subsequent deploys with the
|
|
945
|
+
// same value are no-ops (CloudFormation handles existence checks natively).
|
|
946
|
+
// Requirements: 3.1, 3.2, 3.4, 3.5
|
|
947
|
+
|
|
948
|
+
const createBenchmarkInfra = new cdk.CfnParameter(this, 'CreateBenchmarkInfra', {
|
|
949
|
+
type: 'String',
|
|
950
|
+
default: 'false',
|
|
951
|
+
allowedValues: ['true', 'false'],
|
|
952
|
+
description: 'Whether to create benchmark infrastructure (S3 results bucket, Glue DB/table, IAM permissions). Opt-in.',
|
|
953
|
+
});
|
|
954
|
+
|
|
955
|
+
const benchmarkInfraCondition = new cdk.CfnCondition(this, 'BenchmarkInfraCondition', {
|
|
956
|
+
expression: cdk.Fn.conditionEquals(createBenchmarkInfra.valueAsString, 'true'),
|
|
957
|
+
});
|
|
958
|
+
|
|
959
|
+
// Glue Database: mlcc_ci
|
|
960
|
+
// CloudFormation manages create-or-skip via the condition — no duplicate
|
|
961
|
+
// resource error on re-deploy because the logical ID is stable.
|
|
962
|
+
const glueDatabase = new glue.CfnDatabase(this, 'CiGlueDatabase', {
|
|
963
|
+
catalogId: this.account,
|
|
964
|
+
databaseInput: {
|
|
965
|
+
name: 'mlcc_ci',
|
|
966
|
+
description: 'MCC CI benchmark results warehouse',
|
|
967
|
+
},
|
|
968
|
+
});
|
|
969
|
+
glueDatabase.cfnOptions.condition = benchmarkInfraCondition;
|
|
970
|
+
|
|
971
|
+
// Glue Table: benchmark_results — full DDL with all 28+ columns
|
|
972
|
+
// Partition by region/year/month for efficient time-range queries.
|
|
973
|
+
// Dimension columns are well-separated (not composite keys) per Req 5.1.
|
|
974
|
+
const glueTable = new glue.CfnTable(this, 'BenchmarkResultsTable', {
|
|
975
|
+
catalogId: this.account,
|
|
976
|
+
databaseName: 'mlcc_ci',
|
|
977
|
+
tableInput: {
|
|
978
|
+
name: 'benchmark_results',
|
|
979
|
+
tableType: 'EXTERNAL_TABLE',
|
|
980
|
+
parameters: {
|
|
981
|
+
'classification': 'parquet',
|
|
982
|
+
'parquet.compression': 'SNAPPY',
|
|
983
|
+
},
|
|
984
|
+
storageDescriptor: {
|
|
985
|
+
columns: [
|
|
986
|
+
// Core dimensions
|
|
987
|
+
{ name: 'config_id', type: 'string', comment: 'SHA-256 hash (16 chars), join key with DynamoDB' },
|
|
988
|
+
{ name: 'model_name', type: 'string', comment: 'HuggingFace model ID (e.g., Qwen/Qwen3-4B)' },
|
|
989
|
+
{ name: 'model_family', type: 'string', comment: 'Derived: qwen3, llama3, deepseek-r1, etc.' },
|
|
990
|
+
{ name: 'instance_type', type: 'string', comment: 'SageMaker instance (e.g., ml.g5.xlarge)' },
|
|
991
|
+
{ name: 'instance_family', type: 'string', comment: 'Derived: g5, g6, g6e, p5, trn2, etc.' },
|
|
992
|
+
{ name: 'deployment_config', type: 'string', comment: 'Architecture-backend (e.g., transformers-vllm)' },
|
|
993
|
+
{ name: 'deployment_target', type: 'string', comment: 'realtime-inference, async-inference, etc.' },
|
|
994
|
+
{ name: 'run_timestamp', type: 'string', comment: 'When this benchmark ran (ISO 8601 UTC)' },
|
|
995
|
+
// Configuration dimensions
|
|
996
|
+
{ name: 'tensor_parallel_degree', type: 'int', comment: 'TP degree (1, 2, 4, 8)' },
|
|
997
|
+
{ name: 'quantization', type: 'string', comment: 'Quantization method (fp16, fp8, awq, gptq, none)' },
|
|
998
|
+
{ name: 'enable_lora', type: 'boolean', comment: 'Whether LoRA adapters were enabled' },
|
|
999
|
+
{ name: 'base_image', type: 'string', comment: 'Container base image (e.g., vllm/vllm-openai:v0.8.5)' },
|
|
1000
|
+
{ name: 'base_image_version', type: 'string', comment: 'Extracted tag from base image' },
|
|
1001
|
+
{ name: 'mcc_version', type: 'string', comment: 'MCC generator version that produced the project' },
|
|
1002
|
+
// Workload dimensions
|
|
1003
|
+
{ name: 'concurrency', type: 'int', comment: 'Number of concurrent requests in this measurement' },
|
|
1004
|
+
{ name: 'input_tokens_mean', type: 'int', comment: 'Mean input token count for workload' },
|
|
1005
|
+
{ name: 'output_tokens_mean', type: 'int', comment: 'Mean output token count for workload' },
|
|
1006
|
+
{ name: 'duration_seconds', type: 'int', comment: 'Benchmark duration in seconds' },
|
|
1007
|
+
// Result metrics
|
|
1008
|
+
{ name: 'ttft_p50_ms', type: 'double', comment: 'Time to first token, 50th percentile (ms)' },
|
|
1009
|
+
{ name: 'ttft_p99_ms', type: 'double', comment: 'Time to first token, 99th percentile (ms)' },
|
|
1010
|
+
{ name: 'itl_p50_ms', type: 'double', comment: 'Inter-token latency, 50th percentile (ms)' },
|
|
1011
|
+
{ name: 'itl_p99_ms', type: 'double', comment: 'Inter-token latency, 99th percentile (ms)' },
|
|
1012
|
+
{ name: 'throughput_rps', type: 'double', comment: 'Requests per second at this concurrency' },
|
|
1013
|
+
{ name: 'tokens_per_second', type: 'double', comment: 'Output tokens per second' },
|
|
1014
|
+
{ name: 'cost_per_1m_tokens', type: 'double', comment: 'Estimated cost per 1M output tokens (USD)' },
|
|
1015
|
+
{ name: 'error_rate', type: 'double', comment: 'Fraction of requests that failed (0.0-1.0)' },
|
|
1016
|
+
{ name: 'status', type: 'string', comment: 'completed, failed, timeout, unfeasible' },
|
|
1017
|
+
// Provenance
|
|
1018
|
+
{ name: 'run_type', type: 'string', comment: 'Source: ci, path_prove, optimization, manual' },
|
|
1019
|
+
{ name: 'ci_run_id', type: 'string', comment: 'Step Functions execution ID or CodeBuild build ID' },
|
|
1020
|
+
{ name: 'ci_stage', type: 'string', comment: 'stage2-benchmark' },
|
|
1021
|
+
{ name: 'benchmark_job_name', type: 'string', comment: 'SageMaker AI Benchmark job name' },
|
|
1022
|
+
{ name: 'account_id', type: 'string', comment: 'AWS account ID' },
|
|
1023
|
+
],
|
|
1024
|
+
location: `s3://mlcc-benchmark-results-${this.account}-${this.region}/results/`,
|
|
1025
|
+
inputFormat: 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat',
|
|
1026
|
+
outputFormat: 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat',
|
|
1027
|
+
serdeInfo: {
|
|
1028
|
+
serializationLibrary: 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe',
|
|
1029
|
+
parameters: {
|
|
1030
|
+
'serialization.format': '1',
|
|
1031
|
+
},
|
|
1032
|
+
},
|
|
1033
|
+
compressed: true,
|
|
1034
|
+
},
|
|
1035
|
+
partitionKeys: [
|
|
1036
|
+
{ name: 'model', type: 'string', comment: 'Model name with / replaced by _ (e.g., Qwen_Qwen3-0.6B)' },
|
|
1037
|
+
{ name: 'instance', type: 'string', comment: 'SageMaker instance type (e.g., ml.g5.xlarge)' },
|
|
1038
|
+
{ name: 'target', type: 'string', comment: 'Deployment target (realtime-inference, async-inference, etc.)' },
|
|
1039
|
+
],
|
|
1040
|
+
},
|
|
1041
|
+
});
|
|
1042
|
+
glueTable.addDependency(glueDatabase);
|
|
1043
|
+
glueTable.cfnOptions.condition = benchmarkInfraCondition;
|
|
1044
|
+
|
|
1045
|
+
// Configurable lifecycle parameters for the benchmark results bucket
|
|
1046
|
+
const benchmarkIaTransitionDays = new cdk.CfnParameter(this, 'BenchmarkIaTransitionDays', {
|
|
1047
|
+
type: 'Number',
|
|
1048
|
+
default: 90,
|
|
1049
|
+
description: 'Days before benchmark results transition to Infrequent Access storage',
|
|
1050
|
+
minValue: 30,
|
|
1051
|
+
maxValue: 365,
|
|
1052
|
+
});
|
|
1053
|
+
|
|
1054
|
+
const benchmarkExpirationDays = new cdk.CfnParameter(this, 'BenchmarkExpirationDays', {
|
|
1055
|
+
type: 'Number',
|
|
1056
|
+
default: 365,
|
|
1057
|
+
description: 'Days before benchmark results expire and are deleted',
|
|
1058
|
+
minValue: 90,
|
|
1059
|
+
maxValue: 3650,
|
|
1060
|
+
});
|
|
1061
|
+
|
|
1062
|
+
// S3 bucket for benchmark results (Parquet files partitioned by region/year/month)
|
|
1063
|
+
const benchmarkResultsBucket = new s3.Bucket(this, 'BenchmarkResultsBucket', {
|
|
1064
|
+
bucketName: `mlcc-benchmark-results-${this.account}-${this.region}`,
|
|
1065
|
+
removalPolicy: cdk.RemovalPolicy.RETAIN,
|
|
1066
|
+
lifecycleRules: [
|
|
1067
|
+
{
|
|
1068
|
+
transitions: [
|
|
1069
|
+
{
|
|
1070
|
+
storageClass: s3.StorageClass.INFREQUENT_ACCESS,
|
|
1071
|
+
transitionAfter: cdk.Duration.days(benchmarkIaTransitionDays.valueAsNumber),
|
|
1072
|
+
},
|
|
1073
|
+
],
|
|
1074
|
+
expiration: cdk.Duration.days(benchmarkExpirationDays.valueAsNumber),
|
|
1075
|
+
},
|
|
1076
|
+
],
|
|
1077
|
+
});
|
|
1078
|
+
|
|
1079
|
+
// Apply the benchmark condition to the S3 bucket
|
|
1080
|
+
const cfnBenchmarkBucket = benchmarkResultsBucket.node.defaultChild as cdk.CfnResource;
|
|
1081
|
+
cfnBenchmarkBucket.cfnOptions.condition = benchmarkInfraCondition;
|
|
1082
|
+
|
|
1083
|
+
// Output the benchmark results bucket ARN (conditional)
|
|
1084
|
+
new cdk.CfnOutput(this, 'BenchmarkResultsBucketArn', {
|
|
1085
|
+
value: benchmarkResultsBucket.bucketArn,
|
|
1086
|
+
description: 'ARN of the S3 bucket storing benchmark results (Parquet)',
|
|
1087
|
+
condition: benchmarkInfraCondition,
|
|
1088
|
+
exportName: 'mlcc-ci-benchmark-results-bucket-arn',
|
|
1089
|
+
});
|
|
1090
|
+
|
|
1091
|
+
// Output the benchmark results bucket name (conditional)
|
|
1092
|
+
new cdk.CfnOutput(this, 'BenchmarkResultsBucketName', {
|
|
1093
|
+
value: benchmarkResultsBucket.bucketName,
|
|
1094
|
+
description: 'Name of the S3 bucket storing benchmark results (Parquet)',
|
|
1095
|
+
condition: benchmarkInfraCondition,
|
|
1096
|
+
});
|
|
1097
|
+
|
|
1098
|
+
// Output the Glue database name (conditional)
|
|
1099
|
+
new cdk.CfnOutput(this, 'CiGlueDatabaseName', {
|
|
1100
|
+
value: 'mlcc_ci',
|
|
1101
|
+
description: 'Name of the Glue database for benchmark results',
|
|
1102
|
+
condition: benchmarkInfraCondition,
|
|
1103
|
+
});
|
|
1104
|
+
|
|
1105
|
+
// S3 permissions for benchmark results bucket writes
|
|
1106
|
+
const benchmarkS3Policy = new iam.PolicyStatement({
|
|
1107
|
+
sid: 'BenchmarkResultsWrite',
|
|
1108
|
+
effect: iam.Effect.ALLOW,
|
|
1109
|
+
actions: [
|
|
1110
|
+
's3:PutObject',
|
|
1111
|
+
's3:GetObject',
|
|
1112
|
+
's3:ListBucket',
|
|
1113
|
+
],
|
|
1114
|
+
resources: [
|
|
1115
|
+
'arn:aws:s3:::mlcc-benchmark-results-*',
|
|
1116
|
+
'arn:aws:s3:::mlcc-benchmark-results-*/*',
|
|
1117
|
+
],
|
|
1118
|
+
});
|
|
1119
|
+
|
|
1120
|
+
// Glue permissions for partition management
|
|
1121
|
+
const benchmarkGluePolicy = new iam.PolicyStatement({
|
|
1122
|
+
sid: 'GlueCatalogAccess',
|
|
1123
|
+
effect: iam.Effect.ALLOW,
|
|
1124
|
+
actions: [
|
|
1125
|
+
'glue:GetDatabase',
|
|
1126
|
+
'glue:GetTable',
|
|
1127
|
+
'glue:GetPartitions',
|
|
1128
|
+
'glue:BatchCreatePartition',
|
|
1129
|
+
'glue:CreatePartition',
|
|
1130
|
+
],
|
|
1131
|
+
resources: [
|
|
1132
|
+
'arn:aws:glue:*:*:catalog',
|
|
1133
|
+
'arn:aws:glue:*:*:database/mlcc_ci',
|
|
1134
|
+
'arn:aws:glue:*:*:table/mlcc_ci/*',
|
|
1135
|
+
],
|
|
1136
|
+
});
|
|
1137
|
+
|
|
1138
|
+
// Athena permissions for partition repair (MSCK REPAIR TABLE)
|
|
1139
|
+
const benchmarkAthenaPolicy = new iam.PolicyStatement({
|
|
1140
|
+
sid: 'AthenaPartitionRepair',
|
|
1141
|
+
effect: iam.Effect.ALLOW,
|
|
1142
|
+
actions: [
|
|
1143
|
+
'athena:StartQueryExecution',
|
|
1144
|
+
'athena:GetQueryResults',
|
|
1145
|
+
],
|
|
1146
|
+
resources: ['*'],
|
|
1147
|
+
});
|
|
1148
|
+
|
|
1149
|
+
// Create a managed policy for benchmark permissions so we can condition it
|
|
1150
|
+
const benchmarkPolicy = new iam.Policy(this, 'BenchmarkWritePolicy', {
|
|
1151
|
+
policyName: 'mlcc-ci-benchmark-write-policy',
|
|
1152
|
+
statements: [benchmarkS3Policy, benchmarkGluePolicy, benchmarkAthenaPolicy],
|
|
1153
|
+
});
|
|
1154
|
+
benchmarkPolicy.attachToRole(codebuildRole);
|
|
1155
|
+
|
|
1156
|
+
// Apply the condition to the policy's underlying CFN resource
|
|
1157
|
+
const cfnBenchmarkPolicy = benchmarkPolicy.node.defaultChild as cdk.CfnResource;
|
|
1158
|
+
cfnBenchmarkPolicy.cfnOptions.condition = benchmarkInfraCondition;
|
|
1159
|
+
|
|
1160
|
+
// ─── Path Prover Infrastructure (opt-in, separate from benchmark infra) ────
|
|
1161
|
+
// Gated by the CreatePathProver parameter. When enabled, provisions:
|
|
1162
|
+
// - Brain Lambda (getNextConfig, pickNext, classifyFailure)
|
|
1163
|
+
// - WriteResults Lambda (writes path_prove records to Athena)
|
|
1164
|
+
// - Step Functions state machine (path-prover orchestrator)
|
|
1165
|
+
// - EventBridge scheduled rule (disabled by default)
|
|
1166
|
+
// Requirements: 8.1, 8.7, 8.8
|
|
1167
|
+
|
|
1168
|
+
const createPathProver = new cdk.CfnParameter(this, 'CreatePathProver', {
|
|
1169
|
+
type: 'String',
|
|
1170
|
+
default: 'false',
|
|
1171
|
+
allowedValues: ['true', 'false'],
|
|
1172
|
+
description: 'Whether to create Path Prover infrastructure (state machine, Lambdas, EventBridge rule). Opt-in.',
|
|
1173
|
+
});
|
|
1174
|
+
|
|
1175
|
+
const pathProverCondition = new cdk.CfnCondition(this, 'PathProverCondition', {
|
|
1176
|
+
expression: cdk.Fn.conditionEquals(createPathProver.valueAsString, 'true'),
|
|
1177
|
+
});
|
|
1178
|
+
|
|
1179
|
+
// Path Prover Brain Lambda IAM role
|
|
1180
|
+
const pathProverBrainRole = new iam.Role(this, 'PathProverBrainRole', {
|
|
1181
|
+
roleName: 'mlcc-path-prover-brain-role',
|
|
1182
|
+
assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'),
|
|
1183
|
+
description: 'IAM role for the Path Prover Brain Lambda function',
|
|
1184
|
+
});
|
|
1185
|
+
(pathProverBrainRole.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
|
|
1186
|
+
|
|
1187
|
+
// Brain Lambda: Athena read access for gap identification + substitution
|
|
1188
|
+
const brainAthenaPolicy = new iam.Policy(this, 'PathProverBrainAthenaPolicy', {
|
|
1189
|
+
policyName: 'mlcc-path-prover-brain-athena',
|
|
1190
|
+
statements: [
|
|
1191
|
+
new iam.PolicyStatement({
|
|
1192
|
+
effect: iam.Effect.ALLOW,
|
|
1193
|
+
actions: [
|
|
1194
|
+
'athena:StartQueryExecution',
|
|
1195
|
+
'athena:GetQueryExecution',
|
|
1196
|
+
'athena:GetQueryResults',
|
|
1197
|
+
],
|
|
1198
|
+
resources: ['*'],
|
|
1199
|
+
}),
|
|
1200
|
+
new iam.PolicyStatement({
|
|
1201
|
+
effect: iam.Effect.ALLOW,
|
|
1202
|
+
actions: ['glue:GetTable', 'glue:GetDatabase', 'glue:GetPartitions'],
|
|
1203
|
+
resources: [
|
|
1204
|
+
'arn:aws:glue:*:*:catalog',
|
|
1205
|
+
'arn:aws:glue:*:*:database/mlcc_ci',
|
|
1206
|
+
'arn:aws:glue:*:*:table/mlcc_ci/*',
|
|
1207
|
+
],
|
|
1208
|
+
}),
|
|
1209
|
+
new iam.PolicyStatement({
|
|
1210
|
+
effect: iam.Effect.ALLOW,
|
|
1211
|
+
actions: ['s3:GetObject', 's3:ListBucket', 's3:GetBucketLocation', 's3:PutObject'],
|
|
1212
|
+
resources: [
|
|
1213
|
+
'arn:aws:s3:::mlcc-benchmark-results-*',
|
|
1214
|
+
'arn:aws:s3:::mlcc-benchmark-results-*/*',
|
|
1215
|
+
],
|
|
1216
|
+
}),
|
|
1217
|
+
new iam.PolicyStatement({
|
|
1218
|
+
effect: iam.Effect.ALLOW,
|
|
1219
|
+
actions: ['logs:CreateLogStream', 'logs:PutLogEvents'],
|
|
1220
|
+
resources: [this.ciLogGroup.logGroupArn, `${this.ciLogGroup.logGroupArn}:*`],
|
|
1221
|
+
}),
|
|
1222
|
+
],
|
|
1223
|
+
});
|
|
1224
|
+
brainAthenaPolicy.attachToRole(pathProverBrainRole);
|
|
1225
|
+
(brainAthenaPolicy.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
|
|
1226
|
+
|
|
1227
|
+
// Path Prover Brain Lambda function
|
|
1228
|
+
const pathProverBrainFunction = new NodejsFunction(this, 'PathProverBrainFunction', {
|
|
1229
|
+
functionName: 'mlcc-path-prover-brain',
|
|
1230
|
+
runtime: lambda.Runtime.NODEJS_20_X,
|
|
1231
|
+
memorySize: 512,
|
|
1232
|
+
timeout: cdk.Duration.seconds(120),
|
|
1233
|
+
entry: path.join(__dirname, '..', 'lambda', 'path-prover', 'brain.ts'),
|
|
1234
|
+
handler: 'handler',
|
|
1235
|
+
role: pathProverBrainRole,
|
|
1236
|
+
environment: {
|
|
1237
|
+
GLUE_DATABASE: 'mlcc_ci',
|
|
1238
|
+
GLUE_TABLE: 'benchmark_results',
|
|
1239
|
+
MAX_PROVES_PER_RUN: '10',
|
|
1240
|
+
MAX_COST_PER_RUN: '100',
|
|
1241
|
+
},
|
|
1242
|
+
logGroup: this.ciLogGroup,
|
|
1243
|
+
bundling: {
|
|
1244
|
+
minify: true,
|
|
1245
|
+
sourceMap: true,
|
|
1246
|
+
},
|
|
1247
|
+
});
|
|
1248
|
+
(pathProverBrainFunction.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
|
|
1249
|
+
|
|
1250
|
+
// Path Prover Write Results Lambda IAM role
|
|
1251
|
+
const pathProverWriteRole = new iam.Role(this, 'PathProverWriteRole', {
|
|
1252
|
+
roleName: 'mlcc-path-prover-write-role',
|
|
1253
|
+
assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'),
|
|
1254
|
+
description: 'IAM role for the Path Prover Write Results Lambda function',
|
|
1255
|
+
});
|
|
1256
|
+
(pathProverWriteRole.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
|
|
1257
|
+
|
|
1258
|
+
// Write Results Lambda: S3 + Glue write access
|
|
1259
|
+
const writeResultsPolicy = new iam.Policy(this, 'PathProverWriteResultsPolicy', {
|
|
1260
|
+
policyName: 'mlcc-path-prover-write-results',
|
|
1261
|
+
statements: [
|
|
1262
|
+
new iam.PolicyStatement({
|
|
1263
|
+
effect: iam.Effect.ALLOW,
|
|
1264
|
+
actions: ['s3:PutObject', 's3:GetObject'],
|
|
1265
|
+
resources: [
|
|
1266
|
+
'arn:aws:s3:::mlcc-benchmark-results-*/*',
|
|
1267
|
+
],
|
|
1268
|
+
}),
|
|
1269
|
+
new iam.PolicyStatement({
|
|
1270
|
+
effect: iam.Effect.ALLOW,
|
|
1271
|
+
actions: ['glue:BatchCreatePartition', 'glue:CreatePartition', 'glue:GetTable'],
|
|
1272
|
+
resources: [
|
|
1273
|
+
'arn:aws:glue:*:*:catalog',
|
|
1274
|
+
'arn:aws:glue:*:*:database/mlcc_ci',
|
|
1275
|
+
'arn:aws:glue:*:*:table/mlcc_ci/*',
|
|
1276
|
+
],
|
|
1277
|
+
}),
|
|
1278
|
+
new iam.PolicyStatement({
|
|
1279
|
+
effect: iam.Effect.ALLOW,
|
|
1280
|
+
actions: ['logs:CreateLogStream', 'logs:PutLogEvents'],
|
|
1281
|
+
resources: [this.ciLogGroup.logGroupArn, `${this.ciLogGroup.logGroupArn}:*`],
|
|
1282
|
+
}),
|
|
1283
|
+
],
|
|
1284
|
+
});
|
|
1285
|
+
writeResultsPolicy.attachToRole(pathProverWriteRole);
|
|
1286
|
+
(writeResultsPolicy.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
|
|
1287
|
+
|
|
1288
|
+
// Path Prover Write Results Lambda function
|
|
1289
|
+
const pathProverWriteFunction = new NodejsFunction(this, 'PathProverWriteFunction', {
|
|
1290
|
+
functionName: 'mlcc-path-prover-write-results',
|
|
1291
|
+
runtime: lambda.Runtime.NODEJS_20_X,
|
|
1292
|
+
memorySize: 256,
|
|
1293
|
+
timeout: cdk.Duration.seconds(60),
|
|
1294
|
+
entry: path.join(__dirname, '..', 'lambda', 'path-prover', 'write-results.ts'),
|
|
1295
|
+
handler: 'handler',
|
|
1296
|
+
role: pathProverWriteRole,
|
|
1297
|
+
environment: {
|
|
1298
|
+
GLUE_DATABASE: 'mlcc_ci',
|
|
1299
|
+
GLUE_TABLE: 'benchmark_results',
|
|
1300
|
+
RESULTS_BUCKET: `mlcc-benchmark-results-${this.account}-${this.region}`,
|
|
1301
|
+
},
|
|
1302
|
+
logGroup: this.ciLogGroup,
|
|
1303
|
+
bundling: {
|
|
1304
|
+
minify: true,
|
|
1305
|
+
sourceMap: true,
|
|
1306
|
+
},
|
|
1307
|
+
});
|
|
1308
|
+
(pathProverWriteFunction.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
|
|
1309
|
+
|
|
1310
|
+
// Path Prover Step Functions IAM role
|
|
1311
|
+
const pathProverOrchestratorRole = new iam.Role(this, 'PathProverOrchestratorRole', {
|
|
1312
|
+
roleName: 'mlcc-path-prover-orchestrator-role',
|
|
1313
|
+
assumedBy: new iam.ServicePrincipal('states.amazonaws.com'),
|
|
1314
|
+
description: 'IAM role for the Path Prover Step Functions state machine',
|
|
1315
|
+
});
|
|
1316
|
+
(pathProverOrchestratorRole.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
|
|
1317
|
+
|
|
1318
|
+
// Orchestrator permissions
|
|
1319
|
+
const pathProverOrchestratorPolicy = new iam.Policy(this, 'PathProverOrchestratorPolicy', {
|
|
1320
|
+
policyName: 'mlcc-path-prover-orchestrator-policy',
|
|
1321
|
+
statements: [
|
|
1322
|
+
// Lambda invoke for brain and write-results
|
|
1323
|
+
new iam.PolicyStatement({
|
|
1324
|
+
effect: iam.Effect.ALLOW,
|
|
1325
|
+
actions: ['lambda:InvokeFunction'],
|
|
1326
|
+
resources: [
|
|
1327
|
+
pathProverBrainFunction.functionArn,
|
|
1328
|
+
pathProverWriteFunction.functionArn,
|
|
1329
|
+
],
|
|
1330
|
+
}),
|
|
1331
|
+
// CodeBuild start/poll for lifecycle stages
|
|
1332
|
+
new iam.PolicyStatement({
|
|
1333
|
+
effect: iam.Effect.ALLOW,
|
|
1334
|
+
actions: ['codebuild:StartBuild', 'codebuild:BatchGetBuilds', 'codebuild:StopBuild'],
|
|
1335
|
+
resources: [this.ciCodeBuildProject.projectArn],
|
|
1336
|
+
}),
|
|
1337
|
+
// CloudWatch Logs for execution logging
|
|
1338
|
+
new iam.PolicyStatement({
|
|
1339
|
+
effect: iam.Effect.ALLOW,
|
|
1340
|
+
actions: [
|
|
1341
|
+
'logs:CreateLogDelivery',
|
|
1342
|
+
'logs:GetLogDelivery',
|
|
1343
|
+
'logs:UpdateLogDelivery',
|
|
1344
|
+
'logs:DeleteLogDelivery',
|
|
1345
|
+
'logs:ListLogDeliveries',
|
|
1346
|
+
'logs:PutResourcePolicy',
|
|
1347
|
+
'logs:DescribeResourcePolicies',
|
|
1348
|
+
'logs:DescribeLogGroups',
|
|
1349
|
+
'logs:PutLogEvents',
|
|
1350
|
+
'logs:CreateLogStream',
|
|
1351
|
+
],
|
|
1352
|
+
resources: ['*'],
|
|
1353
|
+
}),
|
|
1354
|
+
// Events for .sync integration
|
|
1355
|
+
new iam.PolicyStatement({
|
|
1356
|
+
effect: iam.Effect.ALLOW,
|
|
1357
|
+
actions: ['events:PutTargets', 'events:PutRule', 'events:DescribeRule'],
|
|
1358
|
+
resources: [`arn:aws:events:${this.region}:${this.account}:rule/StepFunctionsGetBuildStatusRule-*`],
|
|
1359
|
+
}),
|
|
1360
|
+
],
|
|
1361
|
+
});
|
|
1362
|
+
pathProverOrchestratorPolicy.attachToRole(pathProverOrchestratorRole);
|
|
1363
|
+
(pathProverOrchestratorPolicy.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
|
|
1364
|
+
|
|
1365
|
+
// Path Prover State Machine
|
|
1366
|
+
// Uses ASL definition from file with Fn::Sub for variable substitution.
|
|
1367
|
+
// We read the raw JSON and use cdk.Fn.sub to inject resource ARNs.
|
|
1368
|
+
const aslTemplate = JSON.stringify(require('../state-machines/path-prover.asl.json'));
|
|
1369
|
+
const pathProverDefinitionString = cdk.Fn.sub(aslTemplate, {
|
|
1370
|
+
BrainFunctionArn: pathProverBrainFunction.functionArn,
|
|
1371
|
+
WriteResultsFunctionArn: pathProverWriteFunction.functionArn,
|
|
1372
|
+
ClassifyFailureFunctionArn: pathProverBrainFunction.functionArn,
|
|
1373
|
+
CodeBuildProjectName: this.ciCodeBuildProject.projectName,
|
|
1374
|
+
});
|
|
1375
|
+
|
|
1376
|
+
const pathProverStateMachine = new sfn.CfnStateMachine(this, 'PathProverStateMachine', {
|
|
1377
|
+
stateMachineName: 'mlcc-path-prover',
|
|
1378
|
+
stateMachineType: 'STANDARD',
|
|
1379
|
+
definitionString: pathProverDefinitionString,
|
|
1380
|
+
roleArn: pathProverOrchestratorRole.roleArn,
|
|
1381
|
+
loggingConfiguration: {
|
|
1382
|
+
destinations: [{
|
|
1383
|
+
cloudWatchLogsLogGroup: {
|
|
1384
|
+
logGroupArn: this.ciLogGroup.logGroupArn,
|
|
1385
|
+
},
|
|
1386
|
+
}],
|
|
1387
|
+
level: 'ALL',
|
|
1388
|
+
includeExecutionData: true,
|
|
1389
|
+
},
|
|
1390
|
+
tracingConfiguration: {
|
|
1391
|
+
enabled: true,
|
|
1392
|
+
},
|
|
1393
|
+
});
|
|
1394
|
+
pathProverStateMachine.cfnOptions.condition = pathProverCondition;
|
|
1395
|
+
|
|
1396
|
+
// EventBridge scheduled rule for Path Prover (disabled by default)
|
|
1397
|
+
// Can be enabled via the EnablePathProverSchedule parameter
|
|
1398
|
+
const enablePathProverSchedule = new cdk.CfnParameter(this, 'EnablePathProverSchedule', {
|
|
1399
|
+
type: 'String',
|
|
1400
|
+
default: 'DISABLED',
|
|
1401
|
+
allowedValues: ['ENABLED', 'DISABLED'],
|
|
1402
|
+
description: 'Whether to enable the Path Prover scheduled EventBridge rule. Default: DISABLED.',
|
|
1403
|
+
});
|
|
1404
|
+
|
|
1405
|
+
const pathProverScheduleRule = new events.CfnRule(this, 'PathProverScheduleRule', {
|
|
1406
|
+
name: 'mlcc-path-prover-schedule',
|
|
1407
|
+
description: 'Triggers the Path Prover state machine on a schedule to fill coverage gaps',
|
|
1408
|
+
scheduleExpression: 'rate(6 hours)',
|
|
1409
|
+
state: enablePathProverSchedule.valueAsString,
|
|
1410
|
+
targets: [{
|
|
1411
|
+
arn: `arn:aws:states:${this.region}:${this.account}:stateMachine:mlcc-path-prover`,
|
|
1412
|
+
id: 'PathProverTarget',
|
|
1413
|
+
roleArn: pathProverOrchestratorRole.roleArn,
|
|
1414
|
+
input: JSON.stringify({
|
|
1415
|
+
iteration: 0,
|
|
1416
|
+
budgetSpent: 0,
|
|
1417
|
+
maxProvesPerRun: 10,
|
|
1418
|
+
maxCostPerRun: 100,
|
|
1419
|
+
previousResults: [],
|
|
1420
|
+
}),
|
|
1421
|
+
}],
|
|
1422
|
+
});
|
|
1423
|
+
pathProverScheduleRule.cfnOptions.condition = pathProverCondition;
|
|
1424
|
+
|
|
1425
|
+
// Output Path Prover state machine ARN
|
|
1426
|
+
new cdk.CfnOutput(this, 'PathProverStateMachineArn', {
|
|
1427
|
+
value: `arn:aws:states:${this.region}:${this.account}:stateMachine:mlcc-path-prover`,
|
|
1428
|
+
description: 'ARN of the Path Prover Step Functions state machine',
|
|
1429
|
+
condition: pathProverCondition,
|
|
1430
|
+
});
|
|
1431
|
+
|
|
1432
|
+
// Output Brain Lambda ARN
|
|
1433
|
+
new cdk.CfnOutput(this, 'PathProverBrainFunctionArn', {
|
|
1434
|
+
value: pathProverBrainFunction.functionArn,
|
|
1435
|
+
description: 'ARN of the Path Prover Brain Lambda function',
|
|
1436
|
+
condition: pathProverCondition,
|
|
1437
|
+
});
|
|
608
1438
|
}
|
|
609
1439
|
}
|