@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +33 -22
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +53 -67
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +166 -153
  23. package/servers/instance-sizer/lib/instance-ranker.js +120 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/instances.json +27 -0
  29. package/servers/lib/catalogs/model-servers.json +201 -3
  30. package/servers/lib/custom-validators.js +13 -13
  31. package/servers/lib/dynamic-resolver.js +4 -4
  32. package/servers/marketplace-picker/index.js +342 -0
  33. package/servers/marketplace-picker/manifest.json +14 -0
  34. package/servers/marketplace-picker/package.json +18 -0
  35. package/servers/model-picker/index.js +382 -382
  36. package/servers/region-picker/index.js +56 -56
  37. package/servers/workload-picker/LICENSE +202 -0
  38. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  39. package/servers/workload-picker/index.js +171 -0
  40. package/servers/workload-picker/manifest.json +16 -0
  41. package/servers/workload-picker/package.json +16 -0
  42. package/src/app.js +12 -3
  43. package/src/lib/bootstrap-command-handler.js +609 -15
  44. package/src/lib/bootstrap-config.js +36 -0
  45. package/src/lib/bootstrap-profile-manager.js +48 -41
  46. package/src/lib/ci-register-helpers.js +74 -0
  47. package/src/lib/config-loader.js +3 -0
  48. package/src/lib/config-manager.js +7 -0
  49. package/src/lib/config-validator.js +1 -1
  50. package/src/lib/cuda-resolver.js +17 -8
  51. package/src/lib/generated/cli-options.js +319 -314
  52. package/src/lib/generated/parameter-matrix.js +672 -661
  53. package/src/lib/generated/validation-rules.js +76 -72
  54. package/src/lib/path-prover-brain.js +664 -0
  55. package/src/lib/prompts/infrastructure-prompts.js +2 -2
  56. package/src/lib/prompts/model-prompts.js +6 -0
  57. package/src/lib/prompts/project-prompts.js +12 -0
  58. package/src/lib/secrets-prompt-runner.js +4 -0
  59. package/src/lib/template-manager.js +1 -1
  60. package/src/lib/template-variable-resolver.js +87 -1
  61. package/src/lib/tune-catalog-validator.js +37 -4
  62. package/templates/Dockerfile +9 -0
  63. package/templates/code/adapter_sidecar.py +444 -0
  64. package/templates/code/serve +6 -0
  65. package/templates/code/serve.d/vllm.ejs +1 -1
  66. package/templates/do/.benchmark_writer.py +1476 -0
  67. package/templates/do/.tune_helper.py +982 -57
  68. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  69. package/templates/do/adapter +154 -0
  70. package/templates/do/benchmark +639 -85
  71. package/templates/do/build +5 -0
  72. package/templates/do/clean.d/async-inference.ejs +5 -0
  73. package/templates/do/clean.d/batch-transform.ejs +5 -0
  74. package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
  75. package/templates/do/clean.d/managed-inference.ejs +5 -0
  76. package/templates/do/config +115 -45
  77. package/templates/do/deploy.d/async-inference.ejs +30 -3
  78. package/templates/do/deploy.d/batch-transform.ejs +29 -3
  79. package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
  80. package/templates/do/deploy.d/managed-inference.ejs +216 -14
  81. package/templates/do/lib/endpoint-config.sh +1 -1
  82. package/templates/do/lib/profile.sh +44 -0
  83. package/templates/do/optimize +106 -37
  84. package/templates/do/push +5 -0
  85. package/templates/do/register +94 -0
  86. package/templates/do/stage +567 -0
  87. package/templates/do/submit +7 -0
  88. package/templates/do/test +14 -0
  89. package/templates/do/tune +382 -59
  90. package/templates/do/validate +44 -4
@@ -2,7 +2,9 @@ import * as cdk from 'aws-cdk-lib';
2
2
  import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
3
3
  import * as events from 'aws-cdk-lib/aws-events';
4
4
  import * as targets from 'aws-cdk-lib/aws-events-targets';
5
+ import * as glue from 'aws-cdk-lib/aws-glue';
5
6
  import * as iam from 'aws-cdk-lib/aws-iam';
7
+ import * as s3 from 'aws-cdk-lib/aws-s3';
6
8
  import * as lambda from 'aws-cdk-lib/aws-lambda';
7
9
  import { NodejsFunction } from 'aws-cdk-lib/aws-lambda-nodejs';
8
10
  import * as logs from 'aws-cdk-lib/aws-logs';
@@ -117,8 +119,12 @@ export class MlccCiHarnessStack extends cdk.Stack {
117
119
  });
118
120
 
119
121
  // Scanner Lambda IAM role with least-privilege permissions
122
+ //
123
+ // RETAIN policy: IAM roles are retained on stack deletion to prevent conflicts
124
+ // during multi-region bootstrap. If the stack is re-created, existing roles will
125
+ // be reused via --no-rollback.
120
126
  const scannerRole = new iam.Role(this, 'ScannerRole', {
121
- roleName: 'mlcc-ci-scanner-role',
127
+ roleName: `mlcc-ci-scanner-role-${this.region}`,
122
128
  assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'),
123
129
  description: 'IAM role for the MLCC CI Scanner Lambda function',
124
130
  });
@@ -178,16 +184,23 @@ export class MlccCiHarnessStack extends cdk.Stack {
178
184
 
179
185
  // Step Functions Orchestrator IAM role
180
186
  // Permissions for DynamoDB UpdateItem, Logs, and CodeBuild are defined here.
187
+ //
188
+ // RETAIN policy: IAM roles are retained on stack deletion to prevent conflicts
189
+ // during multi-region bootstrap. If the stack is re-created, existing roles will
190
+ // be reused via --no-rollback.
181
191
  this.orchestratorRole = new iam.Role(this, 'OrchestratorRole', {
182
- roleName: 'mlcc-ci-orchestrator-role',
183
- assumedBy: new iam.ServicePrincipal('states.amazonaws.com'),
192
+ roleName: `mlcc-ci-orchestrator-role-${this.region}`,
193
+ assumedBy: new iam.CompositePrincipal(
194
+ new iam.ServicePrincipal('states.amazonaws.com'),
195
+ new iam.ServicePrincipal('events.amazonaws.com'),
196
+ ),
184
197
  description: 'IAM role for the MLCC CI Orchestrator Step Functions state machine',
185
198
  });
186
199
 
187
200
  // DynamoDB:UpdateItem on CI_Table for UpdateResults states
188
201
  this.orchestratorRole.addToPolicy(new iam.PolicyStatement({
189
202
  effect: iam.Effect.ALLOW,
190
- actions: ['dynamodb:UpdateItem'],
203
+ actions: ['dynamodb:UpdateItem', 'dynamodb:GetItem'],
191
204
  resources: [this.ciTable.tableArn],
192
205
  }));
193
206
 
@@ -257,6 +270,14 @@ export class MlccCiHarnessStack extends cdk.Stack {
257
270
  Value: this.ciLogGroup.logGroupName,
258
271
  Type: 'PLAINTEXT',
259
272
  },
273
+ {
274
+ // Benchmark concurrency levels (comma-separated string, e.g. "1,4,8")
275
+ // Set by the CI Scanner Lambda from DynamoDB benchmarkConcurrencyLevels field.
276
+ // Falls back to default [1,4,8] in do/benchmark if empty.
277
+ Name: 'BENCHMARK_CONCURRENCY_LEVELS',
278
+ 'Value.$': '$.benchmarkConcurrencyLevels',
279
+ Type: 'PLAINTEXT',
280
+ },
260
281
  ],
261
282
  },
262
283
  ResultPath: '$.buildResult',
@@ -369,6 +390,270 @@ export class MlccCiHarnessStack extends cdk.Stack {
369
390
  BackoffRate: 2.0,
370
391
  },
371
392
  ],
393
+ },
394
+ });
395
+
396
+ // ─── Stage 2 Orchestration (Req 1.1, 1.2, 1.3) ──────────────────────────
397
+ // After Stage 1 completes, the orchestrator reads the DynamoDB record to
398
+ // determine if the build passed and if benchmark is enabled. Stage 2 runs
399
+ // benchmarks asynchronously and does NOT affect testStatus on failure.
400
+
401
+ // GetBenchmarkConfig: Read benchmarkEnabled flag from DynamoDB after Stage 1
402
+ const getBenchmarkConfig = new sfn.CustomState(this, 'GetBenchmarkConfig', {
403
+ stateJson: {
404
+ Type: 'Task',
405
+ Resource: 'arn:aws:states:::dynamodb:getItem',
406
+ Parameters: {
407
+ TableName: this.ciTable.tableName,
408
+ Key: {
409
+ configId: { 'S.$': '$.configId' },
410
+ },
411
+ ProjectionExpression: 'testStatus, benchmarkEnabled, benchmarkConcurrencyLevels',
412
+ ConsistentRead: true,
413
+ },
414
+ ResultPath: '$.dynamoResult',
415
+ Retry: [
416
+ {
417
+ ErrorEquals: ['States.ALL'],
418
+ IntervalSeconds: 2,
419
+ MaxAttempts: 3,
420
+ BackoffRate: 2.0,
421
+ },
422
+ ],
423
+ },
424
+ });
425
+
426
+ // ExtractBenchmarkFlags: Extract benchmarkEnabled and testStatus into top-level fields
427
+ const extractBenchmarkFlags = new sfn.Pass(this, 'ExtractBenchmarkFlags', {
428
+ parameters: {
429
+ 'configId.$': '$.configId',
430
+ 'configJson.$': '$.configJson',
431
+ 'buildStrategy.$': '$.buildStrategy',
432
+ 'startTime.$': '$.startTime',
433
+ 'buildStatus.$': '$.buildStatus',
434
+ 'testStatus.$': '$.dynamoResult.Item.testStatus.S',
435
+ 'benchmarkEnabled.$': '$.dynamoResult.Item.benchmarkEnabled.BOOL',
436
+ },
437
+ resultPath: '$',
438
+ });
439
+
440
+ // ExtractBenchmarkFlagsDefault: Fallback when benchmarkEnabled is not set in DynamoDB
441
+ // (backward-compatible — absence means disabled)
442
+ const extractBenchmarkFlagsDefault = new sfn.Pass(this, 'ExtractBenchmarkFlagsDefault', {
443
+ parameters: {
444
+ 'configId.$': '$.configId',
445
+ 'configJson.$': '$.configJson',
446
+ 'buildStrategy.$': '$.buildStrategy',
447
+ 'startTime.$': '$.startTime',
448
+ 'buildStatus.$': '$.buildStatus',
449
+ 'testStatus': 'unknown',
450
+ 'benchmarkEnabled': false,
451
+ },
452
+ resultPath: '$',
453
+ });
454
+
455
+ // CheckDynamoItemHasBenchmarkField: determine if the DynamoDB response contains
456
+ // the benchmarkEnabled field. If not present, default to false.
457
+ const checkDynamoItemHasBenchmarkField = new sfn.Choice(this, 'CheckDynamoItemHasBenchmarkField')
458
+ .when(
459
+ sfn.Condition.isPresent('$.dynamoResult.Item.benchmarkEnabled'),
460
+ extractBenchmarkFlags,
461
+ )
462
+ .otherwise(extractBenchmarkFlagsDefault);
463
+
464
+ // CheckStage1Passed: Determine if Stage 1 passed (testStatus from DynamoDB read)
465
+ // If passed + benchmarkEnabled=true → Stage 2
466
+ // If passed + benchmarkEnabled=false → skip to End
467
+ // If failed → skip to End (do/clean already ran in CodeBuild's post_build)
468
+ const prepareStage2Input = new sfn.Pass(this, 'PrepareStage2Input', {
469
+ parameters: {
470
+ 'configId.$': '$.configId',
471
+ 'configJson.$': '$.configJson',
472
+ 'buildStrategy.$': '$.buildStrategy',
473
+ },
474
+ });
475
+
476
+ const skipStage2 = new sfn.Succeed(this, 'SkipStage2');
477
+
478
+ const checkBenchmarkEnabled = new sfn.Choice(this, 'CheckBenchmarkEnabled')
479
+ .when(
480
+ sfn.Condition.and(
481
+ sfn.Condition.stringEquals('$.testStatus', 'pass'),
482
+ sfn.Condition.booleanEquals('$.benchmarkEnabled', true),
483
+ ),
484
+ prepareStage2Input,
485
+ )
486
+ .otherwise(skipStage2);
487
+
488
+ // Stage2Benchmark: Run do/benchmark via CodeBuild
489
+ // Uses .sync integration to wait for build completion.
490
+ const stage2Benchmark = new sfn.CustomState(this, 'Stage2Benchmark', {
491
+ stateJson: {
492
+ Type: 'Task',
493
+ Resource: 'arn:aws:states:::codebuild:startBuild',
494
+ Parameters: {
495
+ ProjectName: 'mlcc-ci-executor',
496
+ EnvironmentVariablesOverride: [
497
+ {
498
+ Name: 'CONFIG_ID',
499
+ 'Value.$': '$.configId',
500
+ Type: 'PLAINTEXT',
501
+ },
502
+ {
503
+ Name: 'CONFIG_JSON',
504
+ 'Value.$': '$.configJson',
505
+ Type: 'PLAINTEXT',
506
+ },
507
+ {
508
+ Name: 'CI_STAGE',
509
+ Value: 'stage2-benchmark',
510
+ Type: 'PLAINTEXT',
511
+ },
512
+ {
513
+ Name: 'CI_TABLE_NAME',
514
+ Value: this.ciTable.tableName,
515
+ Type: 'PLAINTEXT',
516
+ },
517
+ {
518
+ Name: 'CI_LOG_GROUP',
519
+ Value: this.ciLogGroup.logGroupName,
520
+ Type: 'PLAINTEXT',
521
+ },
522
+ ],
523
+ },
524
+ ResultPath: '$.stage2BuildResult',
525
+ },
526
+ });
527
+
528
+ // Stage2RegisterBenchmark: Run do/register --benchmark-status via CodeBuild
529
+ const stage2RegisterBenchmark = new sfn.CustomState(this, 'Stage2RegisterBenchmark', {
530
+ stateJson: {
531
+ Type: 'Task',
532
+ Resource: 'arn:aws:states:::codebuild:startBuild',
533
+ Parameters: {
534
+ ProjectName: 'mlcc-ci-executor',
535
+ EnvironmentVariablesOverride: [
536
+ {
537
+ Name: 'CONFIG_ID',
538
+ 'Value.$': '$.configId',
539
+ Type: 'PLAINTEXT',
540
+ },
541
+ {
542
+ Name: 'CONFIG_JSON',
543
+ 'Value.$': '$.configJson',
544
+ Type: 'PLAINTEXT',
545
+ },
546
+ {
547
+ Name: 'CI_STAGE',
548
+ Value: 'stage2-register-benchmark',
549
+ Type: 'PLAINTEXT',
550
+ },
551
+ {
552
+ Name: 'BENCHMARK_STATUS',
553
+ Value: 'completed',
554
+ Type: 'PLAINTEXT',
555
+ },
556
+ {
557
+ Name: 'CI_TABLE_NAME',
558
+ Value: this.ciTable.tableName,
559
+ Type: 'PLAINTEXT',
560
+ },
561
+ ],
562
+ },
563
+ ResultPath: '$.stage2RegisterResult',
564
+ },
565
+ });
566
+
567
+ // Stage2Clean: Run do/clean after benchmark stage completes (success path)
568
+ const stage2Clean = new sfn.CustomState(this, 'Stage2Clean', {
569
+ stateJson: {
570
+ Type: 'Task',
571
+ Resource: 'arn:aws:states:::codebuild:startBuild',
572
+ Parameters: {
573
+ ProjectName: 'mlcc-ci-executor',
574
+ EnvironmentVariablesOverride: [
575
+ {
576
+ Name: 'CONFIG_ID',
577
+ 'Value.$': '$.configId',
578
+ Type: 'PLAINTEXT',
579
+ },
580
+ {
581
+ Name: 'CONFIG_JSON',
582
+ 'Value.$': '$.configJson',
583
+ Type: 'PLAINTEXT',
584
+ },
585
+ {
586
+ Name: 'CI_STAGE',
587
+ Value: 'stage2-clean',
588
+ Type: 'PLAINTEXT',
589
+ },
590
+ ],
591
+ },
592
+ ResultPath: '$.stage2CleanResult',
593
+ },
594
+ });
595
+
596
+ // Stage2FailureHandler: Handle Stage 2 failures without affecting testStatus.
597
+ // Records lastBenchmarkStatus=failed in DynamoDB, then proceeds to clean.
598
+ // Per Req 1.4: Stage 2 failure SHALL NOT change the DynamoDB testStatus.
599
+ // Uses SET expression targeting ONLY the 3 benchmark fields — never touches
600
+ // testStatus, configJson, or any other pre-existing field.
601
+ const stage2FailureHandler = new sfn.CustomState(this, 'Stage2FailureHandler', {
602
+ stateJson: {
603
+ Type: 'Task',
604
+ Resource: 'arn:aws:states:::dynamodb:updateItem',
605
+ Parameters: {
606
+ TableName: this.ciTable.tableName,
607
+ Key: {
608
+ configId: { 'S.$': '$.configId' },
609
+ },
610
+ UpdateExpression: 'SET lastBenchmarkRunId = :rid, lastBenchmarkTimestamp = :ts, lastBenchmarkStatus = :status',
611
+ ExpressionAttributeValues: {
612
+ ':rid': {
613
+ 'S.$': "States.Format('bmk-failure-{}', $.configId)",
614
+ },
615
+ ':ts': { 'S.$': '$$.State.EnteredTime' },
616
+ ':status': { 'S': 'failed' },
617
+ },
618
+ },
619
+ ResultPath: '$.stage2FailureUpdateResult',
620
+ Retry: [
621
+ {
622
+ ErrorEquals: ['States.ALL'],
623
+ IntervalSeconds: 2,
624
+ MaxAttempts: 3,
625
+ BackoffRate: 2.0,
626
+ },
627
+ ],
628
+ },
629
+ });
630
+
631
+ // Stage2FailureClean: Clean up after a Stage 2 failure
632
+ const stage2FailureClean = new sfn.CustomState(this, 'Stage2FailureClean', {
633
+ stateJson: {
634
+ Type: 'Task',
635
+ Resource: 'arn:aws:states:::codebuild:startBuild',
636
+ Parameters: {
637
+ ProjectName: 'mlcc-ci-executor',
638
+ EnvironmentVariablesOverride: [
639
+ {
640
+ Name: 'CONFIG_ID',
641
+ 'Value.$': '$.configId',
642
+ Type: 'PLAINTEXT',
643
+ },
644
+ {
645
+ Name: 'CONFIG_JSON',
646
+ 'Value.$': '$.configJson',
647
+ Type: 'PLAINTEXT',
648
+ },
649
+ {
650
+ Name: 'CI_STAGE',
651
+ Value: 'stage2-clean',
652
+ Type: 'PLAINTEXT',
653
+ },
654
+ ],
655
+ },
656
+ ResultPath: '$.stage2FailureCleanResult',
372
657
  End: true,
373
658
  },
374
659
  });
@@ -403,11 +688,24 @@ export class MlccCiHarnessStack extends cdk.Stack {
403
688
  },
404
689
  });
405
690
 
691
+ // ─── Stage 2: Benchmark Error Handling ────────────────────────────────
692
+ // Stage 2 failure isolation: if benchmarking fails, we record the failure
693
+ // in the benchmark-specific fields (lastBenchmarkStatus=failed) without
694
+ // touching testStatus. Uses a Parallel wrapper with addCatch so that CDK
695
+ // properly includes the failure handler states in the definition graph.
696
+ // Requirements: 1.4, 7.3
697
+
406
698
  // Wire up the state machine chain
407
699
  // RecordStartTime → StartCodeBuild → WaitForBuild → PollBuildStatus → CheckTimestamp → CheckBuildStatus
408
700
  // CheckBuildStatus branches:
409
- // - SUCCEEDED/FAILED/STOPPED → SetBuildCompleteResult → UpdateResults
410
- // - TIMED_OUTHandleTimeoutUpdateResultsFromTimeout
701
+ // - SUCCEEDED/FAILED/STOPPED → SetBuildCompleteResult → UpdateResults → GetBenchmarkConfig
702
+ // CheckDynamoItemHasBenchmarkFieldExtractBenchmarkFlagsCheckBenchmarkEnabled
703
+ // CheckBenchmarkEnabled branches:
704
+ // - pass + benchmarkEnabled=true → PrepareStage2Input → Stage2Pipeline (Parallel)
705
+ // Success: Stage2Benchmark → Stage2RegisterBenchmark → Stage2Clean → End
706
+ // Failure: Stage2FailureHandler → Stage2FailureClean → End
707
+ // - pass + benchmarkEnabled=false OR failed → SkipStage2 → End
708
+ // - TIMED_OUT → HandleTimeout → UpdateResultsFromTimeout → End
411
709
  // - IN_PROGRESS (otherwise) → WaitForBuild (loop)
412
710
  recordStartTime.next(startCodeBuild);
413
711
  startCodeBuild.next(waitForBuild);
@@ -417,6 +715,33 @@ export class MlccCiHarnessStack extends cdk.Stack {
417
715
  setSuccessResult.next(updateResults);
418
716
  handleTimeout.next(updateResultsFromTimeout);
419
717
 
718
+ // Stage 2 wiring: after UpdateResults, read DynamoDB for benchmark config
719
+ updateResults.next(getBenchmarkConfig);
720
+ getBenchmarkConfig.next(checkDynamoItemHasBenchmarkField);
721
+ extractBenchmarkFlags.next(checkBenchmarkEnabled);
722
+ extractBenchmarkFlagsDefault.next(checkBenchmarkEnabled);
723
+
724
+ // Stage 2 execution uses a Parallel state to enable proper CDK Catch handling.
725
+ // The Parallel has one branch (the success path), and addCatch routes errors
726
+ // to the failure handler chain.
727
+ const stage2Pipeline = new sfn.Parallel(this, 'Stage2Pipeline', {
728
+ resultPath: '$.stage2PipelineResult',
729
+ });
730
+ stage2Pipeline.branch(
731
+ stage2Benchmark
732
+ .next(stage2RegisterBenchmark)
733
+ .next(stage2Clean),
734
+ );
735
+ stage2Pipeline.addCatch(stage2FailureHandler, {
736
+ resultPath: '$.stage2Error',
737
+ });
738
+
739
+ // After PrepareStage2Input, enter the Stage2Pipeline parallel wrapper
740
+ prepareStage2Input.next(stage2Pipeline);
741
+
742
+ // Stage 2 failure path: FailureHandler → FailureClean → End
743
+ stage2FailureHandler.next(stage2FailureClean);
744
+
420
745
  // Create the state machine
421
746
  this.ciOrchestrator = new sfn.StateMachine(this, 'CiOrchestrator', {
422
747
  stateMachineName: 'mlcc-ci-orchestrator',
@@ -442,8 +767,12 @@ export class MlccCiHarnessStack extends cdk.Stack {
442
767
  this.scannerFunction.addEnvironment('STATE_MACHINE_ARN', this.ciOrchestrator.stateMachineArn);
443
768
 
444
769
  // CodeBuild IAM role with permissions for lifecycle execution
770
+ //
771
+ // RETAIN policy: IAM roles are retained on stack deletion to prevent conflicts
772
+ // during multi-region bootstrap. If the stack is re-created, existing roles will
773
+ // be reused via --no-rollback.
445
774
  const codebuildRole = new iam.Role(this, 'CodeBuildRole', {
446
- roleName: 'mlcc-ci-codebuild-role',
775
+ roleName: `mlcc-ci-codebuild-role-${this.region}`,
447
776
  assumedBy: new iam.ServicePrincipal('codebuild.amazonaws.com'),
448
777
  description: 'IAM role for the MLCC CI CodeBuild executor project',
449
778
  });
@@ -605,5 +934,520 @@ export class MlccCiHarnessStack extends cdk.Stack {
605
934
  ],
606
935
  resources: [this.ciCodeBuildProject.projectArn],
607
936
  }));
937
+
938
+ // ─── Benchmark Infrastructure (opt-in) ────────────────────────────────
939
+ // Gated by the CreateBenchmarkInfra parameter. When enabled, provisions
940
+ // Glue database/table, S3 results bucket, IAM permissions for benchmark
941
+ // writes (S3, Glue, Athena), and stack outputs for downstream consumers.
942
+ // Idempotent: CloudFormation conditions ensure re-running `cdk deploy`
943
+ // with the same parameter value produces no changes. Resources are only
944
+ // created when CreateBenchmarkInfra=true; subsequent deploys with the
945
+ // same value are no-ops (CloudFormation handles existence checks natively).
946
+ // Requirements: 3.1, 3.2, 3.4, 3.5
947
+
948
+ const createBenchmarkInfra = new cdk.CfnParameter(this, 'CreateBenchmarkInfra', {
949
+ type: 'String',
950
+ default: 'false',
951
+ allowedValues: ['true', 'false'],
952
+ description: 'Whether to create benchmark infrastructure (S3 results bucket, Glue DB/table, IAM permissions). Opt-in.',
953
+ });
954
+
955
+ const benchmarkInfraCondition = new cdk.CfnCondition(this, 'BenchmarkInfraCondition', {
956
+ expression: cdk.Fn.conditionEquals(createBenchmarkInfra.valueAsString, 'true'),
957
+ });
958
+
959
+ // Glue Database: mlcc_ci
960
+ // CloudFormation manages create-or-skip via the condition — no duplicate
961
+ // resource error on re-deploy because the logical ID is stable.
962
+ const glueDatabase = new glue.CfnDatabase(this, 'CiGlueDatabase', {
963
+ catalogId: this.account,
964
+ databaseInput: {
965
+ name: 'mlcc_ci',
966
+ description: 'MCC CI benchmark results warehouse',
967
+ },
968
+ });
969
+ glueDatabase.cfnOptions.condition = benchmarkInfraCondition;
970
+
971
+ // Glue Table: benchmark_results — full DDL with all 28+ columns
972
+ // Partition by region/year/month for efficient time-range queries.
973
+ // Dimension columns are well-separated (not composite keys) per Req 5.1.
974
+ const glueTable = new glue.CfnTable(this, 'BenchmarkResultsTable', {
975
+ catalogId: this.account,
976
+ databaseName: 'mlcc_ci',
977
+ tableInput: {
978
+ name: 'benchmark_results',
979
+ tableType: 'EXTERNAL_TABLE',
980
+ parameters: {
981
+ 'classification': 'parquet',
982
+ 'parquet.compression': 'SNAPPY',
983
+ },
984
+ storageDescriptor: {
985
+ columns: [
986
+ // Identity & config (matches Parquet writer get_parquet_schema() exactly)
987
+ { name: 'project_name', type: 'string', comment: 'MCC project name' },
988
+ { name: 'model_name', type: 'string', comment: 'HuggingFace model ID' },
989
+ { name: 'model_family', type: 'string', comment: 'Derived: qwen3, llama3, deepseek-r1' },
990
+ { name: 'instance_type', type: 'string', comment: 'SageMaker instance type' },
991
+ { name: 'deployment_config', type: 'string', comment: 'Architecture-backend' },
992
+ { name: 'deployment_target', type: 'string', comment: 'Deployment target' },
993
+ { name: 'quantization', type: 'string', comment: 'none, fp8, awq, gptq' },
994
+ { name: 'tensor_parallel_degree', type: 'int', comment: 'TP degree' },
995
+ { name: 'serving_config', type: 'string', comment: 'Full serving config JSON blob' },
996
+ { name: 'workload', type: 'string', comment: 'Named workload profile' },
997
+ { name: 'concurrency', type: 'int', comment: 'Concurrent requests' },
998
+ { name: 'input_tokens_mean', type: 'int', comment: 'Mean input tokens' },
999
+ { name: 'output_tokens_mean', type: 'int', comment: 'Mean output tokens' },
1000
+ { name: 'streaming', type: 'boolean', comment: 'Streaming enabled' },
1001
+ { name: 'duration_seconds', type: 'int', comment: 'Duration in seconds' },
1002
+ // Throughput metrics
1003
+ { name: 'request_throughput_rps', type: 'double', comment: 'Requests/sec' },
1004
+ { name: 'total_token_throughput_tps', type: 'double', comment: 'Total tokens/sec (in+out)' },
1005
+ { name: 'output_token_throughput_tps', type: 'double', comment: 'Output tokens/sec' },
1006
+ { name: 'request_count', type: 'double', comment: 'Total requests in run' },
1007
+ // Latency metrics (avg/p50/p90/p99)
1008
+ { name: 'ttft_avg_ms', type: 'double', comment: 'TTFT average (ms)' },
1009
+ { name: 'ttft_p50_ms', type: 'double', comment: 'TTFT p50 (ms)' },
1010
+ { name: 'ttft_p90_ms', type: 'double', comment: 'TTFT p90 (ms)' },
1011
+ { name: 'ttft_p99_ms', type: 'double', comment: 'TTFT p99 (ms)' },
1012
+ { name: 'itl_avg_ms', type: 'double', comment: 'ITL average (ms)' },
1013
+ { name: 'itl_p50_ms', type: 'double', comment: 'ITL p50 (ms)' },
1014
+ { name: 'itl_p90_ms', type: 'double', comment: 'ITL p90 (ms)' },
1015
+ { name: 'itl_p99_ms', type: 'double', comment: 'ITL p99 (ms)' },
1016
+ { name: 'e2e_latency_avg_ms', type: 'double', comment: 'E2E latency average (ms)' },
1017
+ { name: 'e2e_latency_p50_ms', type: 'double', comment: 'E2E latency p50 (ms)' },
1018
+ { name: 'e2e_latency_p90_ms', type: 'double', comment: 'E2E latency p90 (ms)' },
1019
+ { name: 'e2e_latency_p99_ms', type: 'double', comment: 'E2E latency p99 (ms)' },
1020
+ { name: 'prefill_tps_avg', type: 'double', comment: 'Prefill throughput avg (tokens/sec)' },
1021
+ { name: 'prefill_tps_p50', type: 'double', comment: 'Prefill throughput p50' },
1022
+ { name: 'output_token_tps_avg', type: 'double', comment: 'Per-user output TPS avg' },
1023
+ { name: 'output_token_tps_p50', type: 'double', comment: 'Per-user output TPS p50' },
1024
+ { name: 'output_token_tps_p90', type: 'double', comment: 'Per-user output TPS p90' },
1025
+ { name: 'ttst_p50_ms', type: 'double', comment: 'Time to second token p50 (ms)' },
1026
+ { name: 'ttst_p90_ms', type: 'double', comment: 'Time to second token p90 (ms)' },
1027
+ { name: 'output_sequence_length_avg', type: 'double', comment: 'Avg output sequence length' },
1028
+ { name: 'input_sequence_length_avg', type: 'double', comment: 'Avg input sequence length' },
1029
+ { name: 'error_rate', type: 'double', comment: 'Error rate (0.0-1.0)' },
1030
+ { name: 'benchmark_duration_sec', type: 'double', comment: 'Wall-clock duration (sec)' },
1031
+ // Provenance
1032
+ { name: 'run_type', type: 'string', comment: 'ci, path_prove, manual' },
1033
+ { name: 'benchmark_job_name', type: 'string', comment: 'SageMaker benchmark job name' },
1034
+ { name: 'mcc_version', type: 'string', comment: 'MCC version' },
1035
+ { name: 'run_timestamp', type: 'string', comment: 'ISO 8601 UTC timestamp' },
1036
+ { name: 'region', type: 'string', comment: 'AWS region' },
1037
+ ],
1038
+ location: `s3://mlcc-benchmark-results-${this.account}-${this.region}/results/`,
1039
+ inputFormat: 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat',
1040
+ outputFormat: 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat',
1041
+ serdeInfo: {
1042
+ serializationLibrary: 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe',
1043
+ parameters: {
1044
+ 'serialization.format': '1',
1045
+ },
1046
+ },
1047
+ compressed: true,
1048
+ },
1049
+ partitionKeys: [
1050
+ { name: 'model', type: 'string', comment: 'Model name with / replaced by _ (e.g., Qwen_Qwen3-0.6B)' },
1051
+ { name: 'instance', type: 'string', comment: 'SageMaker instance type (e.g., ml.g5.xlarge)' },
1052
+ { name: 'target', type: 'string', comment: 'Deployment target (realtime-inference, async-inference, etc.)' },
1053
+ ],
1054
+ },
1055
+ });
1056
+ glueTable.addDependency(glueDatabase);
1057
+ glueTable.cfnOptions.condition = benchmarkInfraCondition;
1058
+
1059
+ // Configurable lifecycle parameters for the benchmark results bucket
1060
+ const benchmarkIaTransitionDays = new cdk.CfnParameter(this, 'BenchmarkIaTransitionDays', {
1061
+ type: 'Number',
1062
+ default: 90,
1063
+ description: 'Days before benchmark results transition to Infrequent Access storage',
1064
+ minValue: 30,
1065
+ maxValue: 365,
1066
+ });
1067
+
1068
+ const benchmarkExpirationDays = new cdk.CfnParameter(this, 'BenchmarkExpirationDays', {
1069
+ type: 'Number',
1070
+ default: 365,
1071
+ description: 'Days before benchmark results expire and are deleted',
1072
+ minValue: 90,
1073
+ maxValue: 3650,
1074
+ });
1075
+
1076
+ // S3 bucket for benchmark results (Parquet files partitioned by region/year/month)
1077
+ const benchmarkResultsBucket = new s3.Bucket(this, 'BenchmarkResultsBucket', {
1078
+ bucketName: `mlcc-benchmark-results-${this.account}-${this.region}`,
1079
+ removalPolicy: cdk.RemovalPolicy.RETAIN,
1080
+ lifecycleRules: [
1081
+ {
1082
+ transitions: [
1083
+ {
1084
+ storageClass: s3.StorageClass.INFREQUENT_ACCESS,
1085
+ transitionAfter: cdk.Duration.days(benchmarkIaTransitionDays.valueAsNumber),
1086
+ },
1087
+ ],
1088
+ expiration: cdk.Duration.days(benchmarkExpirationDays.valueAsNumber),
1089
+ },
1090
+ ],
1091
+ });
1092
+
1093
+ // Apply the benchmark condition to the S3 bucket
1094
+ const cfnBenchmarkBucket = benchmarkResultsBucket.node.defaultChild as cdk.CfnResource;
1095
+ cfnBenchmarkBucket.cfnOptions.condition = benchmarkInfraCondition;
1096
+
1097
+ // Output the benchmark results bucket ARN (conditional)
1098
+ new cdk.CfnOutput(this, 'BenchmarkResultsBucketArn', {
1099
+ value: benchmarkResultsBucket.bucketArn,
1100
+ description: 'ARN of the S3 bucket storing benchmark results (Parquet)',
1101
+ condition: benchmarkInfraCondition,
1102
+ exportName: 'mlcc-ci-benchmark-results-bucket-arn',
1103
+ });
1104
+
1105
+ // Output the benchmark results bucket name (conditional)
1106
+ new cdk.CfnOutput(this, 'BenchmarkResultsBucketName', {
1107
+ value: benchmarkResultsBucket.bucketName,
1108
+ description: 'Name of the S3 bucket storing benchmark results (Parquet)',
1109
+ condition: benchmarkInfraCondition,
1110
+ });
1111
+
1112
+ // Output the Glue database name (conditional)
1113
+ new cdk.CfnOutput(this, 'CiGlueDatabaseName', {
1114
+ value: 'mlcc_ci',
1115
+ description: 'Name of the Glue database for benchmark results',
1116
+ condition: benchmarkInfraCondition,
1117
+ });
1118
+
1119
+ // S3 permissions for benchmark results bucket writes
1120
+ const benchmarkS3Policy = new iam.PolicyStatement({
1121
+ sid: 'BenchmarkResultsWrite',
1122
+ effect: iam.Effect.ALLOW,
1123
+ actions: [
1124
+ 's3:PutObject',
1125
+ 's3:GetObject',
1126
+ 's3:ListBucket',
1127
+ ],
1128
+ resources: [
1129
+ 'arn:aws:s3:::mlcc-benchmark-results-*',
1130
+ 'arn:aws:s3:::mlcc-benchmark-results-*/*',
1131
+ ],
1132
+ });
1133
+
1134
+ // Glue permissions for partition management
1135
+ const benchmarkGluePolicy = new iam.PolicyStatement({
1136
+ sid: 'GlueCatalogAccess',
1137
+ effect: iam.Effect.ALLOW,
1138
+ actions: [
1139
+ 'glue:GetDatabase',
1140
+ 'glue:GetTable',
1141
+ 'glue:GetPartitions',
1142
+ 'glue:BatchCreatePartition',
1143
+ 'glue:CreatePartition',
1144
+ ],
1145
+ resources: [
1146
+ 'arn:aws:glue:*:*:catalog',
1147
+ 'arn:aws:glue:*:*:database/mlcc_ci',
1148
+ 'arn:aws:glue:*:*:table/mlcc_ci/*',
1149
+ ],
1150
+ });
1151
+
1152
+ // Athena permissions for partition repair (MSCK REPAIR TABLE)
1153
+ const benchmarkAthenaPolicy = new iam.PolicyStatement({
1154
+ sid: 'AthenaPartitionRepair',
1155
+ effect: iam.Effect.ALLOW,
1156
+ actions: [
1157
+ 'athena:StartQueryExecution',
1158
+ 'athena:GetQueryResults',
1159
+ ],
1160
+ resources: ['*'],
1161
+ });
1162
+
1163
+ // Create a managed policy for benchmark permissions so we can condition it
1164
+ const benchmarkPolicy = new iam.Policy(this, 'BenchmarkWritePolicy', {
1165
+ policyName: 'mlcc-ci-benchmark-write-policy',
1166
+ statements: [benchmarkS3Policy, benchmarkGluePolicy, benchmarkAthenaPolicy],
1167
+ });
1168
+ benchmarkPolicy.attachToRole(codebuildRole);
1169
+
1170
+ // Apply the condition to the policy's underlying CFN resource
1171
+ const cfnBenchmarkPolicy = benchmarkPolicy.node.defaultChild as cdk.CfnResource;
1172
+ cfnBenchmarkPolicy.cfnOptions.condition = benchmarkInfraCondition;
1173
+
1174
+ // ─── Path Prover Infrastructure (opt-in, separate from benchmark infra) ────
1175
+ // Gated by the CreatePathProver parameter. When enabled, provisions:
1176
+ // - Brain Lambda (getNextConfig, pickNext, classifyFailure)
1177
+ // - WriteResults Lambda (writes path_prove records to Athena)
1178
+ // - Step Functions state machine (path-prover orchestrator)
1179
+ // - EventBridge scheduled rule (disabled by default)
1180
+ // Requirements: 8.1, 8.7, 8.8
1181
+
1182
+ const createPathProver = new cdk.CfnParameter(this, 'CreatePathProver', {
1183
+ type: 'String',
1184
+ default: 'false',
1185
+ allowedValues: ['true', 'false'],
1186
+ description: 'Whether to create Path Prover infrastructure (state machine, Lambdas, EventBridge rule). Opt-in.',
1187
+ });
1188
+
1189
+ const pathProverCondition = new cdk.CfnCondition(this, 'PathProverCondition', {
1190
+ expression: cdk.Fn.conditionEquals(createPathProver.valueAsString, 'true'),
1191
+ });
1192
+
1193
+ // Path Prover Brain Lambda IAM role
1194
+ const pathProverBrainRole = new iam.Role(this, 'PathProverBrainRole', {
1195
+ roleName: 'mlcc-path-prover-brain-role',
1196
+ assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'),
1197
+ description: 'IAM role for the Path Prover Brain Lambda function',
1198
+ });
1199
+ (pathProverBrainRole.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
1200
+
1201
+ // Brain Lambda: Athena read access for gap identification + substitution
1202
+ const brainAthenaPolicy = new iam.Policy(this, 'PathProverBrainAthenaPolicy', {
1203
+ policyName: 'mlcc-path-prover-brain-athena',
1204
+ statements: [
1205
+ new iam.PolicyStatement({
1206
+ effect: iam.Effect.ALLOW,
1207
+ actions: [
1208
+ 'athena:StartQueryExecution',
1209
+ 'athena:GetQueryExecution',
1210
+ 'athena:GetQueryResults',
1211
+ ],
1212
+ resources: ['*'],
1213
+ }),
1214
+ new iam.PolicyStatement({
1215
+ effect: iam.Effect.ALLOW,
1216
+ actions: ['glue:GetTable', 'glue:GetDatabase', 'glue:GetPartitions'],
1217
+ resources: [
1218
+ 'arn:aws:glue:*:*:catalog',
1219
+ 'arn:aws:glue:*:*:database/mlcc_ci',
1220
+ 'arn:aws:glue:*:*:table/mlcc_ci/*',
1221
+ ],
1222
+ }),
1223
+ new iam.PolicyStatement({
1224
+ effect: iam.Effect.ALLOW,
1225
+ actions: ['s3:GetObject', 's3:ListBucket', 's3:GetBucketLocation', 's3:PutObject'],
1226
+ resources: [
1227
+ 'arn:aws:s3:::mlcc-benchmark-results-*',
1228
+ 'arn:aws:s3:::mlcc-benchmark-results-*/*',
1229
+ ],
1230
+ }),
1231
+ new iam.PolicyStatement({
1232
+ effect: iam.Effect.ALLOW,
1233
+ actions: ['logs:CreateLogStream', 'logs:PutLogEvents'],
1234
+ resources: [this.ciLogGroup.logGroupArn, `${this.ciLogGroup.logGroupArn}:*`],
1235
+ }),
1236
+ ],
1237
+ });
1238
+ brainAthenaPolicy.attachToRole(pathProverBrainRole);
1239
+ (brainAthenaPolicy.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
1240
+
1241
+ // Path Prover Brain Lambda function
1242
+ const pathProverBrainFunction = new NodejsFunction(this, 'PathProverBrainFunction', {
1243
+ functionName: 'mlcc-path-prover-brain',
1244
+ runtime: lambda.Runtime.NODEJS_20_X,
1245
+ memorySize: 512,
1246
+ timeout: cdk.Duration.seconds(120),
1247
+ entry: path.join(__dirname, '..', 'lambda', 'path-prover', 'brain.ts'),
1248
+ handler: 'handler',
1249
+ role: pathProverBrainRole,
1250
+ environment: {
1251
+ GLUE_DATABASE: 'mlcc_ci',
1252
+ GLUE_TABLE: 'benchmark_results',
1253
+ MAX_PROVES_PER_RUN: '10',
1254
+ MAX_COST_PER_RUN: '100',
1255
+ },
1256
+ logGroup: this.ciLogGroup,
1257
+ bundling: {
1258
+ minify: true,
1259
+ sourceMap: true,
1260
+ },
1261
+ });
1262
+ (pathProverBrainFunction.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
1263
+
1264
+ // Path Prover Write Results Lambda IAM role
1265
+ const pathProverWriteRole = new iam.Role(this, 'PathProverWriteRole', {
1266
+ roleName: 'mlcc-path-prover-write-role',
1267
+ assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'),
1268
+ description: 'IAM role for the Path Prover Write Results Lambda function',
1269
+ });
1270
+ (pathProverWriteRole.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
1271
+
1272
+ // Write Results Lambda: S3 + Glue write access
1273
+ const writeResultsPolicy = new iam.Policy(this, 'PathProverWriteResultsPolicy', {
1274
+ policyName: 'mlcc-path-prover-write-results',
1275
+ statements: [
1276
+ new iam.PolicyStatement({
1277
+ effect: iam.Effect.ALLOW,
1278
+ actions: ['s3:PutObject', 's3:GetObject'],
1279
+ resources: [
1280
+ 'arn:aws:s3:::mlcc-benchmark-results-*/*',
1281
+ ],
1282
+ }),
1283
+ new iam.PolicyStatement({
1284
+ effect: iam.Effect.ALLOW,
1285
+ actions: ['glue:BatchCreatePartition', 'glue:CreatePartition', 'glue:GetTable'],
1286
+ resources: [
1287
+ 'arn:aws:glue:*:*:catalog',
1288
+ 'arn:aws:glue:*:*:database/mlcc_ci',
1289
+ 'arn:aws:glue:*:*:table/mlcc_ci/*',
1290
+ ],
1291
+ }),
1292
+ new iam.PolicyStatement({
1293
+ effect: iam.Effect.ALLOW,
1294
+ actions: ['logs:CreateLogStream', 'logs:PutLogEvents'],
1295
+ resources: [this.ciLogGroup.logGroupArn, `${this.ciLogGroup.logGroupArn}:*`],
1296
+ }),
1297
+ ],
1298
+ });
1299
+ writeResultsPolicy.attachToRole(pathProverWriteRole);
1300
+ (writeResultsPolicy.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
1301
+
1302
+ // Path Prover Write Results Lambda function
1303
+ const pathProverWriteFunction = new NodejsFunction(this, 'PathProverWriteFunction', {
1304
+ functionName: 'mlcc-path-prover-write-results',
1305
+ runtime: lambda.Runtime.NODEJS_20_X,
1306
+ memorySize: 256,
1307
+ timeout: cdk.Duration.seconds(60),
1308
+ entry: path.join(__dirname, '..', 'lambda', 'path-prover', 'write-results.ts'),
1309
+ handler: 'handler',
1310
+ role: pathProverWriteRole,
1311
+ environment: {
1312
+ GLUE_DATABASE: 'mlcc_ci',
1313
+ GLUE_TABLE: 'benchmark_results',
1314
+ RESULTS_BUCKET: `mlcc-benchmark-results-${this.account}-${this.region}`,
1315
+ },
1316
+ logGroup: this.ciLogGroup,
1317
+ bundling: {
1318
+ minify: true,
1319
+ sourceMap: true,
1320
+ },
1321
+ });
1322
+ (pathProverWriteFunction.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
1323
+
1324
+ // Path Prover Step Functions IAM role
1325
+ const pathProverOrchestratorRole = new iam.Role(this, 'PathProverOrchestratorRole', {
1326
+ roleName: 'mlcc-path-prover-orchestrator-role',
1327
+ assumedBy: new iam.ServicePrincipal('states.amazonaws.com'),
1328
+ description: 'IAM role for the Path Prover Step Functions state machine',
1329
+ });
1330
+ (pathProverOrchestratorRole.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
1331
+
1332
+ // Orchestrator permissions
1333
+ const pathProverOrchestratorPolicy = new iam.Policy(this, 'PathProverOrchestratorPolicy', {
1334
+ policyName: 'mlcc-path-prover-orchestrator-policy',
1335
+ statements: [
1336
+ // Lambda invoke for brain and write-results
1337
+ new iam.PolicyStatement({
1338
+ effect: iam.Effect.ALLOW,
1339
+ actions: ['lambda:InvokeFunction'],
1340
+ resources: [
1341
+ pathProverBrainFunction.functionArn,
1342
+ pathProverWriteFunction.functionArn,
1343
+ ],
1344
+ }),
1345
+ // CodeBuild start/poll for lifecycle stages
1346
+ new iam.PolicyStatement({
1347
+ effect: iam.Effect.ALLOW,
1348
+ actions: ['codebuild:StartBuild', 'codebuild:BatchGetBuilds', 'codebuild:StopBuild'],
1349
+ resources: [this.ciCodeBuildProject.projectArn],
1350
+ }),
1351
+ // CloudWatch Logs for execution logging
1352
+ new iam.PolicyStatement({
1353
+ effect: iam.Effect.ALLOW,
1354
+ actions: [
1355
+ 'logs:CreateLogDelivery',
1356
+ 'logs:GetLogDelivery',
1357
+ 'logs:UpdateLogDelivery',
1358
+ 'logs:DeleteLogDelivery',
1359
+ 'logs:ListLogDeliveries',
1360
+ 'logs:PutResourcePolicy',
1361
+ 'logs:DescribeResourcePolicies',
1362
+ 'logs:DescribeLogGroups',
1363
+ 'logs:PutLogEvents',
1364
+ 'logs:CreateLogStream',
1365
+ ],
1366
+ resources: ['*'],
1367
+ }),
1368
+ // Events for .sync integration
1369
+ new iam.PolicyStatement({
1370
+ effect: iam.Effect.ALLOW,
1371
+ actions: ['events:PutTargets', 'events:PutRule', 'events:DescribeRule'],
1372
+ resources: [`arn:aws:events:${this.region}:${this.account}:rule/StepFunctionsGetBuildStatusRule-*`],
1373
+ }),
1374
+ ],
1375
+ });
1376
+ pathProverOrchestratorPolicy.attachToRole(pathProverOrchestratorRole);
1377
+ (pathProverOrchestratorPolicy.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
1378
+
1379
+ // Path Prover State Machine
1380
+ // Uses ASL definition from file with Fn::Sub for variable substitution.
1381
+ // We read the raw JSON and use cdk.Fn.sub to inject resource ARNs.
1382
+ const aslTemplate = JSON.stringify(require('../state-machines/path-prover.asl.json'));
1383
+ const pathProverDefinitionString = cdk.Fn.sub(aslTemplate, {
1384
+ BrainFunctionArn: pathProverBrainFunction.functionArn,
1385
+ WriteResultsFunctionArn: pathProverWriteFunction.functionArn,
1386
+ ClassifyFailureFunctionArn: pathProverBrainFunction.functionArn,
1387
+ CodeBuildProjectName: this.ciCodeBuildProject.projectName,
1388
+ });
1389
+
1390
+ const pathProverStateMachine = new sfn.CfnStateMachine(this, 'PathProverStateMachine', {
1391
+ stateMachineName: 'mlcc-path-prover',
1392
+ stateMachineType: 'STANDARD',
1393
+ definitionString: pathProverDefinitionString,
1394
+ roleArn: pathProverOrchestratorRole.roleArn,
1395
+ loggingConfiguration: {
1396
+ destinations: [{
1397
+ cloudWatchLogsLogGroup: {
1398
+ logGroupArn: this.ciLogGroup.logGroupArn,
1399
+ },
1400
+ }],
1401
+ level: 'ALL',
1402
+ includeExecutionData: true,
1403
+ },
1404
+ tracingConfiguration: {
1405
+ enabled: true,
1406
+ },
1407
+ });
1408
+ pathProverStateMachine.cfnOptions.condition = pathProverCondition;
1409
+
1410
+ // EventBridge scheduled rule for Path Prover (disabled by default)
1411
+ // Can be enabled via the EnablePathProverSchedule parameter
1412
+ const enablePathProverSchedule = new cdk.CfnParameter(this, 'EnablePathProverSchedule', {
1413
+ type: 'String',
1414
+ default: 'DISABLED',
1415
+ allowedValues: ['ENABLED', 'DISABLED'],
1416
+ description: 'Whether to enable the Path Prover scheduled EventBridge rule. Default: DISABLED.',
1417
+ });
1418
+
1419
+ const pathProverScheduleRule = new events.CfnRule(this, 'PathProverScheduleRule', {
1420
+ name: 'mlcc-path-prover-schedule',
1421
+ description: 'Triggers the Path Prover state machine on a schedule to fill coverage gaps',
1422
+ scheduleExpression: 'rate(6 hours)',
1423
+ state: enablePathProverSchedule.valueAsString,
1424
+ targets: [{
1425
+ arn: `arn:aws:states:${this.region}:${this.account}:stateMachine:mlcc-path-prover`,
1426
+ id: 'PathProverTarget',
1427
+ roleArn: pathProverOrchestratorRole.roleArn,
1428
+ input: JSON.stringify({
1429
+ iteration: 0,
1430
+ budgetSpent: 0,
1431
+ maxProvesPerRun: 10,
1432
+ maxCostPerRun: 100,
1433
+ previousResults: [],
1434
+ }),
1435
+ }],
1436
+ });
1437
+ pathProverScheduleRule.cfnOptions.condition = pathProverCondition;
1438
+
1439
+ // Output Path Prover state machine ARN
1440
+ new cdk.CfnOutput(this, 'PathProverStateMachineArn', {
1441
+ value: `arn:aws:states:${this.region}:${this.account}:stateMachine:mlcc-path-prover`,
1442
+ description: 'ARN of the Path Prover Step Functions state machine',
1443
+ condition: pathProverCondition,
1444
+ });
1445
+
1446
+ // Output Brain Lambda ARN
1447
+ new cdk.CfnOutput(this, 'PathProverBrainFunctionArn', {
1448
+ value: pathProverBrainFunction.functionArn,
1449
+ description: 'ARN of the Path Prover Brain Lambda function',
1450
+ condition: pathProverCondition,
1451
+ });
608
1452
  }
609
1453
  }