npm - @aws/ml-container-creator - Versions diffs - 0.9.1 → 0.10.3 - Mend

@aws/ml-container-creator 0.9.1 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/LICENSE-THIRD-PARTY +9304 -0
package/bin/cli.js +2 -0
package/config/bootstrap-e2e-stack.json +341 -0
package/config/bootstrap-stack.json +40 -3
package/config/parameter-schema-v2.json +2049 -0
package/config/tune-catalog.json +1781 -0
package/infra/ci-harness/buildspec.yml +1 -0
package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
package/package.json +53 -68
package/servers/base-image-picker/index.js +121 -121
package/servers/e2e-status/index.js +297 -0
package/servers/e2e-status/manifest.json +14 -0
package/servers/e2e-status/package.json +15 -0
package/servers/endpoint-picker/LICENSE +202 -0
package/servers/endpoint-picker/index.js +536 -0
package/servers/endpoint-picker/manifest.json +14 -0
package/servers/endpoint-picker/package.json +18 -0
package/servers/hyperpod-cluster-picker/index.js +125 -125
package/servers/instance-sizer/index.js +138 -138
package/servers/instance-sizer/lib/instance-ranker.js +76 -76
package/servers/instance-sizer/lib/model-resolver.js +61 -61
package/servers/instance-sizer/lib/quota-resolver.js +113 -113
package/servers/instance-sizer/lib/vram-estimator.js +31 -31
package/servers/lib/bedrock-client.js +38 -38
package/servers/lib/catalogs/jumpstart-public.json +101 -16
package/servers/lib/catalogs/model-servers.json +201 -3
package/servers/lib/catalogs/models.json +182 -26
package/servers/lib/custom-validators.js +13 -13
package/servers/lib/dynamic-resolver.js +4 -4
package/servers/marketplace-picker/index.js +342 -0
package/servers/marketplace-picker/manifest.json +14 -0
package/servers/marketplace-picker/package.json +18 -0
package/servers/model-picker/index.js +382 -382
package/servers/region-picker/index.js +56 -56
package/servers/workload-picker/LICENSE +202 -0
package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
package/servers/workload-picker/index.js +171 -0
package/servers/workload-picker/manifest.json +16 -0
package/servers/workload-picker/package.json +16 -0
package/src/app.js +4 -390
package/src/lib/bootstrap-command-handler.js +710 -1148
package/src/lib/bootstrap-config.js +36 -0
package/src/lib/bootstrap-profile-manager.js +641 -0
package/src/lib/bootstrap-provisioners.js +421 -0
package/src/lib/ci-register-helpers.js +74 -0
package/src/lib/config-loader.js +408 -0
package/src/lib/config-manager.js +66 -1685
package/src/lib/config-mcp-client.js +118 -0
package/src/lib/config-validator.js +634 -0
package/src/lib/cuda-resolver.js +149 -0
package/src/lib/e2e-catalog-validator.js +251 -3
package/src/lib/e2e-ci-recorder.js +103 -0
package/src/lib/generated/cli-options.js +315 -311
package/src/lib/generated/parameter-matrix.js +671 -0
package/src/lib/generated/validation-rules.js +71 -71
package/src/lib/marketplace-flow.js +276 -0
package/src/lib/mcp-query-runner.js +768 -0
package/src/lib/parameter-schema-validator.js +62 -18
package/src/lib/path-prover-brain.js +607 -0
package/src/lib/prompt-runner.js +41 -1504
package/src/lib/prompts/feature-prompts.js +172 -0
package/src/lib/prompts/index.js +48 -0
package/src/lib/prompts/infrastructure-prompts.js +690 -0
package/src/lib/prompts/model-prompts.js +552 -0
package/src/lib/prompts/project-prompts.js +82 -0
package/src/lib/prompts.js +2 -1446
package/src/lib/registry-command-handler.js +135 -3
package/src/lib/secrets-prompt-runner.js +251 -0
package/src/lib/template-variable-resolver.js +422 -0
package/src/lib/tune-catalog-validator.js +37 -4
package/templates/Dockerfile +9 -0
package/templates/code/adapter_sidecar.py +444 -0
package/templates/code/serve +6 -0
package/templates/code/serve.d/vllm.ejs +1 -1
package/templates/do/.benchmark_writer.py +1476 -0
package/templates/do/.tune_helper.py +982 -57
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/adapter +149 -0
package/templates/do/benchmark +639 -85
package/templates/do/config +108 -5
package/templates/do/deploy.d/managed-inference.ejs +192 -11
package/templates/do/optimize +106 -37
package/templates/do/register +89 -0
package/templates/do/test +13 -0
package/templates/do/tune +378 -59
package/templates/do/validate +44 -4
package/config/parameter-schema.json +0 -88

package/infra/ci-harness/lib/ci-harness-stack.ts CHANGED Viewed

@@ -2,7 +2,9 @@ import * as cdk from 'aws-cdk-lib';
 import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
 import * as events from 'aws-cdk-lib/aws-events';
 import * as targets from 'aws-cdk-lib/aws-events-targets';
+import * as glue from 'aws-cdk-lib/aws-glue';
 import * as iam from 'aws-cdk-lib/aws-iam';
+import * as s3 from 'aws-cdk-lib/aws-s3';
 import * as lambda from 'aws-cdk-lib/aws-lambda';
 import { NodejsFunction } from 'aws-cdk-lib/aws-lambda-nodejs';
 import * as logs from 'aws-cdk-lib/aws-logs';
@@ -117,8 +119,12 @@ export class MlccCiHarnessStack extends cdk.Stack {
         });
         // Scanner Lambda IAM role with least-privilege permissions
+        //
+        // RETAIN policy: IAM roles are retained on stack deletion to prevent conflicts
+        // during multi-region bootstrap. If the stack is re-created, existing roles will
+        // be reused via --no-rollback.
         const scannerRole = new iam.Role(this, 'ScannerRole', {
-            roleName: 'mlcc-ci-scanner-role',
+            roleName: `mlcc-ci-scanner-role-${this.region}`,
             assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'),
             description: 'IAM role for the MLCC CI Scanner Lambda function',
         });
@@ -178,16 +184,23 @@ export class MlccCiHarnessStack extends cdk.Stack {
         // Step Functions Orchestrator IAM role
         // Permissions for DynamoDB UpdateItem, Logs, and CodeBuild are defined here.
+        //
+        // RETAIN policy: IAM roles are retained on stack deletion to prevent conflicts
+        // during multi-region bootstrap. If the stack is re-created, existing roles will
+        // be reused via --no-rollback.
         this.orchestratorRole = new iam.Role(this, 'OrchestratorRole', {
-            roleName: 'mlcc-ci-orchestrator-role',
-            assumedBy: new iam.ServicePrincipal('states.amazonaws.com'),
+            roleName: `mlcc-ci-orchestrator-role-${this.region}`,
+            assumedBy: new iam.CompositePrincipal(
+                new iam.ServicePrincipal('states.amazonaws.com'),
+                new iam.ServicePrincipal('events.amazonaws.com'),
+            ),
             description: 'IAM role for the MLCC CI Orchestrator Step Functions state machine',
         });
         // DynamoDB:UpdateItem on CI_Table for UpdateResults states
         this.orchestratorRole.addToPolicy(new iam.PolicyStatement({
             effect: iam.Effect.ALLOW,
-            actions: ['dynamodb:UpdateItem'],
+            actions: ['dynamodb:UpdateItem', 'dynamodb:GetItem'],
             resources: [this.ciTable.tableArn],
         }));
@@ -257,6 +270,14 @@ export class MlccCiHarnessStack extends cdk.Stack {
                             Value: this.ciLogGroup.logGroupName,
                             Type: 'PLAINTEXT',
                         },
+                        {
+                            // Benchmark concurrency levels (comma-separated string, e.g. "1,4,8")
+                            // Set by the CI Scanner Lambda from DynamoDB benchmarkConcurrencyLevels field.
+                            // Falls back to default [1,4,8] in do/benchmark if empty.
+                            Name: 'BENCHMARK_CONCURRENCY_LEVELS',
+                            'Value.$': '$.benchmarkConcurrencyLevels',
+                            Type: 'PLAINTEXT',
+                        },
                     ],
                 },
                 ResultPath: '$.buildResult',
@@ -369,6 +390,270 @@ export class MlccCiHarnessStack extends cdk.Stack {
                         BackoffRate: 2.0,
                     },
                 ],
+            },
+        });
+        // ─── Stage 2 Orchestration (Req 1.1, 1.2, 1.3) ──────────────────────────
+        // After Stage 1 completes, the orchestrator reads the DynamoDB record to
+        // determine if the build passed and if benchmark is enabled. Stage 2 runs
+        // benchmarks asynchronously and does NOT affect testStatus on failure.
+        // GetBenchmarkConfig: Read benchmarkEnabled flag from DynamoDB after Stage 1
+        const getBenchmarkConfig = new sfn.CustomState(this, 'GetBenchmarkConfig', {
+            stateJson: {
+                Type: 'Task',
+                Resource: 'arn:aws:states:::dynamodb:getItem',
+                Parameters: {
+                    TableName: this.ciTable.tableName,
+                    Key: {
+                        configId: { 'S.$': '$.configId' },
+                    },
+                    ProjectionExpression: 'testStatus, benchmarkEnabled, benchmarkConcurrencyLevels',
+                    ConsistentRead: true,
+                },
+                ResultPath: '$.dynamoResult',
+                Retry: [
+                    {
+                        ErrorEquals: ['States.ALL'],
+                        IntervalSeconds: 2,
+                        MaxAttempts: 3,
+                        BackoffRate: 2.0,
+                    },
+                ],
+            },
+        });
+        // ExtractBenchmarkFlags: Extract benchmarkEnabled and testStatus into top-level fields
+        const extractBenchmarkFlags = new sfn.Pass(this, 'ExtractBenchmarkFlags', {
+            parameters: {
+                'configId.$': '$.configId',
+                'configJson.$': '$.configJson',
+                'buildStrategy.$': '$.buildStrategy',
+                'startTime.$': '$.startTime',
+                'buildStatus.$': '$.buildStatus',
+                'testStatus.$': '$.dynamoResult.Item.testStatus.S',
+                'benchmarkEnabled.$': '$.dynamoResult.Item.benchmarkEnabled.BOOL',
+            },
+            resultPath: '$',
+        });
+        // ExtractBenchmarkFlagsDefault: Fallback when benchmarkEnabled is not set in DynamoDB
+        // (backward-compatible — absence means disabled)
+        const extractBenchmarkFlagsDefault = new sfn.Pass(this, 'ExtractBenchmarkFlagsDefault', {
+            parameters: {
+                'configId.$': '$.configId',
+                'configJson.$': '$.configJson',
+                'buildStrategy.$': '$.buildStrategy',
+                'startTime.$': '$.startTime',
+                'buildStatus.$': '$.buildStatus',
+                'testStatus': 'unknown',
+                'benchmarkEnabled': false,
+            },
+            resultPath: '$',
+        });
+        // CheckDynamoItemHasBenchmarkField: determine if the DynamoDB response contains
+        // the benchmarkEnabled field. If not present, default to false.
+        const checkDynamoItemHasBenchmarkField = new sfn.Choice(this, 'CheckDynamoItemHasBenchmarkField')
+            .when(
+                sfn.Condition.isPresent('$.dynamoResult.Item.benchmarkEnabled'),
+                extractBenchmarkFlags,
+            )
+            .otherwise(extractBenchmarkFlagsDefault);
+        // CheckStage1Passed: Determine if Stage 1 passed (testStatus from DynamoDB read)
+        // If passed + benchmarkEnabled=true → Stage 2
+        // If passed + benchmarkEnabled=false → skip to End
+        // If failed → skip to End (do/clean already ran in CodeBuild's post_build)
+        const prepareStage2Input = new sfn.Pass(this, 'PrepareStage2Input', {
+            parameters: {
+                'configId.$': '$.configId',
+                'configJson.$': '$.configJson',
+                'buildStrategy.$': '$.buildStrategy',
+            },
+        });
+        const skipStage2 = new sfn.Succeed(this, 'SkipStage2');
+        const checkBenchmarkEnabled = new sfn.Choice(this, 'CheckBenchmarkEnabled')
+            .when(
+                sfn.Condition.and(
+                    sfn.Condition.stringEquals('$.testStatus', 'pass'),
+                    sfn.Condition.booleanEquals('$.benchmarkEnabled', true),
+                ),
+                prepareStage2Input,
+            )
+            .otherwise(skipStage2);
+        // Stage2Benchmark: Run do/benchmark via CodeBuild
+        // Uses .sync integration to wait for build completion.
+        const stage2Benchmark = new sfn.CustomState(this, 'Stage2Benchmark', {
+            stateJson: {
+                Type: 'Task',
+                Resource: 'arn:aws:states:::codebuild:startBuild',
+                Parameters: {
+                    ProjectName: 'mlcc-ci-executor',
+                    EnvironmentVariablesOverride: [
+                        {
+                            Name: 'CONFIG_ID',
+                            'Value.$': '$.configId',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CONFIG_JSON',
+                            'Value.$': '$.configJson',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CI_STAGE',
+                            Value: 'stage2-benchmark',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CI_TABLE_NAME',
+                            Value: this.ciTable.tableName,
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CI_LOG_GROUP',
+                            Value: this.ciLogGroup.logGroupName,
+                            Type: 'PLAINTEXT',
+                        },
+                    ],
+                },
+                ResultPath: '$.stage2BuildResult',
+            },
+        });
+        // Stage2RegisterBenchmark: Run do/register --benchmark-status via CodeBuild
+        const stage2RegisterBenchmark = new sfn.CustomState(this, 'Stage2RegisterBenchmark', {
+            stateJson: {
+                Type: 'Task',
+                Resource: 'arn:aws:states:::codebuild:startBuild',
+                Parameters: {
+                    ProjectName: 'mlcc-ci-executor',
+                    EnvironmentVariablesOverride: [
+                        {
+                            Name: 'CONFIG_ID',
+                            'Value.$': '$.configId',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CONFIG_JSON',
+                            'Value.$': '$.configJson',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CI_STAGE',
+                            Value: 'stage2-register-benchmark',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'BENCHMARK_STATUS',
+                            Value: 'completed',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CI_TABLE_NAME',
+                            Value: this.ciTable.tableName,
+                            Type: 'PLAINTEXT',
+                        },
+                    ],
+                },
+                ResultPath: '$.stage2RegisterResult',
+            },
+        });
+        // Stage2Clean: Run do/clean after benchmark stage completes (success path)
+        const stage2Clean = new sfn.CustomState(this, 'Stage2Clean', {
+            stateJson: {
+                Type: 'Task',
+                Resource: 'arn:aws:states:::codebuild:startBuild',
+                Parameters: {
+                    ProjectName: 'mlcc-ci-executor',
+                    EnvironmentVariablesOverride: [
+                        {
+                            Name: 'CONFIG_ID',
+                            'Value.$': '$.configId',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CONFIG_JSON',
+                            'Value.$': '$.configJson',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CI_STAGE',
+                            Value: 'stage2-clean',
+                            Type: 'PLAINTEXT',
+                        },
+                    ],
+                },
+                ResultPath: '$.stage2CleanResult',
+            },
+        });
+        // Stage2FailureHandler: Handle Stage 2 failures without affecting testStatus.
+        // Records lastBenchmarkStatus=failed in DynamoDB, then proceeds to clean.
+        // Per Req 1.4: Stage 2 failure SHALL NOT change the DynamoDB testStatus.
+        // Uses SET expression targeting ONLY the 3 benchmark fields — never touches
+        // testStatus, configJson, or any other pre-existing field.
+        const stage2FailureHandler = new sfn.CustomState(this, 'Stage2FailureHandler', {
+            stateJson: {
+                Type: 'Task',
+                Resource: 'arn:aws:states:::dynamodb:updateItem',
+                Parameters: {
+                    TableName: this.ciTable.tableName,
+                    Key: {
+                        configId: { 'S.$': '$.configId' },
+                    },
+                    UpdateExpression: 'SET lastBenchmarkRunId = :rid, lastBenchmarkTimestamp = :ts, lastBenchmarkStatus = :status',
+                    ExpressionAttributeValues: {
+                        ':rid': {
+                            'S.$': "States.Format('bmk-failure-{}', $.configId)",
+                        },
+                        ':ts': { 'S.$': '$$.State.EnteredTime' },
+                        ':status': { 'S': 'failed' },
+                    },
+                },
+                ResultPath: '$.stage2FailureUpdateResult',
+                Retry: [
+                    {
+                        ErrorEquals: ['States.ALL'],
+                        IntervalSeconds: 2,
+                        MaxAttempts: 3,
+                        BackoffRate: 2.0,
+                    },
+                ],
+            },
+        });
+        // Stage2FailureClean: Clean up after a Stage 2 failure
+        const stage2FailureClean = new sfn.CustomState(this, 'Stage2FailureClean', {
+            stateJson: {
+                Type: 'Task',
+                Resource: 'arn:aws:states:::codebuild:startBuild',
+                Parameters: {
+                    ProjectName: 'mlcc-ci-executor',
+                    EnvironmentVariablesOverride: [
+                        {
+                            Name: 'CONFIG_ID',
+                            'Value.$': '$.configId',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CONFIG_JSON',
+                            'Value.$': '$.configJson',
+                            Type: 'PLAINTEXT',
+                        },
+                        {
+                            Name: 'CI_STAGE',
+                            Value: 'stage2-clean',
+                            Type: 'PLAINTEXT',
+                        },
+                    ],
+                },
+                ResultPath: '$.stage2FailureCleanResult',
                 End: true,
             },
         });
@@ -403,11 +688,24 @@ export class MlccCiHarnessStack extends cdk.Stack {
             },
         });
+        // ─── Stage 2: Benchmark Error Handling ────────────────────────────────
+        // Stage 2 failure isolation: if benchmarking fails, we record the failure
+        // in the benchmark-specific fields (lastBenchmarkStatus=failed) without
+        // touching testStatus. Uses a Parallel wrapper with addCatch so that CDK
+        // properly includes the failure handler states in the definition graph.
+        // Requirements: 1.4, 7.3
         // Wire up the state machine chain
         // RecordStartTime → StartCodeBuild → WaitForBuild → PollBuildStatus → CheckTimestamp → CheckBuildStatus
         // CheckBuildStatus branches:
-        //   - SUCCEEDED/FAILED/STOPPED → SetBuildCompleteResult → UpdateResults
-        //   - TIMED_OUT → HandleTimeout → UpdateResultsFromTimeout
+        //   - SUCCEEDED/FAILED/STOPPED → SetBuildCompleteResult → UpdateResults → GetBenchmarkConfig
+        //     → CheckDynamoItemHasBenchmarkField → ExtractBenchmarkFlags → CheckBenchmarkEnabled
+        //     CheckBenchmarkEnabled branches:
+        //       - pass + benchmarkEnabled=true → PrepareStage2Input → Stage2Pipeline (Parallel)
+        //         Success: Stage2Benchmark → Stage2RegisterBenchmark → Stage2Clean → End
+        //         Failure: Stage2FailureHandler → Stage2FailureClean → End
+        //       - pass + benchmarkEnabled=false OR failed → SkipStage2 → End
+        //   - TIMED_OUT → HandleTimeout → UpdateResultsFromTimeout → End
         //   - IN_PROGRESS (otherwise) → WaitForBuild (loop)
         recordStartTime.next(startCodeBuild);
         startCodeBuild.next(waitForBuild);
@@ -417,6 +715,33 @@ export class MlccCiHarnessStack extends cdk.Stack {
         setSuccessResult.next(updateResults);
         handleTimeout.next(updateResultsFromTimeout);
+        // Stage 2 wiring: after UpdateResults, read DynamoDB for benchmark config
+        updateResults.next(getBenchmarkConfig);
+        getBenchmarkConfig.next(checkDynamoItemHasBenchmarkField);
+        extractBenchmarkFlags.next(checkBenchmarkEnabled);
+        extractBenchmarkFlagsDefault.next(checkBenchmarkEnabled);
+        // Stage 2 execution uses a Parallel state to enable proper CDK Catch handling.
+        // The Parallel has one branch (the success path), and addCatch routes errors
+        // to the failure handler chain.
+        const stage2Pipeline = new sfn.Parallel(this, 'Stage2Pipeline', {
+            resultPath: '$.stage2PipelineResult',
+        });
+        stage2Pipeline.branch(
+            stage2Benchmark
+                .next(stage2RegisterBenchmark)
+                .next(stage2Clean),
+        );
+        stage2Pipeline.addCatch(stage2FailureHandler, {
+            resultPath: '$.stage2Error',
+        });
+        // After PrepareStage2Input, enter the Stage2Pipeline parallel wrapper
+        prepareStage2Input.next(stage2Pipeline);
+        // Stage 2 failure path: FailureHandler → FailureClean → End
+        stage2FailureHandler.next(stage2FailureClean);
         // Create the state machine
         this.ciOrchestrator = new sfn.StateMachine(this, 'CiOrchestrator', {
             stateMachineName: 'mlcc-ci-orchestrator',
@@ -442,8 +767,12 @@ export class MlccCiHarnessStack extends cdk.Stack {
         this.scannerFunction.addEnvironment('STATE_MACHINE_ARN', this.ciOrchestrator.stateMachineArn);
         // CodeBuild IAM role with permissions for lifecycle execution
+        //
+        // RETAIN policy: IAM roles are retained on stack deletion to prevent conflicts
+        // during multi-region bootstrap. If the stack is re-created, existing roles will
+        // be reused via --no-rollback.
         const codebuildRole = new iam.Role(this, 'CodeBuildRole', {
-            roleName: 'mlcc-ci-codebuild-role',
+            roleName: `mlcc-ci-codebuild-role-${this.region}`,
             assumedBy: new iam.ServicePrincipal('codebuild.amazonaws.com'),
             description: 'IAM role for the MLCC CI CodeBuild executor project',
         });
@@ -605,5 +934,506 @@ export class MlccCiHarnessStack extends cdk.Stack {
             ],
             resources: [this.ciCodeBuildProject.projectArn],
         }));
+        // ─── Benchmark Infrastructure (opt-in) ────────────────────────────────
+        // Gated by the CreateBenchmarkInfra parameter. When enabled, provisions
+        // Glue database/table, S3 results bucket, IAM permissions for benchmark
+        // writes (S3, Glue, Athena), and stack outputs for downstream consumers.
+        // Idempotent: CloudFormation conditions ensure re-running `cdk deploy`
+        // with the same parameter value produces no changes. Resources are only
+        // created when CreateBenchmarkInfra=true; subsequent deploys with the
+        // same value are no-ops (CloudFormation handles existence checks natively).
+        // Requirements: 3.1, 3.2, 3.4, 3.5
+        const createBenchmarkInfra = new cdk.CfnParameter(this, 'CreateBenchmarkInfra', {
+            type: 'String',
+            default: 'false',
+            allowedValues: ['true', 'false'],
+            description: 'Whether to create benchmark infrastructure (S3 results bucket, Glue DB/table, IAM permissions). Opt-in.',
+        });
+        const benchmarkInfraCondition = new cdk.CfnCondition(this, 'BenchmarkInfraCondition', {
+            expression: cdk.Fn.conditionEquals(createBenchmarkInfra.valueAsString, 'true'),
+        });
+        // Glue Database: mlcc_ci
+        // CloudFormation manages create-or-skip via the condition — no duplicate
+        // resource error on re-deploy because the logical ID is stable.
+        const glueDatabase = new glue.CfnDatabase(this, 'CiGlueDatabase', {
+            catalogId: this.account,
+            databaseInput: {
+                name: 'mlcc_ci',
+                description: 'MCC CI benchmark results warehouse',
+            },
+        });
+        glueDatabase.cfnOptions.condition = benchmarkInfraCondition;
+        // Glue Table: benchmark_results — full DDL with all 28+ columns
+        // Partition by region/year/month for efficient time-range queries.
+        // Dimension columns are well-separated (not composite keys) per Req 5.1.
+        const glueTable = new glue.CfnTable(this, 'BenchmarkResultsTable', {
+            catalogId: this.account,
+            databaseName: 'mlcc_ci',
+            tableInput: {
+                name: 'benchmark_results',
+                tableType: 'EXTERNAL_TABLE',
+                parameters: {
+                    'classification': 'parquet',
+                    'parquet.compression': 'SNAPPY',
+                },
+                storageDescriptor: {
+                    columns: [
+                        // Core dimensions
+                        { name: 'config_id', type: 'string', comment: 'SHA-256 hash (16 chars), join key with DynamoDB' },
+                        { name: 'model_name', type: 'string', comment: 'HuggingFace model ID (e.g., Qwen/Qwen3-4B)' },
+                        { name: 'model_family', type: 'string', comment: 'Derived: qwen3, llama3, deepseek-r1, etc.' },
+                        { name: 'instance_type', type: 'string', comment: 'SageMaker instance (e.g., ml.g5.xlarge)' },
+                        { name: 'instance_family', type: 'string', comment: 'Derived: g5, g6, g6e, p5, trn2, etc.' },
+                        { name: 'deployment_config', type: 'string', comment: 'Architecture-backend (e.g., transformers-vllm)' },
+                        { name: 'deployment_target', type: 'string', comment: 'realtime-inference, async-inference, etc.' },
+                        { name: 'run_timestamp', type: 'string', comment: 'When this benchmark ran (ISO 8601 UTC)' },
+                        // Configuration dimensions
+                        { name: 'tensor_parallel_degree', type: 'int', comment: 'TP degree (1, 2, 4, 8)' },
+                        { name: 'quantization', type: 'string', comment: 'Quantization method (fp16, fp8, awq, gptq, none)' },
+                        { name: 'enable_lora', type: 'boolean', comment: 'Whether LoRA adapters were enabled' },
+                        { name: 'base_image', type: 'string', comment: 'Container base image (e.g., vllm/vllm-openai:v0.8.5)' },
+                        { name: 'base_image_version', type: 'string', comment: 'Extracted tag from base image' },
+                        { name: 'mcc_version', type: 'string', comment: 'MCC generator version that produced the project' },
+                        // Workload dimensions
+                        { name: 'concurrency', type: 'int', comment: 'Number of concurrent requests in this measurement' },
+                        { name: 'input_tokens_mean', type: 'int', comment: 'Mean input token count for workload' },
+                        { name: 'output_tokens_mean', type: 'int', comment: 'Mean output token count for workload' },
+                        { name: 'duration_seconds', type: 'int', comment: 'Benchmark duration in seconds' },
+                        // Result metrics
+                        { name: 'ttft_p50_ms', type: 'double', comment: 'Time to first token, 50th percentile (ms)' },
+                        { name: 'ttft_p99_ms', type: 'double', comment: 'Time to first token, 99th percentile (ms)' },
+                        { name: 'itl_p50_ms', type: 'double', comment: 'Inter-token latency, 50th percentile (ms)' },
+                        { name: 'itl_p99_ms', type: 'double', comment: 'Inter-token latency, 99th percentile (ms)' },
+                        { name: 'throughput_rps', type: 'double', comment: 'Requests per second at this concurrency' },
+                        { name: 'tokens_per_second', type: 'double', comment: 'Output tokens per second' },
+                        { name: 'cost_per_1m_tokens', type: 'double', comment: 'Estimated cost per 1M output tokens (USD)' },
+                        { name: 'error_rate', type: 'double', comment: 'Fraction of requests that failed (0.0-1.0)' },
+                        { name: 'status', type: 'string', comment: 'completed, failed, timeout, unfeasible' },
+                        // Provenance
+                        { name: 'run_type', type: 'string', comment: 'Source: ci, path_prove, optimization, manual' },
+                        { name: 'ci_run_id', type: 'string', comment: 'Step Functions execution ID or CodeBuild build ID' },
+                        { name: 'ci_stage', type: 'string', comment: 'stage2-benchmark' },
+                        { name: 'benchmark_job_name', type: 'string', comment: 'SageMaker AI Benchmark job name' },
+                        { name: 'account_id', type: 'string', comment: 'AWS account ID' },
+                    ],
+                    location: `s3://mlcc-benchmark-results-${this.account}-${this.region}/results/`,
+                    inputFormat: 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat',
+                    outputFormat: 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat',
+                    serdeInfo: {
+                        serializationLibrary: 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe',
+                        parameters: {
+                            'serialization.format': '1',
+                        },
+                    },
+                    compressed: true,
+                },
+                partitionKeys: [
+                    { name: 'model', type: 'string', comment: 'Model name with / replaced by _ (e.g., Qwen_Qwen3-0.6B)' },
+                    { name: 'instance', type: 'string', comment: 'SageMaker instance type (e.g., ml.g5.xlarge)' },
+                    { name: 'target', type: 'string', comment: 'Deployment target (realtime-inference, async-inference, etc.)' },
+                ],
+            },
+        });
+        glueTable.addDependency(glueDatabase);
+        glueTable.cfnOptions.condition = benchmarkInfraCondition;
+        // Configurable lifecycle parameters for the benchmark results bucket
+        const benchmarkIaTransitionDays = new cdk.CfnParameter(this, 'BenchmarkIaTransitionDays', {
+            type: 'Number',
+            default: 90,
+            description: 'Days before benchmark results transition to Infrequent Access storage',
+            minValue: 30,
+            maxValue: 365,
+        });
+        const benchmarkExpirationDays = new cdk.CfnParameter(this, 'BenchmarkExpirationDays', {
+            type: 'Number',
+            default: 365,
+            description: 'Days before benchmark results expire and are deleted',
+            minValue: 90,
+            maxValue: 3650,
+        });
+        // S3 bucket for benchmark results (Parquet files partitioned by region/year/month)
+        const benchmarkResultsBucket = new s3.Bucket(this, 'BenchmarkResultsBucket', {
+            bucketName: `mlcc-benchmark-results-${this.account}-${this.region}`,
+            removalPolicy: cdk.RemovalPolicy.RETAIN,
+            lifecycleRules: [
+                {
+                    transitions: [
+                        {
+                            storageClass: s3.StorageClass.INFREQUENT_ACCESS,
+                            transitionAfter: cdk.Duration.days(benchmarkIaTransitionDays.valueAsNumber),
+                        },
+                    ],
+                    expiration: cdk.Duration.days(benchmarkExpirationDays.valueAsNumber),
+                },
+            ],
+        });
+        // Apply the benchmark condition to the S3 bucket
+        const cfnBenchmarkBucket = benchmarkResultsBucket.node.defaultChild as cdk.CfnResource;
+        cfnBenchmarkBucket.cfnOptions.condition = benchmarkInfraCondition;
+        // Output the benchmark results bucket ARN (conditional)
+        new cdk.CfnOutput(this, 'BenchmarkResultsBucketArn', {
+            value: benchmarkResultsBucket.bucketArn,
+            description: 'ARN of the S3 bucket storing benchmark results (Parquet)',
+            condition: benchmarkInfraCondition,
+            exportName: 'mlcc-ci-benchmark-results-bucket-arn',
+        });
+        // Output the benchmark results bucket name (conditional)
+        new cdk.CfnOutput(this, 'BenchmarkResultsBucketName', {
+            value: benchmarkResultsBucket.bucketName,
+            description: 'Name of the S3 bucket storing benchmark results (Parquet)',
+            condition: benchmarkInfraCondition,
+        });
+        // Output the Glue database name (conditional)
+        new cdk.CfnOutput(this, 'CiGlueDatabaseName', {
+            value: 'mlcc_ci',
+            description: 'Name of the Glue database for benchmark results',
+            condition: benchmarkInfraCondition,
+        });
+        // S3 permissions for benchmark results bucket writes
+        const benchmarkS3Policy = new iam.PolicyStatement({
+            sid: 'BenchmarkResultsWrite',
+            effect: iam.Effect.ALLOW,
+            actions: [
+                's3:PutObject',
+                's3:GetObject',
+                's3:ListBucket',
+            ],
+            resources: [
+                'arn:aws:s3:::mlcc-benchmark-results-*',
+                'arn:aws:s3:::mlcc-benchmark-results-*/*',
+            ],
+        });
+        // Glue permissions for partition management
+        const benchmarkGluePolicy = new iam.PolicyStatement({
+            sid: 'GlueCatalogAccess',
+            effect: iam.Effect.ALLOW,
+            actions: [
+                'glue:GetDatabase',
+                'glue:GetTable',
+                'glue:GetPartitions',
+                'glue:BatchCreatePartition',
+                'glue:CreatePartition',
+            ],
+            resources: [
+                'arn:aws:glue:*:*:catalog',
+                'arn:aws:glue:*:*:database/mlcc_ci',
+                'arn:aws:glue:*:*:table/mlcc_ci/*',
+            ],
+        });
+        // Athena permissions for partition repair (MSCK REPAIR TABLE)
+        const benchmarkAthenaPolicy = new iam.PolicyStatement({
+            sid: 'AthenaPartitionRepair',
+            effect: iam.Effect.ALLOW,
+            actions: [
+                'athena:StartQueryExecution',
+                'athena:GetQueryResults',
+            ],
+            resources: ['*'],
+        });
+        // Create a managed policy for benchmark permissions so we can condition it
+        const benchmarkPolicy = new iam.Policy(this, 'BenchmarkWritePolicy', {
+            policyName: 'mlcc-ci-benchmark-write-policy',
+            statements: [benchmarkS3Policy, benchmarkGluePolicy, benchmarkAthenaPolicy],
+        });
+        benchmarkPolicy.attachToRole(codebuildRole);
+        // Apply the condition to the policy's underlying CFN resource
+        const cfnBenchmarkPolicy = benchmarkPolicy.node.defaultChild as cdk.CfnResource;
+        cfnBenchmarkPolicy.cfnOptions.condition = benchmarkInfraCondition;
+        // ─── Path Prover Infrastructure (opt-in, separate from benchmark infra) ────
+        // Gated by the CreatePathProver parameter. When enabled, provisions:
+        // - Brain Lambda (getNextConfig, pickNext, classifyFailure)
+        // - WriteResults Lambda (writes path_prove records to Athena)
+        // - Step Functions state machine (path-prover orchestrator)
+        // - EventBridge scheduled rule (disabled by default)
+        // Requirements: 8.1, 8.7, 8.8
+        const createPathProver = new cdk.CfnParameter(this, 'CreatePathProver', {
+            type: 'String',
+            default: 'false',
+            allowedValues: ['true', 'false'],
+            description: 'Whether to create Path Prover infrastructure (state machine, Lambdas, EventBridge rule). Opt-in.',
+        });
+        const pathProverCondition = new cdk.CfnCondition(this, 'PathProverCondition', {
+            expression: cdk.Fn.conditionEquals(createPathProver.valueAsString, 'true'),
+        });
+        // Path Prover Brain Lambda IAM role
+        const pathProverBrainRole = new iam.Role(this, 'PathProverBrainRole', {
+            roleName: 'mlcc-path-prover-brain-role',
+            assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'),
+            description: 'IAM role for the Path Prover Brain Lambda function',
+        });
+        (pathProverBrainRole.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
+        // Brain Lambda: Athena read access for gap identification + substitution
+        const brainAthenaPolicy = new iam.Policy(this, 'PathProverBrainAthenaPolicy', {
+            policyName: 'mlcc-path-prover-brain-athena',
+            statements: [
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: [
+                        'athena:StartQueryExecution',
+                        'athena:GetQueryExecution',
+                        'athena:GetQueryResults',
+                    ],
+                    resources: ['*'],
+                }),
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: ['glue:GetTable', 'glue:GetDatabase', 'glue:GetPartitions'],
+                    resources: [
+                        'arn:aws:glue:*:*:catalog',
+                        'arn:aws:glue:*:*:database/mlcc_ci',
+                        'arn:aws:glue:*:*:table/mlcc_ci/*',
+                    ],
+                }),
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: ['s3:GetObject', 's3:ListBucket', 's3:GetBucketLocation', 's3:PutObject'],
+                    resources: [
+                        'arn:aws:s3:::mlcc-benchmark-results-*',
+                        'arn:aws:s3:::mlcc-benchmark-results-*/*',
+                    ],
+                }),
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: ['logs:CreateLogStream', 'logs:PutLogEvents'],
+                    resources: [this.ciLogGroup.logGroupArn, `${this.ciLogGroup.logGroupArn}:*`],
+                }),
+            ],
+        });
+        brainAthenaPolicy.attachToRole(pathProverBrainRole);
+        (brainAthenaPolicy.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
+        // Path Prover Brain Lambda function
+        const pathProverBrainFunction = new NodejsFunction(this, 'PathProverBrainFunction', {
+            functionName: 'mlcc-path-prover-brain',
+            runtime: lambda.Runtime.NODEJS_20_X,
+            memorySize: 512,
+            timeout: cdk.Duration.seconds(120),
+            entry: path.join(__dirname, '..', 'lambda', 'path-prover', 'brain.ts'),
+            handler: 'handler',
+            role: pathProverBrainRole,
+            environment: {
+                GLUE_DATABASE: 'mlcc_ci',
+                GLUE_TABLE: 'benchmark_results',
+                MAX_PROVES_PER_RUN: '10',
+                MAX_COST_PER_RUN: '100',
+            },
+            logGroup: this.ciLogGroup,
+            bundling: {
+                minify: true,
+                sourceMap: true,
+            },
+        });
+        (pathProverBrainFunction.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
+        // Path Prover Write Results Lambda IAM role
+        const pathProverWriteRole = new iam.Role(this, 'PathProverWriteRole', {
+            roleName: 'mlcc-path-prover-write-role',
+            assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'),
+            description: 'IAM role for the Path Prover Write Results Lambda function',
+        });
+        (pathProverWriteRole.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
+        // Write Results Lambda: S3 + Glue write access
+        const writeResultsPolicy = new iam.Policy(this, 'PathProverWriteResultsPolicy', {
+            policyName: 'mlcc-path-prover-write-results',
+            statements: [
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: ['s3:PutObject', 's3:GetObject'],
+                    resources: [
+                        'arn:aws:s3:::mlcc-benchmark-results-*/*',
+                    ],
+                }),
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: ['glue:BatchCreatePartition', 'glue:CreatePartition', 'glue:GetTable'],
+                    resources: [
+                        'arn:aws:glue:*:*:catalog',
+                        'arn:aws:glue:*:*:database/mlcc_ci',
+                        'arn:aws:glue:*:*:table/mlcc_ci/*',
+                    ],
+                }),
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: ['logs:CreateLogStream', 'logs:PutLogEvents'],
+                    resources: [this.ciLogGroup.logGroupArn, `${this.ciLogGroup.logGroupArn}:*`],
+                }),
+            ],
+        });
+        writeResultsPolicy.attachToRole(pathProverWriteRole);
+        (writeResultsPolicy.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
+        // Path Prover Write Results Lambda function
+        const pathProverWriteFunction = new NodejsFunction(this, 'PathProverWriteFunction', {
+            functionName: 'mlcc-path-prover-write-results',
+            runtime: lambda.Runtime.NODEJS_20_X,
+            memorySize: 256,
+            timeout: cdk.Duration.seconds(60),
+            entry: path.join(__dirname, '..', 'lambda', 'path-prover', 'write-results.ts'),
+            handler: 'handler',
+            role: pathProverWriteRole,
+            environment: {
+                GLUE_DATABASE: 'mlcc_ci',
+                GLUE_TABLE: 'benchmark_results',
+                RESULTS_BUCKET: `mlcc-benchmark-results-${this.account}-${this.region}`,
+            },
+            logGroup: this.ciLogGroup,
+            bundling: {
+                minify: true,
+                sourceMap: true,
+            },
+        });
+        (pathProverWriteFunction.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
+        // Path Prover Step Functions IAM role
+        const pathProverOrchestratorRole = new iam.Role(this, 'PathProverOrchestratorRole', {
+            roleName: 'mlcc-path-prover-orchestrator-role',
+            assumedBy: new iam.ServicePrincipal('states.amazonaws.com'),
+            description: 'IAM role for the Path Prover Step Functions state machine',
+        });
+        (pathProverOrchestratorRole.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
+        // Orchestrator permissions
+        const pathProverOrchestratorPolicy = new iam.Policy(this, 'PathProverOrchestratorPolicy', {
+            policyName: 'mlcc-path-prover-orchestrator-policy',
+            statements: [
+                // Lambda invoke for brain and write-results
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: ['lambda:InvokeFunction'],
+                    resources: [
+                        pathProverBrainFunction.functionArn,
+                        pathProverWriteFunction.functionArn,
+                    ],
+                }),
+                // CodeBuild start/poll for lifecycle stages
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: ['codebuild:StartBuild', 'codebuild:BatchGetBuilds', 'codebuild:StopBuild'],
+                    resources: [this.ciCodeBuildProject.projectArn],
+                }),
+                // CloudWatch Logs for execution logging
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: [
+                        'logs:CreateLogDelivery',
+                        'logs:GetLogDelivery',
+                        'logs:UpdateLogDelivery',
+                        'logs:DeleteLogDelivery',
+                        'logs:ListLogDeliveries',
+                        'logs:PutResourcePolicy',
+                        'logs:DescribeResourcePolicies',
+                        'logs:DescribeLogGroups',
+                        'logs:PutLogEvents',
+                        'logs:CreateLogStream',
+                    ],
+                    resources: ['*'],
+                }),
+                // Events for .sync integration
+                new iam.PolicyStatement({
+                    effect: iam.Effect.ALLOW,
+                    actions: ['events:PutTargets', 'events:PutRule', 'events:DescribeRule'],
+                    resources: [`arn:aws:events:${this.region}:${this.account}:rule/StepFunctionsGetBuildStatusRule-*`],
+                }),
+            ],
+        });
+        pathProverOrchestratorPolicy.attachToRole(pathProverOrchestratorRole);
+        (pathProverOrchestratorPolicy.node.defaultChild as cdk.CfnResource).cfnOptions.condition = pathProverCondition;
+        // Path Prover State Machine
+        // Uses ASL definition from file with Fn::Sub for variable substitution.
+        // We read the raw JSON and use cdk.Fn.sub to inject resource ARNs.
+        const aslTemplate = JSON.stringify(require('../state-machines/path-prover.asl.json'));
+        const pathProverDefinitionString = cdk.Fn.sub(aslTemplate, {
+            BrainFunctionArn: pathProverBrainFunction.functionArn,
+            WriteResultsFunctionArn: pathProverWriteFunction.functionArn,
+            ClassifyFailureFunctionArn: pathProverBrainFunction.functionArn,
+            CodeBuildProjectName: this.ciCodeBuildProject.projectName,
+        });
+        const pathProverStateMachine = new sfn.CfnStateMachine(this, 'PathProverStateMachine', {
+            stateMachineName: 'mlcc-path-prover',
+            stateMachineType: 'STANDARD',
+            definitionString: pathProverDefinitionString,
+            roleArn: pathProverOrchestratorRole.roleArn,
+            loggingConfiguration: {
+                destinations: [{
+                    cloudWatchLogsLogGroup: {
+                        logGroupArn: this.ciLogGroup.logGroupArn,
+                    },
+                }],
+                level: 'ALL',
+                includeExecutionData: true,
+            },
+            tracingConfiguration: {
+                enabled: true,
+            },
+        });
+        pathProverStateMachine.cfnOptions.condition = pathProverCondition;
+        // EventBridge scheduled rule for Path Prover (disabled by default)
+        // Can be enabled via the EnablePathProverSchedule parameter
+        const enablePathProverSchedule = new cdk.CfnParameter(this, 'EnablePathProverSchedule', {
+            type: 'String',
+            default: 'DISABLED',
+            allowedValues: ['ENABLED', 'DISABLED'],
+            description: 'Whether to enable the Path Prover scheduled EventBridge rule. Default: DISABLED.',
+        });
+        const pathProverScheduleRule = new events.CfnRule(this, 'PathProverScheduleRule', {
+            name: 'mlcc-path-prover-schedule',
+            description: 'Triggers the Path Prover state machine on a schedule to fill coverage gaps',
+            scheduleExpression: 'rate(6 hours)',
+            state: enablePathProverSchedule.valueAsString,
+            targets: [{
+                arn: `arn:aws:states:${this.region}:${this.account}:stateMachine:mlcc-path-prover`,
+                id: 'PathProverTarget',
+                roleArn: pathProverOrchestratorRole.roleArn,
+                input: JSON.stringify({
+                    iteration: 0,
+                    budgetSpent: 0,
+                    maxProvesPerRun: 10,
+                    maxCostPerRun: 100,
+                    previousResults: [],
+                }),
+            }],
+        });
+        pathProverScheduleRule.cfnOptions.condition = pathProverCondition;
+        // Output Path Prover state machine ARN
+        new cdk.CfnOutput(this, 'PathProverStateMachineArn', {
+            value: `arn:aws:states:${this.region}:${this.account}:stateMachine:mlcc-path-prover`,
+            description: 'ARN of the Path Prover Step Functions state machine',
+            condition: pathProverCondition,
+        });
+        // Output Brain Lambda ARN
+        new cdk.CfnOutput(this, 'PathProverBrainFunctionArn', {
+            value: pathProverBrainFunction.functionArn,
+            description: 'ARN of the Path Prover Brain Lambda function',
+            condition: pathProverCondition,
+        });
     }
 }