@aws/ml-container-creator 0.10.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +9304 -0
- package/bin/cli.js +2 -0
- package/config/bootstrap-e2e-stack.json +341 -0
- package/config/bootstrap-stack.json +40 -3
- package/config/parameter-schema-v2.json +33 -22
- package/config/tune-catalog.json +1781 -0
- package/infra/ci-harness/buildspec.yml +1 -0
- package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
- package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
- package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
- package/package.json +53 -67
- package/servers/base-image-picker/index.js +121 -121
- package/servers/e2e-status/index.js +297 -0
- package/servers/e2e-status/manifest.json +14 -0
- package/servers/e2e-status/package.json +15 -0
- package/servers/endpoint-picker/LICENSE +202 -0
- package/servers/endpoint-picker/index.js +536 -0
- package/servers/endpoint-picker/manifest.json +14 -0
- package/servers/endpoint-picker/package.json +18 -0
- package/servers/hyperpod-cluster-picker/index.js +125 -125
- package/servers/instance-sizer/index.js +166 -153
- package/servers/instance-sizer/lib/instance-ranker.js +120 -76
- package/servers/instance-sizer/lib/model-resolver.js +61 -61
- package/servers/instance-sizer/lib/quota-resolver.js +113 -113
- package/servers/instance-sizer/lib/vram-estimator.js +31 -31
- package/servers/lib/bedrock-client.js +38 -38
- package/servers/lib/catalogs/instances.json +27 -0
- package/servers/lib/catalogs/model-servers.json +201 -3
- package/servers/lib/custom-validators.js +13 -13
- package/servers/lib/dynamic-resolver.js +4 -4
- package/servers/marketplace-picker/index.js +342 -0
- package/servers/marketplace-picker/manifest.json +14 -0
- package/servers/marketplace-picker/package.json +18 -0
- package/servers/model-picker/index.js +382 -382
- package/servers/region-picker/index.js +56 -56
- package/servers/workload-picker/LICENSE +202 -0
- package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
- package/servers/workload-picker/index.js +171 -0
- package/servers/workload-picker/manifest.json +16 -0
- package/servers/workload-picker/package.json +16 -0
- package/src/app.js +12 -3
- package/src/lib/bootstrap-command-handler.js +609 -15
- package/src/lib/bootstrap-config.js +36 -0
- package/src/lib/bootstrap-profile-manager.js +48 -41
- package/src/lib/ci-register-helpers.js +74 -0
- package/src/lib/config-loader.js +3 -0
- package/src/lib/config-manager.js +7 -0
- package/src/lib/config-validator.js +1 -1
- package/src/lib/cuda-resolver.js +17 -8
- package/src/lib/generated/cli-options.js +319 -314
- package/src/lib/generated/parameter-matrix.js +672 -661
- package/src/lib/generated/validation-rules.js +76 -72
- package/src/lib/path-prover-brain.js +664 -0
- package/src/lib/prompts/infrastructure-prompts.js +2 -2
- package/src/lib/prompts/model-prompts.js +6 -0
- package/src/lib/prompts/project-prompts.js +12 -0
- package/src/lib/secrets-prompt-runner.js +4 -0
- package/src/lib/template-manager.js +1 -1
- package/src/lib/template-variable-resolver.js +87 -1
- package/src/lib/tune-catalog-validator.js +37 -4
- package/templates/Dockerfile +9 -0
- package/templates/code/adapter_sidecar.py +444 -0
- package/templates/code/serve +6 -0
- package/templates/code/serve.d/vllm.ejs +1 -1
- package/templates/do/.benchmark_writer.py +1476 -0
- package/templates/do/.tune_helper.py +982 -57
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/adapter +154 -0
- package/templates/do/benchmark +639 -85
- package/templates/do/build +5 -0
- package/templates/do/clean.d/async-inference.ejs +5 -0
- package/templates/do/clean.d/batch-transform.ejs +5 -0
- package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
- package/templates/do/clean.d/managed-inference.ejs +5 -0
- package/templates/do/config +115 -45
- package/templates/do/deploy.d/async-inference.ejs +30 -3
- package/templates/do/deploy.d/batch-transform.ejs +29 -3
- package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
- package/templates/do/deploy.d/managed-inference.ejs +216 -14
- package/templates/do/lib/endpoint-config.sh +1 -1
- package/templates/do/lib/profile.sh +44 -0
- package/templates/do/optimize +106 -37
- package/templates/do/push +5 -0
- package/templates/do/register +94 -0
- package/templates/do/stage +567 -0
- package/templates/do/submit +7 -0
- package/templates/do/test +14 -0
- package/templates/do/tune +382 -59
- package/templates/do/validate +44 -4
package/bin/cli.js
CHANGED
|
@@ -168,11 +168,13 @@ program
|
|
|
168
168
|
.option('--region <region>', 'AWS region')
|
|
169
169
|
.option('--role-arn <arn>', 'Existing IAM role ARN to use')
|
|
170
170
|
.option('--non-interactive', 'Run without prompts (requires --profile and --region)')
|
|
171
|
+
.option('--name <name>', 'Bootstrap profile name (default: "default")')
|
|
171
172
|
.option('--force', 'Force removal without confirmation')
|
|
172
173
|
.option('--verify', 'Verify resources exist (for status)')
|
|
173
174
|
.option('--delete-stack', 'Delete CloudFormation stack on remove')
|
|
174
175
|
.option('--ignore-staleness', 'Suppress schema staleness warnings')
|
|
175
176
|
.option('--ci', 'Provision CI integration infrastructure')
|
|
177
|
+
.option('--benchmark-infra', 'Provision Athena/Glue benchmark infrastructure (requires --ci)')
|
|
176
178
|
.option('--skip-ci', 'Skip CI infrastructure provisioning')
|
|
177
179
|
.option('--skip-s3', 'Skip S3 bucket creation')
|
|
178
180
|
.option('--skip-post-setup', 'Skip post-setup chain (mcp init, sync-architectures, sync-schemas)')
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
{
|
|
2
|
+
"AWSTemplateFormatVersion": "2010-09-09",
|
|
3
|
+
"Description": "ML Container Creator — E2E validation infrastructure (CodeBuild project, EventBridge schedules, S3 results bucket, SNS notifications). Separate lifecycle from main bootstrap stack.",
|
|
4
|
+
|
|
5
|
+
"Parameters": {
|
|
6
|
+
"SourceType": {
|
|
7
|
+
"Type": "String",
|
|
8
|
+
"Default": "NO_SOURCE",
|
|
9
|
+
"AllowedValues": ["NO_SOURCE", "CODECOMMIT", "GITHUB", "S3"],
|
|
10
|
+
"Description": "Source provider for the CodeBuild project. Use NO_SOURCE if buildspec handles checkout."
|
|
11
|
+
},
|
|
12
|
+
"SourceLocation": {
|
|
13
|
+
"Type": "String",
|
|
14
|
+
"Default": "",
|
|
15
|
+
"Description": "Source location (repo URL or S3 path). Leave empty for NO_SOURCE."
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
|
|
19
|
+
"Conditions": {
|
|
20
|
+
"HasSource": { "Fn::Not": [{ "Fn::Equals": [{ "Ref": "SourceType" }, "NO_SOURCE"] }] }
|
|
21
|
+
},
|
|
22
|
+
|
|
23
|
+
"Resources": {
|
|
24
|
+
"E2ECodeBuildRole": {
|
|
25
|
+
"Type": "AWS::IAM::Role",
|
|
26
|
+
"Properties": {
|
|
27
|
+
"RoleName": "mlcc-e2e-codebuild-role",
|
|
28
|
+
"AssumeRolePolicyDocument": {
|
|
29
|
+
"Version": "2012-10-17",
|
|
30
|
+
"Statement": [
|
|
31
|
+
{
|
|
32
|
+
"Effect": "Allow",
|
|
33
|
+
"Principal": { "Service": "codebuild.amazonaws.com" },
|
|
34
|
+
"Action": "sts:AssumeRole"
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
"Policies": [
|
|
39
|
+
{
|
|
40
|
+
"PolicyName": "mlcc-e2e-codebuild-policy",
|
|
41
|
+
"PolicyDocument": {
|
|
42
|
+
"Version": "2012-10-17",
|
|
43
|
+
"Statement": [
|
|
44
|
+
{
|
|
45
|
+
"Sid": "CloudWatchLogs",
|
|
46
|
+
"Effect": "Allow",
|
|
47
|
+
"Action": [
|
|
48
|
+
"logs:CreateLogGroup",
|
|
49
|
+
"logs:CreateLogStream",
|
|
50
|
+
"logs:PutLogEvents"
|
|
51
|
+
],
|
|
52
|
+
"Resource": { "Fn::Sub": "arn:aws:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/codebuild/ml-container-creator-e2e*" }
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"Sid": "SageMakerAccess",
|
|
56
|
+
"Effect": "Allow",
|
|
57
|
+
"Action": [
|
|
58
|
+
"sagemaker:CreateEndpoint",
|
|
59
|
+
"sagemaker:CreateEndpointConfig",
|
|
60
|
+
"sagemaker:CreateModel",
|
|
61
|
+
"sagemaker:DeleteEndpoint",
|
|
62
|
+
"sagemaker:DeleteEndpointConfig",
|
|
63
|
+
"sagemaker:DeleteModel",
|
|
64
|
+
"sagemaker:DescribeEndpoint",
|
|
65
|
+
"sagemaker:DescribeEndpointConfig",
|
|
66
|
+
"sagemaker:DescribeModel",
|
|
67
|
+
"sagemaker:InvokeEndpoint",
|
|
68
|
+
"sagemaker:ListEndpoints"
|
|
69
|
+
],
|
|
70
|
+
"Resource": "*"
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"Sid": "ECRAccess",
|
|
74
|
+
"Effect": "Allow",
|
|
75
|
+
"Action": [
|
|
76
|
+
"ecr:GetAuthorizationToken",
|
|
77
|
+
"ecr:BatchCheckLayerAvailability",
|
|
78
|
+
"ecr:GetDownloadUrlForLayer",
|
|
79
|
+
"ecr:BatchGetImage",
|
|
80
|
+
"ecr:PutImage",
|
|
81
|
+
"ecr:InitiateLayerUpload",
|
|
82
|
+
"ecr:UploadLayerPart",
|
|
83
|
+
"ecr:CompleteLayerUpload",
|
|
84
|
+
"ecr:CreateRepository",
|
|
85
|
+
"ecr:DescribeRepositories"
|
|
86
|
+
],
|
|
87
|
+
"Resource": "*"
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"Sid": "S3ResultsAccess",
|
|
91
|
+
"Effect": "Allow",
|
|
92
|
+
"Action": [
|
|
93
|
+
"s3:GetObject",
|
|
94
|
+
"s3:PutObject",
|
|
95
|
+
"s3:ListBucket"
|
|
96
|
+
],
|
|
97
|
+
"Resource": [
|
|
98
|
+
{ "Fn::Sub": "arn:aws:s3:::mlcc-e2e-results-${AWS::AccountId}-${AWS::Region}" },
|
|
99
|
+
{ "Fn::Sub": "arn:aws:s3:::mlcc-e2e-results-${AWS::AccountId}-${AWS::Region}/*" }
|
|
100
|
+
]
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"Sid": "SNSPublish",
|
|
104
|
+
"Effect": "Allow",
|
|
105
|
+
"Action": "sns:Publish",
|
|
106
|
+
"Resource": { "Fn::Sub": "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:mlcc-e2e-notifications" }
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"Sid": "IAMPassRole",
|
|
110
|
+
"Effect": "Allow",
|
|
111
|
+
"Action": "iam:PassRole",
|
|
112
|
+
"Resource": { "Fn::Sub": "arn:aws:iam::${AWS::AccountId}:role/mlcc-sagemaker-execution-role" },
|
|
113
|
+
"Condition": {
|
|
114
|
+
"StringEquals": {
|
|
115
|
+
"iam:PassedToService": "sagemaker.amazonaws.com"
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
"Sid": "ServiceQuotas",
|
|
121
|
+
"Effect": "Allow",
|
|
122
|
+
"Action": [
|
|
123
|
+
"service-quotas:GetServiceQuota",
|
|
124
|
+
"service-quotas:ListServiceQuotas"
|
|
125
|
+
],
|
|
126
|
+
"Resource": "*"
|
|
127
|
+
}
|
|
128
|
+
]
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
],
|
|
132
|
+
"Tags": [
|
|
133
|
+
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
134
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap-e2e" }
|
|
135
|
+
]
|
|
136
|
+
}
|
|
137
|
+
},
|
|
138
|
+
|
|
139
|
+
"E2ECodeBuildProject": {
|
|
140
|
+
"Type": "AWS::CodeBuild::Project",
|
|
141
|
+
"Properties": {
|
|
142
|
+
"Name": "ml-container-creator-e2e",
|
|
143
|
+
"Description": "E2E validation runner for ML Container Creator — runs catalog configs through full lifecycle",
|
|
144
|
+
"ServiceRole": { "Fn::GetAtt": ["E2ECodeBuildRole", "Arn"] },
|
|
145
|
+
"TimeoutInMinutes": 480,
|
|
146
|
+
"Environment": {
|
|
147
|
+
"Type": "LINUX_CONTAINER",
|
|
148
|
+
"ComputeType": "BUILD_GENERAL1_LARGE",
|
|
149
|
+
"Image": "aws/codebuild/standard:7.0",
|
|
150
|
+
"PrivilegedMode": true,
|
|
151
|
+
"EnvironmentVariables": [
|
|
152
|
+
{
|
|
153
|
+
"Name": "TIER",
|
|
154
|
+
"Value": "ci",
|
|
155
|
+
"Type": "PLAINTEXT"
|
|
156
|
+
}
|
|
157
|
+
]
|
|
158
|
+
},
|
|
159
|
+
"Source": {
|
|
160
|
+
"Type": { "Ref": "SourceType" },
|
|
161
|
+
"BuildSpec": "version: 0.2\nphases:\n install:\n runtime-versions:\n nodejs: 20\n build:\n commands:\n - npm ci\n - node scripts/e2e-runner.js --tier $TIER\n"
|
|
162
|
+
},
|
|
163
|
+
"Artifacts": {
|
|
164
|
+
"Type": "NO_ARTIFACTS"
|
|
165
|
+
},
|
|
166
|
+
"LogsConfig": {
|
|
167
|
+
"CloudWatchLogs": {
|
|
168
|
+
"Status": "ENABLED",
|
|
169
|
+
"GroupName": "/aws/codebuild/ml-container-creator-e2e"
|
|
170
|
+
}
|
|
171
|
+
},
|
|
172
|
+
"Tags": [
|
|
173
|
+
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
174
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap-e2e" }
|
|
175
|
+
]
|
|
176
|
+
}
|
|
177
|
+
},
|
|
178
|
+
|
|
179
|
+
"E2EEventBridgeRole": {
|
|
180
|
+
"Type": "AWS::IAM::Role",
|
|
181
|
+
"Properties": {
|
|
182
|
+
"RoleName": "mlcc-e2e-eventbridge-role",
|
|
183
|
+
"AssumeRolePolicyDocument": {
|
|
184
|
+
"Version": "2012-10-17",
|
|
185
|
+
"Statement": [
|
|
186
|
+
{
|
|
187
|
+
"Effect": "Allow",
|
|
188
|
+
"Principal": { "Service": "events.amazonaws.com" },
|
|
189
|
+
"Action": "sts:AssumeRole"
|
|
190
|
+
}
|
|
191
|
+
]
|
|
192
|
+
},
|
|
193
|
+
"Policies": [
|
|
194
|
+
{
|
|
195
|
+
"PolicyName": "mlcc-e2e-eventbridge-policy",
|
|
196
|
+
"PolicyDocument": {
|
|
197
|
+
"Version": "2012-10-17",
|
|
198
|
+
"Statement": [
|
|
199
|
+
{
|
|
200
|
+
"Effect": "Allow",
|
|
201
|
+
"Action": "codebuild:StartBuild",
|
|
202
|
+
"Resource": { "Fn::GetAtt": ["E2ECodeBuildProject", "Arn"] }
|
|
203
|
+
}
|
|
204
|
+
]
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
],
|
|
208
|
+
"Tags": [
|
|
209
|
+
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
210
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap-e2e" }
|
|
211
|
+
]
|
|
212
|
+
}
|
|
213
|
+
},
|
|
214
|
+
|
|
215
|
+
"E2ECIDailyRule": {
|
|
216
|
+
"Type": "AWS::Events::Rule",
|
|
217
|
+
"Metadata": {
|
|
218
|
+
"MigrationNote": "Renamed from E2ECIHourlyRule to E2ECIDailyRule. Renaming the logical resource ID causes CloudFormation to delete the old rule and create a new one. This is intentional as the schedule changed from hourly to daily."
|
|
219
|
+
},
|
|
220
|
+
"Properties": {
|
|
221
|
+
"Name": "e2e-ci-daily",
|
|
222
|
+
"Description": "Triggers E2E CI tier validation daily at 6am UTC",
|
|
223
|
+
"ScheduleExpression": "cron(0 6 * * ? *)",
|
|
224
|
+
"State": "ENABLED",
|
|
225
|
+
"Targets": [
|
|
226
|
+
{
|
|
227
|
+
"Id": "e2e-codebuild-ci",
|
|
228
|
+
"Arn": { "Fn::GetAtt": ["E2ECodeBuildProject", "Arn"] },
|
|
229
|
+
"RoleArn": { "Fn::GetAtt": ["E2EEventBridgeRole", "Arn"] },
|
|
230
|
+
"Input": "{\"environmentVariablesOverride\":[{\"name\":\"TIER\",\"value\":\"ci\",\"type\":\"PLAINTEXT\"}]}"
|
|
231
|
+
}
|
|
232
|
+
]
|
|
233
|
+
}
|
|
234
|
+
},
|
|
235
|
+
|
|
236
|
+
"E2ENightlyRule": {
|
|
237
|
+
"Type": "AWS::Events::Rule",
|
|
238
|
+
"Properties": {
|
|
239
|
+
"Name": "e2e-nightly",
|
|
240
|
+
"Description": "Triggers E2E nightly tier validation at 2am UTC daily",
|
|
241
|
+
"ScheduleExpression": "cron(0 2 * * ? *)",
|
|
242
|
+
"State": "ENABLED",
|
|
243
|
+
"Targets": [
|
|
244
|
+
{
|
|
245
|
+
"Id": "e2e-codebuild-nightly",
|
|
246
|
+
"Arn": { "Fn::GetAtt": ["E2ECodeBuildProject", "Arn"] },
|
|
247
|
+
"RoleArn": { "Fn::GetAtt": ["E2EEventBridgeRole", "Arn"] },
|
|
248
|
+
"Input": "{\"environmentVariablesOverride\":[{\"name\":\"TIER\",\"value\":\"nightly\",\"type\":\"PLAINTEXT\"}]}"
|
|
249
|
+
}
|
|
250
|
+
]
|
|
251
|
+
}
|
|
252
|
+
},
|
|
253
|
+
|
|
254
|
+
"E2EWeeklyRule": {
|
|
255
|
+
"Type": "AWS::Events::Rule",
|
|
256
|
+
"Properties": {
|
|
257
|
+
"Name": "e2e-weekly",
|
|
258
|
+
"Description": "Triggers E2E weekly tier validation at 2am UTC every Sunday",
|
|
259
|
+
"ScheduleExpression": "cron(0 2 ? * SUN *)",
|
|
260
|
+
"State": "ENABLED",
|
|
261
|
+
"Targets": [
|
|
262
|
+
{
|
|
263
|
+
"Id": "e2e-codebuild-weekly",
|
|
264
|
+
"Arn": { "Fn::GetAtt": ["E2ECodeBuildProject", "Arn"] },
|
|
265
|
+
"RoleArn": { "Fn::GetAtt": ["E2EEventBridgeRole", "Arn"] },
|
|
266
|
+
"Input": "{\"environmentVariablesOverride\":[{\"name\":\"TIER\",\"value\":\"weekly\",\"type\":\"PLAINTEXT\"}]}"
|
|
267
|
+
}
|
|
268
|
+
]
|
|
269
|
+
}
|
|
270
|
+
},
|
|
271
|
+
|
|
272
|
+
"E2EResultsBucket": {
|
|
273
|
+
"Type": "AWS::S3::Bucket",
|
|
274
|
+
"DeletionPolicy": "Retain",
|
|
275
|
+
"UpdateReplacePolicy": "Retain",
|
|
276
|
+
"Properties": {
|
|
277
|
+
"BucketName": { "Fn::Sub": "mlcc-e2e-results-${AWS::AccountId}-${AWS::Region}" },
|
|
278
|
+
"VersioningConfiguration": { "Status": "Enabled" },
|
|
279
|
+
"BucketEncryption": {
|
|
280
|
+
"ServerSideEncryptionConfiguration": [
|
|
281
|
+
{ "ServerSideEncryptionByDefault": { "SSEAlgorithm": "AES256" } }
|
|
282
|
+
]
|
|
283
|
+
},
|
|
284
|
+
"LifecycleConfiguration": {
|
|
285
|
+
"Rules": [
|
|
286
|
+
{
|
|
287
|
+
"Id": "ExpireOldResults",
|
|
288
|
+
"Status": "Enabled",
|
|
289
|
+
"ExpirationInDays": 90
|
|
290
|
+
}
|
|
291
|
+
]
|
|
292
|
+
},
|
|
293
|
+
"Tags": [
|
|
294
|
+
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
295
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap-e2e" },
|
|
296
|
+
{ "Key": "mlcc:purpose", "Value": "e2e-validation-results" }
|
|
297
|
+
]
|
|
298
|
+
}
|
|
299
|
+
},
|
|
300
|
+
|
|
301
|
+
"E2ENotificationsTopic": {
|
|
302
|
+
"Type": "AWS::SNS::Topic",
|
|
303
|
+
"Properties": {
|
|
304
|
+
"TopicName": "mlcc-e2e-notifications",
|
|
305
|
+
"DisplayName": "ML Container Creator E2E Validation Alerts",
|
|
306
|
+
"Tags": [
|
|
307
|
+
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
308
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap-e2e" },
|
|
309
|
+
{ "Key": "mlcc:purpose", "Value": "e2e-failure-alerts" }
|
|
310
|
+
]
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
},
|
|
314
|
+
|
|
315
|
+
"Outputs": {
|
|
316
|
+
"CodeBuildProjectName": {
|
|
317
|
+
"Description": "E2E CodeBuild project name",
|
|
318
|
+
"Value": { "Ref": "E2ECodeBuildProject" }
|
|
319
|
+
},
|
|
320
|
+
"CodeBuildProjectArn": {
|
|
321
|
+
"Description": "E2E CodeBuild project ARN",
|
|
322
|
+
"Value": { "Fn::GetAtt": ["E2ECodeBuildProject", "Arn"] }
|
|
323
|
+
},
|
|
324
|
+
"ResultsBucketName": {
|
|
325
|
+
"Description": "S3 bucket for E2E validation results",
|
|
326
|
+
"Value": { "Ref": "E2EResultsBucket" }
|
|
327
|
+
},
|
|
328
|
+
"NotificationsTopicArn": {
|
|
329
|
+
"Description": "SNS topic ARN for E2E failure notifications",
|
|
330
|
+
"Value": { "Ref": "E2ENotificationsTopic" }
|
|
331
|
+
},
|
|
332
|
+
"CodeBuildRoleArn": {
|
|
333
|
+
"Description": "IAM role ARN used by the E2E CodeBuild project",
|
|
334
|
+
"Value": { "Fn::GetAtt": ["E2ECodeBuildRole", "Arn"] }
|
|
335
|
+
},
|
|
336
|
+
"StackVersion": {
|
|
337
|
+
"Description": "Bootstrap E2E stack template version",
|
|
338
|
+
"Value": "2026-06-01"
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
@@ -13,12 +13,19 @@
|
|
|
13
13
|
"Type": "String",
|
|
14
14
|
"Default": "",
|
|
15
15
|
"Description": "ARN of an existing IAM role to use instead of creating one. Leave empty to create a new role."
|
|
16
|
+
},
|
|
17
|
+
"SkipEcrCreation": {
|
|
18
|
+
"Type": "String",
|
|
19
|
+
"Default": "false",
|
|
20
|
+
"AllowedValues": ["true", "false"],
|
|
21
|
+
"Description": "Skip ECR repository creation when it already exists. Set to 'true' to avoid ResourceExistenceCheck failures."
|
|
16
22
|
}
|
|
17
23
|
},
|
|
18
24
|
|
|
19
25
|
"Conditions": {
|
|
20
26
|
"ShouldCreateS3Buckets": { "Fn::Equals": [{ "Ref": "CreateS3Buckets" }, "true"] },
|
|
21
|
-
"ShouldCreateRole": { "Fn::Equals": [{ "Ref": "UseExistingRoleArn" }, ""] }
|
|
27
|
+
"ShouldCreateRole": { "Fn::Equals": [{ "Ref": "UseExistingRoleArn" }, ""] },
|
|
28
|
+
"ShouldCreateEcr": { "Fn::Equals": [{ "Ref": "SkipEcrCreation" }, "false"] }
|
|
22
29
|
},
|
|
23
30
|
|
|
24
31
|
"Resources": {
|
|
@@ -77,6 +84,11 @@
|
|
|
77
84
|
"sagemaker:ListAIBenchmarkJobs",
|
|
78
85
|
"sagemaker:StopAIBenchmarkJob",
|
|
79
86
|
"sagemaker:DeleteAIBenchmarkJob",
|
|
87
|
+
"sagemaker:CreateAIRecommendationJob",
|
|
88
|
+
"sagemaker:DescribeAIRecommendationJob",
|
|
89
|
+
"sagemaker:ListAIRecommendationJobs",
|
|
90
|
+
"sagemaker:StopAIRecommendationJob",
|
|
91
|
+
"sagemaker:DeleteAIRecommendationJob",
|
|
80
92
|
"sagemaker:CreateAIWorkloadConfig",
|
|
81
93
|
"sagemaker:DescribeAIWorkloadConfig",
|
|
82
94
|
"sagemaker:ListAIWorkloadConfigs",
|
|
@@ -201,6 +213,9 @@
|
|
|
201
213
|
"sagemaker:DescribeModelPackage",
|
|
202
214
|
"sagemaker:DescribeModelPackageGroup",
|
|
203
215
|
"sagemaker:ListModelPackages",
|
|
216
|
+
"sagemaker:ListHubContents",
|
|
217
|
+
"sagemaker:DescribeHubContent",
|
|
218
|
+
"sagemaker:DescribeHub",
|
|
204
219
|
"sagemaker:CallMlflowAppApi"
|
|
205
220
|
],
|
|
206
221
|
"Resource": "*"
|
|
@@ -208,9 +223,28 @@
|
|
|
208
223
|
{
|
|
209
224
|
"Sid": "SageMakerMLflow",
|
|
210
225
|
"Effect": "Allow",
|
|
211
|
-
"Action":
|
|
226
|
+
"Action": [
|
|
227
|
+
"sagemaker:ListMlflowApps",
|
|
228
|
+
"sagemaker:ListMlflowTrackingServers",
|
|
229
|
+
"sagemaker:DescribeMlflowTrackingServer",
|
|
230
|
+
"sagemaker:CreatePresignedMlflowTrackingServerUrl",
|
|
231
|
+
"sagemaker:DescribeApp",
|
|
232
|
+
"sagemaker:ListApps"
|
|
233
|
+
],
|
|
212
234
|
"Resource": "*"
|
|
213
235
|
},
|
|
236
|
+
{
|
|
237
|
+
"Sid": "SageMakerMlflowAppAccess",
|
|
238
|
+
"Effect": "Allow",
|
|
239
|
+
"Action": [
|
|
240
|
+
"sagemaker:UpdateMlflowApp",
|
|
241
|
+
"sagemaker:DescribeMlflowApp",
|
|
242
|
+
"sagemaker:CreatePresignedMlflowAppUrl",
|
|
243
|
+
"sagemaker:CallMlflowAppApi",
|
|
244
|
+
"sagemaker-mlflow:*"
|
|
245
|
+
],
|
|
246
|
+
"Resource": { "Fn::Sub": "arn:aws:sagemaker:*:${AWS::AccountId}:mlflow-app/*" }
|
|
247
|
+
},
|
|
214
248
|
{
|
|
215
249
|
"Sid": "LambdaInvokeForReward",
|
|
216
250
|
"Effect": "Allow",
|
|
@@ -230,6 +264,7 @@
|
|
|
230
264
|
|
|
231
265
|
"EcrRepository": {
|
|
232
266
|
"Type": "AWS::ECR::Repository",
|
|
267
|
+
"Condition": "ShouldCreateEcr",
|
|
233
268
|
"Properties": {
|
|
234
269
|
"RepositoryName": "ml-container-creator",
|
|
235
270
|
"ImageScanningConfiguration": { "ScanOnPush": true },
|
|
@@ -361,10 +396,12 @@
|
|
|
361
396
|
}
|
|
362
397
|
},
|
|
363
398
|
"EcrRepositoryName": {
|
|
399
|
+
"Condition": "ShouldCreateEcr",
|
|
364
400
|
"Description": "ECR repository name",
|
|
365
401
|
"Value": { "Ref": "EcrRepository" }
|
|
366
402
|
},
|
|
367
403
|
"EcrRepositoryUri": {
|
|
404
|
+
"Condition": "ShouldCreateEcr",
|
|
368
405
|
"Description": "ECR repository URI",
|
|
369
406
|
"Value": { "Fn::GetAtt": ["EcrRepository", "RepositoryUri"] }
|
|
370
407
|
},
|
|
@@ -395,7 +432,7 @@
|
|
|
395
432
|
},
|
|
396
433
|
"StackVersion": {
|
|
397
434
|
"Description": "Bootstrap stack template version for forward compatibility tracking",
|
|
398
|
-
"Value": "2026-05-
|
|
435
|
+
"Value": "2026-05-28"
|
|
399
436
|
}
|
|
400
437
|
}
|
|
401
438
|
}
|
|
@@ -174,7 +174,7 @@
|
|
|
174
174
|
"configKey": "instanceType",
|
|
175
175
|
"default": null,
|
|
176
176
|
"validation": {
|
|
177
|
-
"pattern": "^ml\\.[a-z0-9]+\\.[a-z0-9]+$"
|
|
177
|
+
"pattern": "^ml\\.[a-z0-9-]+\\.[a-z0-9]+$"
|
|
178
178
|
},
|
|
179
179
|
"phase": "infrastructure",
|
|
180
180
|
"group": "infrastructure",
|
|
@@ -423,7 +423,7 @@
|
|
|
423
423
|
},
|
|
424
424
|
"includeBenchmark": {
|
|
425
425
|
"type": "boolean",
|
|
426
|
-
"description": "Include SageMaker AI Benchmarking",
|
|
426
|
+
"description": "Include SageMaker AI Benchmarking scripts (do/benchmark, do/optimize). Workload configuration is specified at runtime via --workload flag.",
|
|
427
427
|
"cliFlag": "--include-benchmark",
|
|
428
428
|
"cliArgName": null,
|
|
429
429
|
"envVar": "ML_INCLUDE_BENCHMARK",
|
|
@@ -483,11 +483,7 @@
|
|
|
483
483
|
"inputType": "number",
|
|
484
484
|
"placeholder": "10"
|
|
485
485
|
},
|
|
486
|
-
"prompt":
|
|
487
|
-
"message": "Benchmark concurrency?",
|
|
488
|
-
"type": "number",
|
|
489
|
-
"when": "includeBenchmark === true"
|
|
490
|
-
},
|
|
486
|
+
"prompt": null,
|
|
491
487
|
"deprecated": false,
|
|
492
488
|
"since": "0.6.0"
|
|
493
489
|
},
|
|
@@ -520,11 +516,7 @@
|
|
|
520
516
|
"inputType": "number",
|
|
521
517
|
"placeholder": "550"
|
|
522
518
|
},
|
|
523
|
-
"prompt":
|
|
524
|
-
"message": "Mean input tokens?",
|
|
525
|
-
"type": "number",
|
|
526
|
-
"when": "includeBenchmark === true"
|
|
527
|
-
},
|
|
519
|
+
"prompt": null,
|
|
528
520
|
"deprecated": false,
|
|
529
521
|
"since": "0.6.0"
|
|
530
522
|
},
|
|
@@ -557,11 +549,7 @@
|
|
|
557
549
|
"inputType": "number",
|
|
558
550
|
"placeholder": "150"
|
|
559
551
|
},
|
|
560
|
-
"prompt":
|
|
561
|
-
"message": "Mean output tokens?",
|
|
562
|
-
"type": "number",
|
|
563
|
-
"when": "includeBenchmark === true"
|
|
564
|
-
},
|
|
552
|
+
"prompt": null,
|
|
565
553
|
"deprecated": false,
|
|
566
554
|
"since": "0.6.0"
|
|
567
555
|
},
|
|
@@ -590,11 +578,7 @@
|
|
|
590
578
|
"section": "features",
|
|
591
579
|
"inputType": "checkbox"
|
|
592
580
|
},
|
|
593
|
-
"prompt":
|
|
594
|
-
"message": "Enable streaming in benchmark?",
|
|
595
|
-
"type": "confirm",
|
|
596
|
-
"when": "includeBenchmark === true"
|
|
597
|
-
},
|
|
581
|
+
"prompt": null,
|
|
598
582
|
"deprecated": false,
|
|
599
583
|
"since": "0.6.0"
|
|
600
584
|
},
|
|
@@ -1254,6 +1238,33 @@
|
|
|
1254
1238
|
"deprecated": false,
|
|
1255
1239
|
"since": "0.4.0"
|
|
1256
1240
|
},
|
|
1241
|
+
"capacityReservationArn": {
|
|
1242
|
+
"type": "string",
|
|
1243
|
+
"description": "Capacity reservation ARN (FTP or ODCR) for reserved instance deployment",
|
|
1244
|
+
"cliFlag": "--capacity-reservation-arn",
|
|
1245
|
+
"cliArgName": "arn",
|
|
1246
|
+
"envVar": "ML_CAPACITY_RESERVATION_ARN",
|
|
1247
|
+
"templateVar": "capacityReservationArn",
|
|
1248
|
+
"configKey": "capacityReservationArn",
|
|
1249
|
+
"default": null,
|
|
1250
|
+
"validation": {
|
|
1251
|
+
"pattern": "^arn:aws:sagemaker:"
|
|
1252
|
+
},
|
|
1253
|
+
"phase": "infrastructure",
|
|
1254
|
+
"group": "endpoint",
|
|
1255
|
+
"appliesTo": {
|
|
1256
|
+
"deploymentTargets": [
|
|
1257
|
+
"managed-inference"
|
|
1258
|
+
],
|
|
1259
|
+
"architectures": [
|
|
1260
|
+
"*"
|
|
1261
|
+
]
|
|
1262
|
+
},
|
|
1263
|
+
"widget": null,
|
|
1264
|
+
"prompt": null,
|
|
1265
|
+
"deprecated": false,
|
|
1266
|
+
"since": "0.11.0"
|
|
1267
|
+
},
|
|
1257
1268
|
"icCpuCount": {
|
|
1258
1269
|
"type": "number",
|
|
1259
1270
|
"description": "vCPUs allocated to the inference component",
|