@aws/ml-container-creator 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/LICENSE-THIRD-PARTY +68620 -0
- package/NOTICE +2 -0
- package/README.md +106 -0
- package/bin/cli.js +365 -0
- package/config/defaults.json +32 -0
- package/config/presets/transformers-djl.json +26 -0
- package/config/presets/transformers-gpu.json +24 -0
- package/config/presets/transformers-lmi.json +27 -0
- package/package.json +129 -0
- package/servers/README.md +419 -0
- package/servers/base-image-picker/catalogs/model-servers.json +1191 -0
- package/servers/base-image-picker/catalogs/python-slim.json +38 -0
- package/servers/base-image-picker/catalogs/triton-backends.json +51 -0
- package/servers/base-image-picker/catalogs/triton.json +38 -0
- package/servers/base-image-picker/index.js +495 -0
- package/servers/base-image-picker/manifest.json +17 -0
- package/servers/base-image-picker/package.json +15 -0
- package/servers/hyperpod-cluster-picker/LICENSE +202 -0
- package/servers/hyperpod-cluster-picker/index.js +424 -0
- package/servers/hyperpod-cluster-picker/manifest.json +14 -0
- package/servers/hyperpod-cluster-picker/package.json +17 -0
- package/servers/instance-recommender/LICENSE +202 -0
- package/servers/instance-recommender/catalogs/instances.json +852 -0
- package/servers/instance-recommender/index.js +284 -0
- package/servers/instance-recommender/manifest.json +16 -0
- package/servers/instance-recommender/package.json +15 -0
- package/servers/lib/LICENSE +202 -0
- package/servers/lib/bedrock-client.js +160 -0
- package/servers/lib/custom-validators.js +46 -0
- package/servers/lib/dynamic-resolver.js +36 -0
- package/servers/lib/package.json +11 -0
- package/servers/lib/schemas/image-catalog.schema.json +185 -0
- package/servers/lib/schemas/instances.schema.json +124 -0
- package/servers/lib/schemas/manifest.schema.json +64 -0
- package/servers/lib/schemas/model-catalog.schema.json +91 -0
- package/servers/lib/schemas/regions.schema.json +26 -0
- package/servers/lib/schemas/triton-backends.schema.json +51 -0
- package/servers/model-picker/catalogs/jumpstart-public.json +66 -0
- package/servers/model-picker/catalogs/popular-diffusors.json +88 -0
- package/servers/model-picker/catalogs/popular-transformers.json +226 -0
- package/servers/model-picker/index.js +1693 -0
- package/servers/model-picker/manifest.json +18 -0
- package/servers/model-picker/package.json +20 -0
- package/servers/region-picker/LICENSE +202 -0
- package/servers/region-picker/catalogs/regions.json +263 -0
- package/servers/region-picker/index.js +230 -0
- package/servers/region-picker/manifest.json +16 -0
- package/servers/region-picker/package.json +15 -0
- package/src/app.js +1007 -0
- package/src/copy-tpl.js +77 -0
- package/src/lib/accelerator-validator.js +39 -0
- package/src/lib/asset-manager.js +385 -0
- package/src/lib/aws-profile-parser.js +181 -0
- package/src/lib/bootstrap-command-handler.js +1647 -0
- package/src/lib/bootstrap-config.js +238 -0
- package/src/lib/ci-register-helpers.js +124 -0
- package/src/lib/ci-report-helpers.js +158 -0
- package/src/lib/ci-stage-helpers.js +268 -0
- package/src/lib/cli-handler.js +529 -0
- package/src/lib/comment-generator.js +544 -0
- package/src/lib/community-reports-validator.js +91 -0
- package/src/lib/config-manager.js +2106 -0
- package/src/lib/configuration-exporter.js +204 -0
- package/src/lib/configuration-manager.js +695 -0
- package/src/lib/configuration-matcher.js +221 -0
- package/src/lib/cpu-validator.js +36 -0
- package/src/lib/cuda-validator.js +57 -0
- package/src/lib/deployment-config-resolver.js +103 -0
- package/src/lib/deployment-entry-schema.js +125 -0
- package/src/lib/deployment-registry.js +598 -0
- package/src/lib/docker-introspection-validator.js +51 -0
- package/src/lib/engine-prefix-resolver.js +60 -0
- package/src/lib/huggingface-client.js +172 -0
- package/src/lib/key-value-parser.js +37 -0
- package/src/lib/known-flags-validator.js +200 -0
- package/src/lib/manifest-cli.js +280 -0
- package/src/lib/mcp-client.js +303 -0
- package/src/lib/mcp-command-handler.js +532 -0
- package/src/lib/neuron-validator.js +80 -0
- package/src/lib/parameter-schema-validator.js +284 -0
- package/src/lib/prompt-runner.js +1349 -0
- package/src/lib/prompts.js +1138 -0
- package/src/lib/registry-command-handler.js +519 -0
- package/src/lib/registry-loader.js +198 -0
- package/src/lib/rocm-validator.js +80 -0
- package/src/lib/schema-validator.js +157 -0
- package/src/lib/sensitive-redactor.js +59 -0
- package/src/lib/template-engine.js +156 -0
- package/src/lib/template-manager.js +341 -0
- package/src/lib/validation-engine.js +314 -0
- package/src/prompt-adapter.js +63 -0
- package/templates/Dockerfile +300 -0
- package/templates/IAM_PERMISSIONS.md +84 -0
- package/templates/MIGRATION.md +488 -0
- package/templates/PROJECT_README.md +439 -0
- package/templates/TEMPLATE_SYSTEM.md +243 -0
- package/templates/buildspec.yml +64 -0
- package/templates/code/chat_template.jinja +1 -0
- package/templates/code/flask/gunicorn_config.py +35 -0
- package/templates/code/flask/wsgi.py +10 -0
- package/templates/code/model_handler.py +387 -0
- package/templates/code/serve +300 -0
- package/templates/code/serve.py +175 -0
- package/templates/code/serving.properties +105 -0
- package/templates/code/start_server.py +39 -0
- package/templates/code/start_server.sh +39 -0
- package/templates/diffusors/Dockerfile +72 -0
- package/templates/diffusors/patch_image_api.py +35 -0
- package/templates/diffusors/serve +115 -0
- package/templates/diffusors/start_server.sh +114 -0
- package/templates/do/.gitkeep +1 -0
- package/templates/do/README.md +541 -0
- package/templates/do/build +83 -0
- package/templates/do/ci +681 -0
- package/templates/do/clean +811 -0
- package/templates/do/config +260 -0
- package/templates/do/deploy +1560 -0
- package/templates/do/export +306 -0
- package/templates/do/logs +319 -0
- package/templates/do/manifest +12 -0
- package/templates/do/push +119 -0
- package/templates/do/register +580 -0
- package/templates/do/run +113 -0
- package/templates/do/submit +417 -0
- package/templates/do/test +1147 -0
- package/templates/hyperpod/configmap.yaml +24 -0
- package/templates/hyperpod/deployment.yaml +71 -0
- package/templates/hyperpod/pvc.yaml +42 -0
- package/templates/hyperpod/service.yaml +17 -0
- package/templates/nginx-diffusors.conf +74 -0
- package/templates/nginx-predictors.conf +47 -0
- package/templates/nginx-tensorrt.conf +74 -0
- package/templates/requirements.txt +61 -0
- package/templates/sample_model/test_inference.py +123 -0
- package/templates/sample_model/train_abalone.py +252 -0
- package/templates/test/test_endpoint.sh +79 -0
- package/templates/test/test_local_image.sh +80 -0
- package/templates/test/test_model_handler.py +180 -0
- package/templates/triton/Dockerfile +128 -0
- package/templates/triton/config.pbtxt +163 -0
- package/templates/triton/model.py +130 -0
- package/templates/triton/requirements.txt +11 -0
|
@@ -0,0 +1,541 @@
|
|
|
1
|
+
# do-framework Scripts
|
|
2
|
+
|
|
3
|
+
This directory contains standardized scripts for managing the container lifecycle of your ML deployment project. These scripts follow the [do-framework](https://github.com/iankoulski/do-framework) conventions for consistent, predictable container operations.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Build Docker image
|
|
9
|
+
./do/build
|
|
10
|
+
|
|
11
|
+
# Test locally
|
|
12
|
+
./do/run
|
|
13
|
+
|
|
14
|
+
# Push to Amazon ECR
|
|
15
|
+
./do/push
|
|
16
|
+
|
|
17
|
+
# Deploy to SageMaker
|
|
18
|
+
export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE
|
|
19
|
+
./do/deploy
|
|
20
|
+
|
|
21
|
+
# Test the endpoint
|
|
22
|
+
./do/test <endpoint-name>
|
|
23
|
+
|
|
24
|
+
# Clean up resources
|
|
25
|
+
./do/clean all
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Project Configuration
|
|
29
|
+
|
|
30
|
+
**Deployment Configuration**: `<%= deploymentConfig %>`
|
|
31
|
+
- Framework: `<%= framework %>`
|
|
32
|
+
- Model Server: `<%= modelServer %>`
|
|
33
|
+
- AWS Region: `<%= awsRegion %>`
|
|
34
|
+
- Instance Type: `<%= instanceType %>`
|
|
35
|
+
- Build Target: `<%= buildTarget %>`
|
|
36
|
+
|
|
37
|
+
All configuration is centralized in `do/config`. You can override any setting by exporting environment variables before running scripts.
|
|
38
|
+
|
|
39
|
+
## Available Commands
|
|
40
|
+
|
|
41
|
+
### `./do/build`
|
|
42
|
+
|
|
43
|
+
Build the Docker image for your ML model.
|
|
44
|
+
|
|
45
|
+
**What it does:**
|
|
46
|
+
- Validates Docker is installed
|
|
47
|
+
- Handles framework-specific authentication (e.g., NGC for TensorRT-LLM)
|
|
48
|
+
- Builds Docker image with appropriate base image (CPU or GPU)
|
|
49
|
+
- Tags image with project name and timestamp
|
|
50
|
+
|
|
51
|
+
**Usage:**
|
|
52
|
+
```bash
|
|
53
|
+
./do/build
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
<% if (modelServer === 'tensorrt-llm') { %>
|
|
57
|
+
**TensorRT-LLM Requirements:**
|
|
58
|
+
```bash
|
|
59
|
+
# Set NGC API key before building
|
|
60
|
+
export NGC_API_KEY=your_ngc_api_key
|
|
61
|
+
./do/build
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Get your NGC API key from [NVIDIA NGC](https://ngc.nvidia.com/).
|
|
65
|
+
<% } %>
|
|
66
|
+
|
|
67
|
+
**Output:**
|
|
68
|
+
- Docker image: `<%= projectName %>:latest`
|
|
69
|
+
- Tagged image: `<%= projectName %>:YYYYMMDD-HHMMSS`
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
### `./do/push`
|
|
74
|
+
|
|
75
|
+
Push the Docker image to Amazon Elastic Container Registry (ECR).
|
|
76
|
+
|
|
77
|
+
**What it does:**
|
|
78
|
+
- Validates AWS credentials
|
|
79
|
+
- Authenticates with ECR
|
|
80
|
+
- Creates ECR repository if it doesn't exist
|
|
81
|
+
- Pushes all image tags to ECR
|
|
82
|
+
- Displays pushed image URIs
|
|
83
|
+
|
|
84
|
+
**Prerequisites:**
|
|
85
|
+
- AWS credentials configured (`aws configure`)
|
|
86
|
+
- Docker image built (`./do/build`)
|
|
87
|
+
- IAM permissions for ECR operations
|
|
88
|
+
|
|
89
|
+
**Usage:**
|
|
90
|
+
```bash
|
|
91
|
+
./do/push
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Output:**
|
|
95
|
+
- Image URI: `ACCOUNT_ID.dkr.ecr.<%= awsRegion %>.amazonaws.com/ml-container-creator:<%= projectName %>-latest`
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
### `./do/deploy`
|
|
100
|
+
|
|
101
|
+
Deploy the container to AWS SageMaker as a managed endpoint.
|
|
102
|
+
|
|
103
|
+
**What it does:**
|
|
104
|
+
- Validates AWS credentials and execution role
|
|
105
|
+
- Verifies ECR image exists
|
|
106
|
+
- Creates SageMaker model
|
|
107
|
+
- Creates endpoint configuration
|
|
108
|
+
- Creates and waits for endpoint to reach InService status
|
|
109
|
+
- Displays endpoint details and test command
|
|
110
|
+
|
|
111
|
+
**Prerequisites:**
|
|
112
|
+
- AWS credentials configured
|
|
113
|
+
- Docker image pushed to ECR (`./do/push`<% if (buildTarget === 'codebuild') { %> or `./do/submit`<% } %>)
|
|
114
|
+
- SageMaker execution role ARN
|
|
115
|
+
|
|
116
|
+
**Usage:**
|
|
117
|
+
```bash
|
|
118
|
+
export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_SAGEMAKER_ROLE
|
|
119
|
+
./do/deploy
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Or set `ROLE_ARN` in `do/config` to avoid exporting each time.
|
|
123
|
+
|
|
124
|
+
**Required IAM Permissions:**
|
|
125
|
+
|
|
126
|
+
The execution role must have:
|
|
127
|
+
- SageMaker model and endpoint management
|
|
128
|
+
- ECR image access
|
|
129
|
+
- S3 access (if using model artifacts)
|
|
130
|
+
- CloudWatch Logs write access
|
|
131
|
+
|
|
132
|
+
**Output:**
|
|
133
|
+
- Endpoint name: `<%= projectName %>-endpoint-TIMESTAMP`
|
|
134
|
+
- Endpoint status: InService
|
|
135
|
+
- Test command: `./do/test <endpoint-name>`
|
|
136
|
+
|
|
137
|
+
**Deployment Time:** Typically 5-10 minutes for endpoint to reach InService status.
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
### `./do/run`
|
|
142
|
+
|
|
143
|
+
Run the container locally for testing before deployment.
|
|
144
|
+
|
|
145
|
+
**What it does:**
|
|
146
|
+
- Detects if GPU support is needed based on deployment configuration
|
|
147
|
+
- Starts Docker container with port 8080 exposed
|
|
148
|
+
- Mounts model directory if specified
|
|
149
|
+
- Streams container logs to console
|
|
150
|
+
|
|
151
|
+
**Prerequisites:**
|
|
152
|
+
- Docker image built (`./do/build`)
|
|
153
|
+
<% if (framework === 'transformers') { %>- NVIDIA Docker runtime (for GPU support)
|
|
154
|
+
<% } %>
|
|
155
|
+
|
|
156
|
+
**Usage:**
|
|
157
|
+
```bash
|
|
158
|
+
./do/run
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
<% if (framework === 'transformers') { %>
|
|
162
|
+
**GPU Requirements:**
|
|
163
|
+
This deployment configuration requires GPU support. Ensure you have:
|
|
164
|
+
- NVIDIA GPU with appropriate drivers
|
|
165
|
+
- NVIDIA Container Toolkit installed
|
|
166
|
+
- Docker configured to use NVIDIA runtime
|
|
167
|
+
<% } %>
|
|
168
|
+
|
|
169
|
+
**Testing the local container:**
|
|
170
|
+
```bash
|
|
171
|
+
# In another terminal, test the endpoints
|
|
172
|
+
./do/test
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Stop the container:** Press `Ctrl+C`
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
### `./do/test`
|
|
180
|
+
|
|
181
|
+
Test the container or SageMaker endpoint with sample requests.
|
|
182
|
+
|
|
183
|
+
**What it does:**
|
|
184
|
+
- Sends health check request to `/ping` endpoint
|
|
185
|
+
- Sends sample inference request to `/invocations` endpoint
|
|
186
|
+
- Validates responses and displays results
|
|
187
|
+
- Supports both local container and SageMaker endpoint testing
|
|
188
|
+
|
|
189
|
+
**Usage:**
|
|
190
|
+
|
|
191
|
+
Test local container:
|
|
192
|
+
```bash
|
|
193
|
+
./do/test
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Test SageMaker endpoint:
|
|
197
|
+
```bash
|
|
198
|
+
./do/test <endpoint-name>
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
**Test Payloads:**
|
|
202
|
+
|
|
203
|
+
<% if (framework === 'sklearn' || framework === 'xgboost' || framework === 'tensorflow') { %>
|
|
204
|
+
Traditional ML models expect JSON with feature vectors:
|
|
205
|
+
```json
|
|
206
|
+
{
|
|
207
|
+
"instances": [[1.0, 2.0, 3.0, 4.0]]
|
|
208
|
+
}
|
|
209
|
+
```
|
|
210
|
+
<% } else if (framework === 'transformers') { %>
|
|
211
|
+
Transformer models expect text generation requests:
|
|
212
|
+
```json
|
|
213
|
+
{
|
|
214
|
+
"inputs": "What is machine learning?",
|
|
215
|
+
"parameters": {
|
|
216
|
+
"max_new_tokens": 50,
|
|
217
|
+
"temperature": 0.7
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
<% } %>
|
|
222
|
+
|
|
223
|
+
**Exit Codes:**
|
|
224
|
+
- `0`: All tests passed
|
|
225
|
+
- `1`: Test failed (connection error, HTTP error, or validation error)
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
### `./do/clean`
|
|
230
|
+
|
|
231
|
+
Clean up Docker images and AWS resources.
|
|
232
|
+
|
|
233
|
+
**What it does:**
|
|
234
|
+
- Removes local Docker images
|
|
235
|
+
- Deletes images from ECR
|
|
236
|
+
- Deletes SageMaker endpoints, configurations, and models
|
|
237
|
+
- Prompts for confirmation before destructive operations
|
|
238
|
+
|
|
239
|
+
**Usage:**
|
|
240
|
+
|
|
241
|
+
Clean local Docker images:
|
|
242
|
+
```bash
|
|
243
|
+
./do/clean local
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
Clean ECR images:
|
|
247
|
+
```bash
|
|
248
|
+
./do/clean ecr
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Clean SageMaker endpoint and related resources:
|
|
252
|
+
```bash
|
|
253
|
+
./do/clean endpoint
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Clean everything:
|
|
257
|
+
```bash
|
|
258
|
+
./do/clean all
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
**Warning:** Cleaning operations are destructive and cannot be undone. Always confirm you want to delete resources.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
<% if (buildTarget === 'codebuild') { %>
|
|
266
|
+
### `./do/submit`
|
|
267
|
+
|
|
268
|
+
Submit a build job to AWS CodeBuild (CodeBuild deployment only).
|
|
269
|
+
|
|
270
|
+
**What it does:**
|
|
271
|
+
- Creates CodeBuild project if it doesn't exist
|
|
272
|
+
- Creates IAM service role for CodeBuild if needed
|
|
273
|
+
- Uploads source code to S3
|
|
274
|
+
- Starts CodeBuild job that builds AND pushes image to ECR
|
|
275
|
+
- Monitors build progress
|
|
276
|
+
- Displays ECR image URI on success
|
|
277
|
+
|
|
278
|
+
**Prerequisites:**
|
|
279
|
+
- AWS credentials configured
|
|
280
|
+
- IAM permissions for CodeBuild, S3, and IAM operations
|
|
281
|
+
|
|
282
|
+
**Usage:**
|
|
283
|
+
```bash
|
|
284
|
+
./do/submit
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
**Important:** When using CodeBuild deployment, `./do/submit` replaces both `./do/build` and `./do/push`. The buildspec.yml handles building the Docker image and pushing it to ECR in the AWS environment.
|
|
288
|
+
|
|
289
|
+
**Workflow Comparison:**
|
|
290
|
+
|
|
291
|
+
Local/SageMaker deployment:
|
|
292
|
+
```bash
|
|
293
|
+
./do/build # Build locally
|
|
294
|
+
./do/push # Push to ECR
|
|
295
|
+
./do/deploy # Deploy to SageMaker
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
CodeBuild deployment:
|
|
299
|
+
```bash
|
|
300
|
+
./do/submit # Build + push via CodeBuild
|
|
301
|
+
./do/deploy # Deploy to SageMaker
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
**Build Time:** Typically 5-15 minutes depending on image size and complexity.
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
<% } %>
|
|
309
|
+
## Configuration Reference
|
|
310
|
+
|
|
311
|
+
All scripts source configuration from `do/config`. Key variables:
|
|
312
|
+
|
|
313
|
+
| Variable | Description | Current Value |
|
|
314
|
+
|----------|-------------|---------------|
|
|
315
|
+
| `PROJECT_NAME` | Project identifier | `<%= projectName %>` |
|
|
316
|
+
| `DEPLOYMENT_CONFIG` | Framework-server combination | `<%= deploymentConfig %>` |
|
|
317
|
+
| `FRAMEWORK` | ML framework | `<%= framework %>` |
|
|
318
|
+
| `MODEL_SERVER` | Model serving framework | `<%= modelServer %>` |
|
|
319
|
+
| `AWS_REGION` | AWS region for deployment | `<%= awsRegion %>` |
|
|
320
|
+
| `ECR_REPOSITORY_NAME` | ECR repository name | `ml-container-creator` |
|
|
321
|
+
| `INSTANCE_TYPE` | SageMaker instance type | `<%= instanceType %>` |
|
|
322
|
+
| `BUILD_TARGET` | Build target | `<%= buildTarget %>` |
|
|
323
|
+
<% if (framework === 'transformers') { %>| `MODEL_NAME` | HuggingFace model name | `<%= modelName %>` |
|
|
324
|
+
<% } %><% if (modelFormat) { %>| `MODEL_FORMAT` | Model file format | `<%= modelFormat %>` |
|
|
325
|
+
<% } %>
|
|
326
|
+
|
|
327
|
+
### Environment Variable Overrides
|
|
328
|
+
|
|
329
|
+
You can override any configuration variable by exporting it before running scripts:
|
|
330
|
+
|
|
331
|
+
```bash
|
|
332
|
+
# Override AWS region
|
|
333
|
+
export AWS_REGION=us-west-2
|
|
334
|
+
./do/deploy
|
|
335
|
+
|
|
336
|
+
# Override instance type
|
|
337
|
+
export INSTANCE_TYPE=ml.g5.2xlarge
|
|
338
|
+
./do/deploy
|
|
339
|
+
|
|
340
|
+
# Override ECR repository name
|
|
341
|
+
export ECR_REPOSITORY_NAME=my-custom-repo
|
|
342
|
+
./do/push
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
## Troubleshooting
|
|
346
|
+
|
|
347
|
+
### Build Issues
|
|
348
|
+
|
|
349
|
+
**Docker not found:**
|
|
350
|
+
```
|
|
351
|
+
❌ Docker is not installed
|
|
352
|
+
```
|
|
353
|
+
Install Docker from [https://docs.docker.com/get-docker/](https://docs.docker.com/get-docker/)
|
|
354
|
+
|
|
355
|
+
<% if (modelServer === 'tensorrt-llm') { %>
|
|
356
|
+
**NGC authentication failed:**
|
|
357
|
+
```
|
|
358
|
+
❌ NGC_API_KEY environment variable not set
|
|
359
|
+
```
|
|
360
|
+
Get your NGC API key from [https://ngc.nvidia.com/](https://ngc.nvidia.com/) and export it:
|
|
361
|
+
```bash
|
|
362
|
+
export NGC_API_KEY=your_key_here
|
|
363
|
+
```
|
|
364
|
+
<% } %>
|
|
365
|
+
|
|
366
|
+
### Push Issues
|
|
367
|
+
|
|
368
|
+
**AWS credentials not configured:**
|
|
369
|
+
```
|
|
370
|
+
❌ AWS credentials not configured
|
|
371
|
+
```
|
|
372
|
+
Run `aws configure` or set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables.
|
|
373
|
+
|
|
374
|
+
**ECR authentication failed:**
|
|
375
|
+
```
|
|
376
|
+
❌ Failed to authenticate with ECR
|
|
377
|
+
```
|
|
378
|
+
Ensure your IAM user/role has `ecr:GetAuthorizationToken` permission.
|
|
379
|
+
|
|
380
|
+
### Deploy Issues
|
|
381
|
+
|
|
382
|
+
**Execution role not provided:**
|
|
383
|
+
```
|
|
384
|
+
❌ Execution role ARN not provided
|
|
385
|
+
```
|
|
386
|
+
Export the role ARN:
|
|
387
|
+
```bash
|
|
388
|
+
export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
**ECR image not found:**
|
|
392
|
+
```
|
|
393
|
+
❌ ECR image not found
|
|
394
|
+
```
|
|
395
|
+
<% if (buildTarget === 'codebuild') { %>Run `./do/submit` to build and push the image via CodeBuild.
|
|
396
|
+
<% } else { %>Run `./do/build` and `./do/push` to build and push the image.
|
|
397
|
+
<% } %>
|
|
398
|
+
|
|
399
|
+
**Endpoint creation failed:**
|
|
400
|
+
```
|
|
401
|
+
❌ Failed to create endpoint
|
|
402
|
+
```
|
|
403
|
+
Check:
|
|
404
|
+
- Instance type is available in your region
|
|
405
|
+
- You have sufficient service quota for the instance type
|
|
406
|
+
- The execution role has correct permissions
|
|
407
|
+
- CloudWatch Logs for detailed error messages
|
|
408
|
+
|
|
409
|
+
### Test Issues
|
|
410
|
+
|
|
411
|
+
**Local container not responding:**
|
|
412
|
+
```
|
|
413
|
+
❌ Could not connect to local container
|
|
414
|
+
```
|
|
415
|
+
Ensure the container is running: `./do/run`
|
|
416
|
+
|
|
417
|
+
**SageMaker endpoint not InService:**
|
|
418
|
+
```
|
|
419
|
+
❌ Endpoint is not InService
|
|
420
|
+
```
|
|
421
|
+
Wait for endpoint to finish deploying. Check status:
|
|
422
|
+
```bash
|
|
423
|
+
aws sagemaker describe-endpoint --endpoint-name <endpoint-name> --region <%= awsRegion %>
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
<% if (framework === 'transformers') { %>
|
|
427
|
+
### GPU Issues
|
|
428
|
+
|
|
429
|
+
**NVIDIA runtime not found:**
|
|
430
|
+
```
|
|
431
|
+
❌ NVIDIA Container Toolkit not installed
|
|
432
|
+
```
|
|
433
|
+
Install NVIDIA Container Toolkit: [https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
|
|
434
|
+
|
|
435
|
+
**Out of GPU memory:**
|
|
436
|
+
```
|
|
437
|
+
❌ CUDA out of memory
|
|
438
|
+
```
|
|
439
|
+
Try:
|
|
440
|
+
- Using a larger instance type with more GPU memory
|
|
441
|
+
- Reducing batch size or model size
|
|
442
|
+
- Using model quantization
|
|
443
|
+
<% } %>
|
|
444
|
+
|
|
445
|
+
## Workflow Examples
|
|
446
|
+
|
|
447
|
+
### Development Workflow
|
|
448
|
+
|
|
449
|
+
1. **Build and test locally:**
|
|
450
|
+
```bash
|
|
451
|
+
./do/build
|
|
452
|
+
./do/run &
|
|
453
|
+
./do/test
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
2. **Deploy to SageMaker:**
|
|
457
|
+
```bash
|
|
458
|
+
<% if (buildTarget === 'codebuild') { %>./do/submit<% } else { %>./do/push<% } %>
|
|
459
|
+
export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE
|
|
460
|
+
./do/deploy
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
3. **Test the endpoint:**
|
|
464
|
+
```bash
|
|
465
|
+
./do/test <endpoint-name>
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
4. **Clean up when done:**
|
|
469
|
+
```bash
|
|
470
|
+
./do/clean endpoint
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
### CI/CD Workflow
|
|
474
|
+
|
|
475
|
+
<% if (buildTarget === 'codebuild') { %>
|
|
476
|
+
```bash
|
|
477
|
+
# In your CI/CD pipeline
|
|
478
|
+
./do/submit # Build and push via CodeBuild
|
|
479
|
+
./do/deploy # Deploy to SageMaker
|
|
480
|
+
./do/test <endpoint-name> # Validate deployment
|
|
481
|
+
```
|
|
482
|
+
<% } else { %>
|
|
483
|
+
```bash
|
|
484
|
+
# In your CI/CD pipeline
|
|
485
|
+
./do/build # Build image
|
|
486
|
+
./do/push # Push to ECR
|
|
487
|
+
./do/deploy # Deploy to SageMaker
|
|
488
|
+
./do/test <endpoint-name> # Validate deployment
|
|
489
|
+
```
|
|
490
|
+
<% } %>
|
|
491
|
+
|
|
492
|
+
### Iterative Development
|
|
493
|
+
|
|
494
|
+
```bash
|
|
495
|
+
# Make code changes
|
|
496
|
+
vim code/model_handler.py
|
|
497
|
+
|
|
498
|
+
# Rebuild and test
|
|
499
|
+
./do/build
|
|
500
|
+
./do/run &
|
|
501
|
+
./do/test
|
|
502
|
+
|
|
503
|
+
# Deploy updated version
|
|
504
|
+
<% if (buildTarget === 'codebuild') { %>./do/submit<% } else { %>./do/push<% } %>
|
|
505
|
+
./do/deploy
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
## Relationship to Legacy Scripts
|
|
509
|
+
|
|
510
|
+
The `deploy/` directory contains legacy wrapper scripts for backward compatibility:
|
|
511
|
+
|
|
512
|
+
| Legacy Script | do-framework Equivalent | Status |
|
|
513
|
+
|---------------|------------------------|--------|
|
|
514
|
+
| `deploy/build_and_push.sh` | `./do/build && ./do/push` | Deprecated |
|
|
515
|
+
| `deploy/deploy.sh` | `./do/deploy` | Deprecated |
|
|
516
|
+
<% if (buildTarget === 'codebuild') { %>| `deploy/submit_build.sh` | `./do/submit` | Deprecated |
|
|
517
|
+
<% } %>
|
|
518
|
+
|
|
519
|
+
**Migration:** The legacy scripts display deprecation warnings and forward to do-framework scripts. Update your workflows to use `do/` scripts directly.
|
|
520
|
+
|
|
521
|
+
See [MIGRATION.md](../MIGRATION.md) for detailed migration instructions.
|
|
522
|
+
|
|
523
|
+
## Additional Resources
|
|
524
|
+
|
|
525
|
+
- **Main Project README**: [../README.md](../README.md)
|
|
526
|
+
- **Migration Guide**: [../MIGRATION.md](../MIGRATION.md)
|
|
527
|
+
- **do-framework**: [https://github.com/iankoulski/do-framework](https://github.com/iankoulski/do-framework)
|
|
528
|
+
- **AWS SageMaker BYOC**: [https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms.html](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms.html)
|
|
529
|
+
- **Docker Documentation**: [https://docs.docker.com/](https://docs.docker.com/)
|
|
530
|
+
|
|
531
|
+
## Getting Help
|
|
532
|
+
|
|
533
|
+
If you encounter issues:
|
|
534
|
+
|
|
535
|
+
1. Check the troubleshooting section above
|
|
536
|
+
2. Review CloudWatch Logs for SageMaker endpoints
|
|
537
|
+
3. Verify IAM permissions and AWS credentials
|
|
538
|
+
4. Ensure prerequisites are installed and configured
|
|
539
|
+
5. Check the main project README for additional guidance
|
|
540
|
+
|
|
541
|
+
For bugs or feature requests, please open an issue in the project repository.
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
set -e
|
|
6
|
+
set -u
|
|
7
|
+
set -o pipefail
|
|
8
|
+
|
|
9
|
+
# Source configuration
|
|
10
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
11
|
+
source "${SCRIPT_DIR}/config"
|
|
12
|
+
|
|
13
|
+
echo "🚀 Building Docker image for ${PROJECT_NAME}"
|
|
14
|
+
echo " Deployment config: ${DEPLOYMENT_CONFIG}"
|
|
15
|
+
echo " Framework: ${FRAMEWORK}"
|
|
16
|
+
echo " Model server: ${MODEL_SERVER}"
|
|
17
|
+
|
|
18
|
+
# Validate prerequisites
|
|
19
|
+
if ! command -v docker &> /dev/null; then
|
|
20
|
+
echo "❌ Docker is not installed"
|
|
21
|
+
echo " Install from: https://docs.docker.com/get-docker/"
|
|
22
|
+
exit 2
|
|
23
|
+
fi
|
|
24
|
+
|
|
25
|
+
# Framework-specific build logic
|
|
26
|
+
case "${DEPLOYMENT_CONFIG}" in
|
|
27
|
+
transformers-tensorrt-llm)
|
|
28
|
+
echo "🔐 TensorRT-LLM requires NGC authentication"
|
|
29
|
+
if [ -z "${NGC_API_KEY:-}" ]; then
|
|
30
|
+
echo "❌ NGC_API_KEY environment variable not set"
|
|
31
|
+
echo ""
|
|
32
|
+
echo "To build TensorRT-LLM containers, you need an NVIDIA NGC API key:"
|
|
33
|
+
echo "1. Create account at: https://ngc.nvidia.com/"
|
|
34
|
+
echo "2. Generate API key in account settings"
|
|
35
|
+
echo "3. Export key: export NGC_API_KEY=your_key_here"
|
|
36
|
+
echo "4. Run this script again"
|
|
37
|
+
exit 3
|
|
38
|
+
fi
|
|
39
|
+
|
|
40
|
+
echo "🔑 Authenticating with NVIDIA NGC..."
|
|
41
|
+
echo "${NGC_API_KEY}" | docker login nvcr.io --username '$oauthtoken' --password-stdin
|
|
42
|
+
|
|
43
|
+
echo "🏗️ Building GPU-enabled image with TensorRT-LLM..."
|
|
44
|
+
docker build -t "${PROJECT_NAME}:latest" .
|
|
45
|
+
;;
|
|
46
|
+
|
|
47
|
+
transformers-vllm|transformers-sglang)
|
|
48
|
+
echo "🏗️ Building GPU-enabled image..."
|
|
49
|
+
docker build -t "${PROJECT_NAME}:latest" .
|
|
50
|
+
;;
|
|
51
|
+
|
|
52
|
+
transformers-lmi|transformers-djl)
|
|
53
|
+
echo "🔐 LMI/DJL base images require AWS ECR authentication"
|
|
54
|
+
echo "🔑 Authenticating with AWS Deep Learning Container ECR..."
|
|
55
|
+
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com
|
|
56
|
+
|
|
57
|
+
echo "🏗️ Building GPU-enabled image..."
|
|
58
|
+
docker build -t "${PROJECT_NAME}:latest" .
|
|
59
|
+
;;
|
|
60
|
+
|
|
61
|
+
sklearn-*|xgboost-*|tensorflow-*)
|
|
62
|
+
echo "🏗️ Building CPU-optimized image..."
|
|
63
|
+
docker build -t "${PROJECT_NAME}:latest" .
|
|
64
|
+
;;
|
|
65
|
+
|
|
66
|
+
*)
|
|
67
|
+
echo "❌ Unknown deployment configuration: ${DEPLOYMENT_CONFIG}"
|
|
68
|
+
exit 3
|
|
69
|
+
;;
|
|
70
|
+
esac
|
|
71
|
+
|
|
72
|
+
# Tag with timestamp
|
|
73
|
+
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
|
74
|
+
docker tag "${PROJECT_NAME}:latest" "${PROJECT_NAME}:${TIMESTAMP}"
|
|
75
|
+
|
|
76
|
+
echo "✅ Build complete!"
|
|
77
|
+
echo " Image: ${PROJECT_NAME}:latest"
|
|
78
|
+
echo " Tagged: ${PROJECT_NAME}:${TIMESTAMP}"
|
|
79
|
+
echo ""
|
|
80
|
+
echo "Next steps:"
|
|
81
|
+
echo " • Test locally: ./do/run"
|
|
82
|
+
echo " • Push to ECR: ./do/push"
|
|
83
|
+
echo " • Deploy to SageMaker: ./do/deploy"
|