@aws/ml-container-creator 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +5 -2
- package/config/bootstrap-stack.json +86 -7
- package/config/defaults.json +1 -1
- package/infra/ci-harness/buildspec.yml +60 -0
- package/package.json +3 -1
- package/servers/README.md +41 -1
- package/servers/instance-sizer/index.js +42 -2
- package/servers/instance-sizer/lib/instance-ranker.js +114 -10
- package/servers/instance-sizer/lib/quota-resolver.js +368 -0
- package/servers/instance-sizer/package.json +2 -0
- package/servers/lib/catalogs/instances.json +527 -12
- package/servers/lib/catalogs/model-servers.json +15 -15
- package/servers/lib/catalogs/model-sizes.json +27 -0
- package/servers/lib/catalogs/models.json +71 -0
- package/servers/lib/schemas/image-catalog.schema.json +9 -1
- package/src/app.js +109 -3
- package/src/lib/bootstrap-command-handler.js +96 -3
- package/src/lib/cli-handler.js +2 -2
- package/src/lib/config-manager.js +117 -1
- package/src/lib/deployment-entry-schema.js +16 -0
- package/src/lib/prompt-runner.js +270 -12
- package/src/lib/prompts.js +288 -6
- package/src/lib/registry-command-handler.js +12 -0
- package/src/lib/schema-sync.js +31 -0
- package/src/lib/template-manager.js +49 -1
- package/src/lib/validate-runner.js +125 -2
- package/templates/Dockerfile +22 -2
- package/templates/code/cuda_compat.sh +22 -0
- package/templates/code/serve +3 -0
- package/templates/code/serving.properties +14 -0
- package/templates/code/start_server.sh +3 -0
- package/templates/diffusors/Dockerfile +2 -1
- package/templates/diffusors/serve +3 -0
- package/templates/do/README.md +33 -0
- package/templates/do/adapter +1214 -0
- package/templates/do/adapters/.gitkeep +2 -0
- package/templates/do/add-ic +130 -0
- package/templates/do/benchmark +718 -0
- package/templates/do/clean +593 -17
- package/templates/do/config +49 -4
- package/templates/do/deploy +513 -362
- package/templates/do/ic/default.conf +32 -0
- package/templates/do/lib/endpoint-config.sh +216 -0
- package/templates/do/lib/inference-component.sh +167 -0
- package/templates/do/lib/secrets.sh +44 -0
- package/templates/do/lib/wait.sh +131 -0
- package/templates/do/logs +107 -27
- package/templates/do/optimize +528 -0
- package/templates/do/register +119 -2
- package/templates/do/status +337 -0
- package/templates/do/test +80 -28
- package/templates/triton/Dockerfile +5 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Per-IC configuration: default
|
|
5
|
+
# This file defines the primary inference component for the project.
|
|
6
|
+
# It is sourced by do/lib/inference-component.sh during deployment.
|
|
7
|
+
#
|
|
8
|
+
# After deployment, IC_DEPLOYED_NAME and IC_DEPLOYED_AT will be appended
|
|
9
|
+
# by the deploy script to track the active inference component.
|
|
10
|
+
|
|
11
|
+
export IC_IMAGE_TAG="<%= projectName %>-latest"
|
|
12
|
+
export IC_GPU_COUNT=<%= (typeof icGpuCount !== 'undefined' && icGpuCount != null) ? icGpuCount : 1 %>
|
|
13
|
+
export IC_COPY_COUNT=1
|
|
14
|
+
export IC_MIN_MEMORY_MB=1024
|
|
15
|
+
export IC_STARTUP_TIMEOUT=900
|
|
16
|
+
<% if (typeof instancePoolSpecs !== 'undefined' && instancePoolSpecs && instancePoolSpecs.length > 1) { %>
|
|
17
|
+
|
|
18
|
+
# Multi-spec IC configuration (auto-generated from instance pool selections)
|
|
19
|
+
# When the endpoint uses instance pools, the IC uses Specifications (plural)
|
|
20
|
+
# with per-instance-type compute resource requirements.
|
|
21
|
+
export IC_MULTI_SPEC=true
|
|
22
|
+
export IC_SPEC_COUNT=<%= instancePoolSpecs.length %>
|
|
23
|
+
<% instancePoolSpecs.forEach(function(spec, idx) { %>
|
|
24
|
+
export IC_SPEC_<%= idx + 1 %>_INSTANCE_TYPE="<%= spec.instanceType %>"
|
|
25
|
+
export IC_SPEC_<%= idx + 1 %>_GPU_COUNT=<%= spec.gpuCount %>
|
|
26
|
+
export IC_SPEC_<%= idx + 1 %>_MIN_MEMORY_MB=<%= spec.minMemoryMb %>
|
|
27
|
+
<% }); %>
|
|
28
|
+
<% } %>
|
|
29
|
+
|
|
30
|
+
# Optional overrides:
|
|
31
|
+
# export IC_MODEL_NAME="my-model-v2"
|
|
32
|
+
# export IC_CONTAINER_ENV_EXTRA='"KEY":"value"'
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Shared helper: create SageMaker endpoint configuration.
|
|
5
|
+
# Sourced by do/deploy — expects PROJECT_NAME, AWS_REGION to be set by the caller.
|
|
6
|
+
# One of INSTANCE_TYPE or INSTANCE_POOLS must be set (mutually exclusive).
|
|
7
|
+
# Optional: ROLE_ARN, INFERENCE_AMI_VERSION, CAPACITY_RESERVATION_ARN, ASYNC_INFERENCE_CONFIG,
|
|
8
|
+
# POOL_TIMEOUT (default: 1200), POOL_INSTANCE_COUNT (default: 1), MODEL_NAME_SM.
|
|
9
|
+
|
|
10
|
+
# _validate_instance_pools()
|
|
11
|
+
# Validates that all instance types in INSTANCE_POOLS are compatible:
|
|
12
|
+
# - All types must share the same accelerator generation (same CUDA/AMI requirements)
|
|
13
|
+
# - Cannot mix CUDA and Neuron accelerator types
|
|
14
|
+
# - Unknown instance types produce a warning but do not block deployment
|
|
15
|
+
#
|
|
16
|
+
# Uses a hardcoded map of instance type prefixes to their generation/AMI compatibility:
|
|
17
|
+
# cuda-11 (AMI 2-x): g4dn, g5, g5g, p3, p4d, p4de
|
|
18
|
+
# cuda-12 (AMI 3-x): g6, g6e, p5
|
|
19
|
+
# cuda-next (AMI 4-x): p6, g7e
|
|
20
|
+
# neuron: inf1, inf2, trn1
|
|
21
|
+
#
|
|
22
|
+
# Exits with error if incompatible types are detected.
|
|
23
|
+
_validate_instance_pools() {
|
|
24
|
+
# Map instance family prefixes to their generation
|
|
25
|
+
# Format: "family_prefix=generation"
|
|
26
|
+
local -a GENERATION_MAP=(
|
|
27
|
+
"ml.g4dn.=cuda-11"
|
|
28
|
+
"ml.g5.=cuda-11"
|
|
29
|
+
"ml.g5g.=cuda-11"
|
|
30
|
+
"ml.p3.=cuda-11"
|
|
31
|
+
"ml.p4d.=cuda-11"
|
|
32
|
+
"ml.p4de.=cuda-11"
|
|
33
|
+
"ml.g6.=cuda-12"
|
|
34
|
+
"ml.g6e.=cuda-12"
|
|
35
|
+
"ml.p5.=cuda-12"
|
|
36
|
+
"ml.p5e.=cuda-12"
|
|
37
|
+
"ml.p5en.=cuda-12"
|
|
38
|
+
"ml.p6.=cuda-next"
|
|
39
|
+
"ml.g7e.=cuda-next"
|
|
40
|
+
"ml.inf1.=neuron"
|
|
41
|
+
"ml.inf2.=neuron"
|
|
42
|
+
"ml.trn1.=neuron"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Extract instance types from INSTANCE_POOLS JSON
|
|
46
|
+
# INSTANCE_POOLS format: [{"InstanceType":"ml.g6e.48xlarge","Priority":1},...]
|
|
47
|
+
# Use simple string parsing to extract InstanceType values
|
|
48
|
+
local pool_types=""
|
|
49
|
+
pool_types=$(echo "${INSTANCE_POOLS}" | grep -oE '"InstanceType"\s*:\s*"[^"]+"' | sed 's/"InstanceType"\s*:\s*"//;s/"$//' || true)
|
|
50
|
+
|
|
51
|
+
if [ -z "${pool_types}" ]; then
|
|
52
|
+
return 0
|
|
53
|
+
fi
|
|
54
|
+
|
|
55
|
+
local first_generation=""
|
|
56
|
+
local first_type=""
|
|
57
|
+
local has_unknown=false
|
|
58
|
+
|
|
59
|
+
while IFS= read -r instance_type; do
|
|
60
|
+
[ -z "${instance_type}" ] && continue
|
|
61
|
+
|
|
62
|
+
local generation=""
|
|
63
|
+
for entry in "${GENERATION_MAP[@]}"; do
|
|
64
|
+
local prefix="${entry%%=*}"
|
|
65
|
+
local gen="${entry##*=}"
|
|
66
|
+
if [[ "${instance_type}" == ${prefix}* ]]; then
|
|
67
|
+
generation="${gen}"
|
|
68
|
+
break
|
|
69
|
+
fi
|
|
70
|
+
done
|
|
71
|
+
|
|
72
|
+
if [ -z "${generation}" ]; then
|
|
73
|
+
echo " ⚠️ Unknown instance type in pool: ${instance_type} — skipping validation for this type"
|
|
74
|
+
has_unknown=true
|
|
75
|
+
continue
|
|
76
|
+
fi
|
|
77
|
+
|
|
78
|
+
if [ -z "${first_generation}" ]; then
|
|
79
|
+
first_generation="${generation}"
|
|
80
|
+
first_type="${instance_type}"
|
|
81
|
+
elif [ "${generation}" != "${first_generation}" ]; then
|
|
82
|
+
echo "❌ Cannot mix ${first_type} (${first_generation}) and ${instance_type} (${generation}) in same pool — different CUDA/AMI requirements"
|
|
83
|
+
echo " All instance types in a pool must share the same InferenceAmiVersion."
|
|
84
|
+
echo ""
|
|
85
|
+
echo " Generation groupings:"
|
|
86
|
+
echo " cuda-11 (AMI 2-x): ml.g4dn.*, ml.g5.*, ml.p3.*, ml.p4d.*"
|
|
87
|
+
echo " cuda-12 (AMI 3-x): ml.g6.*, ml.g6e.*, ml.p5.*"
|
|
88
|
+
echo " neuron: ml.inf1.*, ml.inf2.*, ml.trn1.*"
|
|
89
|
+
echo ""
|
|
90
|
+
echo " Fix: use instance types from the same generation in your pool."
|
|
91
|
+
exit 1
|
|
92
|
+
fi
|
|
93
|
+
done <<< "${pool_types}"
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# create_endpoint_config()
|
|
97
|
+
# Builds a ProductionVariant JSON and calls `aws sagemaker create-endpoint-config`.
|
|
98
|
+
# Sets the global ENDPOINT_CONFIG_NAME variable for downstream use.
|
|
99
|
+
#
|
|
100
|
+
# Behavior:
|
|
101
|
+
# - INSTANCE_POOLS set: uses InstancePools array, RoutingConfig, VariantInstanceProvisionTimeoutInSeconds
|
|
102
|
+
# Omits InstanceType entirely (mutually exclusive with pools)
|
|
103
|
+
# - INSTANCE_POOLS not set: uses single INSTANCE_TYPE (standard path)
|
|
104
|
+
# - INFERENCE_AMI_VERSION: appended to variant when set
|
|
105
|
+
# - CAPACITY_RESERVATION_ARN: appended to variant when set (only for single instance type path)
|
|
106
|
+
# - ASYNC_INFERENCE_CONFIG: passes --async-inference-config when set
|
|
107
|
+
# - ROLE_ARN + no MODEL_NAME_SM: passes --execution-role-arn (IC-based real-time flow)
|
|
108
|
+
# - MODEL_NAME_SM set: omits --execution-role-arn (model-based async flow)
|
|
109
|
+
create_endpoint_config() {
|
|
110
|
+
# Mutual exclusivity: capacity reservations and instance pools cannot be used together.
|
|
111
|
+
# Capacity reservations guarantee a specific instance type, while pools are for fallback
|
|
112
|
+
# across multiple types. If both are set, prefer the reservation.
|
|
113
|
+
if [ -n "${INSTANCE_POOLS:-}" ] && [ -n "${CAPACITY_RESERVATION_ARN:-}" ]; then
|
|
114
|
+
echo "⚠️ Capacity reservations and instance pools are mutually exclusive. Using capacity reservation."
|
|
115
|
+
unset INSTANCE_POOLS
|
|
116
|
+
fi
|
|
117
|
+
|
|
118
|
+
local timestamp
|
|
119
|
+
timestamp=$(date +%s)
|
|
120
|
+
ENDPOINT_CONFIG_NAME="${PROJECT_NAME}-epc-${timestamp}"
|
|
121
|
+
|
|
122
|
+
local variant_json
|
|
123
|
+
|
|
124
|
+
if [ -n "${INSTANCE_POOLS:-}" ]; then
|
|
125
|
+
# Validate pool compatibility before proceeding
|
|
126
|
+
_validate_instance_pools
|
|
127
|
+
|
|
128
|
+
# Instance pools path: heterogeneous instance types with priority-based fallback
|
|
129
|
+
echo " Instance pools: enabled"
|
|
130
|
+
|
|
131
|
+
# Transform ModelName → ModelNameOverride for the SageMaker API.
|
|
132
|
+
# INSTANCE_POOLS config uses "ModelName" for readability; the API expects "ModelNameOverride"
|
|
133
|
+
# as a sibling of InstanceType and Priority within each pool entry.
|
|
134
|
+
local pools_json="${INSTANCE_POOLS}"
|
|
135
|
+
if echo "${pools_json}" | grep -q '"ModelName"'; then
|
|
136
|
+
pools_json=$(echo "${pools_json}" | sed 's/"ModelName"/"ModelNameOverride"/g')
|
|
137
|
+
echo " ModelNameOverride: per-pool model names detected"
|
|
138
|
+
fi
|
|
139
|
+
|
|
140
|
+
variant_json="[{\"VariantName\":\"AllTraffic\""
|
|
141
|
+
variant_json="${variant_json},\"InstancePools\":${pools_json}"
|
|
142
|
+
variant_json="${variant_json},\"InitialInstanceCount\":${POOL_INSTANCE_COUNT:-1}"
|
|
143
|
+
variant_json="${variant_json},\"VariantInstanceProvisionTimeoutInSeconds\":${POOL_TIMEOUT:-1200}"
|
|
144
|
+
variant_json="${variant_json},\"RoutingConfig\":{\"RoutingStrategy\":\"LEAST_OUTSTANDING_REQUESTS\"}"
|
|
145
|
+
|
|
146
|
+
# Optional: AMI version
|
|
147
|
+
if [ -n "${INFERENCE_AMI_VERSION:-}" ]; then
|
|
148
|
+
variant_json="${variant_json},\"InferenceAmiVersion\":\"${INFERENCE_AMI_VERSION}\""
|
|
149
|
+
echo " AMI version: ${INFERENCE_AMI_VERSION}"
|
|
150
|
+
fi
|
|
151
|
+
|
|
152
|
+
variant_json="${variant_json}}]"
|
|
153
|
+
else
|
|
154
|
+
# Standard path: single instance type
|
|
155
|
+
variant_json="[{\"VariantName\":\"AllTraffic\",\"InstanceType\":\"${INSTANCE_TYPE}\",\"InitialInstanceCount\":1"
|
|
156
|
+
|
|
157
|
+
# Optional: AMI version
|
|
158
|
+
if [ -n "${INFERENCE_AMI_VERSION:-}" ]; then
|
|
159
|
+
variant_json="${variant_json},\"InferenceAmiVersion\":\"${INFERENCE_AMI_VERSION}\""
|
|
160
|
+
echo " AMI version: ${INFERENCE_AMI_VERSION}"
|
|
161
|
+
fi
|
|
162
|
+
|
|
163
|
+
# Optional: capacity reservation
|
|
164
|
+
if [ -n "${CAPACITY_RESERVATION_ARN:-}" ]; then
|
|
165
|
+
variant_json="${variant_json},\"CapacityReservationConfig\":{\"CapacityReservationPreference\":\"capacity-reservations-only\",\"MlReservationArn\":\"${CAPACITY_RESERVATION_ARN}\"}"
|
|
166
|
+
echo " ⚠️ Capacity reservation (experimental): ${CAPACITY_RESERVATION_ARN}"
|
|
167
|
+
fi
|
|
168
|
+
|
|
169
|
+
variant_json="${variant_json}}]"
|
|
170
|
+
fi
|
|
171
|
+
|
|
172
|
+
# Build the AWS CLI command arguments
|
|
173
|
+
local -a cmd_args=(
|
|
174
|
+
aws sagemaker create-endpoint-config
|
|
175
|
+
--endpoint-config-name "${ENDPOINT_CONFIG_NAME}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Include --execution-role-arn for IC-based flow (real-time).
|
|
179
|
+
# Omit for model-based flow (async) where MODEL_NAME_SM is set.
|
|
180
|
+
if [ -n "${ROLE_ARN:-}" ] && [ -z "${MODEL_NAME_SM:-}" ]; then
|
|
181
|
+
cmd_args+=(--execution-role-arn "${ROLE_ARN}")
|
|
182
|
+
fi
|
|
183
|
+
|
|
184
|
+
cmd_args+=(--production-variants "${variant_json}")
|
|
185
|
+
|
|
186
|
+
# Optional: async inference config
|
|
187
|
+
if [ -n "${ASYNC_INFERENCE_CONFIG:-}" ]; then
|
|
188
|
+
cmd_args+=(--async-inference-config "${ASYNC_INFERENCE_CONFIG}")
|
|
189
|
+
fi
|
|
190
|
+
|
|
191
|
+
cmd_args+=(--region "${AWS_REGION}")
|
|
192
|
+
|
|
193
|
+
echo "⚙️ Creating endpoint configuration: ${ENDPOINT_CONFIG_NAME}"
|
|
194
|
+
if ! "${cmd_args[@]}"; then
|
|
195
|
+
echo "❌ Failed to create endpoint configuration"
|
|
196
|
+
echo " Check that:"
|
|
197
|
+
if [ -n "${ROLE_ARN:-}" ] && [ -z "${MODEL_NAME_SM:-}" ]; then
|
|
198
|
+
echo " • The execution role ARN is valid"
|
|
199
|
+
fi
|
|
200
|
+
if [ -n "${INSTANCE_POOLS:-}" ]; then
|
|
201
|
+
echo " • The instance pool types are valid and available in region: ${AWS_REGION}"
|
|
202
|
+
echo " • You have sufficient service quota for the pool instance types"
|
|
203
|
+
else
|
|
204
|
+
echo " • The instance type is valid: ${INSTANCE_TYPE}"
|
|
205
|
+
echo " • The instance type is available in region: ${AWS_REGION}"
|
|
206
|
+
echo " • You have sufficient service quota for the instance type"
|
|
207
|
+
fi
|
|
208
|
+
if [ -n "${ASYNC_INFERENCE_CONFIG:-}" ]; then
|
|
209
|
+
echo " • The async inference config is valid JSON"
|
|
210
|
+
echo " • The S3 output path and SNS topics are accessible"
|
|
211
|
+
fi
|
|
212
|
+
exit 4
|
|
213
|
+
fi
|
|
214
|
+
|
|
215
|
+
echo "✅ Endpoint configuration created: ${ENDPOINT_CONFIG_NAME}"
|
|
216
|
+
}
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Shared helper: create SageMaker inference components.
|
|
5
|
+
# Sourced by do/deploy — expects the following to be set by the caller:
|
|
6
|
+
# PROJECT_NAME, ENDPOINT_NAME, ECR_REPOSITORY, AWS_REGION, CONTAINER_ENV_JSON
|
|
7
|
+
# Also expects _update_config_var() to be available (from wait.sh).
|
|
8
|
+
|
|
9
|
+
# create_inference_component <ic_config_file>
|
|
10
|
+
# Creates an inference component from a per-IC config file.
|
|
11
|
+
#
|
|
12
|
+
# The config file is sourced and should export:
|
|
13
|
+
# IC_IMAGE_TAG — container image tag (default: ${PROJECT_NAME}-latest)
|
|
14
|
+
# IC_GPU_COUNT — number of accelerator devices (default: 1)
|
|
15
|
+
# IC_COPY_COUNT — number of IC copies (default: 1)
|
|
16
|
+
# IC_MIN_MEMORY_MB — minimum memory in MB (default: 1024)
|
|
17
|
+
# IC_STARTUP_TIMEOUT — container startup health check timeout in seconds (default: 900)
|
|
18
|
+
# IC_CONTAINER_ENV_EXTRA — optional extra env vars in "KEY":"value" format
|
|
19
|
+
#
|
|
20
|
+
# Multi-spec support (for heterogeneous instance pools):
|
|
21
|
+
# IC_MULTI_SPEC — set to "true" to use Specifications (plural) array
|
|
22
|
+
# IC_SPEC_COUNT — number of spec entries (e.g., 2)
|
|
23
|
+
# IC_SPEC_N_INSTANCE_TYPE — instance type for spec entry N
|
|
24
|
+
# IC_SPEC_N_GPU_COUNT — GPU count for spec entry N
|
|
25
|
+
# IC_SPEC_N_MIN_MEMORY_MB — minimum memory for spec entry N
|
|
26
|
+
#
|
|
27
|
+
# Sets IC_DEPLOYED_NAME in the caller's scope (for use by wait_ic).
|
|
28
|
+
# Persists IC_DEPLOYED_NAME and IC_DEPLOYED_AT back to the IC config file.
|
|
29
|
+
# Echoes the IC name as return value.
|
|
30
|
+
create_inference_component() {
|
|
31
|
+
local ic_conf="$1"
|
|
32
|
+
|
|
33
|
+
if [ ! -f "${ic_conf}" ]; then
|
|
34
|
+
echo "❌ IC config file not found: ${ic_conf}"
|
|
35
|
+
exit 4
|
|
36
|
+
fi
|
|
37
|
+
|
|
38
|
+
# Source the IC config to get per-IC settings
|
|
39
|
+
source "${ic_conf}"
|
|
40
|
+
|
|
41
|
+
local ic_timestamp
|
|
42
|
+
ic_timestamp=$(date +%s)
|
|
43
|
+
local ic_basename
|
|
44
|
+
ic_basename=$(basename "${ic_conf}" .conf)
|
|
45
|
+
local ic_name="${PROJECT_NAME}-${ic_basename}-${ic_timestamp}"
|
|
46
|
+
|
|
47
|
+
# Build container spec JSON
|
|
48
|
+
local container_spec="{\"Image\":\"${ECR_REPOSITORY}:${IC_IMAGE_TAG:-${PROJECT_NAME}-latest}\""
|
|
49
|
+
if [ -n "${CONTAINER_ENV_JSON}${IC_CONTAINER_ENV_EXTRA:-}" ]; then
|
|
50
|
+
local env_json="${CONTAINER_ENV_JSON}"
|
|
51
|
+
[ -n "${IC_CONTAINER_ENV_EXTRA:-}" ] && env_json="${env_json:+${env_json},}${IC_CONTAINER_ENV_EXTRA}"
|
|
52
|
+
container_spec="${container_spec},\"Environment\":{${env_json}}"
|
|
53
|
+
fi
|
|
54
|
+
container_spec="${container_spec}}"
|
|
55
|
+
|
|
56
|
+
# Build specification JSON — multi-spec (Specifications array) or single (Specification object)
|
|
57
|
+
local spec_json
|
|
58
|
+
if [ "${IC_MULTI_SPEC:-false}" = "true" ] && [ "${IC_SPEC_COUNT:-0}" -gt 0 ]; then
|
|
59
|
+
# Multi-spec: build Specifications array with per-instance-type compute resources
|
|
60
|
+
spec_json="{\"Specifications\":["
|
|
61
|
+
local i=1
|
|
62
|
+
while [ "${i}" -le "${IC_SPEC_COUNT}" ]; do
|
|
63
|
+
local spec_instance_type_var="IC_SPEC_${i}_INSTANCE_TYPE"
|
|
64
|
+
local spec_gpu_count_var="IC_SPEC_${i}_GPU_COUNT"
|
|
65
|
+
local spec_min_memory_var="IC_SPEC_${i}_MIN_MEMORY_MB"
|
|
66
|
+
|
|
67
|
+
local spec_instance_type="${!spec_instance_type_var}"
|
|
68
|
+
local spec_gpu_count="${!spec_gpu_count_var:-1}"
|
|
69
|
+
local spec_min_memory="${!spec_min_memory_var:-1024}"
|
|
70
|
+
|
|
71
|
+
if [ "${i}" -gt 1 ]; then
|
|
72
|
+
spec_json="${spec_json},"
|
|
73
|
+
fi
|
|
74
|
+
spec_json="${spec_json}{\"Container\":${container_spec},\"StartupParameters\":{\"ContainerStartupHealthCheckTimeoutInSeconds\":${IC_STARTUP_TIMEOUT:-900}},\"ComputeResourceRequirements\":{\"NumberOfAcceleratorDevicesRequired\":${spec_gpu_count},\"MinMemoryRequiredInMb\":${spec_min_memory}}}"
|
|
75
|
+
|
|
76
|
+
i=$((i + 1))
|
|
77
|
+
done
|
|
78
|
+
spec_json="${spec_json}]}"
|
|
79
|
+
else
|
|
80
|
+
# Single spec: standard Specification object (existing behavior)
|
|
81
|
+
spec_json="{\"Container\":${container_spec},\"StartupParameters\":{\"ContainerStartupHealthCheckTimeoutInSeconds\":${IC_STARTUP_TIMEOUT:-900}},\"ComputeResourceRequirements\":{\"NumberOfAcceleratorDevicesRequired\":${IC_GPU_COUNT:-1},\"MinMemoryRequiredInMb\":${IC_MIN_MEMORY_MB:-1024}}}"
|
|
82
|
+
fi
|
|
83
|
+
|
|
84
|
+
echo "📦 Creating inference component: ${ic_name}"
|
|
85
|
+
if ! aws sagemaker create-inference-component \
|
|
86
|
+
--inference-component-name "${ic_name}" \
|
|
87
|
+
--endpoint-name "${ENDPOINT_NAME}" \
|
|
88
|
+
--variant-name "AllTraffic" \
|
|
89
|
+
--specification "${spec_json}" \
|
|
90
|
+
--runtime-config "{\"CopyCount\": ${IC_COPY_COUNT:-1}}" \
|
|
91
|
+
--region "${AWS_REGION}"; then
|
|
92
|
+
|
|
93
|
+
echo "❌ Failed to create inference component: ${ic_name}"
|
|
94
|
+
echo " Check that:"
|
|
95
|
+
echo " • The endpoint is InService: ${ENDPOINT_NAME}"
|
|
96
|
+
echo " • The container image exists: ${ECR_REPOSITORY}:${IC_IMAGE_TAG:-${PROJECT_NAME}-latest}"
|
|
97
|
+
echo " • GPU count (${IC_GPU_COUNT:-1}) does not exceed instance capacity"
|
|
98
|
+
echo " • You have sufficient permissions for sagemaker:CreateInferenceComponent"
|
|
99
|
+
exit 4
|
|
100
|
+
fi
|
|
101
|
+
|
|
102
|
+
# Persist deployed name and timestamp back to IC config
|
|
103
|
+
IC_DEPLOYED_NAME="${ic_name}"
|
|
104
|
+
IC_DEPLOYED_AT="${ic_timestamp}"
|
|
105
|
+
_update_config_var "IC_DEPLOYED_NAME" "${ic_name}" "${ic_conf}"
|
|
106
|
+
_update_config_var "IC_DEPLOYED_AT" "${ic_timestamp}" "${ic_conf}"
|
|
107
|
+
|
|
108
|
+
echo "✅ Inference component created: ${ic_name}"
|
|
109
|
+
echo "${ic_name}"
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# create_inference_component_legacy()
|
|
113
|
+
# Backward-compatible IC creation for projects without do/ic/ directory.
|
|
114
|
+
# Reads IC_GPU_COUNT from do/config (already sourced) and IMAGE_TAG from caller scope.
|
|
115
|
+
# Uses the same endpoint and container env as the multi-IC path.
|
|
116
|
+
#
|
|
117
|
+
# Sets IC_DEPLOYED_NAME in the caller's scope (for use by wait_ic).
|
|
118
|
+
# Persists INFERENCE_COMPONENT_NAME to do/config.
|
|
119
|
+
create_inference_component_legacy() {
|
|
120
|
+
local ic_timestamp
|
|
121
|
+
ic_timestamp=$(date +%s)
|
|
122
|
+
local ic_name="${PROJECT_NAME}-ic-${ic_timestamp}"
|
|
123
|
+
|
|
124
|
+
# Build container spec JSON (uses IMAGE_TAG from caller scope)
|
|
125
|
+
local container_spec="{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\""
|
|
126
|
+
if [ -n "${CONTAINER_ENV_JSON}" ]; then
|
|
127
|
+
container_spec="${container_spec},\"Environment\":{${CONTAINER_ENV_JSON}}"
|
|
128
|
+
fi
|
|
129
|
+
container_spec="${container_spec}}"
|
|
130
|
+
|
|
131
|
+
echo "📦 Creating inference component: ${ic_name}"
|
|
132
|
+
if ! aws sagemaker create-inference-component \
|
|
133
|
+
--inference-component-name "${ic_name}" \
|
|
134
|
+
--endpoint-name "${ENDPOINT_NAME}" \
|
|
135
|
+
--variant-name "AllTraffic" \
|
|
136
|
+
--specification "{
|
|
137
|
+
\"Container\": ${container_spec},
|
|
138
|
+
\"StartupParameters\": {
|
|
139
|
+
\"ContainerStartupHealthCheckTimeoutInSeconds\": 900
|
|
140
|
+
},
|
|
141
|
+
\"ComputeResourceRequirements\": {
|
|
142
|
+
\"NumberOfAcceleratorDevicesRequired\": ${IC_GPU_COUNT:-1},
|
|
143
|
+
\"MinMemoryRequiredInMb\": 1024
|
|
144
|
+
}
|
|
145
|
+
}" \
|
|
146
|
+
--runtime-config "{\"CopyCount\": 1}" \
|
|
147
|
+
--region "${AWS_REGION}"; then
|
|
148
|
+
|
|
149
|
+
echo "❌ Failed to create inference component: ${ic_name}"
|
|
150
|
+
echo " Check that:"
|
|
151
|
+
echo " • The endpoint is InService: ${ENDPOINT_NAME}"
|
|
152
|
+
echo " • The container image exists: ${ECR_REPOSITORY}:${IMAGE_TAG}"
|
|
153
|
+
echo " • GPU count (${IC_GPU_COUNT:-1}) does not exceed instance capacity"
|
|
154
|
+
echo " • You have sufficient permissions for sagemaker:CreateInferenceComponent"
|
|
155
|
+
exit 4
|
|
156
|
+
fi
|
|
157
|
+
|
|
158
|
+
# Set in caller's scope for wait_ic
|
|
159
|
+
IC_DEPLOYED_NAME="${ic_name}"
|
|
160
|
+
IC_DEPLOYED_AT="${ic_timestamp}"
|
|
161
|
+
|
|
162
|
+
# Persist to do/config for legacy compatibility
|
|
163
|
+
_update_config_var "INFERENCE_COMPONENT_NAME" "${ic_name}"
|
|
164
|
+
_update_config_var "IC_DEPLOYED_AT" "${ic_timestamp}"
|
|
165
|
+
|
|
166
|
+
echo "✅ Inference component created: ${ic_name}"
|
|
167
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Shared helper: resolve container secrets from Secrets Manager or direct values.
|
|
5
|
+
# Sourced by do/deploy — expects AWS_REGION to be set by the caller.
|
|
6
|
+
|
|
7
|
+
# resolve_secrets()
|
|
8
|
+
# Resolves HF_TOKEN and NGC_API_KEY from either:
|
|
9
|
+
# - AWS Secrets Manager (when *_ARN variables are set)
|
|
10
|
+
# - Direct values (when the plain variables are set)
|
|
11
|
+
# Sets the global CONTAINER_ENV_JSON variable with comma-separated "KEY":"value" pairs.
|
|
12
|
+
resolve_secrets() {
|
|
13
|
+
CONTAINER_ENV_JSON=""
|
|
14
|
+
|
|
15
|
+
if [ -n "${HF_TOKEN_ARN:-}" ]; then
|
|
16
|
+
echo "🔐 Resolving HuggingFace token from Secrets Manager..."
|
|
17
|
+
RESOLVED_HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "${HF_TOKEN_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
|
|
18
|
+
echo "❌ Failed to resolve HuggingFace token from Secrets Manager"
|
|
19
|
+
exit 3
|
|
20
|
+
}
|
|
21
|
+
CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${RESOLVED_HF_TOKEN}\""
|
|
22
|
+
elif [ -n "${HF_TOKEN:-}" ]; then
|
|
23
|
+
CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${HF_TOKEN}\""
|
|
24
|
+
fi
|
|
25
|
+
|
|
26
|
+
if [ -n "${NGC_API_KEY_ARN:-}" ]; then
|
|
27
|
+
echo "🔐 Resolving NGC API key from Secrets Manager..."
|
|
28
|
+
RESOLVED_NGC_KEY=$(aws secretsmanager get-secret-value --secret-id "${NGC_API_KEY_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
|
|
29
|
+
echo "❌ Failed to resolve NGC API key from Secrets Manager"
|
|
30
|
+
exit 3
|
|
31
|
+
}
|
|
32
|
+
if [ -n "${CONTAINER_ENV_JSON}" ]; then
|
|
33
|
+
CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
|
|
34
|
+
else
|
|
35
|
+
CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
|
|
36
|
+
fi
|
|
37
|
+
elif [ -n "${NGC_API_KEY:-}" ]; then
|
|
38
|
+
if [ -n "${CONTAINER_ENV_JSON}" ]; then
|
|
39
|
+
CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${NGC_API_KEY}\""
|
|
40
|
+
else
|
|
41
|
+
CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${NGC_API_KEY}\""
|
|
42
|
+
fi
|
|
43
|
+
fi
|
|
44
|
+
}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Shared helper: wait/polling functions and config persistence utilities.
|
|
5
|
+
# Sourced by do/deploy — expects AWS_REGION and SCRIPT_DIR to be set by the caller.
|
|
6
|
+
|
|
7
|
+
# _update_config_var <var_name> <var_value> [config_file]
|
|
8
|
+
# Persist a variable to a config file so other scripts can use it.
|
|
9
|
+
# If the variable already exists, update it in place; otherwise append.
|
|
10
|
+
# Defaults to ${SCRIPT_DIR}/config if no config_file is specified.
|
|
11
|
+
_update_config_var() {
|
|
12
|
+
local var_name="$1" var_value="$2" config_file="${3:-${SCRIPT_DIR}/config}"
|
|
13
|
+
if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
|
|
14
|
+
sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
|
|
15
|
+
rm -f "${config_file}.bak"
|
|
16
|
+
else
|
|
17
|
+
echo "" >> "${config_file}"
|
|
18
|
+
echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
|
|
19
|
+
fi
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
# _get_endpoint_status <endpoint_name>
|
|
23
|
+
# Query a SageMaker endpoint status. Returns empty string if not found.
|
|
24
|
+
_get_endpoint_status() {
|
|
25
|
+
aws sagemaker describe-endpoint \
|
|
26
|
+
--endpoint-name "$1" \
|
|
27
|
+
--region "${AWS_REGION}" \
|
|
28
|
+
--query EndpointStatus \
|
|
29
|
+
--output text 2>/dev/null || echo ""
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# _get_ic_status <inference_component_name>
|
|
33
|
+
# Query a SageMaker inference component status. Returns empty string if not found.
|
|
34
|
+
_get_ic_status() {
|
|
35
|
+
aws sagemaker describe-inference-component \
|
|
36
|
+
--inference-component-name "$1" \
|
|
37
|
+
--region "${AWS_REGION}" \
|
|
38
|
+
--query InferenceComponentStatus \
|
|
39
|
+
--output text 2>/dev/null || echo ""
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# _find_active_ic_on_endpoint <endpoint_name>
|
|
43
|
+
# Find an InService inference component on an endpoint.
|
|
44
|
+
# Returns the first match or empty string.
|
|
45
|
+
_find_active_ic_on_endpoint() {
|
|
46
|
+
aws sagemaker list-inference-components \
|
|
47
|
+
--endpoint-name "$1" \
|
|
48
|
+
--status-equals InService \
|
|
49
|
+
--region "${AWS_REGION}" \
|
|
50
|
+
--query 'InferenceComponents[0].InferenceComponentName' \
|
|
51
|
+
--output text 2>/dev/null || echo ""
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# wait_endpoint <endpoint_name>
|
|
55
|
+
# Wait for a SageMaker endpoint to reach InService status.
|
|
56
|
+
# Detects credential expiry vs actual failure and exits with code 4 on error.
|
|
57
|
+
wait_endpoint() {
|
|
58
|
+
local endpoint_name="$1"
|
|
59
|
+
|
|
60
|
+
if ! aws sagemaker wait endpoint-in-service \
|
|
61
|
+
--endpoint-name "${endpoint_name}" \
|
|
62
|
+
--region "${AWS_REGION}"; then
|
|
63
|
+
|
|
64
|
+
# Check if it was a credential expiration vs actual failure
|
|
65
|
+
local ep_check
|
|
66
|
+
ep_check=$(_get_endpoint_status "${endpoint_name}" 2>/dev/null)
|
|
67
|
+
if [ "${ep_check}" = "Creating" ]; then
|
|
68
|
+
echo ""
|
|
69
|
+
echo "⚠️ Wait interrupted (credentials may have expired), but endpoint is still creating."
|
|
70
|
+
echo " Refresh your credentials and re-run ./do/deploy to resume."
|
|
71
|
+
exit 4
|
|
72
|
+
fi
|
|
73
|
+
|
|
74
|
+
echo "❌ Endpoint failed to reach InService status"
|
|
75
|
+
echo " Check CloudWatch Logs for details:"
|
|
76
|
+
echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${endpoint_name}"
|
|
77
|
+
exit 4
|
|
78
|
+
fi
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# wait_ic <ic_name> [timeout]
|
|
82
|
+
# Poll an inference component until it reaches InService or fails.
|
|
83
|
+
# Default timeout is 1800 seconds (30 minutes).
|
|
84
|
+
# Reports status every 30 seconds. Detects credential expiry.
|
|
85
|
+
# Exits with code 4 on failure or timeout.
|
|
86
|
+
wait_ic() {
|
|
87
|
+
local ic_name="$1"
|
|
88
|
+
local timeout="${2:-1800}"
|
|
89
|
+
local wait_start
|
|
90
|
+
wait_start=$(date +%s)
|
|
91
|
+
|
|
92
|
+
while true; do
|
|
93
|
+
local ic_status
|
|
94
|
+
ic_status=$(_get_ic_status "${ic_name}" 2>/dev/null)
|
|
95
|
+
|
|
96
|
+
case "${ic_status}" in
|
|
97
|
+
InService)
|
|
98
|
+
break
|
|
99
|
+
;;
|
|
100
|
+
Failed)
|
|
101
|
+
echo "❌ Inference component failed to reach InService status"
|
|
102
|
+
echo " Check CloudWatch Logs for details:"
|
|
103
|
+
echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME:-unknown}"
|
|
104
|
+
echo ""
|
|
105
|
+
echo " Debug:"
|
|
106
|
+
echo " aws sagemaker describe-inference-component --inference-component-name ${ic_name} --region ${AWS_REGION}"
|
|
107
|
+
exit 4
|
|
108
|
+
;;
|
|
109
|
+
Creating)
|
|
110
|
+
local elapsed=$(( $(date +%s) - wait_start ))
|
|
111
|
+
if [ "${elapsed}" -ge "${timeout}" ]; then
|
|
112
|
+
echo ""
|
|
113
|
+
echo "⚠️ Inference component still creating after ${timeout}s."
|
|
114
|
+
echo " Re-run ./do/deploy to resume waiting."
|
|
115
|
+
exit 4
|
|
116
|
+
fi
|
|
117
|
+
echo " $(date +%H:%M:%S) Status: Creating (${elapsed}s elapsed)..."
|
|
118
|
+
sleep 30
|
|
119
|
+
;;
|
|
120
|
+
"")
|
|
121
|
+
echo "⚠️ Could not determine inference component status (credentials may have expired)."
|
|
122
|
+
echo " Re-run ./do/deploy to resume."
|
|
123
|
+
exit 4
|
|
124
|
+
;;
|
|
125
|
+
*)
|
|
126
|
+
echo " $(date +%H:%M:%S) Status: ${ic_status}..."
|
|
127
|
+
sleep 30
|
|
128
|
+
;;
|
|
129
|
+
esac
|
|
130
|
+
done
|
|
131
|
+
}
|