@aws/ml-container-creator 0.10.0 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +5 -21
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +51 -66
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +138 -138
  23. package/servers/instance-sizer/lib/instance-ranker.js +76 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/model-servers.json +201 -3
  29. package/servers/lib/custom-validators.js +13 -13
  30. package/servers/lib/dynamic-resolver.js +4 -4
  31. package/servers/marketplace-picker/index.js +342 -0
  32. package/servers/marketplace-picker/manifest.json +14 -0
  33. package/servers/marketplace-picker/package.json +18 -0
  34. package/servers/model-picker/index.js +382 -382
  35. package/servers/region-picker/index.js +56 -56
  36. package/servers/workload-picker/LICENSE +202 -0
  37. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  38. package/servers/workload-picker/index.js +171 -0
  39. package/servers/workload-picker/manifest.json +16 -0
  40. package/servers/workload-picker/package.json +16 -0
  41. package/src/app.js +4 -2
  42. package/src/lib/bootstrap-command-handler.js +579 -14
  43. package/src/lib/bootstrap-config.js +36 -0
  44. package/src/lib/bootstrap-profile-manager.js +48 -41
  45. package/src/lib/ci-register-helpers.js +74 -0
  46. package/src/lib/config-loader.js +3 -0
  47. package/src/lib/config-manager.js +7 -0
  48. package/src/lib/cuda-resolver.js +17 -8
  49. package/src/lib/generated/cli-options.js +315 -315
  50. package/src/lib/generated/parameter-matrix.js +661 -661
  51. package/src/lib/generated/validation-rules.js +71 -71
  52. package/src/lib/path-prover-brain.js +607 -0
  53. package/src/lib/prompts/project-prompts.js +12 -0
  54. package/src/lib/template-variable-resolver.js +25 -1
  55. package/src/lib/tune-catalog-validator.js +37 -4
  56. package/templates/Dockerfile +9 -0
  57. package/templates/code/adapter_sidecar.py +444 -0
  58. package/templates/code/serve +6 -0
  59. package/templates/code/serve.d/vllm.ejs +1 -1
  60. package/templates/do/.benchmark_writer.py +1476 -0
  61. package/templates/do/.tune_helper.py +982 -57
  62. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  63. package/templates/do/adapter +149 -0
  64. package/templates/do/benchmark +639 -85
  65. package/templates/do/config +108 -5
  66. package/templates/do/deploy.d/managed-inference.ejs +192 -11
  67. package/templates/do/optimize +106 -37
  68. package/templates/do/register +89 -0
  69. package/templates/do/test +13 -0
  70. package/templates/do/tune +378 -59
  71. package/templates/do/validate +44 -4
@@ -1,6 +1,7 @@
1
1
  #!/bin/bash
2
2
  # do-framework configuration
3
3
  # This file is sourced by all do scripts
4
+ # Generated: <%= new Date().toISOString() %>
4
5
 
5
6
  # Project identification
6
7
  export PROJECT_NAME="<%= projectName %>"
@@ -10,16 +11,20 @@ export DEPLOYMENT_CONFIG="<%= deploymentConfig %>"
10
11
  export FRAMEWORK="<%= framework %>"
11
12
  export MODEL_SERVER="<%= modelServer %>"
12
13
 
14
+ # AWS configuration
15
+ export AWS_REGION=${AWS_REGION:-<%= awsRegion %>}
16
+ export ECR_REPOSITORY_NAME="ml-container-creator"
17
+
13
18
  <% if (typeof enableLora !== 'undefined' && enableLora) { %>
14
19
  # LoRA adapter serving
15
20
  export ENABLE_LORA=true
16
21
  export ADAPTER_S3_BUCKET="mlcc-adapters-$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo 'UNKNOWN')-${AWS_REGION}"
22
+ <% } else if (framework === 'transformers' || framework === 'diffusors') { %>
23
+ # LoRA adapter serving (uncomment to enable)
24
+ # export ENABLE_LORA=true
25
+ # export ADAPTER_S3_BUCKET="mlcc-adapters-$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo 'UNKNOWN')-${AWS_REGION}"
17
26
  <% } %>
18
27
 
19
- # AWS configuration
20
- export AWS_REGION="<%= awsRegion %>"
21
- export ECR_REPOSITORY_NAME="ml-container-creator"
22
-
23
28
  # Build configuration — WHERE the Docker image gets built
24
29
  export BUILD_TARGET="<%= buildTarget %>"
25
30
  <% if (buildTarget === 'codebuild') { %>
@@ -42,14 +47,27 @@ export INSTANCE_TYPE="<%= instanceType %>"
42
47
  # Instance pools: heterogeneous instance types with priority-based fallback
43
48
  # Priority = selection order (1 = preferred, higher = fallback)
44
49
  export INSTANCE_POOLS='<%= JSON.stringify(instancePools) %>'
50
+ <% } else { %>
51
+ # Instance pools: heterogeneous instance types with priority-based fallback (uncomment to enable)
52
+ # Format: [{"InstanceType":"ml.g6e.48xlarge","Priority":1},{"InstanceType":"ml.g5.48xlarge","Priority":2}]
53
+ # export INSTANCE_POOLS='[]'
45
54
  <% } %>
46
55
  <% if (inferenceAmiVersion) { %>
47
56
  export INFERENCE_AMI_VERSION="<%= inferenceAmiVersion %>"
57
+ <% } else { %>
58
+ # Inference AMI version — auto-resolved from CUDA version (uncomment to override)
59
+ # Valid: al2-ami-sagemaker-inference-gpu-2, al2-ami-sagemaker-inference-gpu-2-1,
60
+ # al2-ami-sagemaker-inference-gpu-3-1, al2023-ami-sagemaker-inference-gpu-4-1
61
+ # export INFERENCE_AMI_VERSION=""
48
62
  <% } %>
49
63
  <% if (typeof capacityReservationArn !== 'undefined' && capacityReservationArn) { %>
50
64
  # Note: Capacity reservations and instance pools (INSTANCE_POOLS) are mutually exclusive.
51
65
  # If both are set, the capacity reservation takes precedence and INSTANCE_POOLS is ignored.
52
66
  export CAPACITY_RESERVATION_ARN="<%= capacityReservationArn %>"
67
+ <% } else { %>
68
+ # Capacity reservation (uncomment to use reserved capacity)
69
+ # Note: Mutually exclusive with INSTANCE_POOLS — reservation takes precedence.
70
+ # export CAPACITY_RESERVATION_ARN=""
53
71
  <% } %>
54
72
  <% } %>
55
73
  <% } %>
@@ -59,6 +77,9 @@ export CAPACITY_RESERVATION_ARN="<%= capacityReservationArn %>"
59
77
  export INSTANCE_TYPE="<%= instanceType %>"
60
78
  <% if (inferenceAmiVersion) { %>
61
79
  export INFERENCE_AMI_VERSION="<%= inferenceAmiVersion %>"
80
+ <% } else { %>
81
+ # Inference AMI version — auto-resolved from CUDA version (uncomment to override)
82
+ # export INFERENCE_AMI_VERSION=""
62
83
  <% } %>
63
84
 
64
85
  # Async-specific configuration
@@ -85,6 +106,9 @@ export ASYNC_SNS_ERROR_TOPIC="arn:aws:sns:${AWS_REGION}:${ACCOUNT_ID}:ml-contain
85
106
 
86
107
  <% if (asyncMaxConcurrentInvocations) { %>
87
108
  export ASYNC_MAX_CONCURRENT_INVOCATIONS="<%= asyncMaxConcurrentInvocations %>"
109
+ <% } else { %>
110
+ # Max concurrent invocations per instance (uncomment to set)
111
+ # export ASYNC_MAX_CONCURRENT_INVOCATIONS=""
88
112
  <% } %>
89
113
  <% } %>
90
114
 
@@ -95,6 +119,9 @@ export HYPERPOD_NAMESPACE="<%= hyperPodNamespace %>"
95
119
  export HYPERPOD_REPLICAS="<%= hyperPodReplicas %>"
96
120
  <% if (fsxVolumeHandle) { %>
97
121
  export FSX_VOLUME_HANDLE="<%= fsxVolumeHandle %>"
122
+ <% } else { %>
123
+ # FSx for Lustre volume for shared model storage (uncomment to enable)
124
+ # export FSX_VOLUME_HANDLE=""
98
125
  <% } %>
99
126
  <% } %>
100
127
 
@@ -121,9 +148,15 @@ export BATCH_STRATEGY="<%= batchStrategy %>"
121
148
  export BATCH_JOIN_SOURCE="<%= batchJoinSource || 'None' %>"
122
149
  <% if (batchMaxConcurrentTransforms) { %>
123
150
  export BATCH_MAX_CONCURRENT_TRANSFORMS="<%= batchMaxConcurrentTransforms %>"
151
+ <% } else { %>
152
+ # Max concurrent transforms per instance (uncomment to set)
153
+ # export BATCH_MAX_CONCURRENT_TRANSFORMS=""
124
154
  <% } %>
125
155
  <% if (batchMaxPayloadInMB) { %>
126
156
  export BATCH_MAX_PAYLOAD_IN_MB="<%= batchMaxPayloadInMB %>"
157
+ <% } else { %>
158
+ # Max payload size in MB (uncomment to set, default: 6)
159
+ # export BATCH_MAX_PAYLOAD_IN_MB=""
127
160
  <% } %>
128
161
  <% } %>
129
162
 
@@ -140,6 +173,22 @@ export ENDPOINT_VARIANT_NAME="<%= endpointVariantName %>"
140
173
  export ENDPOINT_VOLUME_SIZE="<%= endpointVolumeSize %>"
141
174
  <% } %>
142
175
 
176
+ <% if (deploymentTarget === 'realtime-inference' || deploymentTarget === 'async-inference') { %>
177
+ # ─── Endpoint overrides (uncomment to customize) ───────────────────────────────
178
+ <% if (typeof endpointInitialInstanceCount === 'undefined' || endpointInitialInstanceCount == null) { %>
179
+ # export ENDPOINT_INITIAL_INSTANCE_COUNT="1" # Number of instances for the endpoint
180
+ <% } %>
181
+ <% if (typeof endpointDataCapturePercent === 'undefined' || endpointDataCapturePercent == null) { %>
182
+ # export ENDPOINT_DATA_CAPTURE_PERCENT="" # Percentage of requests to capture (0-100)
183
+ <% } %>
184
+ <% if (typeof endpointVariantName === 'undefined' || endpointVariantName == null) { %>
185
+ # export ENDPOINT_VARIANT_NAME="" # Custom variant name (default: AllTraffic)
186
+ <% } %>
187
+ <% if (typeof endpointVolumeSize === 'undefined' || endpointVolumeSize == null) { %>
188
+ # export ENDPOINT_VOLUME_SIZE="" # EBS volume size in GB for model download
189
+ <% } %>
190
+ <% } %>
191
+
143
192
  <% if (typeof icCpuCount !== 'undefined' && icCpuCount != null) { %>
144
193
  export IC_CPU_COUNT="<%= icCpuCount %>"
145
194
  <% } %>
@@ -158,6 +207,22 @@ export IC_COPY_COUNT="<%= icCopyCount %>"
158
207
  export IC_MODEL_WEIGHT="<%= icModelWeight %>"
159
208
  <% } %>
160
209
 
210
+ <% if (deploymentTarget === 'realtime-inference' || deploymentTarget === 'async-inference') { %>
211
+ # ─── Inference Component overrides (uncomment to customize) ────────────────────
212
+ <% if (typeof icCpuCount === 'undefined' || icCpuCount == null) { %>
213
+ # export IC_CPU_COUNT="" # CPU cores reserved for this IC
214
+ <% } %>
215
+ <% if (typeof icMemorySize === 'undefined' || icMemorySize == null) { %>
216
+ # export IC_MEMORY_SIZE="" # Memory in MB reserved for this IC
217
+ <% } %>
218
+ <% if (typeof icCopyCount === 'undefined' || icCopyCount == null) { %>
219
+ # export IC_COPY_COUNT="" # Number of model copies (multi-IC scaling)
220
+ <% } %>
221
+ <% if (typeof icModelWeight === 'undefined' || icModelWeight == null) { %>
222
+ # export IC_MODEL_WEIGHT="" # Traffic weight for this IC (0-100)
223
+ <% } %>
224
+ <% } %>
225
+
161
226
  <% if (typeof modelEnvVars !== 'undefined' && modelEnvVars && Object.keys(modelEnvVars).length > 0) { %>
162
227
  # Model environment variables
163
228
  <% Object.entries(modelEnvVars).forEach(([key, value]) => { %>
@@ -192,7 +257,22 @@ export NGC_API_KEY="<%= ngcApiKey %>"
192
257
  <% if (deploymentTarget !== 'batch-transform') { %>
193
258
  # Managed Model Customization (do/tune)
194
259
  export TUNE_SUPPORTED=<%= (typeof tuneSupported !== 'undefined' && tuneSupported) ? 'true' : 'false' %>
260
+ <% if (typeof tuneSupported !== 'undefined' && tuneSupported) { %>
261
+ <% if (typeof tuneModelId !== 'undefined' && tuneModelId) { %>
262
+ # SageMaker AI Managed Fine-Tuning — JumpStart Hub model ID
263
+ # Flow: JumpStart model (tune) → LoRA adapter (S3) → do/adapter add → vLLM
264
+ export TUNE_MODEL_ID="<%= tuneModelId %>"
265
+ <% } else { %>
266
+ # SageMaker AI Managed Fine-Tuning — JumpStart Hub model ID
267
+ # To find your model's Hub ID:
268
+ # aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \
269
+ # --hub-content-type Model --query "HubContentSummaries[].HubContentName"
270
+ # export TUNE_MODEL_ID=""
271
+ <% } %>
272
+ <% } %>
195
273
  export TUNE_S3_BUCKET="mlcc-tune-$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo 'UNKNOWN')-${AWS_REGION}"
274
+ # MLflow App ARN for experiment tracking (set by bootstrap, or override manually)
275
+ # export MLFLOW_APP_ARN=""
196
276
  <% } %>
197
277
  <% } %>
198
278
 
@@ -210,10 +290,17 @@ export HF_TOKEN="<%= hfToken %>"
210
290
 
211
291
  <% if (modelFormat) { %>
212
292
  export MODEL_FORMAT="<%= modelFormat %>"
293
+ <% } else { %>
294
+ # Model format (uncomment if using quantized models)
295
+ # Valid: pkl, json, keras, safetensors, gguf, awq, gptq
296
+ # export MODEL_FORMAT=""
213
297
  <% } %>
214
298
 
215
299
  <% if (roleArn) { %>
216
300
  export ROLE_ARN="<%= roleArn %>"
301
+ <% } else { %>
302
+ # IAM execution role for SageMaker (uncomment to override bootstrap role)
303
+ # export ROLE_ARN=""
217
304
  <% } %>
218
305
 
219
306
  <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
@@ -234,6 +321,23 @@ export BENCHMARK_S3_OUTPUT_PATH="s3://mlcc-benchmark-$(aws sts get-caller-identi
234
321
  <% } %>
235
322
  export BENCHMARK_JOB_NAME=""
236
323
  export BENCHMARK_WORKLOAD_CONFIG_NAME=""
324
+
325
+ # CI Benchmark Athena persistence (set automatically from bootstrap --benchmark-infra)
326
+ <% if (typeof ciBenchmarkResultsBucket !== 'undefined' && ciBenchmarkResultsBucket) { %>
327
+ export CI_BENCHMARK_RESULTS_BUCKET="<%= ciBenchmarkResultsBucket %>"
328
+ <% } else { %>
329
+ # export CI_BENCHMARK_RESULTS_BUCKET="" # S3 bucket for Athena Parquet results (set by bootstrap --benchmark-infra)
330
+ <% } %>
331
+ <% } else if (framework === 'transformers' && deploymentTarget !== 'batch-transform') { %>
332
+ # ─── SageMaker AI Benchmarking (uncomment to enable) ──────────────────────────
333
+ # export BENCHMARK_CONCURRENCY="10" # Concurrent requests
334
+ # export BENCHMARK_INPUT_TOKENS_MEAN="550" # Mean input tokens per request
335
+ # export BENCHMARK_OUTPUT_TOKENS_MEAN="150" # Mean output tokens per request
336
+ # export BENCHMARK_STREAMING="true" # Enable streaming
337
+ # export BENCHMARK_REQUEST_COUNT="" # Total requests (empty = auto)
338
+ # export BENCHMARK_S3_OUTPUT_PATH="" # S3 path for results (empty = auto)
339
+ # export BENCHMARK_JOB_NAME="" # Resume/check existing job
340
+ # export BENCHMARK_WORKLOAD_CONFIG_NAME="" # Reuse existing workload config
237
341
  <% } %>
238
342
 
239
343
  <% if (orderedEnvVars && orderedEnvVars.length > 0) { %>
@@ -246,7 +350,6 @@ export <%= key %>=${<%= key %>:-<%= value %>}
246
350
  export BASE_IMAGE=${BASE_IMAGE:-<%= baseImage || '' %>}
247
351
 
248
352
  # Allow environment variable overrides
249
- export AWS_REGION=${AWS_REGION:-<%= awsRegion %>}
250
353
  <% if ((deploymentTarget === 'realtime-inference' && !(typeof existingEndpointName !== 'undefined' && existingEndpointName)) || deploymentTarget === 'async-inference' || deploymentTarget === 'batch-transform') { %>
251
354
  export INSTANCE_TYPE=${INSTANCE_TYPE:-<%= instanceType %>}
252
355
  <% } %>
@@ -10,9 +10,11 @@ set -o pipefail
10
10
  FORCE_NEW=false
11
11
  FORCE_IC=false
12
12
  IC_TARGET=""
13
+ CI_FLAG=false
13
14
  while [ $# -gt 0 ]; do
14
15
  case "$1" in
15
16
  --force) FORCE_NEW=true; shift ;;
17
+ --ci) CI_FLAG=true; shift ;;
16
18
  --force-ic)
17
19
  FORCE_IC=true
18
20
  shift
@@ -32,13 +34,14 @@ while [ $# -gt 0 ]; do
32
34
  shift 2
33
35
  ;;
34
36
  --help|-h)
35
- echo "Usage: ./do/deploy [--force] [--force-ic [<name>]] [--ic <name>]"
37
+ echo "Usage: ./do/deploy [--force] [--force-ic [<name>]] [--ic <name>] [--ci]"
36
38
  echo ""
37
39
  echo "Options:"
38
40
  echo " --force Create a new endpoint and IC, even if one already exists."
39
41
  echo " --force-ic Recreate ALL inference components on the existing endpoint."
40
42
  echo " --force-ic <name> Recreate only the named IC on the existing endpoint."
41
43
  echo " --ic <name> Deploy only the named IC (from do/ic/<name>.conf)."
44
+ echo " --ci Enable CI mode (structured errors, timeouts, idempotency)."
42
45
  echo ""
43
46
  echo "Without flags, deploy resumes from the last run."
44
47
  exit 0
@@ -51,6 +54,160 @@ while [ $# -gt 0 ]; do
51
54
  esac
52
55
  done
53
56
 
57
+ # ============================================================
58
+ # CI-Mode Detection and Configuration
59
+ # ============================================================
60
+ # CI mode is activated by CI_MODE=true env var OR --ci flag
61
+ if [ "${CI_MODE:-false}" = "true" ] || [ "${CI_FLAG}" = "true" ]; then
62
+ CI_ACTIVE=true
63
+ else
64
+ CI_ACTIVE=false
65
+ fi
66
+
67
+ # CI-mode timeout configuration (seconds)
68
+ if [ "${CI_ACTIVE}" = "true" ]; then
69
+ CI_DEPLOY_TIMEOUT="${CI_DEPLOY_TIMEOUT_SECONDS:-1200}"
70
+ CI_DEPLOY_START=$(date +%s)
71
+ fi
72
+
73
+ # _ci_emit_error <error_message> <error_type> <retryable>
74
+ # Emits structured JSON error output when in CI mode.
75
+ # In non-CI mode, prints human-readable error and exits.
76
+ _ci_emit_error() {
77
+ local error_msg="$1"
78
+ local error_type="$2"
79
+ local retryable="$3"
80
+ local elapsed=0
81
+
82
+ if [ "${CI_ACTIVE}" = "true" ]; then
83
+ elapsed=$(( $(date +%s) - CI_DEPLOY_START ))
84
+ echo "{\"error\":\"${error_msg}\",\"error_type\":\"${error_type}\",\"instance_type\":\"${INSTANCE_TYPE:-unknown}\",\"region\":\"${AWS_REGION:-unknown}\",\"retryable\":${retryable},\"elapsed_seconds\":${elapsed}}"
85
+ exit 1
86
+ else
87
+ echo "❌ ${error_msg}"
88
+ exit 1
89
+ fi
90
+ }
91
+
92
+ # _ci_check_timeout
93
+ # Checks if CI-mode timeout has been exceeded.
94
+ # Emits structured timeout error if so.
95
+ _ci_check_timeout() {
96
+ if [ "${CI_ACTIVE}" = "true" ]; then
97
+ local elapsed=$(( $(date +%s) - CI_DEPLOY_START ))
98
+ if [ "${elapsed}" -ge "${CI_DEPLOY_TIMEOUT}" ]; then
99
+ _ci_emit_error "Deployment timed out after ${elapsed} seconds (limit: ${CI_DEPLOY_TIMEOUT}s)" "timeout" "true"
100
+ fi
101
+ fi
102
+ }
103
+
104
+ # _ci_create_endpoint_with_retry
105
+ # Wraps CreateEndpoint with exponential backoff for throttling.
106
+ # Base: 5 seconds, max 3 attempts.
107
+ _ci_create_endpoint_with_retry() {
108
+ local attempt=0
109
+ local max_attempts=3
110
+ local backoff=5
111
+ local ep_name="$1"
112
+ local ep_config="$2"
113
+
114
+ while [ "${attempt}" -lt "${max_attempts}" ]; do
115
+ attempt=$(( attempt + 1 ))
116
+ local create_output
117
+ create_output=$(aws sagemaker create-endpoint \
118
+ --endpoint-name "${ep_name}" \
119
+ --endpoint-config-name "${ep_config}" \
120
+ --region "${AWS_REGION}" 2>&1) && return 0
121
+
122
+ # Check if throttled
123
+ if echo "${create_output}" | grep -qi "ThrottlingException"; then
124
+ if [ "${attempt}" -lt "${max_attempts}" ]; then
125
+ if [ "${CI_ACTIVE}" = "true" ]; then
126
+ echo "⏳ Throttled (attempt ${attempt}/${max_attempts}), retrying in ${backoff}s..."
127
+ else
128
+ echo "⏳ Throttled, retrying in ${backoff}s..."
129
+ fi
130
+ sleep "${backoff}"
131
+ backoff=$(( backoff * 2 ))
132
+ else
133
+ _ci_emit_error "CreateEndpoint throttled after ${max_attempts} attempts" "throttled" "true"
134
+ fi
135
+ elif echo "${create_output}" | grep -qi "InsufficientInstanceCapacity"; then
136
+ _ci_emit_error "InsufficientInstanceCapacity: Unable to provision ${INSTANCE_TYPE} in ${AWS_REGION}" "capacity" "true"
137
+ else
138
+ # Other API error
139
+ _ci_emit_error "CreateEndpoint failed: ${create_output}" "api_error" "false"
140
+ fi
141
+ done
142
+ }
143
+
144
+ # _ci_handle_existing_endpoint
145
+ # CI-mode idempotent deployment logic.
146
+ # Returns 0 if deployment should be skipped (already InService with matching config).
147
+ # Returns 1 if a fresh deploy should proceed.
148
+ # Handles bad-state cleanup (Failed/OutOfService → delete + recreate).
149
+ _ci_handle_existing_endpoint() {
150
+ local ep_name="${ENDPOINT_NAME:-}"
151
+ if [ -z "${ep_name}" ]; then
152
+ return 1 # No existing endpoint, proceed with fresh deploy
153
+ fi
154
+
155
+ local ep_status
156
+ ep_status=$(_get_endpoint_status "${ep_name}" 2>/dev/null || echo "")
157
+
158
+ case "${ep_status}" in
159
+ InService)
160
+ # Check if config matches (idempotent check)
161
+ if [ -n "${INFERENCE_COMPONENT_NAME:-}" ]; then
162
+ local ic_status
163
+ ic_status=$(_get_ic_status "${INFERENCE_COMPONENT_NAME}" 2>/dev/null || echo "")
164
+ if [ "${ic_status}" = "InService" ]; then
165
+ echo "✅ [CI] Endpoint InService with matching config — skipping deployment"
166
+ echo " Endpoint: ${ep_name}"
167
+ echo " Inference Component: ${INFERENCE_COMPONENT_NAME}"
168
+ return 0
169
+ fi
170
+ fi
171
+ return 1
172
+ ;;
173
+ Failed|OutOfService)
174
+ echo "⚠️ [CI] Endpoint in bad state (${ep_status}): ${ep_name}"
175
+ echo " Deleting endpoint for fresh deployment..."
176
+
177
+ aws sagemaker delete-endpoint \
178
+ --endpoint-name "${ep_name}" \
179
+ --region "${AWS_REGION}" 2>/dev/null || true
180
+
181
+ # Wait for deletion to complete
182
+ local delete_start
183
+ delete_start=$(date +%s)
184
+ local delete_timeout=300 # 5 minutes
185
+
186
+ while true; do
187
+ _ci_check_timeout
188
+ local check_status
189
+ check_status=$(_get_endpoint_status "${ep_name}" 2>/dev/null || echo "")
190
+ if [ -z "${check_status}" ]; then
191
+ echo " ✅ Endpoint deleted: ${ep_name}"
192
+ break
193
+ fi
194
+ local del_elapsed=$(( $(date +%s) - delete_start ))
195
+ if [ "${del_elapsed}" -ge "${delete_timeout}" ]; then
196
+ _ci_emit_error "Endpoint deletion timed out for ${ep_name} (state: ${ep_status})" "endpoint_failed" "true"
197
+ fi
198
+ sleep 10
199
+ done
200
+
201
+ # Clear endpoint name so fresh deploy proceeds
202
+ ENDPOINT_NAME=""
203
+ return 1
204
+ ;;
205
+ *)
206
+ return 1 # Unknown/absent state, proceed with fresh deploy
207
+ ;;
208
+ esac
209
+ }
210
+
54
211
  # Source configuration
55
212
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
56
213
  source "${SCRIPT_DIR}/config"
@@ -193,6 +350,16 @@ fi
193
350
  # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
194
351
  resolve_secrets
195
352
 
353
+ # ============================================================
354
+ # CI-Mode: Idempotent deployment check (runs before normal idempotency)
355
+ # ============================================================
356
+ if [ "${CI_ACTIVE}" = "true" ] && [ "${FORCE_NEW}" != "true" ]; then
357
+ if _ci_handle_existing_endpoint; then
358
+ # Endpoint already InService with matching config — exit successfully
359
+ exit 0
360
+ fi
361
+ fi
362
+
196
363
  # ============================================================
197
364
  # Idempotency: check for existing deployment from a previous run
198
365
  # ============================================================
@@ -380,16 +547,20 @@ if [ -z "${SKIP_TO}" ]; then
380
547
 
381
548
  # Step 2: Create endpoint
382
549
  echo "🚀 Creating endpoint: ${ENDPOINT_NAME}"
383
- if ! aws sagemaker create-endpoint \
384
- --endpoint-name "${ENDPOINT_NAME}" \
385
- --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
386
- --region "${AWS_REGION}"; then
387
-
388
- echo " Failed to create endpoint"
389
- echo " Check that:"
390
- echo " • Your IAM credentials have sagemaker:CreateEndpoint permission"
391
- echo " You have sufficient service quota in region: ${AWS_REGION}"
392
- exit 4
550
+ if [ "${CI_ACTIVE}" = "true" ]; then
551
+ _ci_create_endpoint_with_retry "${ENDPOINT_NAME}" "${ENDPOINT_CONFIG_NAME}"
552
+ else
553
+ if ! aws sagemaker create-endpoint \
554
+ --endpoint-name "${ENDPOINT_NAME}" \
555
+ --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
556
+ --region "${AWS_REGION}"; then
557
+
558
+ echo " Failed to create endpoint"
559
+ echo " Check that:"
560
+ echo " • Your IAM credentials have sagemaker:CreateEndpoint permission"
561
+ echo " • You have sufficient service quota in region: ${AWS_REGION}"
562
+ exit 4
563
+ fi
393
564
  fi
394
565
 
395
566
  echo "✅ Endpoint creation initiated: ${ENDPOINT_NAME}"
@@ -413,8 +584,18 @@ if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
413
584
  echo " This may take a few minutes..."
414
585
  echo " If this times out, re-run ./do/deploy to resume."
415
586
 
587
+ # CI-mode: check timeout during wait
588
+ if [ "${CI_ACTIVE}" = "true" ]; then
589
+ _ci_check_timeout
590
+ fi
591
+
416
592
  wait_endpoint "${ENDPOINT_NAME}"
417
593
 
594
+ # CI-mode: check timeout after wait completes
595
+ if [ "${CI_ACTIVE}" = "true" ]; then
596
+ _ci_check_timeout
597
+ fi
598
+
418
599
  echo "✅ Endpoint is InService: ${ENDPOINT_NAME}"
419
600
  fi
420
601