npm - @aws/ml-container-creator - Versions diffs - 0.10.0 → 0.10.3 - Mend

@aws/ml-container-creator 0.10.0 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

package/LICENSE-THIRD-PARTY +9304 -0
package/bin/cli.js +2 -0
package/config/bootstrap-e2e-stack.json +341 -0
package/config/bootstrap-stack.json +40 -3
package/config/parameter-schema-v2.json +5 -21
package/config/tune-catalog.json +1781 -0
package/infra/ci-harness/buildspec.yml +1 -0
package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
package/package.json +51 -66
package/servers/base-image-picker/index.js +121 -121
package/servers/e2e-status/index.js +297 -0
package/servers/e2e-status/manifest.json +14 -0
package/servers/e2e-status/package.json +15 -0
package/servers/endpoint-picker/LICENSE +202 -0
package/servers/endpoint-picker/index.js +536 -0
package/servers/endpoint-picker/manifest.json +14 -0
package/servers/endpoint-picker/package.json +18 -0
package/servers/hyperpod-cluster-picker/index.js +125 -125
package/servers/instance-sizer/index.js +138 -138
package/servers/instance-sizer/lib/instance-ranker.js +76 -76
package/servers/instance-sizer/lib/model-resolver.js +61 -61
package/servers/instance-sizer/lib/quota-resolver.js +113 -113
package/servers/instance-sizer/lib/vram-estimator.js +31 -31
package/servers/lib/bedrock-client.js +38 -38
package/servers/lib/catalogs/model-servers.json +201 -3
package/servers/lib/custom-validators.js +13 -13
package/servers/lib/dynamic-resolver.js +4 -4
package/servers/marketplace-picker/index.js +342 -0
package/servers/marketplace-picker/manifest.json +14 -0
package/servers/marketplace-picker/package.json +18 -0
package/servers/model-picker/index.js +382 -382
package/servers/region-picker/index.js +56 -56
package/servers/workload-picker/LICENSE +202 -0
package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
package/servers/workload-picker/index.js +171 -0
package/servers/workload-picker/manifest.json +16 -0
package/servers/workload-picker/package.json +16 -0
package/src/app.js +4 -2
package/src/lib/bootstrap-command-handler.js +579 -14
package/src/lib/bootstrap-config.js +36 -0
package/src/lib/bootstrap-profile-manager.js +48 -41
package/src/lib/ci-register-helpers.js +74 -0
package/src/lib/config-loader.js +3 -0
package/src/lib/config-manager.js +7 -0
package/src/lib/cuda-resolver.js +17 -8
package/src/lib/generated/cli-options.js +315 -315
package/src/lib/generated/parameter-matrix.js +661 -661
package/src/lib/generated/validation-rules.js +71 -71
package/src/lib/path-prover-brain.js +607 -0
package/src/lib/prompts/project-prompts.js +12 -0
package/src/lib/template-variable-resolver.js +25 -1
package/src/lib/tune-catalog-validator.js +37 -4
package/templates/Dockerfile +9 -0
package/templates/code/adapter_sidecar.py +444 -0
package/templates/code/serve +6 -0
package/templates/code/serve.d/vllm.ejs +1 -1
package/templates/do/.benchmark_writer.py +1476 -0
package/templates/do/.tune_helper.py +982 -57
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/adapter +149 -0
package/templates/do/benchmark +639 -85
package/templates/do/config +108 -5
package/templates/do/deploy.d/managed-inference.ejs +192 -11
package/templates/do/optimize +106 -37
package/templates/do/register +89 -0
package/templates/do/test +13 -0
package/templates/do/tune +378 -59
package/templates/do/validate +44 -4

package/templates/do/config CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/bin/bash
 # do-framework configuration
 # This file is sourced by all do scripts
+# Generated: <%= new Date().toISOString() %>
 # Project identification
 export PROJECT_NAME="<%= projectName %>"
@@ -10,16 +11,20 @@ export DEPLOYMENT_CONFIG="<%= deploymentConfig %>"
 export FRAMEWORK="<%= framework %>"
 export MODEL_SERVER="<%= modelServer %>"
+# AWS configuration
+export AWS_REGION=${AWS_REGION:-<%= awsRegion %>}
+export ECR_REPOSITORY_NAME="ml-container-creator"
 <% if (typeof enableLora !== 'undefined' && enableLora) { %>
 # LoRA adapter serving
 export ENABLE_LORA=true
 export ADAPTER_S3_BUCKET="mlcc-adapters-$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo 'UNKNOWN')-${AWS_REGION}"
+<% } else if (framework === 'transformers' || framework === 'diffusors') { %>
+# LoRA adapter serving (uncomment to enable)
+# export ENABLE_LORA=true
+# export ADAPTER_S3_BUCKET="mlcc-adapters-$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo 'UNKNOWN')-${AWS_REGION}"
 <% } %>
-# AWS configuration
-export AWS_REGION="<%= awsRegion %>"
-export ECR_REPOSITORY_NAME="ml-container-creator"
 # Build configuration — WHERE the Docker image gets built
 export BUILD_TARGET="<%= buildTarget %>"
 <% if (buildTarget === 'codebuild') { %>
@@ -42,14 +47,27 @@ export INSTANCE_TYPE="<%= instanceType %>"
 # Instance pools: heterogeneous instance types with priority-based fallback
 # Priority = selection order (1 = preferred, higher = fallback)
 export INSTANCE_POOLS='<%= JSON.stringify(instancePools) %>'
+<% } else { %>
+# Instance pools: heterogeneous instance types with priority-based fallback (uncomment to enable)
+# Format: [{"InstanceType":"ml.g6e.48xlarge","Priority":1},{"InstanceType":"ml.g5.48xlarge","Priority":2}]
+# export INSTANCE_POOLS='[]'
 <% } %>
 <% if (inferenceAmiVersion) { %>
 export INFERENCE_AMI_VERSION="<%= inferenceAmiVersion %>"
+<% } else { %>
+# Inference AMI version — auto-resolved from CUDA version (uncomment to override)
+# Valid: al2-ami-sagemaker-inference-gpu-2, al2-ami-sagemaker-inference-gpu-2-1,
+#        al2-ami-sagemaker-inference-gpu-3-1, al2023-ami-sagemaker-inference-gpu-4-1
+# export INFERENCE_AMI_VERSION=""
 <% } %>
 <% if (typeof capacityReservationArn !== 'undefined' && capacityReservationArn) { %>
 # Note: Capacity reservations and instance pools (INSTANCE_POOLS) are mutually exclusive.
 # If both are set, the capacity reservation takes precedence and INSTANCE_POOLS is ignored.
 export CAPACITY_RESERVATION_ARN="<%= capacityReservationArn %>"
+<% } else { %>
+# Capacity reservation (uncomment to use reserved capacity)
+# Note: Mutually exclusive with INSTANCE_POOLS — reservation takes precedence.
+# export CAPACITY_RESERVATION_ARN=""
 <% } %>
 <% } %>
 <% } %>
@@ -59,6 +77,9 @@ export CAPACITY_RESERVATION_ARN="<%= capacityReservationArn %>"
 export INSTANCE_TYPE="<%= instanceType %>"
 <% if (inferenceAmiVersion) { %>
 export INFERENCE_AMI_VERSION="<%= inferenceAmiVersion %>"
+<% } else { %>
+# Inference AMI version — auto-resolved from CUDA version (uncomment to override)
+# export INFERENCE_AMI_VERSION=""
 <% } %>
 # Async-specific configuration
@@ -85,6 +106,9 @@ export ASYNC_SNS_ERROR_TOPIC="arn:aws:sns:${AWS_REGION}:${ACCOUNT_ID}:ml-contain
 <% if (asyncMaxConcurrentInvocations) { %>
 export ASYNC_MAX_CONCURRENT_INVOCATIONS="<%= asyncMaxConcurrentInvocations %>"
+<% } else { %>
+# Max concurrent invocations per instance (uncomment to set)
+# export ASYNC_MAX_CONCURRENT_INVOCATIONS=""
 <% } %>
 <% } %>
@@ -95,6 +119,9 @@ export HYPERPOD_NAMESPACE="<%= hyperPodNamespace %>"
 export HYPERPOD_REPLICAS="<%= hyperPodReplicas %>"
 <% if (fsxVolumeHandle) { %>
 export FSX_VOLUME_HANDLE="<%= fsxVolumeHandle %>"
+<% } else { %>
+# FSx for Lustre volume for shared model storage (uncomment to enable)
+# export FSX_VOLUME_HANDLE=""
 <% } %>
 <% } %>
@@ -121,9 +148,15 @@ export BATCH_STRATEGY="<%= batchStrategy %>"
 export BATCH_JOIN_SOURCE="<%= batchJoinSource || 'None' %>"
 <% if (batchMaxConcurrentTransforms) { %>
 export BATCH_MAX_CONCURRENT_TRANSFORMS="<%= batchMaxConcurrentTransforms %>"
+<% } else { %>
+# Max concurrent transforms per instance (uncomment to set)
+# export BATCH_MAX_CONCURRENT_TRANSFORMS=""
 <% } %>
 <% if (batchMaxPayloadInMB) { %>
 export BATCH_MAX_PAYLOAD_IN_MB="<%= batchMaxPayloadInMB %>"
+<% } else { %>
+# Max payload size in MB (uncomment to set, default: 6)
+# export BATCH_MAX_PAYLOAD_IN_MB=""
 <% } %>
 <% } %>
@@ -140,6 +173,22 @@ export ENDPOINT_VARIANT_NAME="<%= endpointVariantName %>"
 export ENDPOINT_VOLUME_SIZE="<%= endpointVolumeSize %>"
 <% } %>
+<% if (deploymentTarget === 'realtime-inference' || deploymentTarget === 'async-inference') { %>
+# ─── Endpoint overrides (uncomment to customize) ───────────────────────────────
+<% if (typeof endpointInitialInstanceCount === 'undefined' || endpointInitialInstanceCount == null) { %>
+# export ENDPOINT_INITIAL_INSTANCE_COUNT="1"    # Number of instances for the endpoint
+<% } %>
+<% if (typeof endpointDataCapturePercent === 'undefined' || endpointDataCapturePercent == null) { %>
+# export ENDPOINT_DATA_CAPTURE_PERCENT=""        # Percentage of requests to capture (0-100)
+<% } %>
+<% if (typeof endpointVariantName === 'undefined' || endpointVariantName == null) { %>
+# export ENDPOINT_VARIANT_NAME=""                # Custom variant name (default: AllTraffic)
+<% } %>
+<% if (typeof endpointVolumeSize === 'undefined' || endpointVolumeSize == null) { %>
+# export ENDPOINT_VOLUME_SIZE=""                 # EBS volume size in GB for model download
+<% } %>
+<% } %>
 <% if (typeof icCpuCount !== 'undefined' && icCpuCount != null) { %>
 export IC_CPU_COUNT="<%= icCpuCount %>"
 <% } %>
@@ -158,6 +207,22 @@ export IC_COPY_COUNT="<%= icCopyCount %>"
 export IC_MODEL_WEIGHT="<%= icModelWeight %>"
 <% } %>
+<% if (deploymentTarget === 'realtime-inference' || deploymentTarget === 'async-inference') { %>
+# ─── Inference Component overrides (uncomment to customize) ────────────────────
+<% if (typeof icCpuCount === 'undefined' || icCpuCount == null) { %>
+# export IC_CPU_COUNT=""                         # CPU cores reserved for this IC
+<% } %>
+<% if (typeof icMemorySize === 'undefined' || icMemorySize == null) { %>
+# export IC_MEMORY_SIZE=""                       # Memory in MB reserved for this IC
+<% } %>
+<% if (typeof icCopyCount === 'undefined' || icCopyCount == null) { %>
+# export IC_COPY_COUNT=""                        # Number of model copies (multi-IC scaling)
+<% } %>
+<% if (typeof icModelWeight === 'undefined' || icModelWeight == null) { %>
+# export IC_MODEL_WEIGHT=""                      # Traffic weight for this IC (0-100)
+<% } %>
+<% } %>
 <% if (typeof modelEnvVars !== 'undefined' && modelEnvVars && Object.keys(modelEnvVars).length > 0) { %>
 # Model environment variables
 <% Object.entries(modelEnvVars).forEach(([key, value]) => { %>
@@ -192,7 +257,22 @@ export NGC_API_KEY="<%= ngcApiKey %>"
 <% if (deploymentTarget !== 'batch-transform') { %>
 # Managed Model Customization (do/tune)
 export TUNE_SUPPORTED=<%= (typeof tuneSupported !== 'undefined' && tuneSupported) ? 'true' : 'false' %>
+<% if (typeof tuneSupported !== 'undefined' && tuneSupported) { %>
+<% if (typeof tuneModelId !== 'undefined' && tuneModelId) { %>
+# SageMaker AI Managed Fine-Tuning — JumpStart Hub model ID
+# Flow: JumpStart model (tune) → LoRA adapter (S3) → do/adapter add → vLLM
+export TUNE_MODEL_ID="<%= tuneModelId %>"
+<% } else { %>
+# SageMaker AI Managed Fine-Tuning — JumpStart Hub model ID
+# To find your model's Hub ID:
+#   aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \
+#     --hub-content-type Model --query "HubContentSummaries[].HubContentName"
+# export TUNE_MODEL_ID=""
+<% } %>
+<% } %>
 export TUNE_S3_BUCKET="mlcc-tune-$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo 'UNKNOWN')-${AWS_REGION}"
+# MLflow App ARN for experiment tracking (set by bootstrap, or override manually)
+# export MLFLOW_APP_ARN=""
 <% } %>
 <% } %>
@@ -210,10 +290,17 @@ export HF_TOKEN="<%= hfToken %>"
 <% if (modelFormat) { %>
 export MODEL_FORMAT="<%= modelFormat %>"
+<% } else { %>
+# Model format (uncomment if using quantized models)
+# Valid: pkl, json, keras, safetensors, gguf, awq, gptq
+# export MODEL_FORMAT=""
 <% } %>
 <% if (roleArn) { %>
 export ROLE_ARN="<%= roleArn %>"
+<% } else { %>
+# IAM execution role for SageMaker (uncomment to override bootstrap role)
+# export ROLE_ARN=""
 <% } %>
 <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
@@ -234,6 +321,23 @@ export BENCHMARK_S3_OUTPUT_PATH="s3://mlcc-benchmark-$(aws sts get-caller-identi
 <% } %>
 export BENCHMARK_JOB_NAME=""
 export BENCHMARK_WORKLOAD_CONFIG_NAME=""
+# CI Benchmark Athena persistence (set automatically from bootstrap --benchmark-infra)
+<% if (typeof ciBenchmarkResultsBucket !== 'undefined' && ciBenchmarkResultsBucket) { %>
+export CI_BENCHMARK_RESULTS_BUCKET="<%= ciBenchmarkResultsBucket %>"
+<% } else { %>
+# export CI_BENCHMARK_RESULTS_BUCKET=""            # S3 bucket for Athena Parquet results (set by bootstrap --benchmark-infra)
+<% } %>
+<% } else if (framework === 'transformers' && deploymentTarget !== 'batch-transform') { %>
+# ─── SageMaker AI Benchmarking (uncomment to enable) ──────────────────────────
+# export BENCHMARK_CONCURRENCY="10"              # Concurrent requests
+# export BENCHMARK_INPUT_TOKENS_MEAN="550"       # Mean input tokens per request
+# export BENCHMARK_OUTPUT_TOKENS_MEAN="150"      # Mean output tokens per request
+# export BENCHMARK_STREAMING="true"              # Enable streaming
+# export BENCHMARK_REQUEST_COUNT=""              # Total requests (empty = auto)
+# export BENCHMARK_S3_OUTPUT_PATH=""             # S3 path for results (empty = auto)
+# export BENCHMARK_JOB_NAME=""                   # Resume/check existing job
+# export BENCHMARK_WORKLOAD_CONFIG_NAME=""       # Reuse existing workload config
 <% } %>
 <% if (orderedEnvVars && orderedEnvVars.length > 0) { %>
@@ -246,7 +350,6 @@ export <%= key %>=${<%= key %>:-<%= value %>}
 export BASE_IMAGE=${BASE_IMAGE:-<%= baseImage || '' %>}
 # Allow environment variable overrides
-export AWS_REGION=${AWS_REGION:-<%= awsRegion %>}
 <% if ((deploymentTarget === 'realtime-inference' && !(typeof existingEndpointName !== 'undefined' && existingEndpointName)) || deploymentTarget === 'async-inference' || deploymentTarget === 'batch-transform') { %>
 export INSTANCE_TYPE=${INSTANCE_TYPE:-<%= instanceType %>}
 <% } %>

package/templates/do/deploy.d/managed-inference.ejs CHANGED Viewed

@@ -10,9 +10,11 @@ set -o pipefail
 FORCE_NEW=false
 FORCE_IC=false
 IC_TARGET=""
+CI_FLAG=false
 while [ $# -gt 0 ]; do
     case "$1" in
         --force) FORCE_NEW=true; shift ;;
+        --ci) CI_FLAG=true; shift ;;
         --force-ic)
             FORCE_IC=true
             shift
@@ -32,13 +34,14 @@ while [ $# -gt 0 ]; do
             shift 2
             ;;
         --help|-h)
-            echo "Usage: ./do/deploy [--force] [--force-ic [<name>]] [--ic <name>]"
+            echo "Usage: ./do/deploy [--force] [--force-ic [<name>]] [--ic <name>] [--ci]"
             echo ""
             echo "Options:"
             echo "  --force            Create a new endpoint and IC, even if one already exists."
             echo "  --force-ic         Recreate ALL inference components on the existing endpoint."
             echo "  --force-ic <name>  Recreate only the named IC on the existing endpoint."
             echo "  --ic <name>        Deploy only the named IC (from do/ic/<name>.conf)."
+            echo "  --ci               Enable CI mode (structured errors, timeouts, idempotency)."
             echo ""
             echo "Without flags, deploy resumes from the last run."
             exit 0
@@ -51,6 +54,160 @@ while [ $# -gt 0 ]; do
     esac
 done
+# ============================================================
+# CI-Mode Detection and Configuration
+# ============================================================
+# CI mode is activated by CI_MODE=true env var OR --ci flag
+if [ "${CI_MODE:-false}" = "true" ] || [ "${CI_FLAG}" = "true" ]; then
+    CI_ACTIVE=true
+else
+    CI_ACTIVE=false
+fi
+# CI-mode timeout configuration (seconds)
+if [ "${CI_ACTIVE}" = "true" ]; then
+    CI_DEPLOY_TIMEOUT="${CI_DEPLOY_TIMEOUT_SECONDS:-1200}"
+    CI_DEPLOY_START=$(date +%s)
+fi
+# _ci_emit_error <error_message> <error_type> <retryable>
+#   Emits structured JSON error output when in CI mode.
+#   In non-CI mode, prints human-readable error and exits.
+_ci_emit_error() {
+    local error_msg="$1"
+    local error_type="$2"
+    local retryable="$3"
+    local elapsed=0
+    if [ "${CI_ACTIVE}" = "true" ]; then
+        elapsed=$(( $(date +%s) - CI_DEPLOY_START ))
+        echo "{\"error\":\"${error_msg}\",\"error_type\":\"${error_type}\",\"instance_type\":\"${INSTANCE_TYPE:-unknown}\",\"region\":\"${AWS_REGION:-unknown}\",\"retryable\":${retryable},\"elapsed_seconds\":${elapsed}}"
+        exit 1
+    else
+        echo "❌ ${error_msg}"
+        exit 1
+    fi
+}
+# _ci_check_timeout
+#   Checks if CI-mode timeout has been exceeded.
+#   Emits structured timeout error if so.
+_ci_check_timeout() {
+    if [ "${CI_ACTIVE}" = "true" ]; then
+        local elapsed=$(( $(date +%s) - CI_DEPLOY_START ))
+        if [ "${elapsed}" -ge "${CI_DEPLOY_TIMEOUT}" ]; then
+            _ci_emit_error "Deployment timed out after ${elapsed} seconds (limit: ${CI_DEPLOY_TIMEOUT}s)" "timeout" "true"
+        fi
+    fi
+}
+# _ci_create_endpoint_with_retry
+#   Wraps CreateEndpoint with exponential backoff for throttling.
+#   Base: 5 seconds, max 3 attempts.
+_ci_create_endpoint_with_retry() {
+    local attempt=0
+    local max_attempts=3
+    local backoff=5
+    local ep_name="$1"
+    local ep_config="$2"
+    while [ "${attempt}" -lt "${max_attempts}" ]; do
+        attempt=$(( attempt + 1 ))
+        local create_output
+        create_output=$(aws sagemaker create-endpoint \
+            --endpoint-name "${ep_name}" \
+            --endpoint-config-name "${ep_config}" \
+            --region "${AWS_REGION}" 2>&1) && return 0
+        # Check if throttled
+        if echo "${create_output}" | grep -qi "ThrottlingException"; then
+            if [ "${attempt}" -lt "${max_attempts}" ]; then
+                if [ "${CI_ACTIVE}" = "true" ]; then
+                    echo "⏳ Throttled (attempt ${attempt}/${max_attempts}), retrying in ${backoff}s..."
+                else
+                    echo "⏳ Throttled, retrying in ${backoff}s..."
+                fi
+                sleep "${backoff}"
+                backoff=$(( backoff * 2 ))
+            else
+                _ci_emit_error "CreateEndpoint throttled after ${max_attempts} attempts" "throttled" "true"
+            fi
+        elif echo "${create_output}" | grep -qi "InsufficientInstanceCapacity"; then
+            _ci_emit_error "InsufficientInstanceCapacity: Unable to provision ${INSTANCE_TYPE} in ${AWS_REGION}" "capacity" "true"
+        else
+            # Other API error
+            _ci_emit_error "CreateEndpoint failed: ${create_output}" "api_error" "false"
+        fi
+    done
+}
+# _ci_handle_existing_endpoint
+#   CI-mode idempotent deployment logic.
+#   Returns 0 if deployment should be skipped (already InService with matching config).
+#   Returns 1 if a fresh deploy should proceed.
+#   Handles bad-state cleanup (Failed/OutOfService → delete + recreate).
+_ci_handle_existing_endpoint() {
+    local ep_name="${ENDPOINT_NAME:-}"
+    if [ -z "${ep_name}" ]; then
+        return 1  # No existing endpoint, proceed with fresh deploy
+    fi
+    local ep_status
+    ep_status=$(_get_endpoint_status "${ep_name}" 2>/dev/null || echo "")
+    case "${ep_status}" in
+        InService)
+            # Check if config matches (idempotent check)
+            if [ -n "${INFERENCE_COMPONENT_NAME:-}" ]; then
+                local ic_status
+                ic_status=$(_get_ic_status "${INFERENCE_COMPONENT_NAME}" 2>/dev/null || echo "")
+                if [ "${ic_status}" = "InService" ]; then
+                    echo "✅ [CI] Endpoint InService with matching config — skipping deployment"
+                    echo "   Endpoint: ${ep_name}"
+                    echo "   Inference Component: ${INFERENCE_COMPONENT_NAME}"
+                    return 0
+                fi
+            fi
+            return 1
+            ;;
+        Failed|OutOfService)
+            echo "⚠️  [CI] Endpoint in bad state (${ep_status}): ${ep_name}"
+            echo "   Deleting endpoint for fresh deployment..."
+            aws sagemaker delete-endpoint \
+                --endpoint-name "${ep_name}" \
+                --region "${AWS_REGION}" 2>/dev/null || true
+            # Wait for deletion to complete
+            local delete_start
+            delete_start=$(date +%s)
+            local delete_timeout=300  # 5 minutes
+            while true; do
+                _ci_check_timeout
+                local check_status
+                check_status=$(_get_endpoint_status "${ep_name}" 2>/dev/null || echo "")
+                if [ -z "${check_status}" ]; then
+                    echo "   ✅ Endpoint deleted: ${ep_name}"
+                    break
+                fi
+                local del_elapsed=$(( $(date +%s) - delete_start ))
+                if [ "${del_elapsed}" -ge "${delete_timeout}" ]; then
+                    _ci_emit_error "Endpoint deletion timed out for ${ep_name} (state: ${ep_status})" "endpoint_failed" "true"
+                fi
+                sleep 10
+            done
+            # Clear endpoint name so fresh deploy proceeds
+            ENDPOINT_NAME=""
+            return 1
+            ;;
+        *)
+            return 1  # Unknown/absent state, proceed with fresh deploy
+            ;;
+    esac
+}
 # Source configuration
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/config"
@@ -193,6 +350,16 @@ fi
 # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
 resolve_secrets
+# ============================================================
+# CI-Mode: Idempotent deployment check (runs before normal idempotency)
+# ============================================================
+if [ "${CI_ACTIVE}" = "true" ] && [ "${FORCE_NEW}" != "true" ]; then
+    if _ci_handle_existing_endpoint; then
+        # Endpoint already InService with matching config — exit successfully
+        exit 0
+    fi
+fi
 # ============================================================
 # Idempotency: check for existing deployment from a previous run
 # ============================================================
@@ -380,16 +547,20 @@ if [ -z "${SKIP_TO}" ]; then
         # Step 2: Create endpoint
         echo "🚀 Creating endpoint: ${ENDPOINT_NAME}"
-        if ! aws sagemaker create-endpoint \
-            --endpoint-name "${ENDPOINT_NAME}" \
-            --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
-            --region "${AWS_REGION}"; then
-            echo "❌ Failed to create endpoint"
-            echo "   Check that:"
-            echo "   • Your IAM credentials have sagemaker:CreateEndpoint permission"
-            echo "   • You have sufficient service quota in region: ${AWS_REGION}"
-            exit 4
+        if [ "${CI_ACTIVE}" = "true" ]; then
+            _ci_create_endpoint_with_retry "${ENDPOINT_NAME}" "${ENDPOINT_CONFIG_NAME}"
+        else
+            if ! aws sagemaker create-endpoint \
+                --endpoint-name "${ENDPOINT_NAME}" \
+                --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
+                --region "${AWS_REGION}"; then
+                echo "❌ Failed to create endpoint"
+                echo "   Check that:"
+                echo "   • Your IAM credentials have sagemaker:CreateEndpoint permission"
+                echo "   • You have sufficient service quota in region: ${AWS_REGION}"
+                exit 4
+            fi
         fi
         echo "✅ Endpoint creation initiated: ${ENDPOINT_NAME}"
@@ -413,8 +584,18 @@ if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
     echo "   This may take a few minutes..."
     echo "   If this times out, re-run ./do/deploy to resume."
+    # CI-mode: check timeout during wait
+    if [ "${CI_ACTIVE}" = "true" ]; then
+        _ci_check_timeout
+    fi
     wait_endpoint "${ENDPOINT_NAME}"
+    # CI-mode: check timeout after wait completes
+    if [ "${CI_ACTIVE}" = "true" ]; then
+        _ci_check_timeout
+    fi
     echo "✅ Endpoint is InService: ${ENDPOINT_NAME}"
 fi