npm - @aws/ml-container-creator - Versions diffs - 0.10.0 → 0.12.1 - Mend

@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/LICENSE-THIRD-PARTY +9304 -0
package/bin/cli.js +2 -0
package/config/bootstrap-e2e-stack.json +341 -0
package/config/bootstrap-stack.json +40 -3
package/config/parameter-schema-v2.json +33 -22
package/config/tune-catalog.json +1781 -0
package/infra/ci-harness/buildspec.yml +1 -0
package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
package/package.json +53 -67
package/servers/base-image-picker/index.js +121 -121
package/servers/e2e-status/index.js +297 -0
package/servers/e2e-status/manifest.json +14 -0
package/servers/e2e-status/package.json +15 -0
package/servers/endpoint-picker/LICENSE +202 -0
package/servers/endpoint-picker/index.js +536 -0
package/servers/endpoint-picker/manifest.json +14 -0
package/servers/endpoint-picker/package.json +18 -0
package/servers/hyperpod-cluster-picker/index.js +125 -125
package/servers/instance-sizer/index.js +166 -153
package/servers/instance-sizer/lib/instance-ranker.js +120 -76
package/servers/instance-sizer/lib/model-resolver.js +61 -61
package/servers/instance-sizer/lib/quota-resolver.js +113 -113
package/servers/instance-sizer/lib/vram-estimator.js +31 -31
package/servers/lib/bedrock-client.js +38 -38
package/servers/lib/catalogs/instances.json +27 -0
package/servers/lib/catalogs/model-servers.json +201 -3
package/servers/lib/custom-validators.js +13 -13
package/servers/lib/dynamic-resolver.js +4 -4
package/servers/marketplace-picker/index.js +342 -0
package/servers/marketplace-picker/manifest.json +14 -0
package/servers/marketplace-picker/package.json +18 -0
package/servers/model-picker/index.js +382 -382
package/servers/region-picker/index.js +56 -56
package/servers/workload-picker/LICENSE +202 -0
package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
package/servers/workload-picker/index.js +171 -0
package/servers/workload-picker/manifest.json +16 -0
package/servers/workload-picker/package.json +16 -0
package/src/app.js +12 -3
package/src/lib/bootstrap-command-handler.js +609 -15
package/src/lib/bootstrap-config.js +36 -0
package/src/lib/bootstrap-profile-manager.js +48 -41
package/src/lib/ci-register-helpers.js +74 -0
package/src/lib/config-loader.js +3 -0
package/src/lib/config-manager.js +7 -0
package/src/lib/config-validator.js +1 -1
package/src/lib/cuda-resolver.js +17 -8
package/src/lib/generated/cli-options.js +319 -314
package/src/lib/generated/parameter-matrix.js +672 -661
package/src/lib/generated/validation-rules.js +76 -72
package/src/lib/path-prover-brain.js +664 -0
package/src/lib/prompts/infrastructure-prompts.js +2 -2
package/src/lib/prompts/model-prompts.js +6 -0
package/src/lib/prompts/project-prompts.js +12 -0
package/src/lib/secrets-prompt-runner.js +4 -0
package/src/lib/template-manager.js +1 -1
package/src/lib/template-variable-resolver.js +87 -1
package/src/lib/tune-catalog-validator.js +37 -4
package/templates/Dockerfile +9 -0
package/templates/code/adapter_sidecar.py +444 -0
package/templates/code/serve +6 -0
package/templates/code/serve.d/vllm.ejs +1 -1
package/templates/do/.benchmark_writer.py +1476 -0
package/templates/do/.tune_helper.py +982 -57
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/adapter +154 -0
package/templates/do/benchmark +639 -85
package/templates/do/build +5 -0
package/templates/do/clean.d/async-inference.ejs +5 -0
package/templates/do/clean.d/batch-transform.ejs +5 -0
package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
package/templates/do/clean.d/managed-inference.ejs +5 -0
package/templates/do/config +115 -45
package/templates/do/deploy.d/async-inference.ejs +30 -3
package/templates/do/deploy.d/batch-transform.ejs +29 -3
package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
package/templates/do/deploy.d/managed-inference.ejs +216 -14
package/templates/do/lib/endpoint-config.sh +1 -1
package/templates/do/lib/profile.sh +44 -0
package/templates/do/optimize +106 -37
package/templates/do/push +5 -0
package/templates/do/register +94 -0
package/templates/do/stage +567 -0
package/templates/do/submit +7 -0
package/templates/do/test +14 -0
package/templates/do/tune +382 -59
package/templates/do/validate +44 -4

package/src/lib/prompts/project-prompts.js CHANGED Viewed

@@ -48,6 +48,18 @@ const projectPrompts = [
             // Derive framework from deploymentConfig if not already set
             const framework = answers.framework || answers.deploymentConfig?.split('-')[0];
             return generateProjectName(framework);
+        },
+        validate: (input) => {
+            if (!input || input.length < 2) {
+                return 'Project name must be at least 2 characters.';
+            }
+            if (input.length > 63) {
+                return 'Project name must be 63 characters or fewer.';
+            }
+            if (!/^[a-z0-9][a-z0-9-]*[a-z0-9]$/.test(input)) {
+                return 'Project name must be lowercase alphanumeric with hyphens (e.g. "qwen3-0-6b-v1-test"). No uppercase, dots, or underscores.';
+            }
+            return true;
         }
     }
 ];

package/src/lib/secrets-prompt-runner.js CHANGED Viewed

@@ -70,6 +70,10 @@ export default class SecretsPromptRunner {
             const modelSource = answers.modelSource;
             if (modelSource && modelSource !== 'huggingface') return false;
+            // Skip HF token when model name is an S3 URI (no HF download needed)
+            const modelName = answers.customModelName || answers.modelName;
+            if (modelName && modelName.startsWith('s3://')) return false;
             return true;
         }

package/src/lib/template-manager.js CHANGED Viewed

@@ -146,7 +146,7 @@ export default class TemplateManager {
         // Validate instance type format (ml.*.*) - only for realtime-inference
         if (this.answers.instanceType && this.answers.instanceType !== 'custom') {
-            const instancePattern = /^ml\.[a-z0-9]+\.(nano|micro|small|medium|large|xlarge|[0-9]+xlarge)$/;
+            const instancePattern = /^ml\.[a-z0-9-]+\.(nano|micro|small|medium|large|xlarge|[0-9]+xlarge)$/;
             if (!instancePattern.test(this.answers.instanceType)) {
                 throw new Error(`⚠️  Invalid instance type format: ${this.answers.instanceType}. Expected format: ml.{family}.{size} (e.g., ml.m5.large, ml.g5.xlarge)`);
             }

package/src/lib/template-variable-resolver.js CHANGED Viewed

@@ -4,7 +4,7 @@
 import fs from 'fs';
 import path from 'path';
 import { fileURLToPath } from 'url';
-import { isTuneSupported } from './tune-catalog-validator.js';
+import { isTuneSupported, lookupModel } from './tune-catalog-validator.js';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
@@ -383,6 +383,68 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
         }
     }
+    // Auto-resolve tensor parallel degree from instance catalog GPU count.
+    // Only applies when:
+    //   1. The engine supports tensor parallelism (vLLM, SGLang, TensorRT-LLM, LMI)
+    //   2. The instance has multiple GPUs (gpus > 1)
+    //   3. The user has NOT explicitly set the TP env var via --server-env or --model-env
+    // This ensures multi-GPU instances default to full TP utilization without requiring
+    // the user to manually specify TENSOR_PARALLEL_SIZE.
+    // Requirements: FTP-1 (extension) — task 6.2
+    const _TP_ENGINE_MAP = {
+        'vllm': 'VLLM_TENSOR_PARALLEL_SIZE',
+        'vllm-omni': 'VLLM_OMNI_TENSOR_PARALLEL_SIZE',
+        'sglang': 'SGLANG_TENSOR_PARALLEL_SIZE',
+        'tensorrt-llm': 'TRTLLM_TENSOR_PARALLEL_SIZE',
+        'lmi': 'OPTION_TENSOR_PARALLEL_DEGREE'
+    };
+    const tpEngine = answers.backend || answers.modelServer;
+    const tpEnvKey = tpEngine ? _TP_ENGINE_MAP[tpEngine] : null;
+    if (tpEnvKey && answers.instanceType) {
+        // Check if user explicitly set the TP value via --server-env (un-prefixed key)
+        const userServerEnvVars = answers.serverEnvVars || {};
+        const userExplicitlySetTP = (
+            userServerEnvVars['TENSOR_PARALLEL_SIZE'] !== undefined ||
+            userServerEnvVars['TENSOR_PARALLEL_DEGREE'] !== undefined ||
+            userServerEnvVars[tpEnvKey] !== undefined
+        );
+        if (!userExplicitlySetTP) {
+            // Look up GPU count from instance catalog
+            let instanceGpuCount = null;
+            if (answers.gpuCount) {
+                instanceGpuCount = answers.gpuCount;
+            } else if (answers.icGpuCount) {
+                instanceGpuCount = answers.icGpuCount;
+            } else {
+                try {
+                    const catalogPath = path.resolve(__dirname, '..', '..', 'servers', 'lib', 'catalogs', 'instances.json');
+                    const catalogData = JSON.parse(fs.readFileSync(catalogPath, 'utf-8'));
+                    const instanceInfo = catalogData?.catalog?.[answers.instanceType];
+                    if (instanceInfo?.gpus && instanceInfo.gpus > 0) {
+                        instanceGpuCount = instanceInfo.gpus;
+                    }
+                } catch {
+                    // Silently continue
+                }
+            }
+            // Auto-set TP to GPU count when instance has multiple GPUs
+            if (instanceGpuCount && instanceGpuCount > 1) {
+                if (!answers.envVars) {
+                    answers.envVars = {};
+                }
+                answers.envVars[tpEnvKey] = String(instanceGpuCount);
+                answers.tensorParallelSize = instanceGpuCount;
+                answers._tpAutoResolved = true;
+                answers._tpAutoResolvedFrom = answers.instanceType;
+                console.log(`    ℹ️  TP degree: ${instanceGpuCount} (auto-detected from ${answers.instanceType})`);
+            }
+        }
+    }
     // Determine tune support based on model presence in the tune catalog.
     // Used by the do/config template to write TUNE_SUPPORTED=true|false.
     if (answers.tuneSupported === undefined) {
@@ -395,4 +457,28 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
             answers.tuneSupported = false;
         }
     }
+    // Resolve tuneModelId from the catalog — static lookup, no network calls.
+    // Maps the HuggingFace model ID to the Hub content name (catalog key).
+    if (answers.tuneModelId === undefined) {
+        if (answers.tuneSupported && answers.modelName) {
+            try {
+                const tuneCatalogPath = path.resolve(__dirname, '..', '..', 'config', 'tune-catalog.json');
+                const tuneCatalog = JSON.parse(fs.readFileSync(tuneCatalogPath, 'utf-8'));
+                const entry = lookupModel(answers.modelName, tuneCatalog);
+                if (entry) {
+                    const hubContentName = Object.entries(tuneCatalog.models)
+                        .find(([, v]) => v === entry)?.[0];
+                    if (hubContentName) {
+                        answers.tuneModelId = hubContentName;
+                    }
+                }
+            } catch {
+                // Silently continue — tuneModelId will be set to null below
+            }
+        }
+        if (!answers.tuneModelId) {
+            answers.tuneModelId = null;
+        }
+    }
 }

package/src/lib/tune-catalog-validator.js CHANGED Viewed

@@ -13,7 +13,8 @@
 /**
  * Look up a model entry in the catalog by model ID.
- * @param {string} modelId - The model ID to look up
+ * Tries: direct key match, huggingFaceId field match, then normalized/suffix matching.
+ * @param {string} modelId - The model ID to look up (Hub content name or HuggingFace ID)
  * @param {Object} catalog - The tune catalog object with a `models` map
  * @returns {Object|null} The catalog entry for the model, or null if not found
  */
@@ -21,10 +22,42 @@ export function lookupModel(modelId, catalog) {
     if (!catalog || !catalog.models) {
         return null;
     }
-    if (!Object.hasOwn(catalog.models, modelId)) {
-        return null;
+    // Direct key match (Hub content name)
+    if (Object.hasOwn(catalog.models, modelId)) {
+        return catalog.models[modelId] || null;
+    }
+    // Match by huggingFaceId field (e.g., "Qwen/Qwen3-0.6B")
+    for (const [, entry] of Object.entries(catalog.models)) {
+        if (entry.huggingFaceId === modelId) {
+            return entry;
+        }
+    }
+    // Normalized match: strip org prefix, lowercase, replace dots/spaces with hyphens
+    const normalized = modelId.split('/').pop().toLowerCase().replace(/[.\s]+/g, '-');
+    if (normalized && Object.hasOwn(catalog.models, normalized)) {
+        return catalog.models[normalized] || null;
     }
-    return catalog.models[modelId] || null;
+    // Try without trailing suffixes like -instruct, -chat, -hf, -base
+    const base = normalized ? normalized.replace(/-(instruct|chat|hf|base)$/i, '') : '';
+    if (base && base !== normalized && Object.hasOwn(catalog.models, base)) {
+        return catalog.models[base] || null;
+    }
+    // Suffix match: catalog keys may have prefixes (e.g., "huggingface-reasoning-")
+    // Match if a catalog key ends with the normalized name (must be non-trivial match)
+    if (normalized && normalized.length >= 4) {
+        for (const [key, entry] of Object.entries(catalog.models)) {
+            if (key.endsWith(normalized) || (base && base.length >= 4 && key.endsWith(base))) {
+                return entry || null;
+            }
+        }
+    }
+    return null;
 }
 /**

package/templates/Dockerfile CHANGED Viewed

@@ -243,6 +243,7 @@ ENV <%= key %>=<%= value %>
 ENV VLLM_ENABLE_LORA=true
 ENV VLLM_MAX_LORAS=<%= maxLoras %>
 ENV VLLM_MAX_LORA_RANK=<%= maxLoraRank %>
+ENV VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
 <% } %>
 <% if (enableLora && modelServer === 'sglang') { %>
 # LoRA adapter serving configuration
@@ -307,9 +308,17 @@ COPY code/serving.properties /opt/ml/model/serving.properties
 # LMI/DJL containers use their own entrypoint
 # The container will automatically start DJL Serving with the configuration
 <% } else { %>
+<% if (enableLora && (modelServer === 'vllm' || modelServer === 'sglang')) { %>
+# Install aiohttp for the adapter sidecar
+RUN pip install --no-cache-dir aiohttp
+<% } %>
 COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
 COPY code/cw_log_forwarder.py /usr/bin/cw_log_forwarder.py
 COPY code/serve /usr/bin/serve
+<% if (enableLora && (modelServer === 'vllm' || modelServer === 'sglang')) { %>
+COPY code/adapter_sidecar.py /usr/bin/adapter_sidecar.py
+<% } %>
 RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
 <% if (comments && comments.troubleshooting) { %>

package/templates/code/adapter_sidecar.py ADDED Viewed

@@ -0,0 +1,444 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Adapter Sidecar — SageMaker AI adapter contract implementation.
+Lightweight aiohttp HTTP server that sits between SageMaker (port 8080) and the
+model server (port 8081). Implements POST /adapters and DELETE /adapters by
+translating them into the model server's native LoRA API, while proxying all
+other traffic transparently.
+Configuration (environment variables):
+    MODEL_SERVER_PORT  - Internal model server port (default: 8081)
+    MODEL_SERVER_TYPE  - Model server type: vllm or sglang (default: vllm)
+    SIDECAR_PORT       - Port sidecar listens on (default: 8080)
+    MAX_LORAS          - Maximum concurrent adapters (default: 64)
+    HEALTH_POLL_INTERVAL - Seconds between health polls (default: 2)
+    HEALTH_TIMEOUT     - Seconds to wait for model server readiness (default: 600)
+"""
+import asyncio
+import os
+import tarfile
+import time
+from datetime import datetime, timezone
+from aiohttp import web, ClientSession, ClientTimeout
+# ── Configuration ─────────────────────────────────────────────────────────────
+MODEL_SERVER_PORT = int(os.environ.get('MODEL_SERVER_PORT', '8081'))
+MODEL_SERVER_TYPE = os.environ.get('MODEL_SERVER_TYPE', 'vllm')
+SIDECAR_PORT = int(os.environ.get('SIDECAR_PORT', '8080'))
+MAX_LORAS = int(os.environ.get('MAX_LORAS', '64'))
+HEALTH_POLL_INTERVAL = int(os.environ.get('HEALTH_POLL_INTERVAL', '2'))
+HEALTH_TIMEOUT = int(os.environ.get('HEALTH_TIMEOUT', '600'))
+MODEL_SERVER_BASE = f'http://localhost:{MODEL_SERVER_PORT}'
+# ── Logging ───────────────────────────────────────────────────────────────────
+def log(message, stream='stdout'):
+    """Emit a log message with ISO 8601 timestamp and [adapter-sidecar] prefix."""
+    ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
+    line = f'{ts} [adapter-sidecar] {message}'
+    if stream == 'stderr':
+        import sys
+        print(line, file=sys.stderr)
+    else:
+        print(line)
+# ── Artifact Resolution ───────────────────────────────────────────────────────
+class ArtifactResolver:
+    """Resolves adapter artifacts from a source path.
+    Handles three cases:
+    1. Path contains a single tar.gz file — extract in place, return directory
+    2. Path contains adapter_config.json — use directory directly
+    3. Path does not exist or is empty — raise FileNotFoundError
+    """
+    @staticmethod
+    def resolve(src):
+        """Resolve the adapter artifact path.
+        Args:
+            src: Filesystem path where SageMaker placed adapter artifacts.
+        Returns:
+            Resolved directory path containing adapter files.
+        Raises:
+            FileNotFoundError: If path does not exist or is empty.
+            RuntimeError: If tar.gz extraction fails.
+        """
+        # Check if path exists
+        if not os.path.exists(src):
+            raise FileNotFoundError(f'Adapter artifact path does not exist: {src}')
+        # If src is a file (direct tar.gz path), extract it
+        if os.path.isfile(src) and src.endswith('.tar.gz'):
+            extract_dir = os.path.dirname(src)
+            ArtifactResolver._extract_tar_gz(src, extract_dir)
+            return extract_dir
+        # If src is a directory, check contents
+        if not os.path.isdir(src):
+            raise FileNotFoundError(f'Adapter artifact path is not a directory: {src}')
+        # Check if directory is empty
+        contents = os.listdir(src)
+        if not contents:
+            raise FileNotFoundError(f'Adapter artifact path is empty: {src}')
+        # Check if directory already contains adapter_config.json (extracted files)
+        if 'adapter_config.json' in contents:
+            return src
+        # Check if directory contains a single tar.gz file
+        tar_files = [f for f in contents if f.endswith('.tar.gz')]
+        if len(tar_files) == 1:
+            tar_path = os.path.join(src, tar_files[0])
+            ArtifactResolver._extract_tar_gz(tar_path, src)
+            return src
+        # If we get here, the path exists but has no recognizable adapter artifacts
+        # Check again after potential extraction if adapter_config.json appeared
+        if 'adapter_config.json' in os.listdir(src):
+            return src
+        raise FileNotFoundError(
+            f'Adapter artifact path does not contain adapter_config.json or a tar.gz archive: {src}'
+        )
+    @staticmethod
+    def _extract_tar_gz(tar_path, extract_dir):
+        """Extract a tar.gz archive to the specified directory.
+        Args:
+            tar_path: Path to the tar.gz file.
+            extract_dir: Directory to extract files into.
+        Raises:
+            RuntimeError: If extraction fails due to corruption or permission issues.
+        """
+        try:
+            with tarfile.open(tar_path, 'r:gz') as tar:
+                # Use filter='data' on Python 3.12+ for security, fall back for older versions
+                if hasattr(tarfile, 'data_filter'):
+                    tar.extractall(path=extract_dir, filter='data')
+                else:
+                    tar.extractall(path=extract_dir)
+        except (tarfile.TarError, OSError, PermissionError) as e:
+            raise RuntimeError(f'Failed to extract tar.gz archive {tar_path}: {e}')
+# ── Model Server Client (Strategy Pattern) ────────────────────────────────────
+class ModelServerClient:
+    """Strategy interface for model server native LoRA API translation.
+    Subclasses implement the specific HTTP calls for each model server type.
+    """
+    def __init__(self, session, base_url):
+        self.session = session
+        self.base_url = base_url
+    async def load_adapter(self, name, path):
+        """Load a LoRA adapter into the model server.
+        Args:
+            name: Adapter identifier.
+            path: Resolved filesystem path to adapter artifacts.
+        Returns:
+            dict with response data from the model server.
+        Raises:
+            RuntimeError: If the model server returns an error or is unreachable.
+        """
+        raise NotImplementedError
+    async def unload_adapter(self, name):
+        """Unload a LoRA adapter from the model server.
+        Args:
+            name: Adapter identifier.
+        Returns:
+            dict with response data from the model server.
+        Raises:
+            RuntimeError: If the model server returns an error or is unreachable.
+        """
+        raise NotImplementedError
+class VLLMClient(ModelServerClient):
+    """vLLM-specific adapter API translation.
+    Load: POST /v1/load_lora_adapter {"lora_name": name, "lora_path": path}
+    Unload: POST /v1/unload_lora_adapter {"lora_name": name}
+    """
+    async def load_adapter(self, name, path):
+        """Load a LoRA adapter via vLLM's native API."""
+        url = f'{self.base_url}/v1/load_lora_adapter'
+        payload = {'lora_name': name, 'lora_path': path}
+        try:
+            async with self.session.post(url, json=payload) as resp:
+                body = await resp.text()
+                if resp.status == 200:
+                    return {'status': 'success', 'response': body}
+                raise RuntimeError(f'vLLM load_lora_adapter failed (HTTP {resp.status}): {body}')
+        except RuntimeError:
+            raise
+        except Exception as e:
+            raise RuntimeError(f'Failed to connect to vLLM: {e}')
+    async def unload_adapter(self, name):
+        """Unload a LoRA adapter via vLLM's native API."""
+        url = f'{self.base_url}/v1/unload_lora_adapter'
+        payload = {'lora_name': name}
+        try:
+            async with self.session.post(url, json=payload) as resp:
+                body = await resp.text()
+                if resp.status == 200:
+                    return {'status': 'success', 'response': body}
+                raise RuntimeError(f'vLLM unload_lora_adapter failed (HTTP {resp.status}): {body}')
+        except RuntimeError:
+            raise
+        except Exception as e:
+            raise RuntimeError(f'Failed to connect to vLLM: {e}')
+class SGLangClient(ModelServerClient):
+    """SGLang-specific adapter API translation. (Deferred)
+    SGLang support is deferred to a follow-up. This placeholder raises
+    NotImplementedError for both load and unload operations.
+    """
+    async def load_adapter(self, name, path):
+        """Load a LoRA adapter via SGLang's native API. (Not yet implemented)"""
+        raise NotImplementedError('SGLang adapter loading is not yet implemented')
+    async def unload_adapter(self, name):
+        """Unload a LoRA adapter via SGLang's native API. (Not yet implemented)"""
+        raise NotImplementedError('SGLang adapter unloading is not yet implemented')
+def create_model_server_client(session, base_url, server_type):
+    """Factory function to create the appropriate ModelServerClient.
+    Args:
+        session: aiohttp.ClientSession for HTTP calls.
+        base_url: Model server base URL (e.g., http://localhost:8081).
+        server_type: Model server type ('vllm' or 'sglang').
+    Returns:
+        ModelServerClient instance.
+    """
+    if server_type == 'vllm':
+        return VLLMClient(session, base_url)
+    elif server_type == 'sglang':
+        return SGLangClient(session, base_url)
+    else:
+        raise ValueError(f'Unsupported model server type: {server_type}')
+# ── State ─────────────────────────────────────────────────────────────────────
+adapter_registry = {}
+model_server_ready = False
+# ── Health Polling (Readiness Gating) ─────────────────────────────────────────
+async def poll_model_server_health(app):
+    """Background task that polls the model server health endpoint.
+    Sets model_server_ready to True once the health endpoint returns 200.
+    After HEALTH_TIMEOUT seconds, logs a warning and sets ready to True
+    to avoid indefinite blocking.
+    """
+    global model_server_ready
+    session = app['session']
+    start_time = time.monotonic()
+    log(f'Starting health polling — interval={HEALTH_POLL_INTERVAL}s, timeout={HEALTH_TIMEOUT}s')
+    while True:
+        elapsed = time.monotonic() - start_time
+        # Timeout: log warning and begin accepting requests
+        if elapsed >= HEALTH_TIMEOUT:
+            log(f'Health timeout reached ({HEALTH_TIMEOUT}s) — model server did not become ready. Accepting requests anyway.', stream='stderr')
+            model_server_ready = True
+            return
+        try:
+            async with session.get(f'{MODEL_SERVER_BASE}/health', timeout=ClientTimeout(total=5)) as resp:
+                if resp.status == 200:
+                    model_server_ready = True
+                    log('Model server is ready (health endpoint returned 200)')
+                    return
+        except Exception:
+            pass
+        await asyncio.sleep(HEALTH_POLL_INTERVAL)
+# ── Handlers ──────────────────────────────────────────────────────────────────
+async def handle_ping(request):
+    """GET /ping — readiness gating.
+    Returns 503 until model server health endpoint returns 200.
+    Once ready, proxies /ping to the model server.
+    """
+    if not model_server_ready:
+        return web.Response(status=503, text='Service Unavailable')
+    # Proxy to model server health endpoint
+    session = request.app['session']
+    try:
+        async with session.get(f'{MODEL_SERVER_BASE}/health') as resp:
+            body = await resp.read()
+            return web.Response(status=resp.status, body=body,
+                                headers={'Content-Type': resp.headers.get('Content-Type', 'text/plain')})
+    except Exception as e:
+        return web.Response(status=503, text=f'Model server unreachable: {e}')
+async def handle_adapters_post(request):
+    """POST /adapters — load a LoRA adapter."""
+    name = request.query.get('name')
+    src = request.query.get('src')
+    if not name:
+        return web.json_response({'status': 'error', 'error': 'Missing required query parameter: name'}, status=400)
+    if not src:
+        return web.json_response({'status': 'error', 'error': 'Missing required query parameter: src'}, status=400)
+    # Check MAX_LORAS limit
+    if len(adapter_registry) >= MAX_LORAS:
+        return web.json_response(
+            {'status': 'error', 'adapter': name, 'error': f'Maximum concurrent adapters ({MAX_LORAS}) reached'},
+            status=507
+        )
+    # Resolve adapter artifacts
+    try:
+        resolved_path = ArtifactResolver.resolve(src)
+    except FileNotFoundError as e:
+        return web.json_response({'status': 'error', 'adapter': name, 'error': str(e)}, status=404)
+    except RuntimeError as e:
+        return web.json_response({'status': 'error', 'adapter': name, 'error': str(e)}, status=500)
+    # Call model server native LoRA API
+    client = request.app['model_server_client']
+    try:
+        await client.load_adapter(name, resolved_path)
+    except RuntimeError as e:
+        log(f'Adapter load failed — name={name}, src={src}, error={e}', stream='stderr')
+        return web.json_response({'status': 'error', 'adapter': name, 'error': str(e)}, status=500)
+    # Register adapter and respond
+    adapter_registry[name] = resolved_path
+    log(f'Adapter loaded — name={name}, src={src}, resolved_path={resolved_path}')
+    return web.json_response({'status': 'loaded', 'adapter': name, 'path': resolved_path})
+async def handle_adapters_delete(request):
+    """DELETE /adapters — unload a LoRA adapter."""
+    name = request.query.get('name')
+    if not name:
+        return web.json_response({'status': 'error', 'error': 'Missing required query parameter: name'}, status=400)
+    # Call model server native LoRA API
+    client = request.app['model_server_client']
+    try:
+        await client.unload_adapter(name)
+    except RuntimeError as e:
+        log(f'Adapter unload failed — name={name}, error={e}', stream='stderr')
+        return web.json_response({'status': 'error', 'adapter': name, 'error': str(e)}, status=500)
+    # Remove from registry and respond
+    adapter_registry.pop(name, None)
+    log(f'Adapter unloaded — name={name}')
+    return web.json_response({'status': 'unloaded', 'adapter': name})
+async def handle_proxy(request):
+    """Proxy all non-/adapters requests to the model server transparently."""
+    session = request.app['session']
+    target_url = f'{MODEL_SERVER_BASE}{request.path_qs}'
+    try:
+        body = await request.read()
+        async with session.request(
+            method=request.method,
+            url=target_url,
+            headers={k: v for k, v in request.headers.items() if k.lower() != 'host'},
+            data=body if body else None
+        ) as resp:
+            resp_body = await resp.read()
+            response_headers = {k: v for k, v in resp.headers.items()
+                                if k.lower() not in ('transfer-encoding', 'content-encoding', 'content-length')}
+            return web.Response(status=resp.status, body=resp_body, headers=response_headers)
+    except Exception as e:
+        return web.json_response({'status': 'error', 'error': f'Model server unreachable: {e}'}, status=500)
+# ── Application Setup ─────────────────────────────────────────────────────────
+async def on_startup(app):
+    """Create HTTP session, model server client, and start health polling background task."""
+    app['session'] = ClientSession()
+    app['model_server_client'] = create_model_server_client(app['session'], MODEL_SERVER_BASE, MODEL_SERVER_TYPE)
+    app['health_task'] = asyncio.create_task(poll_model_server_health(app))
+    log(f'Sidecar started — port={SIDECAR_PORT}, model_server_port={MODEL_SERVER_PORT}, '
+        f'model_server_type={MODEL_SERVER_TYPE}, max_loras={MAX_LORAS}')
+async def on_cleanup(app):
+    """Cleanup HTTP session and cancel background tasks."""
+    app['health_task'].cancel()
+    try:
+        await app['health_task']
+    except asyncio.CancelledError:
+        pass
+    await app['session'].close()
+def create_app():
+    """Create and configure the aiohttp application."""
+    app = web.Application()
+    # Register routes
+    app.router.add_get('/ping', handle_ping)
+    app.router.add_post('/adapters', handle_adapters_post)
+    app.router.add_delete('/adapters', handle_adapters_delete)
+    # Catch-all proxy for everything else
+    app.router.add_route('*', '/{path:.*}', handle_proxy)
+    # Lifecycle hooks
+    app.on_startup.append(on_startup)
+    app.on_cleanup.append(on_cleanup)
+    return app
+if __name__ == '__main__':
+    app = create_app()
+    web.run_app(app, host='0.0.0.0', port=SIDECAR_PORT, print=None)