npm - @aws/ml-container-creator - Versions diffs - 0.4.0 → 0.5.0 - Mend

@aws/ml-container-creator 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/bin/cli.js +5 -2
package/infra/ci-harness/buildspec.yml +60 -0
package/package.json +1 -1
package/servers/README.md +41 -1
package/servers/instance-sizer/index.js +6 -0
package/src/app.js +33 -2
package/src/lib/config-manager.js +40 -1
package/src/lib/deployment-entry-schema.js +16 -0
package/src/lib/prompt-runner.js +174 -3
package/src/lib/prompts.js +222 -2
package/src/lib/registry-command-handler.js +12 -0
package/templates/Dockerfile +12 -0
package/templates/code/serving.properties +14 -0
package/templates/do/adapter +1214 -0
package/templates/do/adapters/.gitkeep +2 -0
package/templates/do/add-ic +130 -0
package/templates/do/benchmark +81 -9
package/templates/do/clean +507 -17
package/templates/do/config +23 -1
package/templates/do/deploy +513 -367
package/templates/do/ic/default.conf +32 -0
package/templates/do/lib/endpoint-config.sh +216 -0
package/templates/do/lib/inference-component.sh +167 -0
package/templates/do/lib/secrets.sh +44 -0
package/templates/do/lib/wait.sh +131 -0
package/templates/do/logs +107 -27
package/templates/do/optimize +528 -0
package/templates/do/register +111 -1
package/templates/do/status +337 -0
package/templates/do/test +80 -28

package/bin/cli.js CHANGED Viewed

@@ -98,6 +98,9 @@ program
     .addOption(new Option('--include-sample', 'Include sample model code'))
     .addOption(new Option('--include-testing', 'Include test suite'))
     .addOption(new Option('--test-types <types>', 'Comma-separated test types'))
+    .addOption(new Option('--enable-lora', 'Enable LoRA adapter serving (transformers with vllm/sglang/djl-lmi only)'))
+    .addOption(new Option('--max-loras <n>', 'Maximum concurrent LoRA adapters in GPU memory (default: 30)'))
+    .addOption(new Option('--max-lora-rank <n>', 'Maximum LoRA rank (default: 64)'))
     // --- MCP & Discovery ---
     .addOption(new Option('--smart', 'Enable Bedrock-powered smart mode on MCP servers'))
@@ -190,7 +193,7 @@ program.configureHelp({
                 groups.env.push(opt);
             } else if (['--hf-token', '--hf-token-arn', '--ngc-token', '--ngc-token-arn'].includes(long)) {
                 groups.auth.push(opt);
-            } else if (['--include-sample', '--include-testing', '--test-types'].includes(long)) {
+            } else if (['--include-sample', '--include-testing', '--test-types', '--enable-lora', '--max-loras', '--max-lora-rank'].includes(long)) {
                 groups.features.push(opt);
             } else if (['--smart', '--discover'].includes(long)) {
                 groups.mcp.push(opt);
@@ -307,7 +310,6 @@ program
 program
     .command('registry')
     .description('Registry operations (list, get, remove, replay, export, import, search) — experimental, may be reconciled with do/register')
-    .passThroughOptions()
     .argument('<action>', 'Registry action (log, list, get, remove, replay, export, import, search)')
     .argument('[args...]', 'Additional arguments')
     .option('--backend <backend>', 'Filter by backend')
@@ -328,6 +330,7 @@ program
     .option('--notes <text>', 'Deployment notes')
     .option('--project', 'Use project-level registry')
     .option('--parameters <json>', 'Parameters JSON string')
+    .option('--ic-list <json>', 'IC list JSON string')
     .option('--generator-version <version>', 'Generator version')
     // Options used by `registry list-architectures`
     .option('--server <name>', 'Filter by server name (for list-architectures)')

package/infra/ci-harness/buildspec.yml CHANGED Viewed

@@ -40,6 +40,10 @@ phases:
       - REGISTER_DURATION=0
       - REGISTER_LOG_POINTER=""
       - REGISTER_ERROR_SUMMARY=""
+      - ADAPTER_TEST_STATUS="skip"
+      - ADAPTER_TEST_DURATION=0
+      - ADAPTER_TEST_LOG_POINTER=""
+      - ADAPTER_TEST_ERROR_SUMMARY=""
       - TEARDOWN_STATUS="skip"
       - TEARDOWN_DURATION=0
       - TEARDOWN_LOG_POINTER=""
@@ -182,6 +186,54 @@ phases:
         fi
       - rm -f "$STAGE_STDERR_FILE"
+      # --- Stage: Adapter_Test (only if do/adapters/ has .conf files) ---
+      - echo "=== Stage: Adapter_Test ==="
+      - STAGE_START=$(date +%s)
+      - ADAPTER_TEST_LOG_POINTER="$LOG_POINTER_PREFIX"
+      - STAGE_STDERR_FILE=$(mktemp)
+      - |
+        if [ -n "$FIRST_FAILURE" ]; then
+          echo "Skipping Adapter_Test stage due to prior failure in $FIRST_FAILURE"
+          ADAPTER_TEST_STATUS="skip"
+          ADAPTER_TEST_DURATION=0
+        else
+          cd /tmp/ci-project
+          ADAPTER_CONFS=$(find do/adapters -name '*.conf' 2>/dev/null | grep -v '.gitkeep' || true)
+          if [ -z "$ADAPTER_CONFS" ]; then
+            echo "No adapter configs found in do/adapters/ — skipping"
+            ADAPTER_TEST_STATUS="skip"
+            ADAPTER_TEST_DURATION=0
+          else
+            (
+              set -e
+              cd /tmp/ci-project
+              for conf in do/adapters/*.conf; do
+                [ -f "$conf" ] || continue
+                [[ "$(basename "$conf")" == ".gitkeep" ]] && continue
+                ADAPTER_NAME=$(basename "$conf" .conf)
+                echo "Testing adapter: ${ADAPTER_NAME}"
+                # Source to get weights URI
+                source "$conf"
+                ./do/adapter add "${ADAPTER_NAME}" --weights "${ADAPTER_WEIGHTS_URI}"
+                ./do/test --ic "${ADAPTER_NAME}"
+                ./do/adapter remove "${ADAPTER_NAME}"
+              done
+            ) 2>"$STAGE_STDERR_FILE"; STAGE_EXIT=$?
+            STAGE_END=$(date +%s)
+            ADAPTER_TEST_DURATION=$((STAGE_END - STAGE_START))
+            if [ "$STAGE_EXIT" -eq 0 ]; then
+              ADAPTER_TEST_STATUS="pass"
+              echo "Adapter_Test stage passed in ${ADAPTER_TEST_DURATION}s"
+            else
+              ADAPTER_TEST_STATUS="fail"
+              ADAPTER_TEST_ERROR_SUMMARY=$(tail -c 500 "$STAGE_STDERR_FILE" | tr -d '\000' | tr '"' "'" | tr '\n' ' ')
+              FIRST_FAILURE="adapter_test"
+              echo "Adapter_Test stage FAILED (exit code $STAGE_EXIT) in ${ADAPTER_TEST_DURATION}s"
+            fi
+          fi
+        fi
+      - rm -f "$STAGE_STDERR_FILE"
       # --- Stage: Register (placeholder) ---
       - echo "=== Stage: Register ==="
       - STAGE_START=$(date +%s)
@@ -260,6 +312,7 @@ phases:
             validate)    FINAL_ERROR_MESSAGE="$VALIDATE_ERROR_SUMMARY" ;;
             build)       FINAL_ERROR_MESSAGE="$BUILD_ERROR_SUMMARY" ;;
             deploy_test) FINAL_ERROR_MESSAGE="$DEPLOY_TEST_ERROR_SUMMARY" ;;
+            adapter_test) FINAL_ERROR_MESSAGE="$ADAPTER_TEST_ERROR_SUMMARY" ;;
             register)    FINAL_ERROR_MESSAGE="$REGISTER_ERROR_SUMMARY" ;;
             *)           FINAL_ERROR_MESSAGE="Unknown failure stage" ;;
           esac
@@ -272,6 +325,7 @@ phases:
         ESCAPED_VALIDATE_ERROR=$(printf '%s' "$VALIDATE_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
         ESCAPED_BUILD_ERROR=$(printf '%s' "$BUILD_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
         ESCAPED_DEPLOY_TEST_ERROR=$(printf '%s' "$DEPLOY_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
+        ESCAPED_ADAPTER_TEST_ERROR=$(printf '%s' "$ADAPTER_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
         ESCAPED_REGISTER_ERROR=$(printf '%s' "$REGISTER_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
         ESCAPED_TEARDOWN_ERROR=$(printf '%s' "$TEARDOWN_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
         ESCAPED_FINAL_ERROR=$(printf '%s' "$FINAL_ERROR_MESSAGE" | sed 's/\\/\\\\/g; s/"/\\"/g')
@@ -314,6 +368,12 @@ phases:
                   \"logPointer\": {\"S\": \"$DEPLOY_TEST_LOG_POINTER\"},
                   \"errorSummary\": {\"S\": \"$ESCAPED_DEPLOY_TEST_ERROR\"}
                 }},
+                \"adapter_test\": {\"M\": {
+                  \"status\": {\"S\": \"$ADAPTER_TEST_STATUS\"},
+                  \"durationSeconds\": {\"N\": \"$ADAPTER_TEST_DURATION\"},
+                  \"logPointer\": {\"S\": \"$ADAPTER_TEST_LOG_POINTER\"},
+                  \"errorSummary\": {\"S\": \"$ESCAPED_ADAPTER_TEST_ERROR\"}
+                }},
                 \"register\": {\"M\": {
                   \"status\": {\"S\": \"$REGISTER_STATUS\"},
                   \"durationSeconds\": {\"N\": \"$REGISTER_DURATION\"},

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@aws/ml-container-creator",
-  "version": "0.4.0",
+  "version": "0.5.0",
   "description": "Generator for SageMaker AI BYOC paradigm for predictive inference use-cases.",
   "type": "module",
   "main": "src/app.js",

package/servers/README.md CHANGED Viewed

@@ -15,7 +15,12 @@ servers/
 │   ├── test.js                 # Standalone tests (node test.js)
 │   ├── package.json
 │   └── LICENSE
-└── region-picker/              # AWS region suggestion server
+├── region-picker/              # AWS region suggestion server
+│   ├── index.js                # MCP server entry point
+│   ├── test.js                 # Standalone tests (node test.js)
+│   ├── package.json
+│   └── LICENSE
+└── endpoint-picker/            # SageMaker endpoint discovery server
     ├── index.js                # MCP server entry point
     ├── test.js                 # Standalone tests (node test.js)
     ├── package.json
@@ -74,6 +79,39 @@ Suggests AWS regions for SageMaker deployments based on a search term. Filters t
 }
 ```
+### endpoint-picker
+Discovers InService SageMaker real-time endpoints with available GPU capacity for attaching new inference components. Uses `ListEndpoints`, `DescribeEndpoint`, and `ListInferenceComponents` to calculate available capacity.
+**Discover mode:** Queries the SageMaker API using a 3-strategy credential fallback (explicit profile → default chain → detect profiles). No static mode — always requires AWS credentials.
+**Tool:** `get_inference_endpoints`
+| Input Field | Type | Description |
+|-------------|------|-------------|
+| `parameters` | `string[]` | Must include `"endpointName"` to get results |
+| `limit` | `number` | Max endpoints to return (default: 10) |
+| `context` | `object` | `awsRegion`, `awsProfile`, `deploymentTarget` (must be `realtime-inference`) |
+**Example response:**
+```json
+{
+  "values": { "endpointName": "my-endpoint-1234567890" },
+  "choices": { "endpointName": ["my-endpoint-1234567890", "prod-llm-endpoint"] },
+  "metadata": {
+    "my-endpoint-1234567890": {
+      "variantName": "AllTraffic",
+      "instanceType": "ml.g6e.48xlarge",
+      "instanceCount": 1,
+      "icCount": 2,
+      "availableGpus": 4,
+      "hasInstancePools": false
+    }
+  }
+}
+```
 ## Usage
 ### Adding a Bundled Server
@@ -297,6 +335,7 @@ The Bedrock API didn't respond within 10 seconds. This usually means network con
 ```bash
 node servers/region-picker/test.js
 node servers/instance-recommender/test.js
+node servers/endpoint-picker/test.js
 ```
 ### Smart Mode Not Activating
@@ -313,6 +352,7 @@ Each server has standalone tests that run without AWS credentials or network acc
 # Run individual server tests
 node servers/region-picker/test.js
 node servers/instance-recommender/test.js
+node servers/endpoint-picker/test.js
 # Run all server tests from the project root
 npm run test:servers

package/servers/instance-sizer/index.js CHANGED Viewed

@@ -383,6 +383,7 @@ async function handleGetInstanceRecommendation(params) {
     // Step 3a: Quota & availability filtering (discover mode only)
     let preQuotaFilterCount = 0
     let allFilteredByQuota = false
+    let preQuotaRecommendations = []
     if (DISCOVER_MODE && recommendations.length > 0) {
         try {
             const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
@@ -396,6 +397,7 @@ async function handleGetInstanceRecommendation(params) {
             ])
             preQuotaFilterCount = recommendations.length
+            preQuotaRecommendations = [...recommendations]
             recommendations = applyAvailabilityRanking(
                 recommendations,
                 quotas.status === 'fulfilled' ? quotas.value : null,
@@ -404,6 +406,10 @@ async function handleGetInstanceRecommendation(params) {
             )
             if (recommendations.length === 0 && preQuotaFilterCount > 0) {
                 allFilteredByQuota = true
+                // Restore pre-filter recommendations so user can see compatible instances
+                // and request quota increases for the ones they want
+                recommendations = preQuotaRecommendations
+                log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`)
             }
         } catch (err) {
             // Graceful degradation: if credentials are missing or any unexpected

package/src/app.js CHANGED Viewed

@@ -302,6 +302,22 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
         ignorePatterns.push('**/hyperpod/**');
     }
+    // HyperPod is kubectl-based — no shared bash helpers or IC configs
+    if (answers.deploymentTarget === 'hyperpod-eks') {
+        ignorePatterns.push('**/do/lib/**');
+        ignorePatterns.push('**/do/ic/**');
+        ignorePatterns.push('**/do/add-ic');
+        ignorePatterns.push('**/do/status');
+        ignorePatterns.push('**/do/optimize');
+    }
+    // Async and batch don't use inference components (IC is real-time only)
+    if (answers.deploymentTarget === 'async-inference' || answers.deploymentTarget === 'batch-transform') {
+        ignorePatterns.push('**/do/ic/**');
+        ignorePatterns.push('**/do/add-ic');
+        ignorePatterns.push('**/do/status');
+    }
     // Resolve architecture
     const resolver = new DeploymentConfigResolver();
     let architecture = answers.architecture;
@@ -325,6 +341,13 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
     // Exclude do/benchmark when benchmarking is not selected
     if (!answers.includeBenchmark) {
         ignorePatterns.push('**/do/benchmark');
+        ignorePatterns.push('**/do/optimize');
+    }
+    // Exclude do/adapter and do/adapters/ when LoRA is not enabled
+    if (!answers.enableLora) {
+        ignorePatterns.push('**/do/adapter');
+        ignorePatterns.push('**/do/adapters/**');
     }
     // Exclude do/test when hosted-model-endpoint is not selected
@@ -567,7 +590,11 @@ async function _ensureTemplateVariables(answers, registryConfigManager = null) {
         baseImage: null,
         modelSource: 'huggingface',
         artifactUri: '',
-        modelLoadStrategy: 'runtime'
+        modelLoadStrategy: 'runtime',
+        existingEndpointName: null,
+        enableLora: false,
+        maxLoras: 30,
+        maxLoraRank: 64
     };
     Object.entries(defaults).forEach(([key, value]) => {
@@ -1052,7 +1079,11 @@ function _setExecutablePermissions(destDir) {
         'do/register',
         'do/ci',
         'do/manifest',
-        'do/benchmark'
+        'do/benchmark',
+        'do/optimize',
+        'do/status',
+        'do/add-ic',
+        'do/adapter'
     ];
     shellScripts.forEach(script => {

package/src/lib/config-manager.js CHANGED Viewed

@@ -1056,6 +1056,39 @@ export default class ConfigManager {
                 required: false,
                 default: null,
                 valueSpace: 'bounded'
+            },
+            enableLora: {
+                cliOption: 'enable-lora',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: false,
+                valueSpace: 'bounded'
+            },
+            maxLoras: {
+                cliOption: 'max-loras',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: 30,
+                valueSpace: 'bounded'
+            },
+            maxLoraRank: {
+                cliOption: 'max-lora-rank',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: 64,
+                valueSpace: 'bounded'
             }
         };
     }
@@ -1088,7 +1121,7 @@ export default class ConfigManager {
      */
     _parseValue(parameter, value) {
         // Handle boolean parameters
-        if (parameter === 'includeSampleModel' || parameter === 'includeTesting' || parameter === 'skipPrompts' || parameter === 'includeBenchmark' || parameter === 'benchmarkStreaming') {
+        if (parameter === 'includeSampleModel' || parameter === 'includeTesting' || parameter === 'skipPrompts' || parameter === 'includeBenchmark' || parameter === 'benchmarkStreaming' || parameter === 'enableLora') {
             return value === true || value === 'true';
         }
@@ -1924,6 +1957,12 @@ export default class ConfigManager {
                 if (param === 'instanceType' && finalConfig.deploymentTarget === 'hyperpod-eks' && !finalConfig.instanceType) {
                     return; // Skip validation only if truly missing for backward compat
                 }
+                // Special case: instanceType is not required when attaching to an existing endpoint
+                // The instance type is inherited from the existing endpoint configuration
+                if (param === 'instanceType' && finalConfig.existingEndpointName) {
+                    return; // Skip validation — instance is inherited from existing endpoint
+                }
                 if (isEmpty) {
                     if (config.promptable) {

package/src/lib/deployment-entry-schema.js CHANGED Viewed

@@ -57,6 +57,22 @@ export default {
                 },
                 buildTarget: {
                     type: ['string', 'null']
+                },
+                icList: {
+                    type: 'array',
+                    items: {
+                        type: 'object',
+                        required: ['name'],
+                        properties: {
+                            name: { type: 'string', minLength: 1 },
+                            image: { type: 'string' },
+                            gpuCount: { type: 'integer', minimum: 0 },
+                            copyCount: { type: 'integer', minimum: 1 },
+                            isAdapter: { type: 'boolean' },
+                            baseIcName: { type: 'string' },
+                            artifactUrl: { type: 'string' }
+                        }
+                    }
                 }
             }
         },

package/src/lib/prompt-runner.js CHANGED Viewed

@@ -18,8 +18,10 @@ import {
     modelLoadStrategyPrompts,
     modelProfilePrompts,
     modulePrompts,
+    loraPrompts,
     benchmarkPrompts,
     infraRegionAndTargetPrompts,
+    infraExistingEndpointPrompts,
     infraInstancePrompts,
     infraAsyncPrompts,
     infraBatchTransformPrompts,
@@ -29,7 +31,9 @@ import {
     destinationPrompts,
     baseImageSearchPrompts,
     baseImagePrompts,
-    formatImageChoices
+    formatImageChoices,
+    filterByCudaGeneration,
+    instanceCatalogRaw
 } from './prompts.js';
 import fs from 'fs';
@@ -187,12 +191,40 @@ export default class PromptRunner {
         // 3a. Region query
         await this._queryMcpForRegion(frameworkAnswers, explicitConfig);
+        // 3a2. Existing endpoint prompt (only for realtime-inference)
+        // Requirements: 3.3, 4.3, 4.4 — endpoint-picker MCP query
+        let existingEndpointAnswers = {};
+        if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference') {
+            // Query endpoint-picker MCP server for available endpoints
+            const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
+            await this._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
+            const endpointPreviousAnswers = {
+                ...regionAndTargetAnswers,
+                ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
+            };
+            existingEndpointAnswers = await this._runPhase(
+                infraExistingEndpointPrompts,
+                endpointPreviousAnswers,
+                explicitConfig,
+                existingConfig
+            );
+            // Resolve custom endpoint name
+            if (existingEndpointAnswers.customExistingEndpointName) {
+                existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
+                delete existingEndpointAnswers.customExistingEndpointName;
+            }
+        }
         // 3b. Instance type — query instance-sizer with full context (model + profile + CUDA)
         let instanceAnswers = {};
-        const needsInstance = regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
+        // Skip instance prompts when attaching to an existing endpoint (instance is inherited)
+        const useExistingEndpoint = !!(existingEndpointAnswers.existingEndpointName);
+        const needsInstance = !useExistingEndpoint && (regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
             regionAndTargetAnswers.deploymentTarget === 'async-inference' ||
             regionAndTargetAnswers.deploymentTarget === 'batch-transform' ||
-            regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks';
+            regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks');
         if (needsInstance) {
             // Determine architecture type for heuristic fallback
@@ -230,6 +262,74 @@ export default class PromptRunner {
             if (!instanceAnswers.instanceType && !explicitConfig.instanceType && this._architectureHeuristicDefault) {
                 instanceAnswers.instanceType = this._architectureHeuristicDefault;
             }
+            // Process multi-select instance type results (Requirements: 6.4)
+            // When user selects multiple instances via checkbox, derive instanceType and instancePools
+            if (instanceAnswers.instanceTypeSelections && instanceAnswers.instanceTypeSelections.length > 0) {
+                let selections = instanceAnswers.instanceTypeSelections.slice(0, 5); // Cap at 5 (API limit)
+                // Resolve custom input: replace __custom_input__ sentinel with parsed instances
+                if (selections.includes('__custom_input__') && instanceAnswers.customInstanceTypeSelections) {
+                    const customInstances = instanceAnswers.customInstanceTypeSelections
+                        .split(',').map(s => s.trim()).filter(s => s.length > 0);
+                    // Remove the sentinel and any other MCP selections, replace with custom entries
+                    selections = selections.filter(s => s !== '__custom_input__');
+                    selections = [...selections, ...customInstances];
+                    delete instanceAnswers.customInstanceTypeSelections;
+                } else if (selections.includes('__custom_input__')) {
+                    // Sentinel selected but no custom input provided — remove it
+                    selections = selections.filter(s => s !== '__custom_input__');
+                }
+                // Cap at 5 after custom expansion
+                if (selections.length > 5) {
+                    console.log('   ⚠️  Maximum 5 instance types allowed. Using first 5 selections.');
+                    selections = selections.slice(0, 5);
+                }
+                // Filter to same CUDA generation and warn about incompatible removals
+                const { filtered, generation, removed } = filterByCudaGeneration(selections);
+                if (removed.length > 0) {
+                    console.log(`   ⚠️  Removed incompatible instances (different CUDA generation): ${removed.join(', ')}`);
+                    console.log(`   Keeping ${generation} generation: ${filtered.join(', ')}`);
+                }
+                const finalSelections = filtered.length > 0 ? filtered : selections;
+                if (finalSelections.length === 1) {
+                    // Single selection → standard single instance type (no pools)
+                    instanceAnswers.instanceType = finalSelections[0];
+                    console.log(`   ✓ Single instance selected: ${finalSelections[0]}`);
+                } else {
+                    // Multiple selections → instance pools with priority = selection order
+                    instanceAnswers.instanceType = finalSelections[0]; // backward compat: first is primary
+                    instanceAnswers.instancePools = finalSelections.map((it, idx) => ({
+                        InstanceType: it,
+                        Priority: idx + 1
+                    }));
+                    // Auto-generate multi-spec IC config from catalog
+                    instanceAnswers.instancePoolSpecs = finalSelections.map(it => {
+                        const entry = instanceCatalogRaw[it];
+                        return {
+                            instanceType: it,
+                            gpuCount: entry?.gpus || 1,
+                            minMemoryMb: entry?.gpuMemoryGb ? entry.gpuMemoryGb * 1024 : 1024
+                        };
+                    });
+                    console.log(`   ✓ Instance pools configured (${finalSelections.length} types):`);
+                    finalSelections.forEach((it, idx) => {
+                        const entry = instanceCatalogRaw[it];
+                        const gpus = entry?.gpus || '?';
+                        const mem = entry?.gpuMemoryGb || '?';
+                        console.log(`     Priority ${idx + 1}: ${it} (${gpus} GPUs, ${mem}GB GPU memory)`);
+                    });
+                }
+                // Clean up the raw selections from answers (not needed downstream)
+                delete instanceAnswers.instanceTypeSelections;
+            }
         }
         // In auto-prompt mode, use instance-sizer's top recommendation as the instance type
@@ -318,6 +418,7 @@ export default class PromptRunner {
         // Combine all infrastructure answers
         const infraAnswers = {
             ...regionAndTargetAnswers,
+            ...existingEndpointAnswers,
             ...instanceAnswers,
             ...asyncAnswers,
             ...batchTransformAnswers,
@@ -414,6 +515,14 @@ export default class PromptRunner {
             }
         }
+        // LoRA adapter prompts — only for transformers with vllm/sglang/djl-lmi
+        // Requirements: 1.1, 1.2, 1.4
+        let loraAnswers = {};
+        const loraSubAnswers = await this._runPhase(loraPrompts, { ...frameworkAnswers, ...engineAnswers }, explicitConfig, existingConfig);
+        if (loraSubAnswers.enableLora !== undefined) {
+            loraAnswers = loraSubAnswers;
+        }
         // Validate instance type against framework requirements (now that framework version is known)
         const finalInstanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
         if (finalInstanceType && frameworkVersionAnswers.frameworkVersion) {
@@ -456,6 +565,7 @@ export default class PromptRunner {
             ...ngcApiKeyAnswers,
             ...moduleAnswers,
             ...benchmarkAnswers,
+            ...loraAnswers,
             ...projectAnswers,
             ...destinationAnswers,
             buildTimestamp
@@ -1083,6 +1193,11 @@ export default class PromptRunner {
                     console.log(`   ✓ ${choices.length} compatible instance(s) found${vramInfo}`);
+                    // Warn if all instances had zero quota but were restored for visibility
+                    if (parsed.metadata?.allFilteredByQuota) {
+                        console.log('   ⚠️  All instances have zero quota — request a quota increase for your preferred type');
+                    }
                     // Check if availability data is present (recommendations have capacityType)
                     const hasAvailabilityData = recommendations.some(r => r.capacityType);
@@ -1187,6 +1302,62 @@ export default class PromptRunner {
         }
     }
+    /**
+     * Query the endpoint-picker MCP server for available InService real-time endpoints.
+     * Populates this._mcpEndpointChoices for the existing endpoint selection prompt.
+     * Graceful fallback: if MCP server fails (no credentials, timeout), skip and create new endpoint.
+     * Requirements: 3.3, 4.3, 4.4
+     * @private
+     */
+    async _queryMcpForEndpoints(infraAnswers, explicitConfig) {
+        const cm = this.configManager;
+        if (!cm) return;
+        const mcpServers = cm.getMcpServerNames();
+        if (!mcpServers.includes('endpoint-picker')) return;
+        // Skip if existing endpoint already provided via CLI/config
+        if (explicitConfig.existingEndpointName) return;
+        console.log('   🔍 Querying endpoint-picker...');
+        try {
+            const result = await cm.queryMcpServer('endpoint-picker', {
+                awsRegion: infraAnswers.awsRegion,
+                deploymentTarget: 'realtime-inference'
+            });
+            if (result && result.choices?.endpointName?.length > 0) {
+                const endpointNames = result.choices.endpointName;
+                const metadata = result.metadata || {};
+                // Build choices with metadata annotations
+                this._mcpEndpointChoices = endpointNames.map(name => {
+                    const meta = metadata[name];
+                    if (meta) {
+                        const gpuInfo = meta.availableGpus === '?' ? 'GPUs: ?' : `${meta.availableGpus} GPUs free`;
+                        return {
+                            name: `${name} (${meta.instanceType}, ${gpuInfo}, ${meta.icCount} IC${meta.icCount !== 1 ? 's' : ''})`,
+                            value: name
+                        };
+                    }
+                    return { name, value: name };
+                });
+                console.log(`   ✓ ${endpointNames.length} endpoint(s) with available capacity`);
+            } else {
+                if (result?.message) {
+                    console.log(`   ↳ ${result.message}`);
+                } else {
+                    console.log('   ↳ No endpoints with available capacity found');
+                }
+            }
+        } catch (err) {
+            // Graceful fallback: if MCP server fails, skip and create new endpoint
+            console.log(`   ⚠️  endpoint-picker: ${err.message || 'query failed'} — will create new endpoint`);
+        }
+    }
     /**
      * Query MCP base-image-picker server after deployment config is selected.
      * Populates _mcpBaseImageChoices for the base image selection prompt.