npm - @aws/ml-container-creator - Versions diffs - 0.13.5 → 0.15.1 - Mend

@aws/ml-container-creator 0.13.5 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/config/parameter-schema-v2.json +33 -5
package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
package/infra/ci-harness/package-lock.json +121 -111
package/infra/ci-harness/package.json +1 -1
package/package.json +2 -2
package/servers/endpoint-picker/index.js +23 -14
package/servers/instance-sizer/index.js +72 -4
package/servers/instance-sizer/lib/model-resolver.js +28 -2
package/src/app.js +15 -0
package/src/lib/config-loader.js +18 -0
package/src/lib/config-manager.js +6 -1
package/src/lib/dataset-slug.js +152 -0
package/src/lib/generated/cli-options.js +9 -3
package/src/lib/generated/parameter-matrix.js +15 -4
package/src/lib/generated/validation-rules.js +1 -1
package/src/lib/mcp-client.js +15 -1
package/src/lib/mcp-query-runner.js +11 -1
package/src/lib/prompt-runner.js +40 -20
package/src/lib/prompts/feature-prompts.js +1 -1
package/src/lib/template-manager.js +0 -7
package/src/lib/template-variable-resolver.js +51 -1
package/src/lib/tune-config-state.js +14 -1
package/templates/do/.benchmark_writer.py +43 -0
package/templates/do/.register_helper.py +1185 -0
package/templates/do/.tune_helper.py +168 -2
package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
package/templates/do/adapter +319 -27
package/templates/do/add-ic +85 -3
package/templates/do/benchmark +28 -8
package/templates/do/config +20 -0
package/templates/do/lib/inference-component.sh +56 -3
package/templates/do/register +557 -6
package/templates/do/test +12 -2
package/templates/do/tune +219 -6

package/servers/instance-sizer/index.js CHANGED Viewed

@@ -25,8 +25,8 @@ import { readFileSync } from 'node:fs';
 import { fileURLToPath } from 'node:url';
 import { resolve, dirname } from 'node:path';
 import { resolveModelMetadata } from './lib/model-resolver.js';
-import { estimateVram } from './lib/vram-estimator.js';
-import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js';
+import { estimateVram, computeMaxModelLen } from './lib/vram-estimator.js';
+import { filterAndRankInstances, applyAvailabilityRanking, getPerGpuMemoryGb } from './lib/instance-ranker.js';
 import { QuotaResolver } from './lib/quota-resolver.js';
 import { queryBedrock } from '../lib/bedrock-client.js';
@@ -393,6 +393,66 @@ async function handleGetInstanceRecommendation(params) {
         { limit }
     );
+    // Step 3-max_model_len: When no instance fits at full context, try capping context length
+    // NFR-1 guard: skip this logic for models with recommendedInstances in catalog
+    let suggestedMaxModelLen = null;
+    let contextLengthCapped = false;
+    let originalMaxPositionEmbeddings = null;
+    if (recommendations.length === 0 && !modelMetadata.recommendedInstances && modelMetadata.maxPositionEmbeddings) {
+        // Find the largest available GPU instance
+        const gpuInstances = Object.entries(effectiveCatalog)
+            .filter(([, meta]) => meta.category === 'gpu' && meta.gpus > 0)
+            .map(([name, meta]) => {
+                const perGpu = getPerGpuMemoryGb(meta);
+                return { name, meta, totalVramGb: perGpu ? perGpu * meta.gpus : 0 };
+            })
+            .filter(i => i.totalVramGb > 0)
+            .sort((a, b) => b.totalVramGb - a.totalVramGb);
+        if (gpuInstances.length > 0) {
+            const bestInstance = gpuInstances[0];
+            // Compute model weight memory for computeMaxModelLen
+            const weightsGb = vramEstimate.breakdown.weightsGb;
+            const safeLen = computeMaxModelLen({
+                modelWeightGb: weightsGb,
+                totalGpuMemoryGb: bestInstance.meta.gpuMemoryGb || (bestInstance.totalVramGb / bestInstance.meta.gpus),
+                gpuCount: bestInstance.meta.gpus,
+                numLayers: modelMetadata.numLayers,
+                numKvHeads: modelMetadata.numKvHeads,
+                headDim: modelMetadata.headDim
+            });
+            if (safeLen && safeLen.maxModelLen >= 2048) {
+                // Re-estimate VRAM with capped sequence length
+                const cappedEstimate = estimateVram({
+                    parameterCount: modelMetadata.parameterCount,
+                    dtype: modelMetadata.dtype,
+                    quantization: quantization || undefined,
+                    maxSequenceLength: safeLen.maxModelLen,
+                    batchSize: effectiveBatchSize || undefined
+                });
+                // Re-filter instances with the reduced VRAM requirement
+                recommendations = filterAndRankInstances(
+                    cappedEstimate.vramGb,
+                    effectiveCatalog,
+                    { limit }
+                );
+                suggestedMaxModelLen = safeLen.maxModelLen;
+                contextLengthCapped = true;
+                originalMaxPositionEmbeddings = modelMetadata.maxPositionEmbeddings;
+                log(`Context capped: ${modelMetadata.maxPositionEmbeddings} → ${safeLen.maxModelLen} for ${modelName}`);
+            } else {
+                // AC-1.6: safeLen < 2048 or null — recommend larger instance instead
+                log(`Model ${modelName} cannot fit 2048 context on ${bestInstance.name}, recommending larger instance`);
+            }
+        }
+    }
     // Step 3a: Quota & availability filtering (discover mode only)
     let preQuotaFilterCount = 0;
     let allFilteredByQuota = false;
@@ -521,7 +581,10 @@ async function handleGetInstanceRecommendation(params) {
         content: [{
             type: 'text',
             text: JSON.stringify({
-                values: { instanceType: topRecommendation },
+                values: {
+                    instanceType: topRecommendation,
+                    ...(suggestedMaxModelLen ? { maxModelLen: suggestedMaxModelLen } : {})
+                },
                 choices: { instanceType: rankedList },
                 metadata: {
                     modelName,
@@ -533,7 +596,12 @@ async function handleGetInstanceRecommendation(params) {
                     recommendations: finalRecommendations,
                     source: modelMetadata.source,
                     smartModeUsed,
-                    allFilteredByQuota
+                    allFilteredByQuota,
+                    ...(contextLengthCapped ? {
+                        suggestedMaxModelLen,
+                        contextLengthCapped: true,
+                        originalMaxPositionEmbeddings
+                    } : {})
                 }
             })
         }]

package/servers/instance-sizer/lib/model-resolver.js CHANGED Viewed

@@ -142,13 +142,27 @@ export function extractFromHuggingFaceConfig(config) {
     const architecture = (config.architectures && config.architectures[0]) || 'unknown';
     const maxPositionEmbeddings = config.max_position_embeddings || 4096;
-    return {
+    // Extract architecture params for KV cache computation (computeMaxModelLen)
+    const numLayers = config.num_hidden_layers || null;
+    const numKvHeads = config.num_key_value_heads || config.num_attention_heads || null;
+    const headDim = config.head_dim || (config.hidden_size && config.num_attention_heads
+        ? Math.floor(config.hidden_size / config.num_attention_heads)
+        : null);
+    const result = {
         parameterCount,
         dtype,
         architecture,
         maxPositionEmbeddings,
         source: 'huggingface_api'
     };
+    // Only include architecture params if available (graceful degradation)
+    if (numLayers) result.numLayers = numLayers;
+    if (numKvHeads) result.numKvHeads = numKvHeads;
+    if (headDim) result.headDim = headDim;
+    return result;
 }
 /**
@@ -175,13 +189,25 @@ export async function resolveModelMetadata(modelName, options = {}) {
     const catalogEntry = catalogLookup(modelName, catalog);
     if (catalogEntry) {
-        return {
+        const result = {
             parameterCount: catalogEntry.parameterCount,
             dtype: catalogEntry.defaultDtype || 'float16',
             architecture: catalogEntry.architecture || 'unknown',
             maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings || 4096,
             source: 'catalog'
         };
+        // Pass through recommendedInstances for NFR-1 guard
+        if (catalogEntry.recommendedInstances) {
+            result.recommendedInstances = catalogEntry.recommendedInstances;
+        }
+        // Pass through architecture params if available in catalog
+        if (catalogEntry.numLayers) result.numLayers = catalogEntry.numLayers;
+        if (catalogEntry.numKvHeads) result.numKvHeads = catalogEntry.numKvHeads;
+        if (catalogEntry.headDim) result.headDim = catalogEntry.headDim;
+        return result;
     }
     // Step 2: If discover mode, try HuggingFace Hub

package/src/app.js CHANGED Viewed

@@ -402,6 +402,7 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
         ignorePatterns.push('**/do/.tune_helper.py');
         ignorePatterns.push('**/do/.stage_helper.py');
         ignorePatterns.push('**/do/.adapter_helper.py');
+        ignorePatterns.push('**/do/.register_helper.py');
         ignorePatterns.push('**/do/train');
         ignorePatterns.push('**/do/.train_build_request.py');
         ignorePatterns.push('**/do/.train_status_parser.py');
@@ -578,6 +579,20 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
             fs.writeFileSync(gitignorePath, mlccIgnore);
         }
     }
+    // Add __pycache__/ and *.pyc to .gitignore (Python helpers leave bytecode behind)
+    {
+        const gitignorePath = path.join(destDir, '.gitignore');
+        const pycacheIgnore = '# Python bytecode (generated by do/ helper scripts)\n__pycache__/\n*.pyc\n';
+        if (fs.existsSync(gitignorePath)) {
+            const existing = fs.readFileSync(gitignorePath, 'utf8');
+            if (!existing.includes('__pycache__')) {
+                fs.appendFileSync(gitignorePath, `\n${pycacheIgnore}`);
+            }
+        } else {
+            fs.writeFileSync(gitignorePath, pycacheIgnore);
+        }
+    }
 }
 /**

package/src/lib/config-loader.js CHANGED Viewed

@@ -265,6 +265,21 @@ export default class ConfigLoader {
                 return;
             }
+            // Handle icEnvVars object (deploy-time IC environment variables)
+            if (key === 'icEnvVars' && typeof value === 'object' && value !== null) {
+                if (!this.manager.config.icEnvVars) {
+                    this.manager.config.icEnvVars = {};
+                }
+                const cliIcEnvVars = (this.manager.explicitConfig && this.manager.explicitConfig.icEnvVars) || {};
+                Object.entries(value).forEach(([envKey, envValue]) => {
+                    if (!(envKey in cliIcEnvVars)) {
+                        this.manager.config.icEnvVars[envKey] = envValue;
+                        this.manager._recordSource(`icEnvVars.${envKey}`, envValue, 'config-file');
+                    }
+                });
+                return;
+            }
             if (this.manager._isSourceSupported(key, 'configFile')) {
                 filteredConfig[key] = this.manager._parseValue(key, value);
                 this.manager._recordSource(key, this.manager._parseValue(key, value), 'config-file');
@@ -342,6 +357,9 @@ export default class ConfigLoader {
         // Parse --server-env KEY=VALUE pairs
         this._parseEnvVarOptions('server-env', 'serverEnvVars');
+        // Parse --ic-env KEY=VALUE pairs (deploy-time IC environment variables)
+        this._parseEnvVarOptions('ic-env', 'icEnvVars');
     }
     /**

package/src/lib/config-manager.js CHANGED Viewed

@@ -183,6 +183,9 @@ export default class ConfigManager {
         if (this.config.serverEnvVars && typeof this.config.serverEnvVars === 'object') {
             finalConfig.serverEnvVars = { ...this.config.serverEnvVars };
         }
+        if (this.config.icEnvVars && typeof this.config.icEnvVars === 'object') {
+            finalConfig.icEnvVars = { ...this.config.icEnvVars };
+        }
         // Ensure all parameters from the matrix are included in final config
         // This is important for optional parameters that might be null
@@ -411,7 +414,8 @@ export default class ConfigManager {
             ...endpointParams,
             ...icParams,
             'modelEnvVars',
-            'serverEnvVars'
+            'serverEnvVars',
+            'icEnvVars'
         ]);
         const core = {};
         for (const [key, value] of Object.entries(this.config)) {
@@ -426,6 +430,7 @@ export default class ConfigManager {
             icConfig,
             modelEnvVars: { ...(this.config.modelEnvVars || {}) },
             serverEnvVars: { ...(this.config.serverEnvVars || {}) },
+            icEnvVars: { ...(this.config.icEnvVars || {}) },
             manifest: [...this._sourceManifest]
         };
     }

package/src/lib/dataset-slug.js ADDED Viewed

@@ -0,0 +1,152 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Dataset Slug Derivation
+ *
+ * Derives a deterministic, short slug from a dataset URI for use in
+ * tuning-job-aware adapter naming conventions.
+ *
+ * Slugification rules:
+ * - Lowercase
+ * - Strip non-alphanumeric characters (keep hyphens)
+ * - Truncate to 20 characters
+ * - Replace consecutive hyphens with single hyphen
+ * - Strip leading/trailing hyphens
+ *
+ * Examples:
+ *   hf://org/name           -> "name"
+ *   hf://tatsu-lab/alpaca   -> "alpaca"
+ *   hf://Open-Orca/OpenOrca -> "openorca"
+ *   s3://bucket/path/file.jsonl -> "file"
+ *
+ * Requirements: US-4 (AC-4.2)
+ */
+/**
+ * Derive a dataset slug from a dataset URI.
+ *
+ * @param {string} datasetUri - Dataset URI (s3://... or hf://...)
+ * @returns {string} The derived slug, or empty string if extraction fails
+ */
+export function deriveDatasetSlug(datasetUri) {
+    if (!datasetUri || typeof datasetUri !== 'string') {
+        return '';
+    }
+    let rawName = '';
+    if (datasetUri.startsWith('hf://')) {
+        // hf://org/name[/split][?file=pattern]
+        // Extract the dataset name (second path component)
+        const hfPath = datasetUri.slice(5); // remove "hf://"
+        const withoutQuery = hfPath.split('?')[0]; // remove ?file=...
+        const parts = withoutQuery.split('/');
+        // parts[0] = org, parts[1] = name, parts[2+] = split
+        rawName = parts[1] || parts[0] || '';
+    } else if (datasetUri.startsWith('s3://')) {
+        // s3://bucket/path/file.jsonl -> slug from filename (without extension)
+        const s3Path = datasetUri.slice(5); // remove "s3://"
+        const parts = s3Path.split('/');
+        const filename = parts[parts.length - 1] || '';
+        // Remove file extension
+        const dotIndex = filename.lastIndexOf('.');
+        rawName = dotIndex > 0 ? filename.substring(0, dotIndex) : filename;
+    } else {
+        // Unknown format — try to extract last path component
+        const parts = datasetUri.split('/');
+        rawName = parts[parts.length - 1] || '';
+    }
+    return slugify(rawName);
+}
+/**
+ * Apply slugification rules to a raw name.
+ *
+ * @param {string} raw - Raw name to slugify
+ * @returns {string} Slugified string
+ */
+export function slugify(raw) {
+    if (!raw) return '';
+    let slug = raw
+        .toLowerCase()                      // lowercase
+        .replace(/[^a-z0-9-]/g, '')         // strip non-alphanumeric (keep hyphens)
+        .replace(/-{2,}/g, '-')             // replace consecutive hyphens
+        .replace(/^-+/, '')                 // strip leading hyphens
+        .replace(/-+$/, '');                // strip trailing hyphens
+    // Truncate to 20 chars
+    if (slug.length > 20) {
+        slug = slug.substring(0, 20);
+        // Don't end on a hyphen after truncation
+        slug = slug.replace(/-+$/, '');
+    }
+    return slug;
+}
+/**
+ * Resolve a --from-tune argument to the appropriate config variable name.
+ *
+ * Resolution rules:
+ * - No arg (empty/null) -> TUNE_OUTPUT_PATH_LATEST
+ * - technique only (e.g., "sft") -> TUNE_ADAPTER_PATH_SFT
+ * - technique-dataset compound (e.g., "sft-alpaca") -> TUNE_ADAPTER_PATH_SFT_ALPACA
+ *
+ * @param {string} fromTuneArg - The --from-tune argument value
+ * @param {function} configVarExists - Function that checks if a config var exists
+ * @returns {{ varName: string, technique: string, slug: string, isCompound: boolean, fallback: string|null }}
+ */
+export function resolveFromTuneVar(fromTuneArg, configVarExists) {
+    if (!fromTuneArg) {
+        return {
+            varName: 'TUNE_OUTPUT_PATH_LATEST',
+            technique: '',
+            slug: '',
+            isCompound: false,
+            fallback: null
+        };
+    }
+    const upper = fromTuneArg.toUpperCase();
+    // Check if argument contains a hyphen — potential compound key
+    const hyphenIndex = fromTuneArg.indexOf('-');
+    if (hyphenIndex > 0) {
+        const technique = fromTuneArg.substring(0, hyphenIndex);
+        const slug = fromTuneArg.substring(hyphenIndex + 1);
+        const techniqueUpper = technique.toUpperCase();
+        const slugUpper = slug.toUpperCase().replace(/-/g, '_');
+        const compoundVar = `TUNE_ADAPTER_PATH_${techniqueUpper}_${slugUpper}`;
+        if (configVarExists(compoundVar)) {
+            return {
+                varName: compoundVar,
+                technique,
+                slug,
+                isCompound: true,
+                fallback: null
+            };
+        }
+        // Compound key doesn't exist — fallback to technique-only
+        return {
+            varName: `TUNE_ADAPTER_PATH_${techniqueUpper}`,
+            technique,
+            slug,
+            isCompound: false,
+            fallback: compoundVar // the compound var that was tried but didn't exist
+        };
+    }
+    // No hyphen — technique-only
+    return {
+        varName: `TUNE_ADAPTER_PATH_${upper}`,
+        technique: fromTuneArg,
+        slug: '',
+        isCompound: false,
+        fallback: null
+    };
+}

package/src/lib/generated/cli-options.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-cli.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-15T20:16:03.840Z
+// Generated: 2026-06-23T20:55:23.381Z
 /**
  * CLI option definitions derived from parameter-schema-v2.json.
@@ -70,7 +70,7 @@ export const cliOptions = [
     {
         'flag': '--enable-lora',
         'description': 'Enable LoRA adapter serving',
-        'defaultValue': false
+        'defaultValue': true
     },
     {
         'flag': '--max-loras <n>',
@@ -85,7 +85,7 @@ export const cliOptions = [
     {
         'flag': '--include-benchmark',
         'description': 'Include SageMaker AI Benchmarking scripts (do/benchmark, do/optimize). Workload configuration is specified at runtime via --workload flag.',
-        'defaultValue': false
+        'defaultValue': true
     },
     {
         'flag': '--benchmark-concurrency <n>',
@@ -353,6 +353,11 @@ export const cliOptions = [
         'description': 'Server env var, repeatable (e.g. SGLANG_MEM_FRACTION=0.9)',
         'repeatable': true
     },
+    {
+        'flag': '--ic-env <KEY=VALUE>',
+        'description': 'Deploy-time environment variable for inference components (IC_ENV_* prefix), repeatable (e.g. VLLM_MAX_MODEL_LEN=8192)',
+        'repeatable': true
+    },
     {
         'flag': '--include-sample',
         'description': 'Include sample model code',
@@ -464,6 +469,7 @@ export const helpGroups = {
     '--fsx-volume-handle': 'hyperpod',
     '--model-env': 'env',
     '--server-env': 'env',
+    '--ic-env': 'ic',
     '--include-sample': 'features',
     '--include-testing': 'features',
     '--test-types': 'features',

package/src/lib/generated/parameter-matrix.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-parameter-matrix.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-15T20:16:03.952Z
+// Generated: 2026-06-23T20:55:23.482Z
 /**
  * Parameter matrix defining how each parameter is loaded from various sources.
@@ -106,7 +106,7 @@ export const parameterMatrix = {
         'mcp': false,
         'promptable': true,
         'required': false,
-        'default': false,
+        'default': true,
         'valueSpace': 'bounded'
     },
     'maxLoras': {
@@ -139,7 +139,7 @@ export const parameterMatrix = {
         'mcp': false,
         'promptable': true,
         'required': false,
-        'default': false,
+        'default': true,
         'valueSpace': 'bounded'
     },
     'benchmarkConcurrency': {
@@ -225,7 +225,7 @@ export const parameterMatrix = {
         'configFile': true,
         'packageJson': false,
         'mcp': true,
-        'promptable': false,
+        'promptable': true,
         'required': false,
         'default': null,
         'valueSpace': 'unbounded'
@@ -569,6 +569,17 @@ export const parameterMatrix = {
         'default': null,
         'valueSpace': 'unbounded'
     },
+    'icEnv': {
+        'cliOption': 'ic-env',
+        'envVar': null,
+        'configFile': true,
+        'packageJson': false,
+        'mcp': false,
+        'promptable': false,
+        'required': false,
+        'default': [],
+        'valueSpace': 'unbounded'
+    },
     'includeSampleModel': {
         'cliOption': 'include-sample',
         'envVar': 'ML_INCLUDE_SAMPLE',

package/src/lib/generated/validation-rules.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // AUTO-GENERATED by scripts/codegen-validator.js — DO NOT EDIT
 // Source: config/parameter-schema-v2.json
-// Generated: 2026-06-15T20:16:03.877Z
+// Generated: 2026-06-23T20:55:23.412Z
 /**
  * Validation rules derived from parameter-schema-v2.json.

package/src/lib/mcp-client.js CHANGED Viewed

@@ -143,9 +143,23 @@ class McpClient {
         // Build context from bounded parameters that have defaults
         const context = this._buildContext();
+        // Auto-discover tool name if using the default (get_ml_config)
+        // Each server registers its own tool name (e.g. get_base_images, get_inference_endpoints)
+        let toolName = this.toolName;
+        if (toolName === DEFAULT_TOOL_NAME) {
+            try {
+                const toolList = await this._client.listTools();
+                if (toolList && toolList.tools && toolList.tools.length > 0) {
+                    toolName = toolList.tools[0].name;
+                }
+            } catch (_listErr) {
+                // Fall through to use default tool name
+            }
+        }
         // Call the configured tool
         const result = await this._client.callTool({
-            name: this.toolName,
+            name: toolName,
             arguments: {
                 parameters: unboundedParams,
                 limit: this.limit,

package/src/lib/mcp-query-runner.js CHANGED Viewed

@@ -216,6 +216,12 @@ export default class McpQueryRunner {
                 if (parsed.choices?.instanceType?.length > 0) {
                     this.runner._instanceSizerMetadata = parsed.metadata || null;
+                    // Store maxModelLen from sizer if context was capped (AC-1.7)
+                    if (parsed.values?.maxModelLen) {
+                        this.runner._sizerMaxModelLen = parsed.values.maxModelLen;
+                        console.log(`   ✓ Context length capped: max_model_len=${parsed.values.maxModelLen}`);
+                    }
                     // Build display labels with VRAM estimate and utilization percentage
                     const recommendations = parsed.metadata?.recommendations || [];
                     const estimatedVramGb = parsed.metadata?.estimatedVramGb;
@@ -365,9 +371,13 @@ export default class McpQueryRunner {
         console.log('   🔍 Querying endpoint-picker...');
         try {
+            // Pass awsProfile from bootstrap config for credential resolution
+            const awsProfile = this.runner.configManager?.config?.awsProfile
+                || this.runner.options?.profile || process.env.AWS_PROFILE || null;
             const result = await cm.queryMcpServer('endpoint-picker', {
                 awsRegion: infraAnswers.awsRegion,
-                deploymentTarget: 'realtime-inference'
+                deploymentTarget: 'realtime-inference',
+                ...(awsProfile ? { awsProfile } : {})
             });
             if (result && result.choices?.endpointName?.length > 0) {

package/src/lib/prompt-runner.js CHANGED Viewed

@@ -224,25 +224,39 @@ export default class PromptRunner {
         // Requirements: 3.3, 4.3, 4.4 — endpoint-picker MCP query
         let existingEndpointAnswers = {};
         if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference') {
-            // Query endpoint-picker MCP server for available endpoints
-            const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
-            await this.mcpQueryRunner._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
-            const endpointPreviousAnswers = {
-                ...regionAndTargetAnswers,
-                ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
-            };
-            existingEndpointAnswers = await this._runPhase(
-                infraExistingEndpointPrompts,
-                endpointPreviousAnswers,
+            // First ask if user wants to attach to existing endpoint (no MCP call yet)
+            const attachAnswer = await this._runPhase(
+                [infraExistingEndpointPrompts[0]],
+                { ...regionAndTargetAnswers },
                 explicitConfig,
                 existingConfig
             );
-            // Resolve custom endpoint name
-            if (existingEndpointAnswers.customExistingEndpointName) {
-                existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
-                delete existingEndpointAnswers.customExistingEndpointName;
+            if (attachAnswer.useExistingEndpoint === 'yes') {
+                // Only now query endpoint-picker MCP server
+                const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
+                await this.mcpQueryRunner._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
+                const endpointPreviousAnswers = {
+                    ...regionAndTargetAnswers,
+                    ...attachAnswer,
+                    ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
+                };
+                existingEndpointAnswers = await this._runPhase(
+                    infraExistingEndpointPrompts.slice(1),
+                    endpointPreviousAnswers,
+                    explicitConfig,
+                    existingConfig
+                );
+                existingEndpointAnswers.useExistingEndpoint = 'yes';
+                // Resolve custom endpoint name
+                if (existingEndpointAnswers.customExistingEndpointName) {
+                    existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
+                    delete existingEndpointAnswers.customExistingEndpointName;
+                }
+            } else {
+                existingEndpointAnswers = attachAnswer;
             }
         }
@@ -376,11 +390,12 @@ export default class PromptRunner {
             const sizerRecs = this._instanceSizerMetadata.recommendations || [];
             const finalInstanceType = instanceAnswers.customInstanceType || instanceAnswers.instanceType;
             const matchingRec = sizerRecs.find(r => r.instanceType === finalInstanceType);
-            const tpRec = matchingRec || sizerRecs[0];
-            if (tpRec && tpRec.tensorParallelism > 1) {
-                this._autoTensorParallelism = tpRec.tensorParallelism;
-                this._autoGpuCount = tpRec.gpuCount;
-                console.log(`   ✓ Auto-set tensor parallelism: TP=${tpRec.tensorParallelism} (${tpRec.gpuCount} GPUs)`);
+            // Only use sizer TP recommendation if user selected a recommended instance
+            // Custom instances resolve TP from the instance catalog in template-variable-resolver
+            if (matchingRec && matchingRec.tensorParallelism > 1) {
+                this._autoTensorParallelism = matchingRec.tensorParallelism;
+                this._autoGpuCount = matchingRec.gpuCount;
+                console.log(`   ✓ Auto-set tensor parallelism: TP=${matchingRec.tensorParallelism} (${matchingRec.gpuCount} GPUs)`);
             }
             // Display capacity type confirmation for selected instance
@@ -710,6 +725,11 @@ export default class PromptRunner {
             delete combinedAnswers.customHyperPodCluster;
         }
+        // Propagate max_model_len from instance-sizer context capping (AC-1.7)
+        if (this._sizerMaxModelLen) {
+            combinedAnswers.sizerMaxModelLen = this._sizerMaxModelLen;
+        }
         // Apply CUDA version selection → inference AMI override
         if (combinedAnswers._resolvedInferenceAmiVersion) {
             combinedAnswers.inferenceAmiVersion = combinedAnswers._resolvedInferenceAmiVersion;

package/src/lib/prompts/feature-prompts.js CHANGED Viewed

@@ -90,7 +90,7 @@ const loraPrompts = [
         type: 'confirm',
         name: 'enableLora',
         message: 'Enable LoRA adapter serving?',
-        default: false,
+        default: true,
         when: (answers) => {
             const architecture = answers.architecture || answers.deploymentConfig?.split('-')[0];
             const backend = answers.backend || answers.deploymentConfig?.split('-').slice(1).join('-');