npm - @aws/ml-container-creator - Versions diffs - 0.3.0 → 0.5.0 - Mend

@aws/ml-container-creator 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/bin/cli.js +5 -2
package/config/bootstrap-stack.json +86 -7
package/config/defaults.json +1 -1
package/infra/ci-harness/buildspec.yml +60 -0
package/package.json +3 -1
package/servers/README.md +41 -1
package/servers/instance-sizer/index.js +42 -2
package/servers/instance-sizer/lib/instance-ranker.js +114 -10
package/servers/instance-sizer/lib/quota-resolver.js +368 -0
package/servers/instance-sizer/package.json +2 -0
package/servers/lib/catalogs/instances.json +527 -12
package/servers/lib/catalogs/model-servers.json +15 -15
package/servers/lib/catalogs/model-sizes.json +27 -0
package/servers/lib/catalogs/models.json +71 -0
package/servers/lib/schemas/image-catalog.schema.json +9 -1
package/src/app.js +109 -3
package/src/lib/bootstrap-command-handler.js +96 -3
package/src/lib/cli-handler.js +2 -2
package/src/lib/config-manager.js +117 -1
package/src/lib/deployment-entry-schema.js +16 -0
package/src/lib/prompt-runner.js +270 -12
package/src/lib/prompts.js +288 -6
package/src/lib/registry-command-handler.js +12 -0
package/src/lib/schema-sync.js +31 -0
package/src/lib/template-manager.js +49 -1
package/src/lib/validate-runner.js +125 -2
package/templates/Dockerfile +22 -2
package/templates/code/cuda_compat.sh +22 -0
package/templates/code/serve +3 -0
package/templates/code/serving.properties +14 -0
package/templates/code/start_server.sh +3 -0
package/templates/diffusors/Dockerfile +2 -1
package/templates/diffusors/serve +3 -0
package/templates/do/README.md +33 -0
package/templates/do/adapter +1214 -0
package/templates/do/adapters/.gitkeep +2 -0
package/templates/do/add-ic +130 -0
package/templates/do/benchmark +718 -0
package/templates/do/clean +593 -17
package/templates/do/config +49 -4
package/templates/do/deploy +513 -362
package/templates/do/ic/default.conf +32 -0
package/templates/do/lib/endpoint-config.sh +216 -0
package/templates/do/lib/inference-component.sh +167 -0
package/templates/do/lib/secrets.sh +44 -0
package/templates/do/lib/wait.sh +131 -0
package/templates/do/logs +107 -27
package/templates/do/optimize +528 -0
package/templates/do/register +119 -2
package/templates/do/status +337 -0
package/templates/do/test +80 -28
package/templates/triton/Dockerfile +5 -0

package/src/lib/prompt-runner.js CHANGED Viewed

@@ -18,7 +18,10 @@ import {
     modelLoadStrategyPrompts,
     modelProfilePrompts,
     modulePrompts,
+    loraPrompts,
+    benchmarkPrompts,
     infraRegionAndTargetPrompts,
+    infraExistingEndpointPrompts,
     infraInstancePrompts,
     infraAsyncPrompts,
     infraBatchTransformPrompts,
@@ -28,7 +31,9 @@ import {
     destinationPrompts,
     baseImageSearchPrompts,
     baseImagePrompts,
-    formatImageChoices
+    formatImageChoices,
+    filterByCudaGeneration,
+    instanceCatalogRaw
 } from './prompts.js';
 import fs from 'fs';
@@ -186,12 +191,40 @@ export default class PromptRunner {
         // 3a. Region query
         await this._queryMcpForRegion(frameworkAnswers, explicitConfig);
+        // 3a2. Existing endpoint prompt (only for realtime-inference)
+        // Requirements: 3.3, 4.3, 4.4 — endpoint-picker MCP query
+        let existingEndpointAnswers = {};
+        if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference') {
+            // Query endpoint-picker MCP server for available endpoints
+            const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
+            await this._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
+            const endpointPreviousAnswers = {
+                ...regionAndTargetAnswers,
+                ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
+            };
+            existingEndpointAnswers = await this._runPhase(
+                infraExistingEndpointPrompts,
+                endpointPreviousAnswers,
+                explicitConfig,
+                existingConfig
+            );
+            // Resolve custom endpoint name
+            if (existingEndpointAnswers.customExistingEndpointName) {
+                existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
+                delete existingEndpointAnswers.customExistingEndpointName;
+            }
+        }
         // 3b. Instance type — query instance-sizer with full context (model + profile + CUDA)
         let instanceAnswers = {};
-        const needsInstance = regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
+        // Skip instance prompts when attaching to an existing endpoint (instance is inherited)
+        const useExistingEndpoint = !!(existingEndpointAnswers.existingEndpointName);
+        const needsInstance = !useExistingEndpoint && (regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
             regionAndTargetAnswers.deploymentTarget === 'async-inference' ||
             regionAndTargetAnswers.deploymentTarget === 'batch-transform' ||
-            regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks';
+            regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks');
         if (needsInstance) {
             // Determine architecture type for heuristic fallback
@@ -229,6 +262,74 @@ export default class PromptRunner {
             if (!instanceAnswers.instanceType && !explicitConfig.instanceType && this._architectureHeuristicDefault) {
                 instanceAnswers.instanceType = this._architectureHeuristicDefault;
             }
+            // Process multi-select instance type results (Requirements: 6.4)
+            // When user selects multiple instances via checkbox, derive instanceType and instancePools
+            if (instanceAnswers.instanceTypeSelections && instanceAnswers.instanceTypeSelections.length > 0) {
+                let selections = instanceAnswers.instanceTypeSelections.slice(0, 5); // Cap at 5 (API limit)
+                // Resolve custom input: replace __custom_input__ sentinel with parsed instances
+                if (selections.includes('__custom_input__') && instanceAnswers.customInstanceTypeSelections) {
+                    const customInstances = instanceAnswers.customInstanceTypeSelections
+                        .split(',').map(s => s.trim()).filter(s => s.length > 0);
+                    // Remove the sentinel and any other MCP selections, replace with custom entries
+                    selections = selections.filter(s => s !== '__custom_input__');
+                    selections = [...selections, ...customInstances];
+                    delete instanceAnswers.customInstanceTypeSelections;
+                } else if (selections.includes('__custom_input__')) {
+                    // Sentinel selected but no custom input provided — remove it
+                    selections = selections.filter(s => s !== '__custom_input__');
+                }
+                // Cap at 5 after custom expansion
+                if (selections.length > 5) {
+                    console.log('   ⚠️  Maximum 5 instance types allowed. Using first 5 selections.');
+                    selections = selections.slice(0, 5);
+                }
+                // Filter to same CUDA generation and warn about incompatible removals
+                const { filtered, generation, removed } = filterByCudaGeneration(selections);
+                if (removed.length > 0) {
+                    console.log(`   ⚠️  Removed incompatible instances (different CUDA generation): ${removed.join(', ')}`);
+                    console.log(`   Keeping ${generation} generation: ${filtered.join(', ')}`);
+                }
+                const finalSelections = filtered.length > 0 ? filtered : selections;
+                if (finalSelections.length === 1) {
+                    // Single selection → standard single instance type (no pools)
+                    instanceAnswers.instanceType = finalSelections[0];
+                    console.log(`   ✓ Single instance selected: ${finalSelections[0]}`);
+                } else {
+                    // Multiple selections → instance pools with priority = selection order
+                    instanceAnswers.instanceType = finalSelections[0]; // backward compat: first is primary
+                    instanceAnswers.instancePools = finalSelections.map((it, idx) => ({
+                        InstanceType: it,
+                        Priority: idx + 1
+                    }));
+                    // Auto-generate multi-spec IC config from catalog
+                    instanceAnswers.instancePoolSpecs = finalSelections.map(it => {
+                        const entry = instanceCatalogRaw[it];
+                        return {
+                            instanceType: it,
+                            gpuCount: entry?.gpus || 1,
+                            minMemoryMb: entry?.gpuMemoryGb ? entry.gpuMemoryGb * 1024 : 1024
+                        };
+                    });
+                    console.log(`   ✓ Instance pools configured (${finalSelections.length} types):`);
+                    finalSelections.forEach((it, idx) => {
+                        const entry = instanceCatalogRaw[it];
+                        const gpus = entry?.gpus || '?';
+                        const mem = entry?.gpuMemoryGb || '?';
+                        console.log(`     Priority ${idx + 1}: ${it} (${gpus} GPUs, ${mem}GB GPU memory)`);
+                    });
+                }
+                // Clean up the raw selections from answers (not needed downstream)
+                delete instanceAnswers.instanceTypeSelections;
+            }
         }
         // In auto-prompt mode, use instance-sizer's top recommendation as the instance type
@@ -252,6 +353,29 @@ export default class PromptRunner {
                 this._autoGpuCount = tpRec.gpuCount;
                 console.log(`   ✓ Auto-set tensor parallelism: TP=${tpRec.tensorParallelism} (${tpRec.gpuCount} GPUs)`);
             }
+            // Display capacity type confirmation for selected instance
+            // Requirements: 5.4
+            if (matchingRec && matchingRec.capacityType) {
+                if (matchingRec.capacityType === 'reserved') {
+                    const resType = matchingRec.reservationType === 'capacity-block' ? 'Capacity Block' : 'ODCR';
+                    const endInfo = matchingRec.reservationType === 'capacity-block' && matchingRec.reservationInfo?.endDate
+                        ? `, ends ${new Date(matchingRec.reservationInfo.endDate).toLocaleDateString()}`
+                        : '';
+                    console.log(`   ✓ Using reserved capacity — ${resType} (reservation ${matchingRec.reservationInfo?.reservationId || 'unknown'}${endInfo})`);
+                } else if (matchingRec.capacityType === 'ftp') {
+                    console.log(`   ✓ Using reserved capacity (plan ${matchingRec.ftpInfo?.planName || 'unknown'})`);
+                } else {
+                    const headroom = matchingRec.quotaHeadroom;
+                    console.log(`   ✓ Using on-demand capacity (quota headroom: ${headroom ?? 'unknown'})`);
+                }
+            }
+            // Extract reservation ARN from selected instance for deployment config
+            // Requirements: 2.3
+            if (matchingRec && matchingRec.capacityType === 'reserved' && matchingRec.reservationInfo?.reservationArn) {
+                this._selectedCapacityReservationArn = matchingRec.reservationInfo.reservationArn;
+            }
         }
         // 3c. Async-specific prompts (only when deploymentTarget === 'async-inference')
@@ -294,6 +418,7 @@ export default class PromptRunner {
         // Combine all infrastructure answers
         const infraAnswers = {
             ...regionAndTargetAnswers,
+            ...existingEndpointAnswers,
             ...instanceAnswers,
             ...asyncAnswers,
             ...batchTransformAnswers,
@@ -375,6 +500,29 @@ export default class PromptRunner {
             moduleAnswers.includeSampleModel = false;
         }
+        // Benchmark prompts — derive includeBenchmark from testTypes selection or CLI flag
+        // Requirements: 1.1, 1.2
+        let benchmarkAnswers = {};
+        if (frameworkAnswers.architecture === 'transformers' || frameworkAnswers.architecture === 'diffusors') {
+            const testTypes = moduleAnswers.testTypes || [];
+            const includeBenchmark = testTypes.includes('sagemaker-ai-automated-benchmarking') ||
+                explicitConfig.includeBenchmark === true ||
+                explicitConfig.includeBenchmark === 'true';
+            benchmarkAnswers.includeBenchmark = includeBenchmark;
+            if (includeBenchmark) {
+                const subAnswers = await this._runPhase(benchmarkPrompts, { ...frameworkAnswers, ...moduleAnswers, includeBenchmark }, explicitConfig, existingConfig);
+                benchmarkAnswers = { ...benchmarkAnswers, ...subAnswers };
+            }
+        }
+        // LoRA adapter prompts — only for transformers with vllm/sglang/djl-lmi
+        // Requirements: 1.1, 1.2, 1.4
+        let loraAnswers = {};
+        const loraSubAnswers = await this._runPhase(loraPrompts, { ...frameworkAnswers, ...engineAnswers }, explicitConfig, existingConfig);
+        if (loraSubAnswers.enableLora !== undefined) {
+            loraAnswers = loraSubAnswers;
+        }
         // Validate instance type against framework requirements (now that framework version is known)
         const finalInstanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
         if (finalInstanceType && frameworkVersionAnswers.frameworkVersion) {
@@ -416,6 +564,8 @@ export default class PromptRunner {
             ...hfTokenAnswers,
             ...ngcApiKeyAnswers,
             ...moduleAnswers,
+            ...benchmarkAnswers,
+            ...loraAnswers,
             ...projectAnswers,
             ...destinationAnswers,
             buildTimestamp
@@ -435,6 +585,12 @@ export default class PromptRunner {
             combinedAnswers.artifactUri = this._mcpArtifactUri;
         }
+        // Flow capacity reservation ARN from instance-sizer selection
+        // Requirements: 2.3
+        if (this._selectedCapacityReservationArn) {
+            combinedAnswers.capacityReservationArn = this._selectedCapacityReservationArn;
+        }
         // Validate: non-HF model sources require an artifact URI
         // Without it, the serve script can't download the model at runtime
         // Infer modelSource from model name prefix if not set by MCP
@@ -1036,13 +1192,58 @@ export default class PromptRunner {
                         : '';
                     console.log(`   ✓ ${choices.length} compatible instance(s) found${vramInfo}`);
-                    // Display compact recommendation table
-                    for (const rec of recommendations) {
-                        const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
-                        const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
-                        const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
-                        console.log(`     ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM  ${util.padStart(4)} util${tp}`);
+                    // Warn if all instances had zero quota but were restored for visibility
+                    if (parsed.metadata?.allFilteredByQuota) {
+                        console.log('   ⚠️  All instances have zero quota — request a quota increase for your preferred type');
                     }
+                    // Check if availability data is present (recommendations have capacityType)
+                    const hasAvailabilityData = recommendations.some(r => r.capacityType);
+                    if (hasAvailabilityData) {
+                        // Group by capacityType for display
+                        const reserved = recommendations.filter(r => r.capacityType === 'reserved' || r.capacityType === 'ftp');
+                        const onDemand = recommendations.filter(r => r.capacityType === 'on-demand');
+                        if (reserved.length > 0) {
+                            console.log('     ── Reserved Capacity ──');
+                            for (const rec of reserved) {
+                                const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
+                                const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
+                                const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
+                                const tag = rec.capacityType === 'reserved'
+                                    ? ` [CR] ${rec.reservationInfo?.planName || rec.reservationInfo?.reservationId || ''}`
+                                    : ` [FTP] ${rec.ftpInfo?.planName || ''}`;
+                                console.log(`     ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM  ${util.padStart(4)} util${tp}${tag}`);
+                            }
+                        }
+                        if (onDemand.length > 0) {
+                            console.log('     ── On-Demand ──');
+                            for (const rec of onDemand) {
+                                const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
+                                const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
+                                const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
+                                const deployed = rec.quotaDeployed;
+                                const quota = rec.quotaLimit;
+                                const tag = quota !== null && quota !== undefined ? ` [Q:${deployed ?? 0}/${quota}]` : '';
+                                console.log(`     ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM  ${util.padStart(4)} util${tp}${tag}`);
+                            }
+                        }
+                    } else {
+                        // Fallback: display compact recommendation table (no availability data)
+                        for (const rec of recommendations) {
+                            const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
+                            const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
+                            const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
+                            console.log(`     ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM  ${util.padStart(4)} util${tp}`);
+                        }
+                    }
+                } else if (parsed.metadata?.allFilteredByQuota) {
+                    // All VRAM-compatible instances had zero quota
+                    console.log('   ⚠️ No quota available for compatible instances. Request a quota increase.');
+                    this._instanceSizerMetadata = parsed.metadata || null;
                 } else if (parsed.metadata?.warning) {
                     console.log(`   ⚠️  ${parsed.metadata.warning}`);
                 } else {
@@ -1101,6 +1302,62 @@ export default class PromptRunner {
         }
     }
+    /**
+     * Query the endpoint-picker MCP server for available InService real-time endpoints.
+     * Populates this._mcpEndpointChoices for the existing endpoint selection prompt.
+     * Graceful fallback: if MCP server fails (no credentials, timeout), skip and create new endpoint.
+     * Requirements: 3.3, 4.3, 4.4
+     * @private
+     */
+    async _queryMcpForEndpoints(infraAnswers, explicitConfig) {
+        const cm = this.configManager;
+        if (!cm) return;
+        const mcpServers = cm.getMcpServerNames();
+        if (!mcpServers.includes('endpoint-picker')) return;
+        // Skip if existing endpoint already provided via CLI/config
+        if (explicitConfig.existingEndpointName) return;
+        console.log('   🔍 Querying endpoint-picker...');
+        try {
+            const result = await cm.queryMcpServer('endpoint-picker', {
+                awsRegion: infraAnswers.awsRegion,
+                deploymentTarget: 'realtime-inference'
+            });
+            if (result && result.choices?.endpointName?.length > 0) {
+                const endpointNames = result.choices.endpointName;
+                const metadata = result.metadata || {};
+                // Build choices with metadata annotations
+                this._mcpEndpointChoices = endpointNames.map(name => {
+                    const meta = metadata[name];
+                    if (meta) {
+                        const gpuInfo = meta.availableGpus === '?' ? 'GPUs: ?' : `${meta.availableGpus} GPUs free`;
+                        return {
+                            name: `${name} (${meta.instanceType}, ${gpuInfo}, ${meta.icCount} IC${meta.icCount !== 1 ? 's' : ''})`,
+                            value: name
+                        };
+                    }
+                    return { name, value: name };
+                });
+                console.log(`   ✓ ${endpointNames.length} endpoint(s) with available capacity`);
+            } else {
+                if (result?.message) {
+                    console.log(`   ↳ ${result.message}`);
+                } else {
+                    console.log('   ↳ No endpoints with available capacity found');
+                }
+            }
+        } catch (err) {
+            // Graceful fallback: if MCP server fails, skip and create new endpoint
+            console.log(`   ⚠️  endpoint-picker: ${err.message || 'query failed'} — will create new endpoint`);
+        }
+    }
     /**
      * Query MCP base-image-picker server after deployment config is selected.
      * Populates _mcpBaseImageChoices for the base image selection prompt.
@@ -1972,9 +2229,10 @@ export default class PromptRunner {
         '11.4': 'al2-ami-sagemaker-inference-gpu-2-1',
         '11.8': 'al2-ami-sagemaker-inference-gpu-2-1',
         '12.1': 'al2-ami-sagemaker-inference-gpu-3-1',
-        '12.2': 'al2023-ami-sagemaker-inference-gpu-4-1',
-        '12.4': 'al2023-ami-sagemaker-inference-gpu-4-1',
-        '12.6': 'al2023-ami-sagemaker-inference-gpu-4-1'
+        '12.2': 'al2-ami-sagemaker-inference-gpu-3-1',
+        '12.4': 'al2-ami-sagemaker-inference-gpu-3-1',
+        '12.6': 'al2-ami-sagemaker-inference-gpu-3-1',
+        '13.0': 'al2023-ami-sagemaker-inference-gpu-4-1'
     };
     /**