npm - @aws/ml-container-creator - Versions diffs - 0.2.6 → 0.4.0 - Mend

@aws/ml-container-creator 0.2.6 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/bin/cli.js +38 -2
package/config/bootstrap-stack.json +94 -1
package/config/defaults.json +1 -1
package/infra/ci-harness/package-lock.json +22 -9
package/package.json +3 -1
package/servers/instance-sizer/index.js +45 -8
package/servers/instance-sizer/lib/instance-ranker.js +140 -11
package/servers/instance-sizer/lib/model-resolver.js +10 -6
package/servers/instance-sizer/lib/quota-resolver.js +368 -0
package/servers/instance-sizer/package.json +2 -0
package/servers/lib/catalogs/instances.json +527 -12
package/servers/lib/catalogs/model-servers.json +298 -20
package/servers/lib/catalogs/model-sizes.json +27 -0
package/servers/lib/catalogs/models.json +101 -0
package/servers/lib/schemas/image-catalog.schema.json +15 -1
package/servers/model-picker/index.js +2 -1
package/src/app.js +96 -2
package/src/lib/architecture-sync.js +171 -0
package/src/lib/arn-detection.js +22 -0
package/src/lib/bootstrap-command-handler.js +178 -3
package/src/lib/cli-handler.js +2 -2
package/src/lib/config-manager.js +121 -1
package/src/lib/cross-cutting-checker.js +119 -0
package/src/lib/deployment-entry-schema.js +1 -2
package/src/lib/prompt-runner.js +514 -20
package/src/lib/prompts.js +67 -5
package/src/lib/registry-command-handler.js +236 -0
package/src/lib/schema-sync.js +31 -0
package/src/lib/secret-classification.js +56 -0
package/src/lib/secrets-command-handler.js +550 -0
package/src/lib/template-manager.js +49 -1
package/src/lib/validate-runner.js +174 -2
package/src/lib/validation-report.js +8 -1
package/src/prompt-adapter.js +3 -2
package/templates/Dockerfile +10 -2
package/templates/code/cuda_compat.sh +22 -0
package/templates/code/serve +3 -0
package/templates/code/start_server.sh +3 -0
package/templates/diffusors/Dockerfile +2 -1
package/templates/diffusors/serve +3 -0
package/templates/do/README.md +33 -0
package/templates/do/benchmark +646 -0
package/templates/do/build +22 -0
package/templates/do/clean +86 -0
package/templates/do/config +41 -6
package/templates/do/deploy +66 -6
package/templates/do/logs +18 -3
package/templates/do/register +8 -1
package/templates/do/run +10 -0
package/templates/triton/Dockerfile +5 -0

package/src/lib/config-manager.js CHANGED Viewed

@@ -300,6 +300,15 @@ export default class ConfigManager {
             finalConfig.hfToken = this._resolveHfToken(finalConfig.hfToken);
         }
+        // Mutual exclusion: ARN takes precedence over plaintext when both are set
+        // (CLI validation should prevent this, but enforce at config level too)
+        if (finalConfig.hfTokenArn) {
+            finalConfig.hfToken = null;
+        }
+        if (finalConfig.ngcTokenArn) {
+            finalConfig.ngcApiKey = null;
+        }
         // Map awsRoleArn to roleArn for templates
         if (finalConfig.awsRoleArn) {
             finalConfig.roleArn = finalConfig.awsRoleArn;
@@ -643,6 +652,28 @@ export default class ConfigManager {
                 default: null,
                 valueSpace: 'bounded'
             },
+            hfTokenArn: {
+                cliOption: 'hf-token-arn',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: false,
+                required: false,
+                default: null,
+                valueSpace: 'bounded'
+            },
+            ngcTokenArn: {
+                cliOption: 'ngc-token-arn',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: false,
+                required: false,
+                default: null,
+                valueSpace: 'bounded'
+            },
             deploymentTarget: {
                 cliOption: 'deployment-target',
                 envVar: 'ML_DEPLOYMENT_TARGET',
@@ -948,6 +979,83 @@ export default class ConfigManager {
                 default: 1.0,
                 valueSpace: 'bounded',
                 schemaValidated: true
+            },
+            includeBenchmark: {
+                cliOption: 'include-benchmark',
+                envVar: 'ML_INCLUDE_BENCHMARK',
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: false,
+                valueSpace: 'bounded'
+            },
+            benchmarkConcurrency: {
+                cliOption: 'benchmark-concurrency',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: 10,
+                valueSpace: 'bounded'
+            },
+            benchmarkInputTokensMean: {
+                cliOption: 'benchmark-input-tokens',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: 550,
+                valueSpace: 'bounded'
+            },
+            benchmarkOutputTokensMean: {
+                cliOption: 'benchmark-output-tokens',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: 150,
+                valueSpace: 'bounded'
+            },
+            benchmarkStreaming: {
+                cliOption: 'benchmark-streaming',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: true,
+                valueSpace: 'bounded'
+            },
+            benchmarkRequestCount: {
+                cliOption: 'benchmark-request-count',
+                envVar: null,
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: null,
+                valueSpace: 'bounded'
+            },
+            benchmarkS3OutputPath: {
+                cliOption: 'benchmark-s3-output-path',
+                envVar: 'ML_BENCHMARK_S3_OUTPUT_PATH',
+                configFile: true,
+                packageJson: false,
+                mcp: false,
+                promptable: true,
+                required: false,
+                default: null,
+                valueSpace: 'bounded'
             }
         };
     }
@@ -980,7 +1088,7 @@ export default class ConfigManager {
      */
     _parseValue(parameter, value) {
         // Handle boolean parameters
-        if (parameter === 'includeSampleModel' || parameter === 'includeTesting' || parameter === 'skipPrompts') {
+        if (parameter === 'includeSampleModel' || parameter === 'includeTesting' || parameter === 'skipPrompts' || parameter === 'includeBenchmark' || parameter === 'benchmarkStreaming') {
             return value === true || value === 'true';
         }
@@ -1675,6 +1783,18 @@ export default class ConfigManager {
             }
         }
+        // Validate mutual exclusion: plaintext token and ARN cannot both be set
+        if (this.config.hfToken && this.config.hfTokenArn) {
+            errors.push('Cannot specify both --hf-token and --hf-token-arn. Use one or the other.');
+        }
+        if (this.config.ngcTokenArn) {
+            // Check ngcToken from CLI options (Commander converts --ngc-token to ngcToken)
+            const ngcTokenFromCli = this.options['ngc-token'];
+            if (ngcTokenFromCli) {
+                errors.push('Cannot specify both --ngc-token and --ngc-token-arn. Use one or the other.');
+            }
+        }
         // Validate AWS Role ARN format if provided
         if (this.config.awsRoleArn) {
             try {

package/src/lib/cross-cutting-checker.js CHANGED Viewed

@@ -22,6 +22,7 @@ export default class CrossCuttingChecker {
         findings.push(...this.checkRoleArnFormat(context));
         findings.push(...this.checkCudaCompatibility(context, instanceCatalog));
         findings.push(...this.checkModelTypeInstanceAlignment(context, instanceCatalog));
+        findings.push(...this.checkKvCacheMemoryFit(context, instanceCatalog));
         return findings;
     }
@@ -298,6 +299,45 @@ export default class CrossCuttingChecker {
         return findings;
     }
+    /**
+     * Verify model architecture compatibility with the selected server version.
+     * Checks model_type against the server's supportedModelTypes from the catalog.
+     * Skips silently when supportedModelTypes is empty (sync not run).
+     *
+     * @param {Object} context - ValidationContext
+     * @param {Object} modelServersCatalog - Model servers catalog (from servers/lib/catalogs/model-servers.json)
+     * @returns {Array} Findings
+     */
+    checkModelArchitectureCompatibility(context, modelServersCatalog) {
+        const findings = [];
+        const config = context.config || {};
+        const modelType = config.modelType;
+        const serverVersion = config.baseImageVersion;
+        const server = config.modelServer;
+        if (!modelType || !server || !serverVersion) return findings;
+        const entries = modelServersCatalog[server] || [];
+        const entry = entries.find(e => e.labels?.framework_version === serverVersion);
+        if (!entry?.supportedModelTypes?.length) return findings;
+        if (!entry.supportedModelTypes.includes(modelType.toLowerCase())) {
+            findings.push({
+                service: 'cross-cutting',
+                operation: 'configuration',
+                fieldPath: 'MODEL_NAME',
+                invalidValue: modelType,
+                constraint: { type: 'architecture-compatibility', server, version: serverVersion },
+                severity: 'warning',
+                confidence: 'medium',
+                source: 'cross-cutting',
+                remediationHint: `Model architecture "${modelType}" may not be supported by ${server} ${serverVersion}. Consider a newer server version.`
+            });
+        }
+        return findings;
+    }
     /**
      * Verify predictor models are not assigned GPU instances.
      * @param {Object} context - ValidationContext
@@ -338,4 +378,83 @@ export default class CrossCuttingChecker {
         return findings;
     }
+    /**
+     * Verify that the model's estimated VRAM (weights + KV cache at configured max_model_len)
+     * fits in the instance's available GPU memory.
+     *
+     * Uses the same estimation formula as the instance-sizer's vram-estimator:
+     * total = weights + KV cache + 10% overhead
+     *
+     * @param {Object} context - ValidationContext
+     * @param {Object} instanceCatalog - Instance catalog
+     * @returns {Array} Findings
+     */
+    checkKvCacheMemoryFit(context, instanceCatalog) {
+        const findings = [];
+        const config = context.config || {};
+        const catalog = instanceCatalog?.catalog || instanceCatalog || {};
+        const instanceType = config.INSTANCE_TYPE;
+        if (!instanceType) return findings;
+        const instanceInfo = catalog[instanceType];
+        if (!instanceInfo || !instanceInfo.gpus || instanceInfo.gpus <= 0) return findings;
+        // Need parameter count to estimate weights
+        const parameterCount = config._parameterCount || config.parameterCount;
+        if (!parameterCount) return findings;
+        // Resolve max sequence length: explicit env var > model's max_position_embeddings > skip
+        const maxModelLen = parseInt(config.VLLM_MAX_MODEL_LEN || config.SGLANG_MAX_MODEL_LEN || '0', 10);
+        const maxPosEmbed = parseInt(config._maxPositionEmbeddings || '0', 10);
+        const seqLen = maxModelLen || maxPosEmbed;
+        if (!seqLen) return findings;
+        // Estimate per-GPU VRAM from instance catalog
+        let perGpuVramGb = instanceInfo.gpuMemoryGb;
+        if (!perGpuVramGb && instanceInfo.accelerator) {
+            const match = instanceInfo.accelerator.match(/(\d+)GB/);
+            if (match) {
+                const totalGb = parseInt(match[1], 10);
+                const hasMultiplier = instanceInfo.accelerator.match(/^(\d+)x\s/);
+                perGpuVramGb = hasMultiplier ? totalGb / instanceInfo.gpus : totalGb;
+            }
+        }
+        if (!perGpuVramGb) return findings;
+        const totalVramGb = perGpuVramGb * instanceInfo.gpus;
+        // Estimate VRAM needed (same formula as vram-estimator.js)
+        const dtype = config._dtype || 'float16';
+        const bytesPerParam = dtype === 'float32' ? 4.0 : dtype === 'int8' ? 1.0 : 2.0;
+        const weightsGb = (parameterCount * bytesPerParam) / (1024 ** 3);
+        const kvCacheGb = (parameterCount * (seqLen / 4096) * 0.05) / (1024 ** 3);
+        const overheadGb = weightsGb * 0.1;
+        const estimatedTotalGb = weightsGb + kvCacheGb + overheadGb;
+        if (estimatedTotalGb > totalVramGb) {
+            findings.push({
+                service: 'cross-cutting',
+                operation: 'configuration',
+                fieldPath: 'INSTANCE_TYPE',
+                invalidValue: instanceType,
+                constraint: {
+                    type: 'kv-cache-memory-fit',
+                    estimatedVramGb: Math.round(estimatedTotalGb * 10) / 10,
+                    weightsGb: Math.round(weightsGb * 10) / 10,
+                    kvCacheGb: Math.round(kvCacheGb * 10) / 10,
+                    totalVramGb,
+                    maxModelLen: seqLen,
+                    instanceType
+                },
+                severity: 'warning',
+                confidence: 'medium',
+                source: 'cross-cutting',
+                remediationHint: `Estimated VRAM needed: ${estimatedTotalGb.toFixed(1)}GB (weights: ${weightsGb.toFixed(1)}GB + KV cache: ${kvCacheGb.toFixed(1)}GB at seq_len=${seqLen}) exceeds instance capacity (${totalVramGb}GB). Reduce VLLM_MAX_MODEL_LEN, use quantization, or select a larger instance.`
+            });
+        }
+        return findings;
+    }
 }

package/src/lib/deployment-entry-schema.js CHANGED Viewed

@@ -65,8 +65,7 @@ export default {
             required: ['modelName'],
             properties: {
                 modelName: {
-                    type: 'string',
-                    minLength: 1
+                    type: ['string', 'null']
                 },
                 modelFormat: {
                     type: ['string', 'null']