npm - @aws/ml-container-creator - Versions diffs - 0.2.6 → 0.3.0 - Mend

@aws/ml-container-creator 0.2.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/bin/cli.js +38 -2
package/config/bootstrap-stack.json +14 -0
package/infra/ci-harness/package-lock.json +22 -9
package/package.json +1 -1
package/servers/instance-sizer/index.js +9 -6
package/servers/instance-sizer/lib/instance-ranker.js +35 -10
package/servers/instance-sizer/lib/model-resolver.js +10 -6
package/servers/lib/catalogs/model-servers.json +283 -5
package/servers/lib/catalogs/models.json +30 -0
package/servers/lib/schemas/image-catalog.schema.json +6 -0
package/servers/model-picker/index.js +2 -1
package/src/app.js +19 -0
package/src/lib/architecture-sync.js +171 -0
package/src/lib/arn-detection.js +22 -0
package/src/lib/bootstrap-command-handler.js +82 -0
package/src/lib/config-manager.js +43 -0
package/src/lib/cross-cutting-checker.js +119 -0
package/src/lib/deployment-entry-schema.js +1 -2
package/src/lib/prompt-runner.js +427 -20
package/src/lib/prompts.js +1 -1
package/src/lib/registry-command-handler.js +236 -0
package/src/lib/secret-classification.js +56 -0
package/src/lib/secrets-command-handler.js +550 -0
package/src/lib/validate-runner.js +49 -0
package/src/lib/validation-report.js +8 -1
package/src/prompt-adapter.js +3 -2
package/templates/do/build +22 -0
package/templates/do/config +15 -3
package/templates/do/deploy +60 -5
package/templates/do/logs +18 -3
package/templates/do/run +10 -0

package/bin/cli.js CHANGED Viewed

@@ -90,6 +90,9 @@ program
     // --- Authentication ---
     .addOption(new Option('--hf-token <token>', 'HuggingFace token (or "$HF_TOKEN" for env var reference)'))
+    .addOption(new Option('--hf-token-arn <arn>', 'HuggingFace token ARN from Secrets Manager'))
+    .addOption(new Option('--ngc-token <token>', 'NVIDIA NGC token (or "$NGC_API_KEY" for env var reference)'))
+    .addOption(new Option('--ngc-token-arn <arn>', 'NVIDIA NGC token ARN from Secrets Manager'))
     // --- Optional Features ---
     .addOption(new Option('--include-sample', 'Include sample model code'))
@@ -106,7 +109,18 @@ program
     .addOption(new Option('--validate-with-docker', 'Enable Docker introspection validation (opt-in)'))
     .addOption(new Option('--offline', 'Disable HuggingFace API lookups'))
-    .action((projectNameArgs, options) => run(projectNameArgs?.[0] || null, options));
+    .action((projectNameArgs, options) => {
+        // Mutual exclusion validation: plaintext token and ARN flags cannot both be provided
+        if (options.hfToken && options.hfTokenArn) {
+            console.error('❌ Cannot specify both --hf-token and --hf-token-arn. Use one or the other.');
+            process.exit(1);
+        }
+        if (options.ngcToken && options.ngcTokenArn) {
+            console.error('❌ Cannot specify both --ngc-token and --ngc-token-arn. Use one or the other.');
+            process.exit(1);
+        }
+        return run(projectNameArgs?.[0] || null, options);
+    });
 // Custom help formatting — group options into logical sections (root command only)
 program.configureHelp({
@@ -174,7 +188,7 @@ program.configureHelp({
                 groups.hyperpod.push(opt);
             } else if (['--model-env', '--server-env'].includes(long)) {
                 groups.env.push(opt);
-            } else if (['--hf-token'].includes(long)) {
+            } else if (['--hf-token', '--hf-token-arn', '--ngc-token', '--ngc-token-arn'].includes(long)) {
                 groups.auth.push(opt);
             } else if (['--include-sample', '--include-testing', '--test-types'].includes(long)) {
                 groups.features.push(opt);
@@ -255,6 +269,7 @@ program
     .option('--ci', 'Provision CI integration infrastructure')
     .option('--skip-ci', 'Skip CI infrastructure provisioning')
     .option('--skip-s3', 'Skip S3 bucket creation')
+    .option('--skip-post-setup', 'Skip post-setup chain (mcp init, sync-architectures, sync-schemas)')
     .action(async (action, args, options) => {
         const { default: BootstrapCommandHandler } = await import('../src/lib/bootstrap-command-handler.js');
         const handler = new BootstrapCommandHandler();
@@ -314,12 +329,33 @@ program
     .option('--project', 'Use project-level registry')
     .option('--parameters <json>', 'Parameters JSON string')
     .option('--generator-version <version>', 'Generator version')
+    // Options used by `registry list-architectures`
+    .option('--server <name>', 'Filter by server name (for list-architectures)')
+    .option('--verbose', 'Show full list of supported model types (for list-architectures)')
     .action(async (action, args, options) => {
         const { default: RegistryCommandHandler } = await import('../src/lib/registry-command-handler.js');
         const handler = new RegistryCommandHandler();
         await handler.handle([action, ...args], options);
     });
+program
+    .command('secrets')
+    .description('Manage secrets in AWS Secrets Manager (create, list, describe)')
+    .argument('[action]', 'Secrets action (create, list, describe)')
+    .argument('[args...]', 'Additional arguments')
+    .option('--type <type>', 'Secret type (e.g., hf-token, ngc-token)')
+    .option('--name <label>', 'Secret label (used in naming convention)')
+    .option('--secret-value <value>', 'Secret value (masked in terminal)')
+    .option('--description <text>', 'Secret description')
+    .option('--kms-key-id <key>', 'KMS key for encryption')
+    .option('--json <json-or-path>', 'JSON input (inline or file://path)')
+    .action(async (action, args, options) => {
+        const { default: SecretsCommandHandler } = await import('../src/lib/secrets-command-handler.js');
+        const handler = new SecretsCommandHandler();
+        const allArgs = action ? [action, ...args] : [];
+        await handler.handle(allArgs, options);
+    });
 program
     .command('configure')
     .description('Interactive configuration setup (experimental)')

package/config/bootstrap-stack.json CHANGED Viewed

@@ -105,6 +105,20 @@
                     "arn:aws:s3:::ml-container-creator-*",
                     "arn:aws:s3:::ml-container-creator-*/*"
                   ]
+                },
+                {
+                  "Sid": "SecretsManagerRead",
+                  "Effect": "Allow",
+                  "Action": [
+                    "secretsmanager:GetSecretValue",
+                    "secretsmanager:DescribeSecret"
+                  ],
+                  "Resource": "arn:aws:secretsmanager:*:*:secret:mlcc/*",
+                  "Condition": {
+                    "StringEquals": {
+                      "aws:ResourceTag/mlcc:managed-by": "ml-container-creator"
+                    }
+                  }
                 }
               ]
             }

package/infra/ci-harness/package-lock.json CHANGED Viewed

@@ -48,7 +48,6 @@
         "semver"
       ],
       "license": "Apache-2.0",
-      "peer": true,
       "dependencies": {
         "jsonschema": "~1.4.1",
         "semver": "^7.7.4"
@@ -2151,7 +2150,6 @@
       "integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "undici-types": "~6.21.0"
       }
@@ -2791,8 +2789,7 @@
       "version": "10.6.0",
       "resolved": "https://registry.npmjs.org/constructs/-/constructs-10.6.0.tgz",
       "integrity": "sha512-TxHOnBO5zMo/G76ykzGF/wMpEHu257TbWiIxP9K0Yv/+t70UzgBQiTqjkAsWOPC6jW91DzJI0+ehQV6xDRNBuQ==",
-      "license": "Apache-2.0",
-      "peer": true
+      "license": "Apache-2.0"
     },
     "node_modules/create-require": {
       "version": "1.1.1",
@@ -2937,9 +2934,9 @@
       }
     },
     "node_modules/fast-xml-builder": {
-      "version": "1.1.5",
-      "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.1.5.tgz",
-      "integrity": "sha512-4TJn/8FKLeslLAH3dnohXqE3QSoxkhvaMzepOIZytwJXZO69Bfz0HBdDHzOTOon6G59Zrk6VQ2bEiv1t61rfkA==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.2.0.tgz",
+      "integrity": "sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==",
       "dev": true,
       "funding": [
         {
@@ -2949,7 +2946,8 @@
       ],
       "license": "MIT",
       "dependencies": {
-        "path-expression-matcher": "^1.1.3"
+        "path-expression-matcher": "^1.5.0",
+        "xml-naming": "^0.1.0"
       }
     },
     "node_modules/fast-xml-parser": {
@@ -3696,7 +3694,6 @@
       "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
       "dev": true,
       "license": "Apache-2.0",
-      "peer": true,
       "bin": {
         "tsc": "bin/tsc",
         "tsserver": "bin/tsserver"
@@ -3837,6 +3834,22 @@
         "url": "https://github.com/chalk/ansi-styles?sponsor=1"
       }
     },
+    "node_modules/xml-naming": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/xml-naming/-/xml-naming-0.1.0.tgz",
+      "integrity": "sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        }
+      ],
+      "license": "MIT",
+      "engines": {
+        "node": ">=16.0.0"
+      }
+    },
     "node_modules/y18n": {
       "version": "5.0.8",
       "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@aws/ml-container-creator",
-  "version": "0.2.6",
+  "version": "0.3.0",
   "description": "Generator for SageMaker AI BYOC paradigm for predictive inference use-cases.",
   "type": "module",
   "main": "src/app.js",

package/servers/instance-sizer/index.js CHANGED Viewed

@@ -114,11 +114,11 @@ function log(message) {
  * @param {string} search - Search query string
  * @param {object} instanceCatalog - Instance catalog object
  * @param {object} [options={}]
- * @param {number} [options.limit=8] - Max results
+ * @param {number} [options.limit=10] - Max results
  * @returns {string[]} Matching instance type names, sorted by relevance
  */
 function searchInstancesByTag(search, instanceCatalog, options = {}) {
-    const { limit = 8 } = options
+    const { limit = 10 } = options
     const candidates = Object.entries(instanceCatalog)
     // Tokenize search into lowercase keywords
@@ -236,7 +236,7 @@ async function handleGetInstanceRecommendation(params) {
         maxSequenceLength,
         batchSize,
         cudaVersion,
-        limit = 8,
+        limit = 10,
         context
     } = params
@@ -361,11 +361,14 @@ async function handleGetInstanceRecommendation(params) {
     }
     // Step 2: Estimate VRAM
+    // Use model's max_position_embeddings as the sequence length when no explicit value is provided.
+    // This ensures KV cache is sized for the model's actual context window, not the 4096 default.
+    const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined
     const vramEstimate = estimateVram({
         parameterCount: modelMetadata.parameterCount,
         dtype: modelMetadata.dtype,
         quantization: quantization || undefined,
-        maxSequenceLength: effectiveMaxSeqLen || undefined,
+        maxSequenceLength: resolvedMaxSeqLen,
         batchSize: effectiveBatchSize || undefined
     })
@@ -502,7 +505,7 @@ server.tool(
         maxSequenceLength: z.number().optional().describe('Max context/sequence length (affects KV cache estimate)'),
         batchSize: z.number().optional().describe('Expected concurrent batch size'),
         cudaVersion: z.string().optional().describe('Required CUDA version from base image (filters incompatible instances)'),
-        limit: z.number().optional().default(8).describe('Maximum number of instance recommendations to return'),
+        limit: z.number().optional().default(10).describe('Maximum number of instance recommendations to return'),
         context: z.object({
             architecture: z.string().optional(),
             backend: z.string().optional(),
@@ -526,7 +529,7 @@ server.tool(
         maxSequenceLength: z.number().optional().describe('Max context/sequence length (affects KV cache estimate)'),
         batchSize: z.number().optional().describe('Expected concurrent batch size'),
         cudaVersion: z.string().optional().describe('Required CUDA version from base image (filters incompatible instances)'),
-        limit: z.number().optional().default(8).describe('Maximum number of instance recommendations to return'),
+        limit: z.number().optional().default(10).describe('Maximum number of instance recommendations to return'),
         context: z.object({
             architecture: z.string().optional(),
             backend: z.string().optional(),

package/servers/instance-sizer/lib/instance-ranker.js CHANGED Viewed

@@ -51,6 +51,22 @@ const COST_TIER_WEIGHT = {
     'high': 3
 }
+/**
+ * Generation weight by instance family.
+ * Lower is newer (sorted first). Newer generations offer better perf/$.
+ */
+const GENERATION_WEIGHT = {
+    'g6': 1,
+    'p5': 1,
+    'trn1': 2,
+    'inf2': 2,
+    'g5': 3,
+    'p4de': 4,
+    'p4d': 4,
+    'p3': 5,
+    'g4dn': 6
+}
 /**
  * TP overhead penalty: 10% per additional GPU beyond the first.
  * Effective VRAM = totalVram × (1 - 0.10 × (gpuCount - 1))
@@ -144,12 +160,12 @@ const effectiveVram = (totalVramGb, gpuCount) => {
  * @param {number} vramRequired - Required VRAM in GB
  * @param {object} instanceCatalog - Object keyed by instance type, values are metadata
  * @param {object} [options={}]
- * @param {number} [options.limit=8] - Max results to return
+ * @param {number} [options.limit=10] - Max results to return
  * @param {boolean} [options.allowTensorParallelism=true] - Consider multi-GPU splits
  * @returns {object[]} Ranked list of compatible instances
  */
 const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) => {
-    const { limit = 8, allowTensorParallelism = true } = options
+    const { limit = 10, allowTensorParallelism = true } = options
     if (!vramRequired || vramRequired <= 0) {
         return []
@@ -182,7 +198,8 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
                     totalVramGb,
                     utilizationPercent,
                     tensorParallelism: 1,
-                    costTier: getCostTier(meta)
+                    costTier: getCostTier(meta),
+                    family: meta.family || ''
                 })
             }
         } else if (allowTensorParallelism) {
@@ -196,7 +213,8 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
                     totalVramGb,
                     utilizationPercent,
                     tensorParallelism: gpuCount,
-                    costTier: getCostTier(meta)
+                    costTier: getCostTier(meta),
+                    family: meta.family || ''
                 })
             }
         }
@@ -204,24 +222,30 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
     // Sort candidates by ranking criteria:
     // 1. Single-GPU first (TP=1), then multi-GPU by lowest TP degree
-    // 2. Within each TP tier, sort by cost-efficiency (lowest cost tier first,
-    //    then by lowest utilization — more headroom is better for the same cost)
+    // 2. Within each TP tier, newest generation first (g6 > g5 > g4dn)
+    // 3. Within same generation, sort by cost tier (lower is better)
+    // 4. Within same cost tier, prefer lower total VRAM (right-sized)
     candidates.sort((a, b) => {
         // Primary: TP degree (lower is better)
         if (a.tensorParallelism !== b.tensorParallelism) {
             return a.tensorParallelism - b.tensorParallelism
         }
-        // Secondary: cost tier (lower is better)
+        // Secondary: generation (newer is better — lower weight)
+        const genA = GENERATION_WEIGHT[a.family] || 4
+        const genB = GENERATION_WEIGHT[b.family] || 4
+        if (genA !== genB) {
+            return genA - genB
+        }
+        // Tertiary: cost tier (lower is better)
         const costA = COST_TIER_WEIGHT[a.costTier] || 2
         const costB = COST_TIER_WEIGHT[b.costTier] || 2
         if (costA !== costB) {
             return costA - costB
         }
-        // Tertiary: cost-efficiency — lower $/GB approximated by
-        // lower cost tier with higher total VRAM (more GB per dollar)
-        // Since cost tier is equal here, prefer higher total VRAM (better value)
+        // Quaternary: prefer lower total VRAM (right-sized, less waste)
         if (a.totalVramGb !== b.totalVramGb) {
             return a.totalVramGb - b.totalVramGb
         }
@@ -241,5 +265,6 @@ export {
     GPU_MEMORY_MAP,
     COST_TIER_MAP,
     COST_TIER_WEIGHT,
+    GENERATION_WEIGHT,
     TP_OVERHEAD_PER_GPU
 }

package/servers/instance-sizer/lib/model-resolver.js CHANGED Viewed

@@ -214,12 +214,16 @@ const resolveModelMetadata = async (modelName, options = {}) => {
     const catalogEntry = catalogLookup(modelName, catalog)
     if (catalogEntry) {
-        return {
-            parameterCount: catalogEntry.parameterCount,
-            dtype: catalogEntry.defaultDtype,
-            architecture: catalogEntry.architecture,
-            maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings,
-            source: 'catalog'
+        // Only use catalog entry if it has a usable parameterCount for VRAM estimation.
+        // If parameterCount is missing, fall through to HuggingFace API (tier 2).
+        if (catalogEntry.parameterCount) {
+            return {
+                parameterCount: catalogEntry.parameterCount,
+                dtype: catalogEntry.defaultDtype,
+                architecture: catalogEntry.architecture,
+                maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings,
+                source: 'catalog'
+            }
         }
     }