npm - @aws/ml-container-creator - Versions diffs - 0.2.6 → 0.4.0 - Mend

@aws/ml-container-creator 0.2.6 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/bin/cli.js +38 -2
package/config/bootstrap-stack.json +94 -1
package/config/defaults.json +1 -1
package/infra/ci-harness/package-lock.json +22 -9
package/package.json +3 -1
package/servers/instance-sizer/index.js +45 -8
package/servers/instance-sizer/lib/instance-ranker.js +140 -11
package/servers/instance-sizer/lib/model-resolver.js +10 -6
package/servers/instance-sizer/lib/quota-resolver.js +368 -0
package/servers/instance-sizer/package.json +2 -0
package/servers/lib/catalogs/instances.json +527 -12
package/servers/lib/catalogs/model-servers.json +298 -20
package/servers/lib/catalogs/model-sizes.json +27 -0
package/servers/lib/catalogs/models.json +101 -0
package/servers/lib/schemas/image-catalog.schema.json +15 -1
package/servers/model-picker/index.js +2 -1
package/src/app.js +96 -2
package/src/lib/architecture-sync.js +171 -0
package/src/lib/arn-detection.js +22 -0
package/src/lib/bootstrap-command-handler.js +178 -3
package/src/lib/cli-handler.js +2 -2
package/src/lib/config-manager.js +121 -1
package/src/lib/cross-cutting-checker.js +119 -0
package/src/lib/deployment-entry-schema.js +1 -2
package/src/lib/prompt-runner.js +514 -20
package/src/lib/prompts.js +67 -5
package/src/lib/registry-command-handler.js +236 -0
package/src/lib/schema-sync.js +31 -0
package/src/lib/secret-classification.js +56 -0
package/src/lib/secrets-command-handler.js +550 -0
package/src/lib/template-manager.js +49 -1
package/src/lib/validate-runner.js +174 -2
package/src/lib/validation-report.js +8 -1
package/src/prompt-adapter.js +3 -2
package/templates/Dockerfile +10 -2
package/templates/code/cuda_compat.sh +22 -0
package/templates/code/serve +3 -0
package/templates/code/start_server.sh +3 -0
package/templates/diffusors/Dockerfile +2 -1
package/templates/diffusors/serve +3 -0
package/templates/do/README.md +33 -0
package/templates/do/benchmark +646 -0
package/templates/do/build +22 -0
package/templates/do/clean +86 -0
package/templates/do/config +41 -6
package/templates/do/deploy +66 -6
package/templates/do/logs +18 -3
package/templates/do/register +8 -1
package/templates/do/run +10 -0
package/templates/triton/Dockerfile +5 -0

package/bin/cli.js CHANGED Viewed

@@ -90,6 +90,9 @@ program
     // --- Authentication ---
     .addOption(new Option('--hf-token <token>', 'HuggingFace token (or "$HF_TOKEN" for env var reference)'))
+    .addOption(new Option('--hf-token-arn <arn>', 'HuggingFace token ARN from Secrets Manager'))
+    .addOption(new Option('--ngc-token <token>', 'NVIDIA NGC token (or "$NGC_API_KEY" for env var reference)'))
+    .addOption(new Option('--ngc-token-arn <arn>', 'NVIDIA NGC token ARN from Secrets Manager'))
     // --- Optional Features ---
     .addOption(new Option('--include-sample', 'Include sample model code'))
@@ -106,7 +109,18 @@ program
     .addOption(new Option('--validate-with-docker', 'Enable Docker introspection validation (opt-in)'))
     .addOption(new Option('--offline', 'Disable HuggingFace API lookups'))
-    .action((projectNameArgs, options) => run(projectNameArgs?.[0] || null, options));
+    .action((projectNameArgs, options) => {
+        // Mutual exclusion validation: plaintext token and ARN flags cannot both be provided
+        if (options.hfToken && options.hfTokenArn) {
+            console.error('❌ Cannot specify both --hf-token and --hf-token-arn. Use one or the other.');
+            process.exit(1);
+        }
+        if (options.ngcToken && options.ngcTokenArn) {
+            console.error('❌ Cannot specify both --ngc-token and --ngc-token-arn. Use one or the other.');
+            process.exit(1);
+        }
+        return run(projectNameArgs?.[0] || null, options);
+    });
 // Custom help formatting — group options into logical sections (root command only)
 program.configureHelp({
@@ -174,7 +188,7 @@ program.configureHelp({
                 groups.hyperpod.push(opt);
             } else if (['--model-env', '--server-env'].includes(long)) {
                 groups.env.push(opt);
-            } else if (['--hf-token'].includes(long)) {
+            } else if (['--hf-token', '--hf-token-arn', '--ngc-token', '--ngc-token-arn'].includes(long)) {
                 groups.auth.push(opt);
             } else if (['--include-sample', '--include-testing', '--test-types'].includes(long)) {
                 groups.features.push(opt);
@@ -255,6 +269,7 @@ program
     .option('--ci', 'Provision CI integration infrastructure')
     .option('--skip-ci', 'Skip CI infrastructure provisioning')
     .option('--skip-s3', 'Skip S3 bucket creation')
+    .option('--skip-post-setup', 'Skip post-setup chain (mcp init, sync-architectures, sync-schemas)')
     .action(async (action, args, options) => {
         const { default: BootstrapCommandHandler } = await import('../src/lib/bootstrap-command-handler.js');
         const handler = new BootstrapCommandHandler();
@@ -314,12 +329,33 @@ program
     .option('--project', 'Use project-level registry')
     .option('--parameters <json>', 'Parameters JSON string')
     .option('--generator-version <version>', 'Generator version')
+    // Options used by `registry list-architectures`
+    .option('--server <name>', 'Filter by server name (for list-architectures)')
+    .option('--verbose', 'Show full list of supported model types (for list-architectures)')
     .action(async (action, args, options) => {
         const { default: RegistryCommandHandler } = await import('../src/lib/registry-command-handler.js');
         const handler = new RegistryCommandHandler();
         await handler.handle([action, ...args], options);
     });
+program
+    .command('secrets')
+    .description('Manage secrets in AWS Secrets Manager (create, list, describe)')
+    .argument('[action]', 'Secrets action (create, list, describe)')
+    .argument('[args...]', 'Additional arguments')
+    .option('--type <type>', 'Secret type (e.g., hf-token, ngc-token)')
+    .option('--name <label>', 'Secret label (used in naming convention)')
+    .option('--secret-value <value>', 'Secret value (masked in terminal)')
+    .option('--description <text>', 'Secret description')
+    .option('--kms-key-id <key>', 'KMS key for encryption')
+    .option('--json <json-or-path>', 'JSON input (inline or file://path)')
+    .action(async (action, args, options) => {
+        const { default: SecretsCommandHandler } = await import('../src/lib/secrets-command-handler.js');
+        const handler = new SecretsCommandHandler();
+        const allArgs = action ? [action, ...args] : [];
+        await handler.handle(allArgs, options);
+    });
 program
     .command('configure')
     .description('Interactive configuration setup (experimental)')

package/config/bootstrap-stack.json CHANGED Viewed

@@ -67,6 +67,37 @@
                   ],
                   "Resource": "*"
                 },
+                {
+                  "Sid": "SageMakerBenchmarking",
+                  "Effect": "Allow",
+                  "Action": [
+                    "sagemaker:CreateAIBenchmarkJob",
+                    "sagemaker:DescribeAIBenchmarkJob",
+                    "sagemaker:ListAIBenchmarkJobs",
+                    "sagemaker:StopAIBenchmarkJob",
+                    "sagemaker:DeleteAIBenchmarkJob",
+                    "sagemaker:CreateAIWorkloadConfig",
+                    "sagemaker:DescribeAIWorkloadConfig",
+                    "sagemaker:ListAIWorkloadConfigs",
+                    "sagemaker:DeleteAIWorkloadConfig",
+                    "sagemaker:CreateTrainingJob",
+                    "sagemaker:DescribeTrainingJob",
+                    "sagemaker:StopTrainingJob",
+                    "sagemaker:AddTags"
+                  ],
+                  "Resource": "*"
+                },
+                {
+                  "Sid": "PassRoleToSageMaker",
+                  "Effect": "Allow",
+                  "Action": "iam:PassRole",
+                  "Resource": { "Fn::Sub": "arn:aws:iam::${AWS::AccountId}:role/mlcc-sagemaker-execution-role" },
+                  "Condition": {
+                    "StringEquals": {
+                      "iam:PassedToService": "sagemaker.amazonaws.com"
+                    }
+                  }
+                },
                 {
                   "Sid": "ECRPull",
                   "Effect": "Allow",
@@ -76,7 +107,7 @@
                     "ecr:GetDownloadUrlForLayer",
                     "ecr:BatchGetImage"
                   ],
-                  "Resource": { "Fn::Sub": "arn:aws:ecr:*:${AWS::AccountId}:repository/ml-container-creator" }
+                  "Resource": "*"
                 },
                 {
                   "Sid": "ECRAuth",
@@ -99,12 +130,51 @@
                   "Effect": "Allow",
                   "Action": [
                     "s3:GetObject",
+                    "s3:PutObject",
                     "s3:ListBucket"
                   ],
                   "Resource": [
                     "arn:aws:s3:::ml-container-creator-*",
                     "arn:aws:s3:::ml-container-creator-*/*"
                   ]
+                },
+                {
+                  "Sid": "SecretsManagerRead",
+                  "Effect": "Allow",
+                  "Action": [
+                    "secretsmanager:GetSecretValue",
+                    "secretsmanager:DescribeSecret"
+                  ],
+                  "Resource": [
+                    "arn:aws:secretsmanager:*:*:secret:mlcc/*",
+                    "arn:aws:secretsmanager:*:*:secret:ml-container-creator/*"
+                  ]
+                },
+                {
+                  "Sid": "SecretsManagerWrite",
+                  "Effect": "Allow",
+                  "Action": [
+                    "secretsmanager:CreateSecret",
+                    "secretsmanager:PutSecretValue",
+                    "secretsmanager:TagResource"
+                  ],
+                  "Resource": [
+                    "arn:aws:secretsmanager:*:*:secret:mlcc/*",
+                    "arn:aws:secretsmanager:*:*:secret:ml-container-creator/*"
+                  ]
+                },
+                {
+                  "Sid": "QuotaAndAvailability",
+                  "Effect": "Allow",
+                  "Action": [
+                    "service-quotas:GetServiceQuota",
+                    "service-quotas:ListServiceQuotas",
+                    "ec2:DescribeCapacityReservations",
+                    "sagemaker:ListTrainingPlans",
+                    "sagemaker:DescribeTrainingPlan",
+                    "sagemaker:ListEndpoints"
+                  ],
+                  "Resource": "*"
                 }
               ]
             }
@@ -171,6 +241,25 @@
           { "Key": "mlcc:created-by", "Value": "bootstrap" }
         ]
       }
+    },
+    "BenchmarkS3Bucket": {
+      "Type": "AWS::S3::Bucket",
+      "DeletionPolicy": "Retain",
+      "UpdateReplacePolicy": "Retain",
+      "Properties": {
+        "BucketName": { "Fn::Sub": "ml-container-creator-benchmark-${AWS::Region}-${AWS::AccountId}" },
+        "VersioningConfiguration": { "Status": "Enabled" },
+        "BucketEncryption": {
+          "ServerSideEncryptionConfiguration": [
+            { "ServerSideEncryptionByDefault": { "SSEAlgorithm": "AES256" } }
+          ]
+        },
+        "Tags": [
+          { "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
+          { "Key": "mlcc:created-by", "Value": "bootstrap" }
+        ]
+      }
     }
   },
@@ -203,6 +292,10 @@
       "Description": "S3 bucket for batch transform I/O",
       "Value": { "Ref": "BatchS3Bucket" }
     },
+    "BenchmarkS3BucketName": {
+      "Description": "S3 bucket for benchmark results output",
+      "Value": { "Ref": "BenchmarkS3Bucket" }
+    },
     "StackVersion": {
       "Description": "Bootstrap stack template version for forward compatibility tracking",
       "Value": "2026-05-04"

package/config/defaults.json CHANGED Viewed

@@ -12,7 +12,7 @@
     "awsRegion": "us-east-1",
     "includeTesting": true,
     "testTypes": ["local-model-cli", "local-model-server", "hosted-model-endpoint"],
-    "includeSampleModel": false,
+    "includeSampleModel": true,
     "skipPrompts": false
   },
   "validation": {

package/infra/ci-harness/package-lock.json CHANGED Viewed

@@ -48,7 +48,6 @@
         "semver"
       ],
       "license": "Apache-2.0",
-      "peer": true,
       "dependencies": {
         "jsonschema": "~1.4.1",
         "semver": "^7.7.4"
@@ -2151,7 +2150,6 @@
       "integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "undici-types": "~6.21.0"
       }
@@ -2791,8 +2789,7 @@
       "version": "10.6.0",
       "resolved": "https://registry.npmjs.org/constructs/-/constructs-10.6.0.tgz",
       "integrity": "sha512-TxHOnBO5zMo/G76ykzGF/wMpEHu257TbWiIxP9K0Yv/+t70UzgBQiTqjkAsWOPC6jW91DzJI0+ehQV6xDRNBuQ==",
-      "license": "Apache-2.0",
-      "peer": true
+      "license": "Apache-2.0"
     },
     "node_modules/create-require": {
       "version": "1.1.1",
@@ -2937,9 +2934,9 @@
       }
     },
     "node_modules/fast-xml-builder": {
-      "version": "1.1.5",
-      "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.1.5.tgz",
-      "integrity": "sha512-4TJn/8FKLeslLAH3dnohXqE3QSoxkhvaMzepOIZytwJXZO69Bfz0HBdDHzOTOon6G59Zrk6VQ2bEiv1t61rfkA==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.2.0.tgz",
+      "integrity": "sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==",
       "dev": true,
       "funding": [
         {
@@ -2949,7 +2946,8 @@
       ],
       "license": "MIT",
       "dependencies": {
-        "path-expression-matcher": "^1.1.3"
+        "path-expression-matcher": "^1.5.0",
+        "xml-naming": "^0.1.0"
       }
     },
     "node_modules/fast-xml-parser": {
@@ -3696,7 +3694,6 @@
       "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
       "dev": true,
       "license": "Apache-2.0",
-      "peer": true,
       "bin": {
         "tsc": "bin/tsc",
         "tsserver": "bin/tsserver"
@@ -3837,6 +3834,22 @@
         "url": "https://github.com/chalk/ansi-styles?sponsor=1"
       }
     },
+    "node_modules/xml-naming": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/xml-naming/-/xml-naming-0.1.0.tgz",
+      "integrity": "sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        }
+      ],
+      "license": "MIT",
+      "engines": {
+        "node": ">=16.0.0"
+      }
+    },
     "node_modules/y18n": {
       "version": "5.0.8",
       "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@aws/ml-container-creator",
-  "version": "0.2.6",
+  "version": "0.4.0",
   "description": "Generator for SageMaker AI BYOC paradigm for predictive inference use-cases.",
   "type": "module",
   "main": "src/app.js",
@@ -111,6 +111,8 @@
     "tinyglobby": "^0.2.16"
   },
   "devDependencies": {
+    "@aws-sdk/client-sagemaker": "^3.700.0",
+    "@aws-sdk/client-service-quotas": "^3.700.0",
     "@microsoft/eslint-formatter-sarif": "^3.1.0",
     "eslint": "^8.57.0",
     "fast-check": "^4.5.2",

package/servers/instance-sizer/index.js CHANGED Viewed

@@ -26,7 +26,8 @@ import { fileURLToPath } from 'node:url'
 import { resolve, dirname } from 'node:path'
 import { resolveModelMetadata } from './lib/model-resolver.js'
 import { estimateVram } from './lib/vram-estimator.js'
-import { filterAndRankInstances } from './lib/instance-ranker.js'
+import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js'
+import { QuotaResolver } from './lib/quota-resolver.js'
 import { queryBedrock } from '../lib/bedrock-client.js'
 // ── Path setup ───────────────────────────────────────────────────────────────
@@ -114,11 +115,11 @@ function log(message) {
  * @param {string} search - Search query string
  * @param {object} instanceCatalog - Instance catalog object
  * @param {object} [options={}]
- * @param {number} [options.limit=8] - Max results
+ * @param {number} [options.limit=10] - Max results
  * @returns {string[]} Matching instance type names, sorted by relevance
  */
 function searchInstancesByTag(search, instanceCatalog, options = {}) {
-    const { limit = 8 } = options
+    const { limit = 10 } = options
     const candidates = Object.entries(instanceCatalog)
     // Tokenize search into lowercase keywords
@@ -236,7 +237,7 @@ async function handleGetInstanceRecommendation(params) {
         maxSequenceLength,
         batchSize,
         cudaVersion,
-        limit = 8,
+        limit = 10,
         context
     } = params
@@ -361,11 +362,14 @@ async function handleGetInstanceRecommendation(params) {
     }
     // Step 2: Estimate VRAM
+    // Use model's max_position_embeddings as the sequence length when no explicit value is provided.
+    // This ensures KV cache is sized for the model's actual context window, not the 4096 default.
+    const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined
     const vramEstimate = estimateVram({
         parameterCount: modelMetadata.parameterCount,
         dtype: modelMetadata.dtype,
         quantization: quantization || undefined,
-        maxSequenceLength: effectiveMaxSeqLen || undefined,
+        maxSequenceLength: resolvedMaxSeqLen,
         batchSize: effectiveBatchSize || undefined
     })
@@ -376,6 +380,38 @@ async function handleGetInstanceRecommendation(params) {
         { limit }
     )
+    // Step 3a: Quota & availability filtering (discover mode only)
+    let preQuotaFilterCount = 0
+    let allFilteredByQuota = false
+    if (DISCOVER_MODE && recommendations.length > 0) {
+        try {
+            const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
+            const quotaResolver = new QuotaResolver(region)
+            const instanceTypes = recommendations.map(r => r.instanceType)
+            const [quotas, reservations, ftps] = await Promise.allSettled([
+                quotaResolver.getQuotaHeadroom(instanceTypes),
+                quotaResolver.getCapacityReservations(),
+                quotaResolver.getTrainingPlans()
+            ])
+            preQuotaFilterCount = recommendations.length
+            recommendations = applyAvailabilityRanking(
+                recommendations,
+                quotas.status === 'fulfilled' ? quotas.value : null,
+                reservations.status === 'fulfilled' ? reservations.value : null,
+                ftps.status === 'fulfilled' ? ftps.value : null
+            )
+            if (recommendations.length === 0 && preQuotaFilterCount > 0) {
+                allFilteredByQuota = true
+            }
+        } catch (err) {
+            // Graceful degradation: if credentials are missing or any unexpected
+            // error occurs, skip quota filtering and continue with unfiltered results
+            log(`Quota resolution skipped: ${err.message}`)
+        }
+    }
     // Step 3b: If instanceSearch is also provided, further filter by tags
     if (instanceSearch && recommendations.length > 0) {
         const searchMatches = new Set(searchInstancesByTag(instanceSearch, effectiveCatalog, { limit: 100 }))
@@ -477,7 +513,8 @@ async function handleGetInstanceRecommendation(params) {
                     vramBreakdown: vramEstimate.breakdown,
                     recommendations: finalRecommendations,
                     source: modelMetadata.source,
-                    smartModeUsed
+                    smartModeUsed,
+                    allFilteredByQuota
                 }
             })
         }]
@@ -502,7 +539,7 @@ server.tool(
         maxSequenceLength: z.number().optional().describe('Max context/sequence length (affects KV cache estimate)'),
         batchSize: z.number().optional().describe('Expected concurrent batch size'),
         cudaVersion: z.string().optional().describe('Required CUDA version from base image (filters incompatible instances)'),
-        limit: z.number().optional().default(8).describe('Maximum number of instance recommendations to return'),
+        limit: z.number().optional().default(10).describe('Maximum number of instance recommendations to return'),
         context: z.object({
             architecture: z.string().optional(),
             backend: z.string().optional(),
@@ -526,7 +563,7 @@ server.tool(
         maxSequenceLength: z.number().optional().describe('Max context/sequence length (affects KV cache estimate)'),
         batchSize: z.number().optional().describe('Expected concurrent batch size'),
         cudaVersion: z.string().optional().describe('Required CUDA version from base image (filters incompatible instances)'),
-        limit: z.number().optional().default(8).describe('Maximum number of instance recommendations to return'),
+        limit: z.number().optional().default(10).describe('Maximum number of instance recommendations to return'),
         context: z.object({
             architecture: z.string().optional(),
             backend: z.string().optional(),

package/servers/instance-sizer/lib/instance-ranker.js CHANGED Viewed

@@ -31,14 +31,20 @@ const GPU_MEMORY_MAP = {
  */
 const COST_TIER_MAP = {
     'g4dn': 'low',
+    'g4ad': 'low',
     'inf2': 'low',
     'g5': 'medium',
     'g6': 'medium',
+    'g6e': 'medium',
+    'g7e': 'medium',
     'trn1': 'medium',
     'p3': 'high',
     'p4d': 'high',
     'p4de': 'high',
-    'p5': 'high'
+    'p5': 'high',
+    'p5e': 'high',
+    'p5en': 'high',
+    'p6': 'high'
 }
 /**
@@ -51,6 +57,28 @@ const COST_TIER_WEIGHT = {
     'high': 3
 }
+/**
+ * Generation weight by instance family.
+ * Lower is newer (sorted first). Newer generations offer better perf/$.
+ */
+const GENERATION_WEIGHT = {
+    'g7e': 1,
+    'p6': 1,
+    'g6e': 2,
+    'p5e': 2,
+    'p5en': 2,
+    'g6': 3,
+    'p5': 3,
+    'trn1': 3,
+    'inf2': 3,
+    'g5': 4,
+    'p4de': 5,
+    'p4d': 5,
+    'p3': 6,
+    'g4dn': 7,
+    'g4ad': 7
+}
 /**
  * TP overhead penalty: 10% per additional GPU beyond the first.
  * Effective VRAM = totalVram × (1 - 0.10 × (gpuCount - 1))
@@ -144,12 +172,12 @@ const effectiveVram = (totalVramGb, gpuCount) => {
  * @param {number} vramRequired - Required VRAM in GB
  * @param {object} instanceCatalog - Object keyed by instance type, values are metadata
  * @param {object} [options={}]
- * @param {number} [options.limit=8] - Max results to return
+ * @param {number} [options.limit=10] - Max results to return
  * @param {boolean} [options.allowTensorParallelism=true] - Consider multi-GPU splits
  * @returns {object[]} Ranked list of compatible instances
  */
 const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) => {
-    const { limit = 8, allowTensorParallelism = true } = options
+    const { limit = 10, allowTensorParallelism = true } = options
     if (!vramRequired || vramRequired <= 0) {
         return []
@@ -182,7 +210,8 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
                     totalVramGb,
                     utilizationPercent,
                     tensorParallelism: 1,
-                    costTier: getCostTier(meta)
+                    costTier: getCostTier(meta),
+                    family: meta.family || ''
                 })
             }
         } else if (allowTensorParallelism) {
@@ -196,7 +225,8 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
                     totalVramGb,
                     utilizationPercent,
                     tensorParallelism: gpuCount,
-                    costTier: getCostTier(meta)
+                    costTier: getCostTier(meta),
+                    family: meta.family || ''
                 })
             }
         }
@@ -204,24 +234,30 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
     // Sort candidates by ranking criteria:
     // 1. Single-GPU first (TP=1), then multi-GPU by lowest TP degree
-    // 2. Within each TP tier, sort by cost-efficiency (lowest cost tier first,
-    //    then by lowest utilization — more headroom is better for the same cost)
+    // 2. Within each TP tier, newest generation first (g6 > g5 > g4dn)
+    // 3. Within same generation, sort by cost tier (lower is better)
+    // 4. Within same cost tier, prefer lower total VRAM (right-sized)
     candidates.sort((a, b) => {
         // Primary: TP degree (lower is better)
         if (a.tensorParallelism !== b.tensorParallelism) {
             return a.tensorParallelism - b.tensorParallelism
         }
-        // Secondary: cost tier (lower is better)
+        // Secondary: generation (newer is better — lower weight)
+        const genA = GENERATION_WEIGHT[a.family] || 4
+        const genB = GENERATION_WEIGHT[b.family] || 4
+        if (genA !== genB) {
+            return genA - genB
+        }
+        // Tertiary: cost tier (lower is better)
         const costA = COST_TIER_WEIGHT[a.costTier] || 2
         const costB = COST_TIER_WEIGHT[b.costTier] || 2
         if (costA !== costB) {
             return costA - costB
         }
-        // Tertiary: cost-efficiency — lower $/GB approximated by
-        // lower cost tier with higher total VRAM (more GB per dollar)
-        // Since cost tier is equal here, prefer higher total VRAM (better value)
+        // Quaternary: prefer lower total VRAM (right-sized, less waste)
         if (a.totalVramGb !== b.totalVramGb) {
             return a.totalVramGb - b.totalVramGb
         }
@@ -233,13 +269,106 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
     return candidates.slice(0, limit)
 }
+// ── Availability Ranking ─────────────────────────────────────────────────────
+/**
+ * Priority weights for capacity types used in availability ranking.
+ * Lower value = higher priority (sorted first).
+ */
+const CAPACITY_TYPE_PRIORITY = {
+    reserved: 0,
+    ftp: 1,
+    'on-demand': 2
+}
+/**
+ * Annotate, filter, and re-rank instance recommendations based on
+ * quota headroom, capacity reservations, and Flexible Training Plans.
+ *
+ * Each recommendation is annotated with:
+ * - capacityType: 'reserved' | 'ftp' | 'on-demand'
+ * - quotaStatus: 'available' | 'limited' | 'zero-quota'
+ * - reservationInfo: object (when capacityType is 'reserved')
+ * - ftpInfo: object (when capacityType is 'ftp')
+ *
+ * Instances with quotaStatus === 'zero-quota' are filtered out.
+ * Sort order: reserved → FTP → on-demand, preserving existing order within tiers.
+ *
+ * When any input signal is null (API failure), that signal is skipped
+ * and the function degrades gracefully.
+ *
+ * @param {object[]} recommendations - Ranked instance recommendations from filterAndRankInstances
+ * @param {Map|null} quotas - Map: instanceType → { quota, deployed, headroom }, or null
+ * @param {Map|null} reservations - Map: instanceType → { reservationId, count, expiresAt }, or null
+ * @param {Map|null} ftps - Map: instanceType → { planName, remainingCapacity, expiresAt }, or null
+ * @returns {object[]} Filtered and re-ranked recommendations
+ */
+const applyAvailabilityRanking = (recommendations, quotas, reservations, ftps) => {
+    if (!recommendations || recommendations.length === 0) {
+        return []
+    }
+    // If all signals are null (all API calls failed), return unmodified
+    if (!quotas && !reservations && !ftps) {
+        return recommendations
+    }
+    // Annotate each recommendation with capacityType and quotaStatus
+    for (const rec of recommendations) {
+        rec.capacityType = 'on-demand'
+        rec.quotaStatus = 'available'
+        if (reservations?.has(rec.instanceType)) {
+            rec.capacityType = 'reserved'
+            rec.reservationInfo = reservations.get(rec.instanceType)
+            rec.reservationType = 'training-plan'
+        } else if (ftps?.has(rec.instanceType)) {
+            rec.capacityType = 'ftp'
+            rec.ftpInfo = ftps.get(rec.instanceType)
+        }
+        // quotaStatus applies to all instances regardless of capacityType
+        if (quotas) {
+            const q = quotas.get(rec.instanceType)
+            if (q && q.headroom === 0) {
+                rec.quotaStatus = 'zero-quota'
+            } else if (q && q.headroom < 2) {
+                rec.quotaStatus = 'limited'
+            }
+            if (q) {
+                rec.quotaHeadroom = q.headroom
+                rec.quotaDeployed = q.deployed
+                rec.quotaLimit = q.quota
+            }
+        }
+    }
+    // Filter out zero-quota instances (but never filter reserved/FTP — you have the capacity)
+    const filtered = recommendations.filter(r =>
+        r.quotaStatus !== 'zero-quota' || r.capacityType === 'reserved' || r.capacityType === 'ftp'
+    )
+    // Sort: reserved first, then FTP, then on-demand (preserve existing order within tier)
+    filtered.sort((a, b) => {
+        const pa = CAPACITY_TYPE_PRIORITY[a.capacityType] ?? 2
+        const pb = CAPACITY_TYPE_PRIORITY[b.capacityType] ?? 2
+        if (pa !== pb) return pa - pb
+        return 0
+    })
+    return filtered
+}
 export {
     filterAndRankInstances,
+    applyAvailabilityRanking,
     getPerGpuMemoryGb,
     getCostTier,
     effectiveVram,
     GPU_MEMORY_MAP,
     COST_TIER_MAP,
     COST_TIER_WEIGHT,
+    GENERATION_WEIGHT,
+    CAPACITY_TYPE_PRIORITY,
     TP_OVERHEAD_PER_GPU
 }