@aws/ml-container-creator 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/bin/cli.js +5 -2
  2. package/config/bootstrap-stack.json +86 -7
  3. package/config/defaults.json +1 -1
  4. package/infra/ci-harness/buildspec.yml +60 -0
  5. package/package.json +3 -1
  6. package/servers/README.md +41 -1
  7. package/servers/instance-sizer/index.js +42 -2
  8. package/servers/instance-sizer/lib/instance-ranker.js +114 -10
  9. package/servers/instance-sizer/lib/quota-resolver.js +368 -0
  10. package/servers/instance-sizer/package.json +2 -0
  11. package/servers/lib/catalogs/instances.json +527 -12
  12. package/servers/lib/catalogs/model-servers.json +15 -15
  13. package/servers/lib/catalogs/model-sizes.json +27 -0
  14. package/servers/lib/catalogs/models.json +71 -0
  15. package/servers/lib/schemas/image-catalog.schema.json +9 -1
  16. package/src/app.js +109 -3
  17. package/src/lib/bootstrap-command-handler.js +96 -3
  18. package/src/lib/cli-handler.js +2 -2
  19. package/src/lib/config-manager.js +117 -1
  20. package/src/lib/deployment-entry-schema.js +16 -0
  21. package/src/lib/prompt-runner.js +270 -12
  22. package/src/lib/prompts.js +288 -6
  23. package/src/lib/registry-command-handler.js +12 -0
  24. package/src/lib/schema-sync.js +31 -0
  25. package/src/lib/template-manager.js +49 -1
  26. package/src/lib/validate-runner.js +125 -2
  27. package/templates/Dockerfile +22 -2
  28. package/templates/code/cuda_compat.sh +22 -0
  29. package/templates/code/serve +3 -0
  30. package/templates/code/serving.properties +14 -0
  31. package/templates/code/start_server.sh +3 -0
  32. package/templates/diffusors/Dockerfile +2 -1
  33. package/templates/diffusors/serve +3 -0
  34. package/templates/do/README.md +33 -0
  35. package/templates/do/adapter +1214 -0
  36. package/templates/do/adapters/.gitkeep +2 -0
  37. package/templates/do/add-ic +130 -0
  38. package/templates/do/benchmark +718 -0
  39. package/templates/do/clean +593 -17
  40. package/templates/do/config +49 -4
  41. package/templates/do/deploy +513 -362
  42. package/templates/do/ic/default.conf +32 -0
  43. package/templates/do/lib/endpoint-config.sh +216 -0
  44. package/templates/do/lib/inference-component.sh +167 -0
  45. package/templates/do/lib/secrets.sh +44 -0
  46. package/templates/do/lib/wait.sh +131 -0
  47. package/templates/do/logs +107 -27
  48. package/templates/do/optimize +528 -0
  49. package/templates/do/register +119 -2
  50. package/templates/do/status +337 -0
  51. package/templates/do/test +80 -28
  52. package/templates/triton/Dockerfile +5 -0
package/bin/cli.js CHANGED
@@ -98,6 +98,9 @@ program
98
98
  .addOption(new Option('--include-sample', 'Include sample model code'))
99
99
  .addOption(new Option('--include-testing', 'Include test suite'))
100
100
  .addOption(new Option('--test-types <types>', 'Comma-separated test types'))
101
+ .addOption(new Option('--enable-lora', 'Enable LoRA adapter serving (transformers with vllm/sglang/djl-lmi only)'))
102
+ .addOption(new Option('--max-loras <n>', 'Maximum concurrent LoRA adapters in GPU memory (default: 30)'))
103
+ .addOption(new Option('--max-lora-rank <n>', 'Maximum LoRA rank (default: 64)'))
101
104
 
102
105
  // --- MCP & Discovery ---
103
106
  .addOption(new Option('--smart', 'Enable Bedrock-powered smart mode on MCP servers'))
@@ -190,7 +193,7 @@ program.configureHelp({
190
193
  groups.env.push(opt);
191
194
  } else if (['--hf-token', '--hf-token-arn', '--ngc-token', '--ngc-token-arn'].includes(long)) {
192
195
  groups.auth.push(opt);
193
- } else if (['--include-sample', '--include-testing', '--test-types'].includes(long)) {
196
+ } else if (['--include-sample', '--include-testing', '--test-types', '--enable-lora', '--max-loras', '--max-lora-rank'].includes(long)) {
194
197
  groups.features.push(opt);
195
198
  } else if (['--smart', '--discover'].includes(long)) {
196
199
  groups.mcp.push(opt);
@@ -307,7 +310,6 @@ program
307
310
  program
308
311
  .command('registry')
309
312
  .description('Registry operations (list, get, remove, replay, export, import, search) — experimental, may be reconciled with do/register')
310
- .passThroughOptions()
311
313
  .argument('<action>', 'Registry action (log, list, get, remove, replay, export, import, search)')
312
314
  .argument('[args...]', 'Additional arguments')
313
315
  .option('--backend <backend>', 'Filter by backend')
@@ -328,6 +330,7 @@ program
328
330
  .option('--notes <text>', 'Deployment notes')
329
331
  .option('--project', 'Use project-level registry')
330
332
  .option('--parameters <json>', 'Parameters JSON string')
333
+ .option('--ic-list <json>', 'IC list JSON string')
331
334
  .option('--generator-version <version>', 'Generator version')
332
335
  // Options used by `registry list-architectures`
333
336
  .option('--server <name>', 'Filter by server name (for list-architectures)')
@@ -67,6 +67,37 @@
67
67
  ],
68
68
  "Resource": "*"
69
69
  },
70
+ {
71
+ "Sid": "SageMakerBenchmarking",
72
+ "Effect": "Allow",
73
+ "Action": [
74
+ "sagemaker:CreateAIBenchmarkJob",
75
+ "sagemaker:DescribeAIBenchmarkJob",
76
+ "sagemaker:ListAIBenchmarkJobs",
77
+ "sagemaker:StopAIBenchmarkJob",
78
+ "sagemaker:DeleteAIBenchmarkJob",
79
+ "sagemaker:CreateAIWorkloadConfig",
80
+ "sagemaker:DescribeAIWorkloadConfig",
81
+ "sagemaker:ListAIWorkloadConfigs",
82
+ "sagemaker:DeleteAIWorkloadConfig",
83
+ "sagemaker:CreateTrainingJob",
84
+ "sagemaker:DescribeTrainingJob",
85
+ "sagemaker:StopTrainingJob",
86
+ "sagemaker:AddTags"
87
+ ],
88
+ "Resource": "*"
89
+ },
90
+ {
91
+ "Sid": "PassRoleToSageMaker",
92
+ "Effect": "Allow",
93
+ "Action": "iam:PassRole",
94
+ "Resource": { "Fn::Sub": "arn:aws:iam::${AWS::AccountId}:role/mlcc-sagemaker-execution-role" },
95
+ "Condition": {
96
+ "StringEquals": {
97
+ "iam:PassedToService": "sagemaker.amazonaws.com"
98
+ }
99
+ }
100
+ },
70
101
  {
71
102
  "Sid": "ECRPull",
72
103
  "Effect": "Allow",
@@ -76,7 +107,7 @@
76
107
  "ecr:GetDownloadUrlForLayer",
77
108
  "ecr:BatchGetImage"
78
109
  ],
79
- "Resource": { "Fn::Sub": "arn:aws:ecr:*:${AWS::AccountId}:repository/ml-container-creator" }
110
+ "Resource": "*"
80
111
  },
81
112
  {
82
113
  "Sid": "ECRAuth",
@@ -99,6 +130,7 @@
99
130
  "Effect": "Allow",
100
131
  "Action": [
101
132
  "s3:GetObject",
133
+ "s3:PutObject",
102
134
  "s3:ListBucket"
103
135
  ],
104
136
  "Resource": [
@@ -113,12 +145,36 @@
113
145
  "secretsmanager:GetSecretValue",
114
146
  "secretsmanager:DescribeSecret"
115
147
  ],
116
- "Resource": "arn:aws:secretsmanager:*:*:secret:mlcc/*",
117
- "Condition": {
118
- "StringEquals": {
119
- "aws:ResourceTag/mlcc:managed-by": "ml-container-creator"
120
- }
121
- }
148
+ "Resource": [
149
+ "arn:aws:secretsmanager:*:*:secret:mlcc/*",
150
+ "arn:aws:secretsmanager:*:*:secret:ml-container-creator/*"
151
+ ]
152
+ },
153
+ {
154
+ "Sid": "SecretsManagerWrite",
155
+ "Effect": "Allow",
156
+ "Action": [
157
+ "secretsmanager:CreateSecret",
158
+ "secretsmanager:PutSecretValue",
159
+ "secretsmanager:TagResource"
160
+ ],
161
+ "Resource": [
162
+ "arn:aws:secretsmanager:*:*:secret:mlcc/*",
163
+ "arn:aws:secretsmanager:*:*:secret:ml-container-creator/*"
164
+ ]
165
+ },
166
+ {
167
+ "Sid": "QuotaAndAvailability",
168
+ "Effect": "Allow",
169
+ "Action": [
170
+ "service-quotas:GetServiceQuota",
171
+ "service-quotas:ListServiceQuotas",
172
+ "ec2:DescribeCapacityReservations",
173
+ "sagemaker:ListTrainingPlans",
174
+ "sagemaker:DescribeTrainingPlan",
175
+ "sagemaker:ListEndpoints"
176
+ ],
177
+ "Resource": "*"
122
178
  }
123
179
  ]
124
180
  }
@@ -185,6 +241,25 @@
185
241
  { "Key": "mlcc:created-by", "Value": "bootstrap" }
186
242
  ]
187
243
  }
244
+ },
245
+
246
+ "BenchmarkS3Bucket": {
247
+ "Type": "AWS::S3::Bucket",
248
+ "DeletionPolicy": "Retain",
249
+ "UpdateReplacePolicy": "Retain",
250
+ "Properties": {
251
+ "BucketName": { "Fn::Sub": "ml-container-creator-benchmark-${AWS::Region}-${AWS::AccountId}" },
252
+ "VersioningConfiguration": { "Status": "Enabled" },
253
+ "BucketEncryption": {
254
+ "ServerSideEncryptionConfiguration": [
255
+ { "ServerSideEncryptionByDefault": { "SSEAlgorithm": "AES256" } }
256
+ ]
257
+ },
258
+ "Tags": [
259
+ { "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
260
+ { "Key": "mlcc:created-by", "Value": "bootstrap" }
261
+ ]
262
+ }
188
263
  }
189
264
  },
190
265
 
@@ -217,6 +292,10 @@
217
292
  "Description": "S3 bucket for batch transform I/O",
218
293
  "Value": { "Ref": "BatchS3Bucket" }
219
294
  },
295
+ "BenchmarkS3BucketName": {
296
+ "Description": "S3 bucket for benchmark results output",
297
+ "Value": { "Ref": "BenchmarkS3Bucket" }
298
+ },
220
299
  "StackVersion": {
221
300
  "Description": "Bootstrap stack template version for forward compatibility tracking",
222
301
  "Value": "2026-05-04"
@@ -12,7 +12,7 @@
12
12
  "awsRegion": "us-east-1",
13
13
  "includeTesting": true,
14
14
  "testTypes": ["local-model-cli", "local-model-server", "hosted-model-endpoint"],
15
- "includeSampleModel": false,
15
+ "includeSampleModel": true,
16
16
  "skipPrompts": false
17
17
  },
18
18
  "validation": {
@@ -40,6 +40,10 @@ phases:
40
40
  - REGISTER_DURATION=0
41
41
  - REGISTER_LOG_POINTER=""
42
42
  - REGISTER_ERROR_SUMMARY=""
43
+ - ADAPTER_TEST_STATUS="skip"
44
+ - ADAPTER_TEST_DURATION=0
45
+ - ADAPTER_TEST_LOG_POINTER=""
46
+ - ADAPTER_TEST_ERROR_SUMMARY=""
43
47
  - TEARDOWN_STATUS="skip"
44
48
  - TEARDOWN_DURATION=0
45
49
  - TEARDOWN_LOG_POINTER=""
@@ -182,6 +186,54 @@ phases:
182
186
  fi
183
187
  - rm -f "$STAGE_STDERR_FILE"
184
188
 
189
+ # --- Stage: Adapter_Test (only if do/adapters/ has .conf files) ---
190
+ - echo "=== Stage: Adapter_Test ==="
191
+ - STAGE_START=$(date +%s)
192
+ - ADAPTER_TEST_LOG_POINTER="$LOG_POINTER_PREFIX"
193
+ - STAGE_STDERR_FILE=$(mktemp)
194
+ - |
195
+ if [ -n "$FIRST_FAILURE" ]; then
196
+ echo "Skipping Adapter_Test stage due to prior failure in $FIRST_FAILURE"
197
+ ADAPTER_TEST_STATUS="skip"
198
+ ADAPTER_TEST_DURATION=0
199
+ else
200
+ cd /tmp/ci-project
201
+ ADAPTER_CONFS=$(find do/adapters -name '*.conf' 2>/dev/null | grep -v '.gitkeep' || true)
202
+ if [ -z "$ADAPTER_CONFS" ]; then
203
+ echo "No adapter configs found in do/adapters/ — skipping"
204
+ ADAPTER_TEST_STATUS="skip"
205
+ ADAPTER_TEST_DURATION=0
206
+ else
207
+ (
208
+ set -e
209
+ cd /tmp/ci-project
210
+ for conf in do/adapters/*.conf; do
211
+ [ -f "$conf" ] || continue
212
+ [[ "$(basename "$conf")" == ".gitkeep" ]] && continue
213
+ ADAPTER_NAME=$(basename "$conf" .conf)
214
+ echo "Testing adapter: ${ADAPTER_NAME}"
215
+ # Source to get weights URI
216
+ source "$conf"
217
+ ./do/adapter add "${ADAPTER_NAME}" --weights "${ADAPTER_WEIGHTS_URI}"
218
+ ./do/test --ic "${ADAPTER_NAME}"
219
+ ./do/adapter remove "${ADAPTER_NAME}"
220
+ done
221
+ ) 2>"$STAGE_STDERR_FILE"; STAGE_EXIT=$?
222
+ STAGE_END=$(date +%s)
223
+ ADAPTER_TEST_DURATION=$((STAGE_END - STAGE_START))
224
+ if [ "$STAGE_EXIT" -eq 0 ]; then
225
+ ADAPTER_TEST_STATUS="pass"
226
+ echo "Adapter_Test stage passed in ${ADAPTER_TEST_DURATION}s"
227
+ else
228
+ ADAPTER_TEST_STATUS="fail"
229
+ ADAPTER_TEST_ERROR_SUMMARY=$(tail -c 500 "$STAGE_STDERR_FILE" | tr -d '\000' | tr '"' "'" | tr '\n' ' ')
230
+ FIRST_FAILURE="adapter_test"
231
+ echo "Adapter_Test stage FAILED (exit code $STAGE_EXIT) in ${ADAPTER_TEST_DURATION}s"
232
+ fi
233
+ fi
234
+ fi
235
+ - rm -f "$STAGE_STDERR_FILE"
236
+
185
237
  # --- Stage: Register (placeholder) ---
186
238
  - echo "=== Stage: Register ==="
187
239
  - STAGE_START=$(date +%s)
@@ -260,6 +312,7 @@ phases:
260
312
  validate) FINAL_ERROR_MESSAGE="$VALIDATE_ERROR_SUMMARY" ;;
261
313
  build) FINAL_ERROR_MESSAGE="$BUILD_ERROR_SUMMARY" ;;
262
314
  deploy_test) FINAL_ERROR_MESSAGE="$DEPLOY_TEST_ERROR_SUMMARY" ;;
315
+ adapter_test) FINAL_ERROR_MESSAGE="$ADAPTER_TEST_ERROR_SUMMARY" ;;
263
316
  register) FINAL_ERROR_MESSAGE="$REGISTER_ERROR_SUMMARY" ;;
264
317
  *) FINAL_ERROR_MESSAGE="Unknown failure stage" ;;
265
318
  esac
@@ -272,6 +325,7 @@ phases:
272
325
  ESCAPED_VALIDATE_ERROR=$(printf '%s' "$VALIDATE_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
273
326
  ESCAPED_BUILD_ERROR=$(printf '%s' "$BUILD_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
274
327
  ESCAPED_DEPLOY_TEST_ERROR=$(printf '%s' "$DEPLOY_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
328
+ ESCAPED_ADAPTER_TEST_ERROR=$(printf '%s' "$ADAPTER_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
275
329
  ESCAPED_REGISTER_ERROR=$(printf '%s' "$REGISTER_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
276
330
  ESCAPED_TEARDOWN_ERROR=$(printf '%s' "$TEARDOWN_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
277
331
  ESCAPED_FINAL_ERROR=$(printf '%s' "$FINAL_ERROR_MESSAGE" | sed 's/\\/\\\\/g; s/"/\\"/g')
@@ -314,6 +368,12 @@ phases:
314
368
  \"logPointer\": {\"S\": \"$DEPLOY_TEST_LOG_POINTER\"},
315
369
  \"errorSummary\": {\"S\": \"$ESCAPED_DEPLOY_TEST_ERROR\"}
316
370
  }},
371
+ \"adapter_test\": {\"M\": {
372
+ \"status\": {\"S\": \"$ADAPTER_TEST_STATUS\"},
373
+ \"durationSeconds\": {\"N\": \"$ADAPTER_TEST_DURATION\"},
374
+ \"logPointer\": {\"S\": \"$ADAPTER_TEST_LOG_POINTER\"},
375
+ \"errorSummary\": {\"S\": \"$ESCAPED_ADAPTER_TEST_ERROR\"}
376
+ }},
317
377
  \"register\": {\"M\": {
318
378
  \"status\": {\"S\": \"$REGISTER_STATUS\"},
319
379
  \"durationSeconds\": {\"N\": \"$REGISTER_DURATION\"},
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aws/ml-container-creator",
3
- "version": "0.3.0",
3
+ "version": "0.5.0",
4
4
  "description": "Generator for SageMaker AI BYOC paradigm for predictive inference use-cases.",
5
5
  "type": "module",
6
6
  "main": "src/app.js",
@@ -111,6 +111,8 @@
111
111
  "tinyglobby": "^0.2.16"
112
112
  },
113
113
  "devDependencies": {
114
+ "@aws-sdk/client-sagemaker": "^3.700.0",
115
+ "@aws-sdk/client-service-quotas": "^3.700.0",
114
116
  "@microsoft/eslint-formatter-sarif": "^3.1.0",
115
117
  "eslint": "^8.57.0",
116
118
  "fast-check": "^4.5.2",
package/servers/README.md CHANGED
@@ -15,7 +15,12 @@ servers/
15
15
  │ ├── test.js # Standalone tests (node test.js)
16
16
  │ ├── package.json
17
17
  │ └── LICENSE
18
- └── region-picker/ # AWS region suggestion server
18
+ ├── region-picker/ # AWS region suggestion server
19
+ │ ├── index.js # MCP server entry point
20
+ │ ├── test.js # Standalone tests (node test.js)
21
+ │ ├── package.json
22
+ │ └── LICENSE
23
+ └── endpoint-picker/ # SageMaker endpoint discovery server
19
24
  ├── index.js # MCP server entry point
20
25
  ├── test.js # Standalone tests (node test.js)
21
26
  ├── package.json
@@ -74,6 +79,39 @@ Suggests AWS regions for SageMaker deployments based on a search term. Filters t
74
79
  }
75
80
  ```
76
81
 
82
+ ### endpoint-picker
83
+
84
+ Discovers InService SageMaker real-time endpoints with available GPU capacity for attaching new inference components. Uses `ListEndpoints`, `DescribeEndpoint`, and `ListInferenceComponents` to calculate available capacity.
85
+
86
+ **Discover mode:** Queries the SageMaker API using a 3-strategy credential fallback (explicit profile → default chain → detect profiles). No static mode — always requires AWS credentials.
87
+
88
+ **Tool:** `get_inference_endpoints`
89
+
90
+ | Input Field | Type | Description |
91
+ |-------------|------|-------------|
92
+ | `parameters` | `string[]` | Must include `"endpointName"` to get results |
93
+ | `limit` | `number` | Max endpoints to return (default: 10) |
94
+ | `context` | `object` | `awsRegion`, `awsProfile`, `deploymentTarget` (must be `realtime-inference`) |
95
+
96
+ **Example response:**
97
+
98
+ ```json
99
+ {
100
+ "values": { "endpointName": "my-endpoint-1234567890" },
101
+ "choices": { "endpointName": ["my-endpoint-1234567890", "prod-llm-endpoint"] },
102
+ "metadata": {
103
+ "my-endpoint-1234567890": {
104
+ "variantName": "AllTraffic",
105
+ "instanceType": "ml.g6e.48xlarge",
106
+ "instanceCount": 1,
107
+ "icCount": 2,
108
+ "availableGpus": 4,
109
+ "hasInstancePools": false
110
+ }
111
+ }
112
+ }
113
+ ```
114
+
77
115
  ## Usage
78
116
 
79
117
  ### Adding a Bundled Server
@@ -297,6 +335,7 @@ The Bedrock API didn't respond within 10 seconds. This usually means network con
297
335
  ```bash
298
336
  node servers/region-picker/test.js
299
337
  node servers/instance-recommender/test.js
338
+ node servers/endpoint-picker/test.js
300
339
  ```
301
340
 
302
341
  ### Smart Mode Not Activating
@@ -313,6 +352,7 @@ Each server has standalone tests that run without AWS credentials or network acc
313
352
  # Run individual server tests
314
353
  node servers/region-picker/test.js
315
354
  node servers/instance-recommender/test.js
355
+ node servers/endpoint-picker/test.js
316
356
 
317
357
  # Run all server tests from the project root
318
358
  npm run test:servers
@@ -26,7 +26,8 @@ import { fileURLToPath } from 'node:url'
26
26
  import { resolve, dirname } from 'node:path'
27
27
  import { resolveModelMetadata } from './lib/model-resolver.js'
28
28
  import { estimateVram } from './lib/vram-estimator.js'
29
- import { filterAndRankInstances } from './lib/instance-ranker.js'
29
+ import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js'
30
+ import { QuotaResolver } from './lib/quota-resolver.js'
30
31
  import { queryBedrock } from '../lib/bedrock-client.js'
31
32
 
32
33
  // ── Path setup ───────────────────────────────────────────────────────────────
@@ -379,6 +380,44 @@ async function handleGetInstanceRecommendation(params) {
379
380
  { limit }
380
381
  )
381
382
 
383
+ // Step 3a: Quota & availability filtering (discover mode only)
384
+ let preQuotaFilterCount = 0
385
+ let allFilteredByQuota = false
386
+ let preQuotaRecommendations = []
387
+ if (DISCOVER_MODE && recommendations.length > 0) {
388
+ try {
389
+ const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
390
+ const quotaResolver = new QuotaResolver(region)
391
+
392
+ const instanceTypes = recommendations.map(r => r.instanceType)
393
+ const [quotas, reservations, ftps] = await Promise.allSettled([
394
+ quotaResolver.getQuotaHeadroom(instanceTypes),
395
+ quotaResolver.getCapacityReservations(),
396
+ quotaResolver.getTrainingPlans()
397
+ ])
398
+
399
+ preQuotaFilterCount = recommendations.length
400
+ preQuotaRecommendations = [...recommendations]
401
+ recommendations = applyAvailabilityRanking(
402
+ recommendations,
403
+ quotas.status === 'fulfilled' ? quotas.value : null,
404
+ reservations.status === 'fulfilled' ? reservations.value : null,
405
+ ftps.status === 'fulfilled' ? ftps.value : null
406
+ )
407
+ if (recommendations.length === 0 && preQuotaFilterCount > 0) {
408
+ allFilteredByQuota = true
409
+ // Restore pre-filter recommendations so user can see compatible instances
410
+ // and request quota increases for the ones they want
411
+ recommendations = preQuotaRecommendations
412
+ log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`)
413
+ }
414
+ } catch (err) {
415
+ // Graceful degradation: if credentials are missing or any unexpected
416
+ // error occurs, skip quota filtering and continue with unfiltered results
417
+ log(`Quota resolution skipped: ${err.message}`)
418
+ }
419
+ }
420
+
382
421
  // Step 3b: If instanceSearch is also provided, further filter by tags
383
422
  if (instanceSearch && recommendations.length > 0) {
384
423
  const searchMatches = new Set(searchInstancesByTag(instanceSearch, effectiveCatalog, { limit: 100 }))
@@ -480,7 +519,8 @@ async function handleGetInstanceRecommendation(params) {
480
519
  vramBreakdown: vramEstimate.breakdown,
481
520
  recommendations: finalRecommendations,
482
521
  source: modelMetadata.source,
483
- smartModeUsed
522
+ smartModeUsed,
523
+ allFilteredByQuota
484
524
  }
485
525
  })
486
526
  }]
@@ -31,14 +31,20 @@ const GPU_MEMORY_MAP = {
31
31
  */
32
32
  const COST_TIER_MAP = {
33
33
  'g4dn': 'low',
34
+ 'g4ad': 'low',
34
35
  'inf2': 'low',
35
36
  'g5': 'medium',
36
37
  'g6': 'medium',
38
+ 'g6e': 'medium',
39
+ 'g7e': 'medium',
37
40
  'trn1': 'medium',
38
41
  'p3': 'high',
39
42
  'p4d': 'high',
40
43
  'p4de': 'high',
41
- 'p5': 'high'
44
+ 'p5': 'high',
45
+ 'p5e': 'high',
46
+ 'p5en': 'high',
47
+ 'p6': 'high'
42
48
  }
43
49
 
44
50
  /**
@@ -56,15 +62,21 @@ const COST_TIER_WEIGHT = {
56
62
  * Lower is newer (sorted first). Newer generations offer better perf/$.
57
63
  */
58
64
  const GENERATION_WEIGHT = {
59
- 'g6': 1,
60
- 'p5': 1,
61
- 'trn1': 2,
62
- 'inf2': 2,
63
- 'g5': 3,
64
- 'p4de': 4,
65
- 'p4d': 4,
66
- 'p3': 5,
67
- 'g4dn': 6
65
+ 'g7e': 1,
66
+ 'p6': 1,
67
+ 'g6e': 2,
68
+ 'p5e': 2,
69
+ 'p5en': 2,
70
+ 'g6': 3,
71
+ 'p5': 3,
72
+ 'trn1': 3,
73
+ 'inf2': 3,
74
+ 'g5': 4,
75
+ 'p4de': 5,
76
+ 'p4d': 5,
77
+ 'p3': 6,
78
+ 'g4dn': 7,
79
+ 'g4ad': 7
68
80
  }
69
81
 
70
82
  /**
@@ -257,8 +269,99 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
257
269
  return candidates.slice(0, limit)
258
270
  }
259
271
 
272
+ // ── Availability Ranking ─────────────────────────────────────────────────────
273
+
274
+ /**
275
+ * Priority weights for capacity types used in availability ranking.
276
+ * Lower value = higher priority (sorted first).
277
+ */
278
+ const CAPACITY_TYPE_PRIORITY = {
279
+ reserved: 0,
280
+ ftp: 1,
281
+ 'on-demand': 2
282
+ }
283
+
284
+ /**
285
+ * Annotate, filter, and re-rank instance recommendations based on
286
+ * quota headroom, capacity reservations, and Flexible Training Plans.
287
+ *
288
+ * Each recommendation is annotated with:
289
+ * - capacityType: 'reserved' | 'ftp' | 'on-demand'
290
+ * - quotaStatus: 'available' | 'limited' | 'zero-quota'
291
+ * - reservationInfo: object (when capacityType is 'reserved')
292
+ * - ftpInfo: object (when capacityType is 'ftp')
293
+ *
294
+ * Instances with quotaStatus === 'zero-quota' are filtered out.
295
+ * Sort order: reserved → FTP → on-demand, preserving existing order within tiers.
296
+ *
297
+ * When any input signal is null (API failure), that signal is skipped
298
+ * and the function degrades gracefully.
299
+ *
300
+ * @param {object[]} recommendations - Ranked instance recommendations from filterAndRankInstances
301
+ * @param {Map|null} quotas - Map: instanceType → { quota, deployed, headroom }, or null
302
+ * @param {Map|null} reservations - Map: instanceType → { reservationId, count, expiresAt }, or null
303
+ * @param {Map|null} ftps - Map: instanceType → { planName, remainingCapacity, expiresAt }, or null
304
+ * @returns {object[]} Filtered and re-ranked recommendations
305
+ */
306
+ const applyAvailabilityRanking = (recommendations, quotas, reservations, ftps) => {
307
+ if (!recommendations || recommendations.length === 0) {
308
+ return []
309
+ }
310
+
311
+ // If all signals are null (all API calls failed), return unmodified
312
+ if (!quotas && !reservations && !ftps) {
313
+ return recommendations
314
+ }
315
+
316
+ // Annotate each recommendation with capacityType and quotaStatus
317
+ for (const rec of recommendations) {
318
+ rec.capacityType = 'on-demand'
319
+ rec.quotaStatus = 'available'
320
+
321
+ if (reservations?.has(rec.instanceType)) {
322
+ rec.capacityType = 'reserved'
323
+ rec.reservationInfo = reservations.get(rec.instanceType)
324
+ rec.reservationType = 'training-plan'
325
+ } else if (ftps?.has(rec.instanceType)) {
326
+ rec.capacityType = 'ftp'
327
+ rec.ftpInfo = ftps.get(rec.instanceType)
328
+ }
329
+
330
+ // quotaStatus applies to all instances regardless of capacityType
331
+ if (quotas) {
332
+ const q = quotas.get(rec.instanceType)
333
+ if (q && q.headroom === 0) {
334
+ rec.quotaStatus = 'zero-quota'
335
+ } else if (q && q.headroom < 2) {
336
+ rec.quotaStatus = 'limited'
337
+ }
338
+ if (q) {
339
+ rec.quotaHeadroom = q.headroom
340
+ rec.quotaDeployed = q.deployed
341
+ rec.quotaLimit = q.quota
342
+ }
343
+ }
344
+ }
345
+
346
+ // Filter out zero-quota instances (but never filter reserved/FTP — you have the capacity)
347
+ const filtered = recommendations.filter(r =>
348
+ r.quotaStatus !== 'zero-quota' || r.capacityType === 'reserved' || r.capacityType === 'ftp'
349
+ )
350
+
351
+ // Sort: reserved first, then FTP, then on-demand (preserve existing order within tier)
352
+ filtered.sort((a, b) => {
353
+ const pa = CAPACITY_TYPE_PRIORITY[a.capacityType] ?? 2
354
+ const pb = CAPACITY_TYPE_PRIORITY[b.capacityType] ?? 2
355
+ if (pa !== pb) return pa - pb
356
+ return 0
357
+ })
358
+
359
+ return filtered
360
+ }
361
+
260
362
  export {
261
363
  filterAndRankInstances,
364
+ applyAvailabilityRanking,
262
365
  getPerGpuMemoryGb,
263
366
  getCostTier,
264
367
  effectiveVram,
@@ -266,5 +369,6 @@ export {
266
369
  COST_TIER_MAP,
267
370
  COST_TIER_WEIGHT,
268
371
  GENERATION_WEIGHT,
372
+ CAPACITY_TYPE_PRIORITY,
269
373
  TP_OVERHEAD_PER_GPU
270
374
  }