@aws/ml-container-creator 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/cli.js CHANGED
@@ -98,6 +98,9 @@ program
98
98
  .addOption(new Option('--include-sample', 'Include sample model code'))
99
99
  .addOption(new Option('--include-testing', 'Include test suite'))
100
100
  .addOption(new Option('--test-types <types>', 'Comma-separated test types'))
101
+ .addOption(new Option('--enable-lora', 'Enable LoRA adapter serving (transformers with vllm/sglang/djl-lmi only)'))
102
+ .addOption(new Option('--max-loras <n>', 'Maximum concurrent LoRA adapters in GPU memory (default: 30)'))
103
+ .addOption(new Option('--max-lora-rank <n>', 'Maximum LoRA rank (default: 64)'))
101
104
 
102
105
  // --- MCP & Discovery ---
103
106
  .addOption(new Option('--smart', 'Enable Bedrock-powered smart mode on MCP servers'))
@@ -190,7 +193,7 @@ program.configureHelp({
190
193
  groups.env.push(opt);
191
194
  } else if (['--hf-token', '--hf-token-arn', '--ngc-token', '--ngc-token-arn'].includes(long)) {
192
195
  groups.auth.push(opt);
193
- } else if (['--include-sample', '--include-testing', '--test-types'].includes(long)) {
196
+ } else if (['--include-sample', '--include-testing', '--test-types', '--enable-lora', '--max-loras', '--max-lora-rank'].includes(long)) {
194
197
  groups.features.push(opt);
195
198
  } else if (['--smart', '--discover'].includes(long)) {
196
199
  groups.mcp.push(opt);
@@ -307,7 +310,6 @@ program
307
310
  program
308
311
  .command('registry')
309
312
  .description('Registry operations (list, get, remove, replay, export, import, search) — experimental, may be reconciled with do/register')
310
- .passThroughOptions()
311
313
  .argument('<action>', 'Registry action (log, list, get, remove, replay, export, import, search)')
312
314
  .argument('[args...]', 'Additional arguments')
313
315
  .option('--backend <backend>', 'Filter by backend')
@@ -328,6 +330,7 @@ program
328
330
  .option('--notes <text>', 'Deployment notes')
329
331
  .option('--project', 'Use project-level registry')
330
332
  .option('--parameters <json>', 'Parameters JSON string')
333
+ .option('--ic-list <json>', 'IC list JSON string')
331
334
  .option('--generator-version <version>', 'Generator version')
332
335
  // Options used by `registry list-architectures`
333
336
  .option('--server <name>', 'Filter by server name (for list-architectures)')
@@ -40,6 +40,10 @@ phases:
40
40
  - REGISTER_DURATION=0
41
41
  - REGISTER_LOG_POINTER=""
42
42
  - REGISTER_ERROR_SUMMARY=""
43
+ - ADAPTER_TEST_STATUS="skip"
44
+ - ADAPTER_TEST_DURATION=0
45
+ - ADAPTER_TEST_LOG_POINTER=""
46
+ - ADAPTER_TEST_ERROR_SUMMARY=""
43
47
  - TEARDOWN_STATUS="skip"
44
48
  - TEARDOWN_DURATION=0
45
49
  - TEARDOWN_LOG_POINTER=""
@@ -182,6 +186,54 @@ phases:
182
186
  fi
183
187
  - rm -f "$STAGE_STDERR_FILE"
184
188
 
189
+ # --- Stage: Adapter_Test (only if do/adapters/ has .conf files) ---
190
+ - echo "=== Stage: Adapter_Test ==="
191
+ - STAGE_START=$(date +%s)
192
+ - ADAPTER_TEST_LOG_POINTER="$LOG_POINTER_PREFIX"
193
+ - STAGE_STDERR_FILE=$(mktemp)
194
+ - |
195
+ if [ -n "$FIRST_FAILURE" ]; then
196
+ echo "Skipping Adapter_Test stage due to prior failure in $FIRST_FAILURE"
197
+ ADAPTER_TEST_STATUS="skip"
198
+ ADAPTER_TEST_DURATION=0
199
+ else
200
+ cd /tmp/ci-project
201
+ ADAPTER_CONFS=$(find do/adapters -name '*.conf' 2>/dev/null | grep -v '.gitkeep' || true)
202
+ if [ -z "$ADAPTER_CONFS" ]; then
203
+ echo "No adapter configs found in do/adapters/ — skipping"
204
+ ADAPTER_TEST_STATUS="skip"
205
+ ADAPTER_TEST_DURATION=0
206
+ else
207
+ (
208
+ set -e
209
+ cd /tmp/ci-project
210
+ for conf in do/adapters/*.conf; do
211
+ [ -f "$conf" ] || continue
212
+ [[ "$(basename "$conf")" == ".gitkeep" ]] && continue
213
+ ADAPTER_NAME=$(basename "$conf" .conf)
214
+ echo "Testing adapter: ${ADAPTER_NAME}"
215
+ # Source to get weights URI
216
+ source "$conf"
217
+ ./do/adapter add "${ADAPTER_NAME}" --weights "${ADAPTER_WEIGHTS_URI}"
218
+ ./do/test --ic "${ADAPTER_NAME}"
219
+ ./do/adapter remove "${ADAPTER_NAME}"
220
+ done
221
+ ) 2>"$STAGE_STDERR_FILE"; STAGE_EXIT=$?
222
+ STAGE_END=$(date +%s)
223
+ ADAPTER_TEST_DURATION=$((STAGE_END - STAGE_START))
224
+ if [ "$STAGE_EXIT" -eq 0 ]; then
225
+ ADAPTER_TEST_STATUS="pass"
226
+ echo "Adapter_Test stage passed in ${ADAPTER_TEST_DURATION}s"
227
+ else
228
+ ADAPTER_TEST_STATUS="fail"
229
+ ADAPTER_TEST_ERROR_SUMMARY=$(tail -c 500 "$STAGE_STDERR_FILE" | tr -d '\000' | tr '"' "'" | tr '\n' ' ')
230
+ FIRST_FAILURE="adapter_test"
231
+ echo "Adapter_Test stage FAILED (exit code $STAGE_EXIT) in ${ADAPTER_TEST_DURATION}s"
232
+ fi
233
+ fi
234
+ fi
235
+ - rm -f "$STAGE_STDERR_FILE"
236
+
185
237
  # --- Stage: Register (placeholder) ---
186
238
  - echo "=== Stage: Register ==="
187
239
  - STAGE_START=$(date +%s)
@@ -260,6 +312,7 @@ phases:
260
312
  validate) FINAL_ERROR_MESSAGE="$VALIDATE_ERROR_SUMMARY" ;;
261
313
  build) FINAL_ERROR_MESSAGE="$BUILD_ERROR_SUMMARY" ;;
262
314
  deploy_test) FINAL_ERROR_MESSAGE="$DEPLOY_TEST_ERROR_SUMMARY" ;;
315
+ adapter_test) FINAL_ERROR_MESSAGE="$ADAPTER_TEST_ERROR_SUMMARY" ;;
263
316
  register) FINAL_ERROR_MESSAGE="$REGISTER_ERROR_SUMMARY" ;;
264
317
  *) FINAL_ERROR_MESSAGE="Unknown failure stage" ;;
265
318
  esac
@@ -272,6 +325,7 @@ phases:
272
325
  ESCAPED_VALIDATE_ERROR=$(printf '%s' "$VALIDATE_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
273
326
  ESCAPED_BUILD_ERROR=$(printf '%s' "$BUILD_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
274
327
  ESCAPED_DEPLOY_TEST_ERROR=$(printf '%s' "$DEPLOY_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
328
+ ESCAPED_ADAPTER_TEST_ERROR=$(printf '%s' "$ADAPTER_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
275
329
  ESCAPED_REGISTER_ERROR=$(printf '%s' "$REGISTER_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
276
330
  ESCAPED_TEARDOWN_ERROR=$(printf '%s' "$TEARDOWN_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
277
331
  ESCAPED_FINAL_ERROR=$(printf '%s' "$FINAL_ERROR_MESSAGE" | sed 's/\\/\\\\/g; s/"/\\"/g')
@@ -314,6 +368,12 @@ phases:
314
368
  \"logPointer\": {\"S\": \"$DEPLOY_TEST_LOG_POINTER\"},
315
369
  \"errorSummary\": {\"S\": \"$ESCAPED_DEPLOY_TEST_ERROR\"}
316
370
  }},
371
+ \"adapter_test\": {\"M\": {
372
+ \"status\": {\"S\": \"$ADAPTER_TEST_STATUS\"},
373
+ \"durationSeconds\": {\"N\": \"$ADAPTER_TEST_DURATION\"},
374
+ \"logPointer\": {\"S\": \"$ADAPTER_TEST_LOG_POINTER\"},
375
+ \"errorSummary\": {\"S\": \"$ESCAPED_ADAPTER_TEST_ERROR\"}
376
+ }},
317
377
  \"register\": {\"M\": {
318
378
  \"status\": {\"S\": \"$REGISTER_STATUS\"},
319
379
  \"durationSeconds\": {\"N\": \"$REGISTER_DURATION\"},
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aws/ml-container-creator",
3
- "version": "0.4.0",
3
+ "version": "0.5.0",
4
4
  "description": "Generator for SageMaker AI BYOC paradigm for predictive inference use-cases.",
5
5
  "type": "module",
6
6
  "main": "src/app.js",
package/servers/README.md CHANGED
@@ -15,7 +15,12 @@ servers/
15
15
  │ ├── test.js # Standalone tests (node test.js)
16
16
  │ ├── package.json
17
17
  │ └── LICENSE
18
- └── region-picker/ # AWS region suggestion server
18
+ ├── region-picker/ # AWS region suggestion server
19
+ │ ├── index.js # MCP server entry point
20
+ │ ├── test.js # Standalone tests (node test.js)
21
+ │ ├── package.json
22
+ │ └── LICENSE
23
+ └── endpoint-picker/ # SageMaker endpoint discovery server
19
24
  ├── index.js # MCP server entry point
20
25
  ├── test.js # Standalone tests (node test.js)
21
26
  ├── package.json
@@ -74,6 +79,39 @@ Suggests AWS regions for SageMaker deployments based on a search term. Filters t
74
79
  }
75
80
  ```
76
81
 
82
+ ### endpoint-picker
83
+
84
+ Discovers InService SageMaker real-time endpoints with available GPU capacity for attaching new inference components. Uses `ListEndpoints`, `DescribeEndpoint`, and `ListInferenceComponents` to calculate available capacity.
85
+
86
+ **Discover mode:** Queries the SageMaker API using a 3-strategy credential fallback (explicit profile → default chain → detect profiles). No static mode — always requires AWS credentials.
87
+
88
+ **Tool:** `get_inference_endpoints`
89
+
90
+ | Input Field | Type | Description |
91
+ |-------------|------|-------------|
92
+ | `parameters` | `string[]` | Must include `"endpointName"` to get results |
93
+ | `limit` | `number` | Max endpoints to return (default: 10) |
94
+ | `context` | `object` | `awsRegion`, `awsProfile`, `deploymentTarget` (must be `realtime-inference`) |
95
+
96
+ **Example response:**
97
+
98
+ ```json
99
+ {
100
+ "values": { "endpointName": "my-endpoint-1234567890" },
101
+ "choices": { "endpointName": ["my-endpoint-1234567890", "prod-llm-endpoint"] },
102
+ "metadata": {
103
+ "my-endpoint-1234567890": {
104
+ "variantName": "AllTraffic",
105
+ "instanceType": "ml.g6e.48xlarge",
106
+ "instanceCount": 1,
107
+ "icCount": 2,
108
+ "availableGpus": 4,
109
+ "hasInstancePools": false
110
+ }
111
+ }
112
+ }
113
+ ```
114
+
77
115
  ## Usage
78
116
 
79
117
  ### Adding a Bundled Server
@@ -297,6 +335,7 @@ The Bedrock API didn't respond within 10 seconds. This usually means network con
297
335
  ```bash
298
336
  node servers/region-picker/test.js
299
337
  node servers/instance-recommender/test.js
338
+ node servers/endpoint-picker/test.js
300
339
  ```
301
340
 
302
341
  ### Smart Mode Not Activating
@@ -313,6 +352,7 @@ Each server has standalone tests that run without AWS credentials or network acc
313
352
  # Run individual server tests
314
353
  node servers/region-picker/test.js
315
354
  node servers/instance-recommender/test.js
355
+ node servers/endpoint-picker/test.js
316
356
 
317
357
  # Run all server tests from the project root
318
358
  npm run test:servers
@@ -383,6 +383,7 @@ async function handleGetInstanceRecommendation(params) {
383
383
  // Step 3a: Quota & availability filtering (discover mode only)
384
384
  let preQuotaFilterCount = 0
385
385
  let allFilteredByQuota = false
386
+ let preQuotaRecommendations = []
386
387
  if (DISCOVER_MODE && recommendations.length > 0) {
387
388
  try {
388
389
  const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
@@ -396,6 +397,7 @@ async function handleGetInstanceRecommendation(params) {
396
397
  ])
397
398
 
398
399
  preQuotaFilterCount = recommendations.length
400
+ preQuotaRecommendations = [...recommendations]
399
401
  recommendations = applyAvailabilityRanking(
400
402
  recommendations,
401
403
  quotas.status === 'fulfilled' ? quotas.value : null,
@@ -404,6 +406,10 @@ async function handleGetInstanceRecommendation(params) {
404
406
  )
405
407
  if (recommendations.length === 0 && preQuotaFilterCount > 0) {
406
408
  allFilteredByQuota = true
409
+ // Restore pre-filter recommendations so user can see compatible instances
410
+ // and request quota increases for the ones they want
411
+ recommendations = preQuotaRecommendations
412
+ log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`)
407
413
  }
408
414
  } catch (err) {
409
415
  // Graceful degradation: if credentials are missing or any unexpected
package/src/app.js CHANGED
@@ -302,6 +302,22 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
302
302
  ignorePatterns.push('**/hyperpod/**');
303
303
  }
304
304
 
305
+ // HyperPod is kubectl-based — no shared bash helpers or IC configs
306
+ if (answers.deploymentTarget === 'hyperpod-eks') {
307
+ ignorePatterns.push('**/do/lib/**');
308
+ ignorePatterns.push('**/do/ic/**');
309
+ ignorePatterns.push('**/do/add-ic');
310
+ ignorePatterns.push('**/do/status');
311
+ ignorePatterns.push('**/do/optimize');
312
+ }
313
+
314
+ // Async and batch don't use inference components (IC is real-time only)
315
+ if (answers.deploymentTarget === 'async-inference' || answers.deploymentTarget === 'batch-transform') {
316
+ ignorePatterns.push('**/do/ic/**');
317
+ ignorePatterns.push('**/do/add-ic');
318
+ ignorePatterns.push('**/do/status');
319
+ }
320
+
305
321
  // Resolve architecture
306
322
  const resolver = new DeploymentConfigResolver();
307
323
  let architecture = answers.architecture;
@@ -325,6 +341,13 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
325
341
  // Exclude do/benchmark when benchmarking is not selected
326
342
  if (!answers.includeBenchmark) {
327
343
  ignorePatterns.push('**/do/benchmark');
344
+ ignorePatterns.push('**/do/optimize');
345
+ }
346
+
347
+ // Exclude do/adapter and do/adapters/ when LoRA is not enabled
348
+ if (!answers.enableLora) {
349
+ ignorePatterns.push('**/do/adapter');
350
+ ignorePatterns.push('**/do/adapters/**');
328
351
  }
329
352
 
330
353
  // Exclude do/test when hosted-model-endpoint is not selected
@@ -567,7 +590,11 @@ async function _ensureTemplateVariables(answers, registryConfigManager = null) {
567
590
  baseImage: null,
568
591
  modelSource: 'huggingface',
569
592
  artifactUri: '',
570
- modelLoadStrategy: 'runtime'
593
+ modelLoadStrategy: 'runtime',
594
+ existingEndpointName: null,
595
+ enableLora: false,
596
+ maxLoras: 30,
597
+ maxLoraRank: 64
571
598
  };
572
599
 
573
600
  Object.entries(defaults).forEach(([key, value]) => {
@@ -1052,7 +1079,11 @@ function _setExecutablePermissions(destDir) {
1052
1079
  'do/register',
1053
1080
  'do/ci',
1054
1081
  'do/manifest',
1055
- 'do/benchmark'
1082
+ 'do/benchmark',
1083
+ 'do/optimize',
1084
+ 'do/status',
1085
+ 'do/add-ic',
1086
+ 'do/adapter'
1056
1087
  ];
1057
1088
 
1058
1089
  shellScripts.forEach(script => {
@@ -1056,6 +1056,39 @@ export default class ConfigManager {
1056
1056
  required: false,
1057
1057
  default: null,
1058
1058
  valueSpace: 'bounded'
1059
+ },
1060
+ enableLora: {
1061
+ cliOption: 'enable-lora',
1062
+ envVar: null,
1063
+ configFile: true,
1064
+ packageJson: false,
1065
+ mcp: false,
1066
+ promptable: true,
1067
+ required: false,
1068
+ default: false,
1069
+ valueSpace: 'bounded'
1070
+ },
1071
+ maxLoras: {
1072
+ cliOption: 'max-loras',
1073
+ envVar: null,
1074
+ configFile: true,
1075
+ packageJson: false,
1076
+ mcp: false,
1077
+ promptable: true,
1078
+ required: false,
1079
+ default: 30,
1080
+ valueSpace: 'bounded'
1081
+ },
1082
+ maxLoraRank: {
1083
+ cliOption: 'max-lora-rank',
1084
+ envVar: null,
1085
+ configFile: true,
1086
+ packageJson: false,
1087
+ mcp: false,
1088
+ promptable: true,
1089
+ required: false,
1090
+ default: 64,
1091
+ valueSpace: 'bounded'
1059
1092
  }
1060
1093
  };
1061
1094
  }
@@ -1088,7 +1121,7 @@ export default class ConfigManager {
1088
1121
  */
1089
1122
  _parseValue(parameter, value) {
1090
1123
  // Handle boolean parameters
1091
- if (parameter === 'includeSampleModel' || parameter === 'includeTesting' || parameter === 'skipPrompts' || parameter === 'includeBenchmark' || parameter === 'benchmarkStreaming') {
1124
+ if (parameter === 'includeSampleModel' || parameter === 'includeTesting' || parameter === 'skipPrompts' || parameter === 'includeBenchmark' || parameter === 'benchmarkStreaming' || parameter === 'enableLora') {
1092
1125
  return value === true || value === 'true';
1093
1126
  }
1094
1127
 
@@ -1924,6 +1957,12 @@ export default class ConfigManager {
1924
1957
  if (param === 'instanceType' && finalConfig.deploymentTarget === 'hyperpod-eks' && !finalConfig.instanceType) {
1925
1958
  return; // Skip validation only if truly missing for backward compat
1926
1959
  }
1960
+
1961
+ // Special case: instanceType is not required when attaching to an existing endpoint
1962
+ // The instance type is inherited from the existing endpoint configuration
1963
+ if (param === 'instanceType' && finalConfig.existingEndpointName) {
1964
+ return; // Skip validation — instance is inherited from existing endpoint
1965
+ }
1927
1966
 
1928
1967
  if (isEmpty) {
1929
1968
  if (config.promptable) {
@@ -57,6 +57,22 @@ export default {
57
57
  },
58
58
  buildTarget: {
59
59
  type: ['string', 'null']
60
+ },
61
+ icList: {
62
+ type: 'array',
63
+ items: {
64
+ type: 'object',
65
+ required: ['name'],
66
+ properties: {
67
+ name: { type: 'string', minLength: 1 },
68
+ image: { type: 'string' },
69
+ gpuCount: { type: 'integer', minimum: 0 },
70
+ copyCount: { type: 'integer', minimum: 1 },
71
+ isAdapter: { type: 'boolean' },
72
+ baseIcName: { type: 'string' },
73
+ artifactUrl: { type: 'string' }
74
+ }
75
+ }
60
76
  }
61
77
  }
62
78
  },
@@ -18,8 +18,10 @@ import {
18
18
  modelLoadStrategyPrompts,
19
19
  modelProfilePrompts,
20
20
  modulePrompts,
21
+ loraPrompts,
21
22
  benchmarkPrompts,
22
23
  infraRegionAndTargetPrompts,
24
+ infraExistingEndpointPrompts,
23
25
  infraInstancePrompts,
24
26
  infraAsyncPrompts,
25
27
  infraBatchTransformPrompts,
@@ -29,7 +31,9 @@ import {
29
31
  destinationPrompts,
30
32
  baseImageSearchPrompts,
31
33
  baseImagePrompts,
32
- formatImageChoices
34
+ formatImageChoices,
35
+ filterByCudaGeneration,
36
+ instanceCatalogRaw
33
37
  } from './prompts.js';
34
38
 
35
39
  import fs from 'fs';
@@ -187,12 +191,40 @@ export default class PromptRunner {
187
191
  // 3a. Region query
188
192
  await this._queryMcpForRegion(frameworkAnswers, explicitConfig);
189
193
 
194
+ // 3a2. Existing endpoint prompt (only for realtime-inference)
195
+ // Requirements: 3.3, 4.3, 4.4 — endpoint-picker MCP query
196
+ let existingEndpointAnswers = {};
197
+ if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference') {
198
+ // Query endpoint-picker MCP server for available endpoints
199
+ const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
200
+ await this._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
201
+
202
+ const endpointPreviousAnswers = {
203
+ ...regionAndTargetAnswers,
204
+ ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
205
+ };
206
+ existingEndpointAnswers = await this._runPhase(
207
+ infraExistingEndpointPrompts,
208
+ endpointPreviousAnswers,
209
+ explicitConfig,
210
+ existingConfig
211
+ );
212
+
213
+ // Resolve custom endpoint name
214
+ if (existingEndpointAnswers.customExistingEndpointName) {
215
+ existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
216
+ delete existingEndpointAnswers.customExistingEndpointName;
217
+ }
218
+ }
219
+
190
220
  // 3b. Instance type — query instance-sizer with full context (model + profile + CUDA)
191
221
  let instanceAnswers = {};
192
- const needsInstance = regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
222
+ // Skip instance prompts when attaching to an existing endpoint (instance is inherited)
223
+ const useExistingEndpoint = !!(existingEndpointAnswers.existingEndpointName);
224
+ const needsInstance = !useExistingEndpoint && (regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
193
225
  regionAndTargetAnswers.deploymentTarget === 'async-inference' ||
194
226
  regionAndTargetAnswers.deploymentTarget === 'batch-transform' ||
195
- regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks';
227
+ regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks');
196
228
 
197
229
  if (needsInstance) {
198
230
  // Determine architecture type for heuristic fallback
@@ -230,6 +262,74 @@ export default class PromptRunner {
230
262
  if (!instanceAnswers.instanceType && !explicitConfig.instanceType && this._architectureHeuristicDefault) {
231
263
  instanceAnswers.instanceType = this._architectureHeuristicDefault;
232
264
  }
265
+
266
+ // Process multi-select instance type results (Requirements: 6.4)
267
+ // When user selects multiple instances via checkbox, derive instanceType and instancePools
268
+ if (instanceAnswers.instanceTypeSelections && instanceAnswers.instanceTypeSelections.length > 0) {
269
+ let selections = instanceAnswers.instanceTypeSelections.slice(0, 5); // Cap at 5 (API limit)
270
+
271
+ // Resolve custom input: replace __custom_input__ sentinel with parsed instances
272
+ if (selections.includes('__custom_input__') && instanceAnswers.customInstanceTypeSelections) {
273
+ const customInstances = instanceAnswers.customInstanceTypeSelections
274
+ .split(',').map(s => s.trim()).filter(s => s.length > 0);
275
+ // Remove the sentinel and any other MCP selections, replace with custom entries
276
+ selections = selections.filter(s => s !== '__custom_input__');
277
+ selections = [...selections, ...customInstances];
278
+ delete instanceAnswers.customInstanceTypeSelections;
279
+ } else if (selections.includes('__custom_input__')) {
280
+ // Sentinel selected but no custom input provided — remove it
281
+ selections = selections.filter(s => s !== '__custom_input__');
282
+ }
283
+
284
+ // Cap at 5 after custom expansion
285
+ if (selections.length > 5) {
286
+ console.log(' ⚠️ Maximum 5 instance types allowed. Using first 5 selections.');
287
+ selections = selections.slice(0, 5);
288
+ }
289
+
290
+ // Filter to same CUDA generation and warn about incompatible removals
291
+ const { filtered, generation, removed } = filterByCudaGeneration(selections);
292
+ if (removed.length > 0) {
293
+ console.log(` ⚠️ Removed incompatible instances (different CUDA generation): ${removed.join(', ')}`);
294
+ console.log(` Keeping ${generation} generation: ${filtered.join(', ')}`);
295
+ }
296
+
297
+ const finalSelections = filtered.length > 0 ? filtered : selections;
298
+
299
+ if (finalSelections.length === 1) {
300
+ // Single selection → standard single instance type (no pools)
301
+ instanceAnswers.instanceType = finalSelections[0];
302
+ console.log(` ✓ Single instance selected: ${finalSelections[0]}`);
303
+ } else {
304
+ // Multiple selections → instance pools with priority = selection order
305
+ instanceAnswers.instanceType = finalSelections[0]; // backward compat: first is primary
306
+ instanceAnswers.instancePools = finalSelections.map((it, idx) => ({
307
+ InstanceType: it,
308
+ Priority: idx + 1
309
+ }));
310
+
311
+ // Auto-generate multi-spec IC config from catalog
312
+ instanceAnswers.instancePoolSpecs = finalSelections.map(it => {
313
+ const entry = instanceCatalogRaw[it];
314
+ return {
315
+ instanceType: it,
316
+ gpuCount: entry?.gpus || 1,
317
+ minMemoryMb: entry?.gpuMemoryGb ? entry.gpuMemoryGb * 1024 : 1024
318
+ };
319
+ });
320
+
321
+ console.log(` ✓ Instance pools configured (${finalSelections.length} types):`);
322
+ finalSelections.forEach((it, idx) => {
323
+ const entry = instanceCatalogRaw[it];
324
+ const gpus = entry?.gpus || '?';
325
+ const mem = entry?.gpuMemoryGb || '?';
326
+ console.log(` Priority ${idx + 1}: ${it} (${gpus} GPUs, ${mem}GB GPU memory)`);
327
+ });
328
+ }
329
+
330
+ // Clean up the raw selections from answers (not needed downstream)
331
+ delete instanceAnswers.instanceTypeSelections;
332
+ }
233
333
  }
234
334
 
235
335
  // In auto-prompt mode, use instance-sizer's top recommendation as the instance type
@@ -318,6 +418,7 @@ export default class PromptRunner {
318
418
  // Combine all infrastructure answers
319
419
  const infraAnswers = {
320
420
  ...regionAndTargetAnswers,
421
+ ...existingEndpointAnswers,
321
422
  ...instanceAnswers,
322
423
  ...asyncAnswers,
323
424
  ...batchTransformAnswers,
@@ -414,6 +515,14 @@ export default class PromptRunner {
414
515
  }
415
516
  }
416
517
 
518
+ // LoRA adapter prompts — only for transformers with vllm/sglang/djl-lmi
519
+ // Requirements: 1.1, 1.2, 1.4
520
+ let loraAnswers = {};
521
+ const loraSubAnswers = await this._runPhase(loraPrompts, { ...frameworkAnswers, ...engineAnswers }, explicitConfig, existingConfig);
522
+ if (loraSubAnswers.enableLora !== undefined) {
523
+ loraAnswers = loraSubAnswers;
524
+ }
525
+
417
526
  // Validate instance type against framework requirements (now that framework version is known)
418
527
  const finalInstanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
419
528
  if (finalInstanceType && frameworkVersionAnswers.frameworkVersion) {
@@ -456,6 +565,7 @@ export default class PromptRunner {
456
565
  ...ngcApiKeyAnswers,
457
566
  ...moduleAnswers,
458
567
  ...benchmarkAnswers,
568
+ ...loraAnswers,
459
569
  ...projectAnswers,
460
570
  ...destinationAnswers,
461
571
  buildTimestamp
@@ -1083,6 +1193,11 @@ export default class PromptRunner {
1083
1193
 
1084
1194
  console.log(` ✓ ${choices.length} compatible instance(s) found${vramInfo}`);
1085
1195
 
1196
+ // Warn if all instances had zero quota but were restored for visibility
1197
+ if (parsed.metadata?.allFilteredByQuota) {
1198
+ console.log(' ⚠️ All instances have zero quota — request a quota increase for your preferred type');
1199
+ }
1200
+
1086
1201
  // Check if availability data is present (recommendations have capacityType)
1087
1202
  const hasAvailabilityData = recommendations.some(r => r.capacityType);
1088
1203
 
@@ -1187,6 +1302,62 @@ export default class PromptRunner {
1187
1302
  }
1188
1303
  }
1189
1304
 
1305
+ /**
1306
+ * Query the endpoint-picker MCP server for available InService real-time endpoints.
1307
+ * Populates this._mcpEndpointChoices for the existing endpoint selection prompt.
1308
+ * Graceful fallback: if MCP server fails (no credentials, timeout), skip and create new endpoint.
1309
+ * Requirements: 3.3, 4.3, 4.4
1310
+ * @private
1311
+ */
1312
+ async _queryMcpForEndpoints(infraAnswers, explicitConfig) {
1313
+ const cm = this.configManager;
1314
+ if (!cm) return;
1315
+
1316
+ const mcpServers = cm.getMcpServerNames();
1317
+ if (!mcpServers.includes('endpoint-picker')) return;
1318
+
1319
+ // Skip if existing endpoint already provided via CLI/config
1320
+ if (explicitConfig.existingEndpointName) return;
1321
+
1322
+ console.log(' 🔍 Querying endpoint-picker...');
1323
+
1324
+ try {
1325
+ const result = await cm.queryMcpServer('endpoint-picker', {
1326
+ awsRegion: infraAnswers.awsRegion,
1327
+ deploymentTarget: 'realtime-inference'
1328
+ });
1329
+
1330
+ if (result && result.choices?.endpointName?.length > 0) {
1331
+ const endpointNames = result.choices.endpointName;
1332
+ const metadata = result.metadata || {};
1333
+
1334
+ // Build choices with metadata annotations
1335
+ this._mcpEndpointChoices = endpointNames.map(name => {
1336
+ const meta = metadata[name];
1337
+ if (meta) {
1338
+ const gpuInfo = meta.availableGpus === '?' ? 'GPUs: ?' : `${meta.availableGpus} GPUs free`;
1339
+ return {
1340
+ name: `${name} (${meta.instanceType}, ${gpuInfo}, ${meta.icCount} IC${meta.icCount !== 1 ? 's' : ''})`,
1341
+ value: name
1342
+ };
1343
+ }
1344
+ return { name, value: name };
1345
+ });
1346
+
1347
+ console.log(` ✓ ${endpointNames.length} endpoint(s) with available capacity`);
1348
+ } else {
1349
+ if (result?.message) {
1350
+ console.log(` ↳ ${result.message}`);
1351
+ } else {
1352
+ console.log(' ↳ No endpoints with available capacity found');
1353
+ }
1354
+ }
1355
+ } catch (err) {
1356
+ // Graceful fallback: if MCP server fails, skip and create new endpoint
1357
+ console.log(` ⚠️ endpoint-picker: ${err.message || 'query failed'} — will create new endpoint`);
1358
+ }
1359
+ }
1360
+
1190
1361
  /**
1191
1362
  * Query MCP base-image-picker server after deployment config is selected.
1192
1363
  * Populates _mcpBaseImageChoices for the base image selection prompt.