@aws/ml-container-creator 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +5 -2
- package/infra/ci-harness/buildspec.yml +60 -0
- package/package.json +1 -1
- package/servers/README.md +41 -1
- package/servers/instance-sizer/index.js +6 -0
- package/src/app.js +33 -2
- package/src/lib/config-manager.js +40 -1
- package/src/lib/deployment-entry-schema.js +16 -0
- package/src/lib/prompt-runner.js +174 -3
- package/src/lib/prompts.js +222 -2
- package/src/lib/registry-command-handler.js +12 -0
- package/templates/Dockerfile +12 -0
- package/templates/code/serving.properties +14 -0
- package/templates/do/adapter +1214 -0
- package/templates/do/adapters/.gitkeep +2 -0
- package/templates/do/add-ic +130 -0
- package/templates/do/benchmark +81 -9
- package/templates/do/clean +507 -17
- package/templates/do/config +23 -1
- package/templates/do/deploy +513 -367
- package/templates/do/ic/default.conf +32 -0
- package/templates/do/lib/endpoint-config.sh +216 -0
- package/templates/do/lib/inference-component.sh +167 -0
- package/templates/do/lib/secrets.sh +44 -0
- package/templates/do/lib/wait.sh +131 -0
- package/templates/do/logs +107 -27
- package/templates/do/optimize +528 -0
- package/templates/do/register +111 -1
- package/templates/do/status +337 -0
- package/templates/do/test +80 -28
package/bin/cli.js
CHANGED
|
@@ -98,6 +98,9 @@ program
|
|
|
98
98
|
.addOption(new Option('--include-sample', 'Include sample model code'))
|
|
99
99
|
.addOption(new Option('--include-testing', 'Include test suite'))
|
|
100
100
|
.addOption(new Option('--test-types <types>', 'Comma-separated test types'))
|
|
101
|
+
.addOption(new Option('--enable-lora', 'Enable LoRA adapter serving (transformers with vllm/sglang/djl-lmi only)'))
|
|
102
|
+
.addOption(new Option('--max-loras <n>', 'Maximum concurrent LoRA adapters in GPU memory (default: 30)'))
|
|
103
|
+
.addOption(new Option('--max-lora-rank <n>', 'Maximum LoRA rank (default: 64)'))
|
|
101
104
|
|
|
102
105
|
// --- MCP & Discovery ---
|
|
103
106
|
.addOption(new Option('--smart', 'Enable Bedrock-powered smart mode on MCP servers'))
|
|
@@ -190,7 +193,7 @@ program.configureHelp({
|
|
|
190
193
|
groups.env.push(opt);
|
|
191
194
|
} else if (['--hf-token', '--hf-token-arn', '--ngc-token', '--ngc-token-arn'].includes(long)) {
|
|
192
195
|
groups.auth.push(opt);
|
|
193
|
-
} else if (['--include-sample', '--include-testing', '--test-types'].includes(long)) {
|
|
196
|
+
} else if (['--include-sample', '--include-testing', '--test-types', '--enable-lora', '--max-loras', '--max-lora-rank'].includes(long)) {
|
|
194
197
|
groups.features.push(opt);
|
|
195
198
|
} else if (['--smart', '--discover'].includes(long)) {
|
|
196
199
|
groups.mcp.push(opt);
|
|
@@ -307,7 +310,6 @@ program
|
|
|
307
310
|
program
|
|
308
311
|
.command('registry')
|
|
309
312
|
.description('Registry operations (list, get, remove, replay, export, import, search) — experimental, may be reconciled with do/register')
|
|
310
|
-
.passThroughOptions()
|
|
311
313
|
.argument('<action>', 'Registry action (log, list, get, remove, replay, export, import, search)')
|
|
312
314
|
.argument('[args...]', 'Additional arguments')
|
|
313
315
|
.option('--backend <backend>', 'Filter by backend')
|
|
@@ -328,6 +330,7 @@ program
|
|
|
328
330
|
.option('--notes <text>', 'Deployment notes')
|
|
329
331
|
.option('--project', 'Use project-level registry')
|
|
330
332
|
.option('--parameters <json>', 'Parameters JSON string')
|
|
333
|
+
.option('--ic-list <json>', 'IC list JSON string')
|
|
331
334
|
.option('--generator-version <version>', 'Generator version')
|
|
332
335
|
// Options used by `registry list-architectures`
|
|
333
336
|
.option('--server <name>', 'Filter by server name (for list-architectures)')
|
|
@@ -40,6 +40,10 @@ phases:
|
|
|
40
40
|
- REGISTER_DURATION=0
|
|
41
41
|
- REGISTER_LOG_POINTER=""
|
|
42
42
|
- REGISTER_ERROR_SUMMARY=""
|
|
43
|
+
- ADAPTER_TEST_STATUS="skip"
|
|
44
|
+
- ADAPTER_TEST_DURATION=0
|
|
45
|
+
- ADAPTER_TEST_LOG_POINTER=""
|
|
46
|
+
- ADAPTER_TEST_ERROR_SUMMARY=""
|
|
43
47
|
- TEARDOWN_STATUS="skip"
|
|
44
48
|
- TEARDOWN_DURATION=0
|
|
45
49
|
- TEARDOWN_LOG_POINTER=""
|
|
@@ -182,6 +186,54 @@ phases:
|
|
|
182
186
|
fi
|
|
183
187
|
- rm -f "$STAGE_STDERR_FILE"
|
|
184
188
|
|
|
189
|
+
# --- Stage: Adapter_Test (only if do/adapters/ has .conf files) ---
|
|
190
|
+
- echo "=== Stage: Adapter_Test ==="
|
|
191
|
+
- STAGE_START=$(date +%s)
|
|
192
|
+
- ADAPTER_TEST_LOG_POINTER="$LOG_POINTER_PREFIX"
|
|
193
|
+
- STAGE_STDERR_FILE=$(mktemp)
|
|
194
|
+
- |
|
|
195
|
+
if [ -n "$FIRST_FAILURE" ]; then
|
|
196
|
+
echo "Skipping Adapter_Test stage due to prior failure in $FIRST_FAILURE"
|
|
197
|
+
ADAPTER_TEST_STATUS="skip"
|
|
198
|
+
ADAPTER_TEST_DURATION=0
|
|
199
|
+
else
|
|
200
|
+
cd /tmp/ci-project
|
|
201
|
+
ADAPTER_CONFS=$(find do/adapters -name '*.conf' 2>/dev/null | grep -v '.gitkeep' || true)
|
|
202
|
+
if [ -z "$ADAPTER_CONFS" ]; then
|
|
203
|
+
echo "No adapter configs found in do/adapters/ — skipping"
|
|
204
|
+
ADAPTER_TEST_STATUS="skip"
|
|
205
|
+
ADAPTER_TEST_DURATION=0
|
|
206
|
+
else
|
|
207
|
+
(
|
|
208
|
+
set -e
|
|
209
|
+
cd /tmp/ci-project
|
|
210
|
+
for conf in do/adapters/*.conf; do
|
|
211
|
+
[ -f "$conf" ] || continue
|
|
212
|
+
[[ "$(basename "$conf")" == ".gitkeep" ]] && continue
|
|
213
|
+
ADAPTER_NAME=$(basename "$conf" .conf)
|
|
214
|
+
echo "Testing adapter: ${ADAPTER_NAME}"
|
|
215
|
+
# Source to get weights URI
|
|
216
|
+
source "$conf"
|
|
217
|
+
./do/adapter add "${ADAPTER_NAME}" --weights "${ADAPTER_WEIGHTS_URI}"
|
|
218
|
+
./do/test --ic "${ADAPTER_NAME}"
|
|
219
|
+
./do/adapter remove "${ADAPTER_NAME}"
|
|
220
|
+
done
|
|
221
|
+
) 2>"$STAGE_STDERR_FILE"; STAGE_EXIT=$?
|
|
222
|
+
STAGE_END=$(date +%s)
|
|
223
|
+
ADAPTER_TEST_DURATION=$((STAGE_END - STAGE_START))
|
|
224
|
+
if [ "$STAGE_EXIT" -eq 0 ]; then
|
|
225
|
+
ADAPTER_TEST_STATUS="pass"
|
|
226
|
+
echo "Adapter_Test stage passed in ${ADAPTER_TEST_DURATION}s"
|
|
227
|
+
else
|
|
228
|
+
ADAPTER_TEST_STATUS="fail"
|
|
229
|
+
ADAPTER_TEST_ERROR_SUMMARY=$(tail -c 500 "$STAGE_STDERR_FILE" | tr -d '\000' | tr '"' "'" | tr '\n' ' ')
|
|
230
|
+
FIRST_FAILURE="adapter_test"
|
|
231
|
+
echo "Adapter_Test stage FAILED (exit code $STAGE_EXIT) in ${ADAPTER_TEST_DURATION}s"
|
|
232
|
+
fi
|
|
233
|
+
fi
|
|
234
|
+
fi
|
|
235
|
+
- rm -f "$STAGE_STDERR_FILE"
|
|
236
|
+
|
|
185
237
|
# --- Stage: Register (placeholder) ---
|
|
186
238
|
- echo "=== Stage: Register ==="
|
|
187
239
|
- STAGE_START=$(date +%s)
|
|
@@ -260,6 +312,7 @@ phases:
|
|
|
260
312
|
validate) FINAL_ERROR_MESSAGE="$VALIDATE_ERROR_SUMMARY" ;;
|
|
261
313
|
build) FINAL_ERROR_MESSAGE="$BUILD_ERROR_SUMMARY" ;;
|
|
262
314
|
deploy_test) FINAL_ERROR_MESSAGE="$DEPLOY_TEST_ERROR_SUMMARY" ;;
|
|
315
|
+
adapter_test) FINAL_ERROR_MESSAGE="$ADAPTER_TEST_ERROR_SUMMARY" ;;
|
|
263
316
|
register) FINAL_ERROR_MESSAGE="$REGISTER_ERROR_SUMMARY" ;;
|
|
264
317
|
*) FINAL_ERROR_MESSAGE="Unknown failure stage" ;;
|
|
265
318
|
esac
|
|
@@ -272,6 +325,7 @@ phases:
|
|
|
272
325
|
ESCAPED_VALIDATE_ERROR=$(printf '%s' "$VALIDATE_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
273
326
|
ESCAPED_BUILD_ERROR=$(printf '%s' "$BUILD_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
274
327
|
ESCAPED_DEPLOY_TEST_ERROR=$(printf '%s' "$DEPLOY_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
328
|
+
ESCAPED_ADAPTER_TEST_ERROR=$(printf '%s' "$ADAPTER_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
275
329
|
ESCAPED_REGISTER_ERROR=$(printf '%s' "$REGISTER_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
276
330
|
ESCAPED_TEARDOWN_ERROR=$(printf '%s' "$TEARDOWN_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
277
331
|
ESCAPED_FINAL_ERROR=$(printf '%s' "$FINAL_ERROR_MESSAGE" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
@@ -314,6 +368,12 @@ phases:
|
|
|
314
368
|
\"logPointer\": {\"S\": \"$DEPLOY_TEST_LOG_POINTER\"},
|
|
315
369
|
\"errorSummary\": {\"S\": \"$ESCAPED_DEPLOY_TEST_ERROR\"}
|
|
316
370
|
}},
|
|
371
|
+
\"adapter_test\": {\"M\": {
|
|
372
|
+
\"status\": {\"S\": \"$ADAPTER_TEST_STATUS\"},
|
|
373
|
+
\"durationSeconds\": {\"N\": \"$ADAPTER_TEST_DURATION\"},
|
|
374
|
+
\"logPointer\": {\"S\": \"$ADAPTER_TEST_LOG_POINTER\"},
|
|
375
|
+
\"errorSummary\": {\"S\": \"$ESCAPED_ADAPTER_TEST_ERROR\"}
|
|
376
|
+
}},
|
|
317
377
|
\"register\": {\"M\": {
|
|
318
378
|
\"status\": {\"S\": \"$REGISTER_STATUS\"},
|
|
319
379
|
\"durationSeconds\": {\"N\": \"$REGISTER_DURATION\"},
|
package/package.json
CHANGED
package/servers/README.md
CHANGED
|
@@ -15,7 +15,12 @@ servers/
|
|
|
15
15
|
│ ├── test.js # Standalone tests (node test.js)
|
|
16
16
|
│ ├── package.json
|
|
17
17
|
│ └── LICENSE
|
|
18
|
-
|
|
18
|
+
├── region-picker/ # AWS region suggestion server
|
|
19
|
+
│ ├── index.js # MCP server entry point
|
|
20
|
+
│ ├── test.js # Standalone tests (node test.js)
|
|
21
|
+
│ ├── package.json
|
|
22
|
+
│ └── LICENSE
|
|
23
|
+
└── endpoint-picker/ # SageMaker endpoint discovery server
|
|
19
24
|
├── index.js # MCP server entry point
|
|
20
25
|
├── test.js # Standalone tests (node test.js)
|
|
21
26
|
├── package.json
|
|
@@ -74,6 +79,39 @@ Suggests AWS regions for SageMaker deployments based on a search term. Filters t
|
|
|
74
79
|
}
|
|
75
80
|
```
|
|
76
81
|
|
|
82
|
+
### endpoint-picker
|
|
83
|
+
|
|
84
|
+
Discovers InService SageMaker real-time endpoints with available GPU capacity for attaching new inference components. Uses `ListEndpoints`, `DescribeEndpoint`, and `ListInferenceComponents` to calculate available capacity.
|
|
85
|
+
|
|
86
|
+
**Discover mode:** Queries the SageMaker API using a 3-strategy credential fallback (explicit profile → default chain → detect profiles). No static mode — always requires AWS credentials.
|
|
87
|
+
|
|
88
|
+
**Tool:** `get_inference_endpoints`
|
|
89
|
+
|
|
90
|
+
| Input Field | Type | Description |
|
|
91
|
+
|-------------|------|-------------|
|
|
92
|
+
| `parameters` | `string[]` | Must include `"endpointName"` to get results |
|
|
93
|
+
| `limit` | `number` | Max endpoints to return (default: 10) |
|
|
94
|
+
| `context` | `object` | `awsRegion`, `awsProfile`, `deploymentTarget` (must be `realtime-inference`) |
|
|
95
|
+
|
|
96
|
+
**Example response:**
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{
|
|
100
|
+
"values": { "endpointName": "my-endpoint-1234567890" },
|
|
101
|
+
"choices": { "endpointName": ["my-endpoint-1234567890", "prod-llm-endpoint"] },
|
|
102
|
+
"metadata": {
|
|
103
|
+
"my-endpoint-1234567890": {
|
|
104
|
+
"variantName": "AllTraffic",
|
|
105
|
+
"instanceType": "ml.g6e.48xlarge",
|
|
106
|
+
"instanceCount": 1,
|
|
107
|
+
"icCount": 2,
|
|
108
|
+
"availableGpus": 4,
|
|
109
|
+
"hasInstancePools": false
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
```
|
|
114
|
+
|
|
77
115
|
## Usage
|
|
78
116
|
|
|
79
117
|
### Adding a Bundled Server
|
|
@@ -297,6 +335,7 @@ The Bedrock API didn't respond within 10 seconds. This usually means network con
|
|
|
297
335
|
```bash
|
|
298
336
|
node servers/region-picker/test.js
|
|
299
337
|
node servers/instance-recommender/test.js
|
|
338
|
+
node servers/endpoint-picker/test.js
|
|
300
339
|
```
|
|
301
340
|
|
|
302
341
|
### Smart Mode Not Activating
|
|
@@ -313,6 +352,7 @@ Each server has standalone tests that run without AWS credentials or network acc
|
|
|
313
352
|
# Run individual server tests
|
|
314
353
|
node servers/region-picker/test.js
|
|
315
354
|
node servers/instance-recommender/test.js
|
|
355
|
+
node servers/endpoint-picker/test.js
|
|
316
356
|
|
|
317
357
|
# Run all server tests from the project root
|
|
318
358
|
npm run test:servers
|
|
@@ -383,6 +383,7 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
383
383
|
// Step 3a: Quota & availability filtering (discover mode only)
|
|
384
384
|
let preQuotaFilterCount = 0
|
|
385
385
|
let allFilteredByQuota = false
|
|
386
|
+
let preQuotaRecommendations = []
|
|
386
387
|
if (DISCOVER_MODE && recommendations.length > 0) {
|
|
387
388
|
try {
|
|
388
389
|
const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
|
|
@@ -396,6 +397,7 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
396
397
|
])
|
|
397
398
|
|
|
398
399
|
preQuotaFilterCount = recommendations.length
|
|
400
|
+
preQuotaRecommendations = [...recommendations]
|
|
399
401
|
recommendations = applyAvailabilityRanking(
|
|
400
402
|
recommendations,
|
|
401
403
|
quotas.status === 'fulfilled' ? quotas.value : null,
|
|
@@ -404,6 +406,10 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
404
406
|
)
|
|
405
407
|
if (recommendations.length === 0 && preQuotaFilterCount > 0) {
|
|
406
408
|
allFilteredByQuota = true
|
|
409
|
+
// Restore pre-filter recommendations so user can see compatible instances
|
|
410
|
+
// and request quota increases for the ones they want
|
|
411
|
+
recommendations = preQuotaRecommendations
|
|
412
|
+
log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`)
|
|
407
413
|
}
|
|
408
414
|
} catch (err) {
|
|
409
415
|
// Graceful degradation: if credentials are missing or any unexpected
|
package/src/app.js
CHANGED
|
@@ -302,6 +302,22 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
|
|
|
302
302
|
ignorePatterns.push('**/hyperpod/**');
|
|
303
303
|
}
|
|
304
304
|
|
|
305
|
+
// HyperPod is kubectl-based — no shared bash helpers or IC configs
|
|
306
|
+
if (answers.deploymentTarget === 'hyperpod-eks') {
|
|
307
|
+
ignorePatterns.push('**/do/lib/**');
|
|
308
|
+
ignorePatterns.push('**/do/ic/**');
|
|
309
|
+
ignorePatterns.push('**/do/add-ic');
|
|
310
|
+
ignorePatterns.push('**/do/status');
|
|
311
|
+
ignorePatterns.push('**/do/optimize');
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// Async and batch don't use inference components (IC is real-time only)
|
|
315
|
+
if (answers.deploymentTarget === 'async-inference' || answers.deploymentTarget === 'batch-transform') {
|
|
316
|
+
ignorePatterns.push('**/do/ic/**');
|
|
317
|
+
ignorePatterns.push('**/do/add-ic');
|
|
318
|
+
ignorePatterns.push('**/do/status');
|
|
319
|
+
}
|
|
320
|
+
|
|
305
321
|
// Resolve architecture
|
|
306
322
|
const resolver = new DeploymentConfigResolver();
|
|
307
323
|
let architecture = answers.architecture;
|
|
@@ -325,6 +341,13 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
|
|
|
325
341
|
// Exclude do/benchmark when benchmarking is not selected
|
|
326
342
|
if (!answers.includeBenchmark) {
|
|
327
343
|
ignorePatterns.push('**/do/benchmark');
|
|
344
|
+
ignorePatterns.push('**/do/optimize');
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// Exclude do/adapter and do/adapters/ when LoRA is not enabled
|
|
348
|
+
if (!answers.enableLora) {
|
|
349
|
+
ignorePatterns.push('**/do/adapter');
|
|
350
|
+
ignorePatterns.push('**/do/adapters/**');
|
|
328
351
|
}
|
|
329
352
|
|
|
330
353
|
// Exclude do/test when hosted-model-endpoint is not selected
|
|
@@ -567,7 +590,11 @@ async function _ensureTemplateVariables(answers, registryConfigManager = null) {
|
|
|
567
590
|
baseImage: null,
|
|
568
591
|
modelSource: 'huggingface',
|
|
569
592
|
artifactUri: '',
|
|
570
|
-
modelLoadStrategy: 'runtime'
|
|
593
|
+
modelLoadStrategy: 'runtime',
|
|
594
|
+
existingEndpointName: null,
|
|
595
|
+
enableLora: false,
|
|
596
|
+
maxLoras: 30,
|
|
597
|
+
maxLoraRank: 64
|
|
571
598
|
};
|
|
572
599
|
|
|
573
600
|
Object.entries(defaults).forEach(([key, value]) => {
|
|
@@ -1052,7 +1079,11 @@ function _setExecutablePermissions(destDir) {
|
|
|
1052
1079
|
'do/register',
|
|
1053
1080
|
'do/ci',
|
|
1054
1081
|
'do/manifest',
|
|
1055
|
-
'do/benchmark'
|
|
1082
|
+
'do/benchmark',
|
|
1083
|
+
'do/optimize',
|
|
1084
|
+
'do/status',
|
|
1085
|
+
'do/add-ic',
|
|
1086
|
+
'do/adapter'
|
|
1056
1087
|
];
|
|
1057
1088
|
|
|
1058
1089
|
shellScripts.forEach(script => {
|
|
@@ -1056,6 +1056,39 @@ export default class ConfigManager {
|
|
|
1056
1056
|
required: false,
|
|
1057
1057
|
default: null,
|
|
1058
1058
|
valueSpace: 'bounded'
|
|
1059
|
+
},
|
|
1060
|
+
enableLora: {
|
|
1061
|
+
cliOption: 'enable-lora',
|
|
1062
|
+
envVar: null,
|
|
1063
|
+
configFile: true,
|
|
1064
|
+
packageJson: false,
|
|
1065
|
+
mcp: false,
|
|
1066
|
+
promptable: true,
|
|
1067
|
+
required: false,
|
|
1068
|
+
default: false,
|
|
1069
|
+
valueSpace: 'bounded'
|
|
1070
|
+
},
|
|
1071
|
+
maxLoras: {
|
|
1072
|
+
cliOption: 'max-loras',
|
|
1073
|
+
envVar: null,
|
|
1074
|
+
configFile: true,
|
|
1075
|
+
packageJson: false,
|
|
1076
|
+
mcp: false,
|
|
1077
|
+
promptable: true,
|
|
1078
|
+
required: false,
|
|
1079
|
+
default: 30,
|
|
1080
|
+
valueSpace: 'bounded'
|
|
1081
|
+
},
|
|
1082
|
+
maxLoraRank: {
|
|
1083
|
+
cliOption: 'max-lora-rank',
|
|
1084
|
+
envVar: null,
|
|
1085
|
+
configFile: true,
|
|
1086
|
+
packageJson: false,
|
|
1087
|
+
mcp: false,
|
|
1088
|
+
promptable: true,
|
|
1089
|
+
required: false,
|
|
1090
|
+
default: 64,
|
|
1091
|
+
valueSpace: 'bounded'
|
|
1059
1092
|
}
|
|
1060
1093
|
};
|
|
1061
1094
|
}
|
|
@@ -1088,7 +1121,7 @@ export default class ConfigManager {
|
|
|
1088
1121
|
*/
|
|
1089
1122
|
_parseValue(parameter, value) {
|
|
1090
1123
|
// Handle boolean parameters
|
|
1091
|
-
if (parameter === 'includeSampleModel' || parameter === 'includeTesting' || parameter === 'skipPrompts' || parameter === 'includeBenchmark' || parameter === 'benchmarkStreaming') {
|
|
1124
|
+
if (parameter === 'includeSampleModel' || parameter === 'includeTesting' || parameter === 'skipPrompts' || parameter === 'includeBenchmark' || parameter === 'benchmarkStreaming' || parameter === 'enableLora') {
|
|
1092
1125
|
return value === true || value === 'true';
|
|
1093
1126
|
}
|
|
1094
1127
|
|
|
@@ -1924,6 +1957,12 @@ export default class ConfigManager {
|
|
|
1924
1957
|
if (param === 'instanceType' && finalConfig.deploymentTarget === 'hyperpod-eks' && !finalConfig.instanceType) {
|
|
1925
1958
|
return; // Skip validation only if truly missing for backward compat
|
|
1926
1959
|
}
|
|
1960
|
+
|
|
1961
|
+
// Special case: instanceType is not required when attaching to an existing endpoint
|
|
1962
|
+
// The instance type is inherited from the existing endpoint configuration
|
|
1963
|
+
if (param === 'instanceType' && finalConfig.existingEndpointName) {
|
|
1964
|
+
return; // Skip validation — instance is inherited from existing endpoint
|
|
1965
|
+
}
|
|
1927
1966
|
|
|
1928
1967
|
if (isEmpty) {
|
|
1929
1968
|
if (config.promptable) {
|
|
@@ -57,6 +57,22 @@ export default {
|
|
|
57
57
|
},
|
|
58
58
|
buildTarget: {
|
|
59
59
|
type: ['string', 'null']
|
|
60
|
+
},
|
|
61
|
+
icList: {
|
|
62
|
+
type: 'array',
|
|
63
|
+
items: {
|
|
64
|
+
type: 'object',
|
|
65
|
+
required: ['name'],
|
|
66
|
+
properties: {
|
|
67
|
+
name: { type: 'string', minLength: 1 },
|
|
68
|
+
image: { type: 'string' },
|
|
69
|
+
gpuCount: { type: 'integer', minimum: 0 },
|
|
70
|
+
copyCount: { type: 'integer', minimum: 1 },
|
|
71
|
+
isAdapter: { type: 'boolean' },
|
|
72
|
+
baseIcName: { type: 'string' },
|
|
73
|
+
artifactUrl: { type: 'string' }
|
|
74
|
+
}
|
|
75
|
+
}
|
|
60
76
|
}
|
|
61
77
|
}
|
|
62
78
|
},
|
package/src/lib/prompt-runner.js
CHANGED
|
@@ -18,8 +18,10 @@ import {
|
|
|
18
18
|
modelLoadStrategyPrompts,
|
|
19
19
|
modelProfilePrompts,
|
|
20
20
|
modulePrompts,
|
|
21
|
+
loraPrompts,
|
|
21
22
|
benchmarkPrompts,
|
|
22
23
|
infraRegionAndTargetPrompts,
|
|
24
|
+
infraExistingEndpointPrompts,
|
|
23
25
|
infraInstancePrompts,
|
|
24
26
|
infraAsyncPrompts,
|
|
25
27
|
infraBatchTransformPrompts,
|
|
@@ -29,7 +31,9 @@ import {
|
|
|
29
31
|
destinationPrompts,
|
|
30
32
|
baseImageSearchPrompts,
|
|
31
33
|
baseImagePrompts,
|
|
32
|
-
formatImageChoices
|
|
34
|
+
formatImageChoices,
|
|
35
|
+
filterByCudaGeneration,
|
|
36
|
+
instanceCatalogRaw
|
|
33
37
|
} from './prompts.js';
|
|
34
38
|
|
|
35
39
|
import fs from 'fs';
|
|
@@ -187,12 +191,40 @@ export default class PromptRunner {
|
|
|
187
191
|
// 3a. Region query
|
|
188
192
|
await this._queryMcpForRegion(frameworkAnswers, explicitConfig);
|
|
189
193
|
|
|
194
|
+
// 3a2. Existing endpoint prompt (only for realtime-inference)
|
|
195
|
+
// Requirements: 3.3, 4.3, 4.4 — endpoint-picker MCP query
|
|
196
|
+
let existingEndpointAnswers = {};
|
|
197
|
+
if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference') {
|
|
198
|
+
// Query endpoint-picker MCP server for available endpoints
|
|
199
|
+
const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
|
|
200
|
+
await this._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
|
|
201
|
+
|
|
202
|
+
const endpointPreviousAnswers = {
|
|
203
|
+
...regionAndTargetAnswers,
|
|
204
|
+
...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
|
|
205
|
+
};
|
|
206
|
+
existingEndpointAnswers = await this._runPhase(
|
|
207
|
+
infraExistingEndpointPrompts,
|
|
208
|
+
endpointPreviousAnswers,
|
|
209
|
+
explicitConfig,
|
|
210
|
+
existingConfig
|
|
211
|
+
);
|
|
212
|
+
|
|
213
|
+
// Resolve custom endpoint name
|
|
214
|
+
if (existingEndpointAnswers.customExistingEndpointName) {
|
|
215
|
+
existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
|
|
216
|
+
delete existingEndpointAnswers.customExistingEndpointName;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
190
220
|
// 3b. Instance type — query instance-sizer with full context (model + profile + CUDA)
|
|
191
221
|
let instanceAnswers = {};
|
|
192
|
-
|
|
222
|
+
// Skip instance prompts when attaching to an existing endpoint (instance is inherited)
|
|
223
|
+
const useExistingEndpoint = !!(existingEndpointAnswers.existingEndpointName);
|
|
224
|
+
const needsInstance = !useExistingEndpoint && (regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
|
|
193
225
|
regionAndTargetAnswers.deploymentTarget === 'async-inference' ||
|
|
194
226
|
regionAndTargetAnswers.deploymentTarget === 'batch-transform' ||
|
|
195
|
-
regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks';
|
|
227
|
+
regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks');
|
|
196
228
|
|
|
197
229
|
if (needsInstance) {
|
|
198
230
|
// Determine architecture type for heuristic fallback
|
|
@@ -230,6 +262,74 @@ export default class PromptRunner {
|
|
|
230
262
|
if (!instanceAnswers.instanceType && !explicitConfig.instanceType && this._architectureHeuristicDefault) {
|
|
231
263
|
instanceAnswers.instanceType = this._architectureHeuristicDefault;
|
|
232
264
|
}
|
|
265
|
+
|
|
266
|
+
// Process multi-select instance type results (Requirements: 6.4)
|
|
267
|
+
// When user selects multiple instances via checkbox, derive instanceType and instancePools
|
|
268
|
+
if (instanceAnswers.instanceTypeSelections && instanceAnswers.instanceTypeSelections.length > 0) {
|
|
269
|
+
let selections = instanceAnswers.instanceTypeSelections.slice(0, 5); // Cap at 5 (API limit)
|
|
270
|
+
|
|
271
|
+
// Resolve custom input: replace __custom_input__ sentinel with parsed instances
|
|
272
|
+
if (selections.includes('__custom_input__') && instanceAnswers.customInstanceTypeSelections) {
|
|
273
|
+
const customInstances = instanceAnswers.customInstanceTypeSelections
|
|
274
|
+
.split(',').map(s => s.trim()).filter(s => s.length > 0);
|
|
275
|
+
// Remove the sentinel and any other MCP selections, replace with custom entries
|
|
276
|
+
selections = selections.filter(s => s !== '__custom_input__');
|
|
277
|
+
selections = [...selections, ...customInstances];
|
|
278
|
+
delete instanceAnswers.customInstanceTypeSelections;
|
|
279
|
+
} else if (selections.includes('__custom_input__')) {
|
|
280
|
+
// Sentinel selected but no custom input provided — remove it
|
|
281
|
+
selections = selections.filter(s => s !== '__custom_input__');
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Cap at 5 after custom expansion
|
|
285
|
+
if (selections.length > 5) {
|
|
286
|
+
console.log(' ⚠️ Maximum 5 instance types allowed. Using first 5 selections.');
|
|
287
|
+
selections = selections.slice(0, 5);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// Filter to same CUDA generation and warn about incompatible removals
|
|
291
|
+
const { filtered, generation, removed } = filterByCudaGeneration(selections);
|
|
292
|
+
if (removed.length > 0) {
|
|
293
|
+
console.log(` ⚠️ Removed incompatible instances (different CUDA generation): ${removed.join(', ')}`);
|
|
294
|
+
console.log(` Keeping ${generation} generation: ${filtered.join(', ')}`);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
const finalSelections = filtered.length > 0 ? filtered : selections;
|
|
298
|
+
|
|
299
|
+
if (finalSelections.length === 1) {
|
|
300
|
+
// Single selection → standard single instance type (no pools)
|
|
301
|
+
instanceAnswers.instanceType = finalSelections[0];
|
|
302
|
+
console.log(` ✓ Single instance selected: ${finalSelections[0]}`);
|
|
303
|
+
} else {
|
|
304
|
+
// Multiple selections → instance pools with priority = selection order
|
|
305
|
+
instanceAnswers.instanceType = finalSelections[0]; // backward compat: first is primary
|
|
306
|
+
instanceAnswers.instancePools = finalSelections.map((it, idx) => ({
|
|
307
|
+
InstanceType: it,
|
|
308
|
+
Priority: idx + 1
|
|
309
|
+
}));
|
|
310
|
+
|
|
311
|
+
// Auto-generate multi-spec IC config from catalog
|
|
312
|
+
instanceAnswers.instancePoolSpecs = finalSelections.map(it => {
|
|
313
|
+
const entry = instanceCatalogRaw[it];
|
|
314
|
+
return {
|
|
315
|
+
instanceType: it,
|
|
316
|
+
gpuCount: entry?.gpus || 1,
|
|
317
|
+
minMemoryMb: entry?.gpuMemoryGb ? entry.gpuMemoryGb * 1024 : 1024
|
|
318
|
+
};
|
|
319
|
+
});
|
|
320
|
+
|
|
321
|
+
console.log(` ✓ Instance pools configured (${finalSelections.length} types):`);
|
|
322
|
+
finalSelections.forEach((it, idx) => {
|
|
323
|
+
const entry = instanceCatalogRaw[it];
|
|
324
|
+
const gpus = entry?.gpus || '?';
|
|
325
|
+
const mem = entry?.gpuMemoryGb || '?';
|
|
326
|
+
console.log(` Priority ${idx + 1}: ${it} (${gpus} GPUs, ${mem}GB GPU memory)`);
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// Clean up the raw selections from answers (not needed downstream)
|
|
331
|
+
delete instanceAnswers.instanceTypeSelections;
|
|
332
|
+
}
|
|
233
333
|
}
|
|
234
334
|
|
|
235
335
|
// In auto-prompt mode, use instance-sizer's top recommendation as the instance type
|
|
@@ -318,6 +418,7 @@ export default class PromptRunner {
|
|
|
318
418
|
// Combine all infrastructure answers
|
|
319
419
|
const infraAnswers = {
|
|
320
420
|
...regionAndTargetAnswers,
|
|
421
|
+
...existingEndpointAnswers,
|
|
321
422
|
...instanceAnswers,
|
|
322
423
|
...asyncAnswers,
|
|
323
424
|
...batchTransformAnswers,
|
|
@@ -414,6 +515,14 @@ export default class PromptRunner {
|
|
|
414
515
|
}
|
|
415
516
|
}
|
|
416
517
|
|
|
518
|
+
// LoRA adapter prompts — only for transformers with vllm/sglang/djl-lmi
|
|
519
|
+
// Requirements: 1.1, 1.2, 1.4
|
|
520
|
+
let loraAnswers = {};
|
|
521
|
+
const loraSubAnswers = await this._runPhase(loraPrompts, { ...frameworkAnswers, ...engineAnswers }, explicitConfig, existingConfig);
|
|
522
|
+
if (loraSubAnswers.enableLora !== undefined) {
|
|
523
|
+
loraAnswers = loraSubAnswers;
|
|
524
|
+
}
|
|
525
|
+
|
|
417
526
|
// Validate instance type against framework requirements (now that framework version is known)
|
|
418
527
|
const finalInstanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
|
|
419
528
|
if (finalInstanceType && frameworkVersionAnswers.frameworkVersion) {
|
|
@@ -456,6 +565,7 @@ export default class PromptRunner {
|
|
|
456
565
|
...ngcApiKeyAnswers,
|
|
457
566
|
...moduleAnswers,
|
|
458
567
|
...benchmarkAnswers,
|
|
568
|
+
...loraAnswers,
|
|
459
569
|
...projectAnswers,
|
|
460
570
|
...destinationAnswers,
|
|
461
571
|
buildTimestamp
|
|
@@ -1083,6 +1193,11 @@ export default class PromptRunner {
|
|
|
1083
1193
|
|
|
1084
1194
|
console.log(` ✓ ${choices.length} compatible instance(s) found${vramInfo}`);
|
|
1085
1195
|
|
|
1196
|
+
// Warn if all instances had zero quota but were restored for visibility
|
|
1197
|
+
if (parsed.metadata?.allFilteredByQuota) {
|
|
1198
|
+
console.log(' ⚠️ All instances have zero quota — request a quota increase for your preferred type');
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1086
1201
|
// Check if availability data is present (recommendations have capacityType)
|
|
1087
1202
|
const hasAvailabilityData = recommendations.some(r => r.capacityType);
|
|
1088
1203
|
|
|
@@ -1187,6 +1302,62 @@ export default class PromptRunner {
|
|
|
1187
1302
|
}
|
|
1188
1303
|
}
|
|
1189
1304
|
|
|
1305
|
+
/**
|
|
1306
|
+
* Query the endpoint-picker MCP server for available InService real-time endpoints.
|
|
1307
|
+
* Populates this._mcpEndpointChoices for the existing endpoint selection prompt.
|
|
1308
|
+
* Graceful fallback: if MCP server fails (no credentials, timeout), skip and create new endpoint.
|
|
1309
|
+
* Requirements: 3.3, 4.3, 4.4
|
|
1310
|
+
* @private
|
|
1311
|
+
*/
|
|
1312
|
+
async _queryMcpForEndpoints(infraAnswers, explicitConfig) {
|
|
1313
|
+
const cm = this.configManager;
|
|
1314
|
+
if (!cm) return;
|
|
1315
|
+
|
|
1316
|
+
const mcpServers = cm.getMcpServerNames();
|
|
1317
|
+
if (!mcpServers.includes('endpoint-picker')) return;
|
|
1318
|
+
|
|
1319
|
+
// Skip if existing endpoint already provided via CLI/config
|
|
1320
|
+
if (explicitConfig.existingEndpointName) return;
|
|
1321
|
+
|
|
1322
|
+
console.log(' 🔍 Querying endpoint-picker...');
|
|
1323
|
+
|
|
1324
|
+
try {
|
|
1325
|
+
const result = await cm.queryMcpServer('endpoint-picker', {
|
|
1326
|
+
awsRegion: infraAnswers.awsRegion,
|
|
1327
|
+
deploymentTarget: 'realtime-inference'
|
|
1328
|
+
});
|
|
1329
|
+
|
|
1330
|
+
if (result && result.choices?.endpointName?.length > 0) {
|
|
1331
|
+
const endpointNames = result.choices.endpointName;
|
|
1332
|
+
const metadata = result.metadata || {};
|
|
1333
|
+
|
|
1334
|
+
// Build choices with metadata annotations
|
|
1335
|
+
this._mcpEndpointChoices = endpointNames.map(name => {
|
|
1336
|
+
const meta = metadata[name];
|
|
1337
|
+
if (meta) {
|
|
1338
|
+
const gpuInfo = meta.availableGpus === '?' ? 'GPUs: ?' : `${meta.availableGpus} GPUs free`;
|
|
1339
|
+
return {
|
|
1340
|
+
name: `${name} (${meta.instanceType}, ${gpuInfo}, ${meta.icCount} IC${meta.icCount !== 1 ? 's' : ''})`,
|
|
1341
|
+
value: name
|
|
1342
|
+
};
|
|
1343
|
+
}
|
|
1344
|
+
return { name, value: name };
|
|
1345
|
+
});
|
|
1346
|
+
|
|
1347
|
+
console.log(` ✓ ${endpointNames.length} endpoint(s) with available capacity`);
|
|
1348
|
+
} else {
|
|
1349
|
+
if (result?.message) {
|
|
1350
|
+
console.log(` ↳ ${result.message}`);
|
|
1351
|
+
} else {
|
|
1352
|
+
console.log(' ↳ No endpoints with available capacity found');
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
} catch (err) {
|
|
1356
|
+
// Graceful fallback: if MCP server fails, skip and create new endpoint
|
|
1357
|
+
console.log(` ⚠️ endpoint-picker: ${err.message || 'query failed'} — will create new endpoint`);
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1190
1361
|
/**
|
|
1191
1362
|
* Query MCP base-image-picker server after deployment config is selected.
|
|
1192
1363
|
* Populates _mcpBaseImageChoices for the base image selection prompt.
|