@aws/ml-container-creator 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +5 -2
- package/config/bootstrap-stack.json +40 -9
- package/infra/ci-harness/buildspec.yml +60 -0
- package/infra/ci-harness/package-lock.json +5 -1
- package/package.json +1 -1
- package/servers/README.md +41 -1
- package/servers/instance-sizer/index.js +10 -4
- package/servers/instance-sizer/lib/model-resolver.js +1 -1
- package/servers/lib/catalogs/model-sizes.json +135 -90
- package/servers/lib/catalogs/models.json +483 -411
- package/src/app.js +33 -2
- package/src/lib/bootstrap-command-handler.js +6 -0
- package/src/lib/cli-handler.js +1 -1
- package/src/lib/config-manager.js +41 -2
- package/src/lib/deployment-entry-schema.js +16 -0
- package/src/lib/mcp-client.js +3 -3
- package/src/lib/prompt-runner.js +179 -8
- package/src/lib/prompts.js +253 -7
- package/src/lib/registry-command-handler.js +12 -0
- package/templates/Dockerfile +12 -0
- package/templates/code/serving.properties +14 -0
- package/templates/do/adapter +1230 -0
- package/templates/do/adapters/.gitkeep +2 -0
- package/templates/do/add-ic +130 -0
- package/templates/do/benchmark +81 -9
- package/templates/do/clean +507 -17
- package/templates/do/config +28 -5
- package/templates/do/deploy +513 -367
- package/templates/do/ic/default.conf +32 -0
- package/templates/do/lib/endpoint-config.sh +216 -0
- package/templates/do/lib/inference-component.sh +167 -0
- package/templates/do/lib/secrets.sh +44 -0
- package/templates/do/lib/wait.sh +131 -0
- package/templates/do/logs +107 -27
- package/templates/do/optimize +528 -0
- package/templates/do/register +111 -1
- package/templates/do/status +337 -0
- package/templates/do/test +80 -28
package/bin/cli.js
CHANGED
|
@@ -98,6 +98,9 @@ program
|
|
|
98
98
|
.addOption(new Option('--include-sample', 'Include sample model code'))
|
|
99
99
|
.addOption(new Option('--include-testing', 'Include test suite'))
|
|
100
100
|
.addOption(new Option('--test-types <types>', 'Comma-separated test types'))
|
|
101
|
+
.addOption(new Option('--enable-lora', 'Enable LoRA adapter serving (transformers with vllm/sglang/djl-lmi only)'))
|
|
102
|
+
.addOption(new Option('--max-loras <n>', 'Maximum concurrent LoRA adapters in GPU memory (default: 30)'))
|
|
103
|
+
.addOption(new Option('--max-lora-rank <n>', 'Maximum LoRA rank (default: 64)'))
|
|
101
104
|
|
|
102
105
|
// --- MCP & Discovery ---
|
|
103
106
|
.addOption(new Option('--smart', 'Enable Bedrock-powered smart mode on MCP servers'))
|
|
@@ -190,7 +193,7 @@ program.configureHelp({
|
|
|
190
193
|
groups.env.push(opt);
|
|
191
194
|
} else if (['--hf-token', '--hf-token-arn', '--ngc-token', '--ngc-token-arn'].includes(long)) {
|
|
192
195
|
groups.auth.push(opt);
|
|
193
|
-
} else if (['--include-sample', '--include-testing', '--test-types'].includes(long)) {
|
|
196
|
+
} else if (['--include-sample', '--include-testing', '--test-types', '--enable-lora', '--max-loras', '--max-lora-rank'].includes(long)) {
|
|
194
197
|
groups.features.push(opt);
|
|
195
198
|
} else if (['--smart', '--discover'].includes(long)) {
|
|
196
199
|
groups.mcp.push(opt);
|
|
@@ -307,7 +310,6 @@ program
|
|
|
307
310
|
program
|
|
308
311
|
.command('registry')
|
|
309
312
|
.description('Registry operations (list, get, remove, replay, export, import, search) — experimental, may be reconciled with do/register')
|
|
310
|
-
.passThroughOptions()
|
|
311
313
|
.argument('<action>', 'Registry action (log, list, get, remove, replay, export, import, search)')
|
|
312
314
|
.argument('[args...]', 'Additional arguments')
|
|
313
315
|
.option('--backend <backend>', 'Filter by backend')
|
|
@@ -328,6 +330,7 @@ program
|
|
|
328
330
|
.option('--notes <text>', 'Deployment notes')
|
|
329
331
|
.option('--project', 'Use project-level registry')
|
|
330
332
|
.option('--parameters <json>', 'Parameters JSON string')
|
|
333
|
+
.option('--ic-list <json>', 'IC list JSON string')
|
|
331
334
|
.option('--generator-version <version>', 'Generator version')
|
|
332
335
|
// Options used by `registry list-architectures`
|
|
333
336
|
.option('--server <name>', 'Filter by server name (for list-architectures)')
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
"Type": "String",
|
|
8
8
|
"Default": "false",
|
|
9
9
|
"AllowedValues": ["true", "false"],
|
|
10
|
-
"Description": "Whether to create S3 buckets for async inference
|
|
10
|
+
"Description": "Whether to create S3 buckets for async inference, batch transform, adapters, and benchmarks"
|
|
11
11
|
},
|
|
12
12
|
"UseExistingRoleArn": {
|
|
13
13
|
"Type": "String",
|
|
@@ -134,8 +134,8 @@
|
|
|
134
134
|
"s3:ListBucket"
|
|
135
135
|
],
|
|
136
136
|
"Resource": [
|
|
137
|
-
"arn:aws:s3:::
|
|
138
|
-
"arn:aws:s3:::
|
|
137
|
+
"arn:aws:s3:::mlcc-*",
|
|
138
|
+
"arn:aws:s3:::mlcc-*/*"
|
|
139
139
|
]
|
|
140
140
|
},
|
|
141
141
|
{
|
|
@@ -209,7 +209,7 @@
|
|
|
209
209
|
"DeletionPolicy": "Retain",
|
|
210
210
|
"UpdateReplacePolicy": "Retain",
|
|
211
211
|
"Properties": {
|
|
212
|
-
"BucketName": { "Fn::Sub": "
|
|
212
|
+
"BucketName": { "Fn::Sub": "mlcc-async-${AWS::AccountId}-${AWS::Region}" },
|
|
213
213
|
"VersioningConfiguration": { "Status": "Enabled" },
|
|
214
214
|
"BucketEncryption": {
|
|
215
215
|
"ServerSideEncryptionConfiguration": [
|
|
@@ -218,7 +218,8 @@
|
|
|
218
218
|
},
|
|
219
219
|
"Tags": [
|
|
220
220
|
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
221
|
-
{ "Key": "mlcc:created-by", "Value": "bootstrap" }
|
|
221
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap" },
|
|
222
|
+
{ "Key": "mlcc:purpose", "Value": "async-inference-output" }
|
|
222
223
|
]
|
|
223
224
|
}
|
|
224
225
|
},
|
|
@@ -229,7 +230,7 @@
|
|
|
229
230
|
"DeletionPolicy": "Retain",
|
|
230
231
|
"UpdateReplacePolicy": "Retain",
|
|
231
232
|
"Properties": {
|
|
232
|
-
"BucketName": { "Fn::Sub": "
|
|
233
|
+
"BucketName": { "Fn::Sub": "mlcc-batch-${AWS::AccountId}-${AWS::Region}" },
|
|
233
234
|
"VersioningConfiguration": { "Status": "Enabled" },
|
|
234
235
|
"BucketEncryption": {
|
|
235
236
|
"ServerSideEncryptionConfiguration": [
|
|
@@ -238,17 +239,40 @@
|
|
|
238
239
|
},
|
|
239
240
|
"Tags": [
|
|
240
241
|
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
241
|
-
{ "Key": "mlcc:created-by", "Value": "bootstrap" }
|
|
242
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap" },
|
|
243
|
+
{ "Key": "mlcc:purpose", "Value": "batch-transform-io" }
|
|
244
|
+
]
|
|
245
|
+
}
|
|
246
|
+
},
|
|
247
|
+
|
|
248
|
+
"AdapterS3Bucket": {
|
|
249
|
+
"Type": "AWS::S3::Bucket",
|
|
250
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
251
|
+
"DeletionPolicy": "Retain",
|
|
252
|
+
"UpdateReplacePolicy": "Retain",
|
|
253
|
+
"Properties": {
|
|
254
|
+
"BucketName": { "Fn::Sub": "mlcc-adapters-${AWS::AccountId}-${AWS::Region}" },
|
|
255
|
+
"VersioningConfiguration": { "Status": "Enabled" },
|
|
256
|
+
"BucketEncryption": {
|
|
257
|
+
"ServerSideEncryptionConfiguration": [
|
|
258
|
+
{ "ServerSideEncryptionByDefault": { "SSEAlgorithm": "AES256" } }
|
|
259
|
+
]
|
|
260
|
+
},
|
|
261
|
+
"Tags": [
|
|
262
|
+
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
263
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap" },
|
|
264
|
+
{ "Key": "mlcc:purpose", "Value": "lora-adapter-storage" }
|
|
242
265
|
]
|
|
243
266
|
}
|
|
244
267
|
},
|
|
245
268
|
|
|
246
269
|
"BenchmarkS3Bucket": {
|
|
247
270
|
"Type": "AWS::S3::Bucket",
|
|
271
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
248
272
|
"DeletionPolicy": "Retain",
|
|
249
273
|
"UpdateReplacePolicy": "Retain",
|
|
250
274
|
"Properties": {
|
|
251
|
-
"BucketName": { "Fn::Sub": "
|
|
275
|
+
"BucketName": { "Fn::Sub": "mlcc-benchmark-${AWS::AccountId}-${AWS::Region}" },
|
|
252
276
|
"VersioningConfiguration": { "Status": "Enabled" },
|
|
253
277
|
"BucketEncryption": {
|
|
254
278
|
"ServerSideEncryptionConfiguration": [
|
|
@@ -257,7 +281,8 @@
|
|
|
257
281
|
},
|
|
258
282
|
"Tags": [
|
|
259
283
|
{ "Key": "mlcc:managed-by", "Value": "ml-container-creator" },
|
|
260
|
-
{ "Key": "mlcc:created-by", "Value": "bootstrap" }
|
|
284
|
+
{ "Key": "mlcc:created-by", "Value": "bootstrap" },
|
|
285
|
+
{ "Key": "mlcc:purpose", "Value": "benchmark-results" }
|
|
261
286
|
]
|
|
262
287
|
}
|
|
263
288
|
}
|
|
@@ -292,7 +317,13 @@
|
|
|
292
317
|
"Description": "S3 bucket for batch transform I/O",
|
|
293
318
|
"Value": { "Ref": "BatchS3Bucket" }
|
|
294
319
|
},
|
|
320
|
+
"AdapterS3BucketName": {
|
|
321
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
322
|
+
"Description": "S3 bucket for LoRA adapter storage",
|
|
323
|
+
"Value": { "Ref": "AdapterS3Bucket" }
|
|
324
|
+
},
|
|
295
325
|
"BenchmarkS3BucketName": {
|
|
326
|
+
"Condition": "ShouldCreateS3Buckets",
|
|
296
327
|
"Description": "S3 bucket for benchmark results output",
|
|
297
328
|
"Value": { "Ref": "BenchmarkS3Bucket" }
|
|
298
329
|
},
|
|
@@ -40,6 +40,10 @@ phases:
|
|
|
40
40
|
- REGISTER_DURATION=0
|
|
41
41
|
- REGISTER_LOG_POINTER=""
|
|
42
42
|
- REGISTER_ERROR_SUMMARY=""
|
|
43
|
+
- ADAPTER_TEST_STATUS="skip"
|
|
44
|
+
- ADAPTER_TEST_DURATION=0
|
|
45
|
+
- ADAPTER_TEST_LOG_POINTER=""
|
|
46
|
+
- ADAPTER_TEST_ERROR_SUMMARY=""
|
|
43
47
|
- TEARDOWN_STATUS="skip"
|
|
44
48
|
- TEARDOWN_DURATION=0
|
|
45
49
|
- TEARDOWN_LOG_POINTER=""
|
|
@@ -182,6 +186,54 @@ phases:
|
|
|
182
186
|
fi
|
|
183
187
|
- rm -f "$STAGE_STDERR_FILE"
|
|
184
188
|
|
|
189
|
+
# --- Stage: Adapter_Test (only if do/adapters/ has .conf files) ---
|
|
190
|
+
- echo "=== Stage: Adapter_Test ==="
|
|
191
|
+
- STAGE_START=$(date +%s)
|
|
192
|
+
- ADAPTER_TEST_LOG_POINTER="$LOG_POINTER_PREFIX"
|
|
193
|
+
- STAGE_STDERR_FILE=$(mktemp)
|
|
194
|
+
- |
|
|
195
|
+
if [ -n "$FIRST_FAILURE" ]; then
|
|
196
|
+
echo "Skipping Adapter_Test stage due to prior failure in $FIRST_FAILURE"
|
|
197
|
+
ADAPTER_TEST_STATUS="skip"
|
|
198
|
+
ADAPTER_TEST_DURATION=0
|
|
199
|
+
else
|
|
200
|
+
cd /tmp/ci-project
|
|
201
|
+
ADAPTER_CONFS=$(find do/adapters -name '*.conf' 2>/dev/null | grep -v '.gitkeep' || true)
|
|
202
|
+
if [ -z "$ADAPTER_CONFS" ]; then
|
|
203
|
+
echo "No adapter configs found in do/adapters/ — skipping"
|
|
204
|
+
ADAPTER_TEST_STATUS="skip"
|
|
205
|
+
ADAPTER_TEST_DURATION=0
|
|
206
|
+
else
|
|
207
|
+
(
|
|
208
|
+
set -e
|
|
209
|
+
cd /tmp/ci-project
|
|
210
|
+
for conf in do/adapters/*.conf; do
|
|
211
|
+
[ -f "$conf" ] || continue
|
|
212
|
+
[[ "$(basename "$conf")" == ".gitkeep" ]] && continue
|
|
213
|
+
ADAPTER_NAME=$(basename "$conf" .conf)
|
|
214
|
+
echo "Testing adapter: ${ADAPTER_NAME}"
|
|
215
|
+
# Source to get weights URI
|
|
216
|
+
source "$conf"
|
|
217
|
+
./do/adapter add "${ADAPTER_NAME}" --weights "${ADAPTER_WEIGHTS_URI}"
|
|
218
|
+
./do/test --ic "${ADAPTER_NAME}"
|
|
219
|
+
./do/adapter remove "${ADAPTER_NAME}"
|
|
220
|
+
done
|
|
221
|
+
) 2>"$STAGE_STDERR_FILE"; STAGE_EXIT=$?
|
|
222
|
+
STAGE_END=$(date +%s)
|
|
223
|
+
ADAPTER_TEST_DURATION=$((STAGE_END - STAGE_START))
|
|
224
|
+
if [ "$STAGE_EXIT" -eq 0 ]; then
|
|
225
|
+
ADAPTER_TEST_STATUS="pass"
|
|
226
|
+
echo "Adapter_Test stage passed in ${ADAPTER_TEST_DURATION}s"
|
|
227
|
+
else
|
|
228
|
+
ADAPTER_TEST_STATUS="fail"
|
|
229
|
+
ADAPTER_TEST_ERROR_SUMMARY=$(tail -c 500 "$STAGE_STDERR_FILE" | tr -d '\000' | tr '"' "'" | tr '\n' ' ')
|
|
230
|
+
FIRST_FAILURE="adapter_test"
|
|
231
|
+
echo "Adapter_Test stage FAILED (exit code $STAGE_EXIT) in ${ADAPTER_TEST_DURATION}s"
|
|
232
|
+
fi
|
|
233
|
+
fi
|
|
234
|
+
fi
|
|
235
|
+
- rm -f "$STAGE_STDERR_FILE"
|
|
236
|
+
|
|
185
237
|
# --- Stage: Register (placeholder) ---
|
|
186
238
|
- echo "=== Stage: Register ==="
|
|
187
239
|
- STAGE_START=$(date +%s)
|
|
@@ -260,6 +312,7 @@ phases:
|
|
|
260
312
|
validate) FINAL_ERROR_MESSAGE="$VALIDATE_ERROR_SUMMARY" ;;
|
|
261
313
|
build) FINAL_ERROR_MESSAGE="$BUILD_ERROR_SUMMARY" ;;
|
|
262
314
|
deploy_test) FINAL_ERROR_MESSAGE="$DEPLOY_TEST_ERROR_SUMMARY" ;;
|
|
315
|
+
adapter_test) FINAL_ERROR_MESSAGE="$ADAPTER_TEST_ERROR_SUMMARY" ;;
|
|
263
316
|
register) FINAL_ERROR_MESSAGE="$REGISTER_ERROR_SUMMARY" ;;
|
|
264
317
|
*) FINAL_ERROR_MESSAGE="Unknown failure stage" ;;
|
|
265
318
|
esac
|
|
@@ -272,6 +325,7 @@ phases:
|
|
|
272
325
|
ESCAPED_VALIDATE_ERROR=$(printf '%s' "$VALIDATE_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
273
326
|
ESCAPED_BUILD_ERROR=$(printf '%s' "$BUILD_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
274
327
|
ESCAPED_DEPLOY_TEST_ERROR=$(printf '%s' "$DEPLOY_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
328
|
+
ESCAPED_ADAPTER_TEST_ERROR=$(printf '%s' "$ADAPTER_TEST_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
275
329
|
ESCAPED_REGISTER_ERROR=$(printf '%s' "$REGISTER_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
276
330
|
ESCAPED_TEARDOWN_ERROR=$(printf '%s' "$TEARDOWN_ERROR_SUMMARY" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
277
331
|
ESCAPED_FINAL_ERROR=$(printf '%s' "$FINAL_ERROR_MESSAGE" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
@@ -314,6 +368,12 @@ phases:
|
|
|
314
368
|
\"logPointer\": {\"S\": \"$DEPLOY_TEST_LOG_POINTER\"},
|
|
315
369
|
\"errorSummary\": {\"S\": \"$ESCAPED_DEPLOY_TEST_ERROR\"}
|
|
316
370
|
}},
|
|
371
|
+
\"adapter_test\": {\"M\": {
|
|
372
|
+
\"status\": {\"S\": \"$ADAPTER_TEST_STATUS\"},
|
|
373
|
+
\"durationSeconds\": {\"N\": \"$ADAPTER_TEST_DURATION\"},
|
|
374
|
+
\"logPointer\": {\"S\": \"$ADAPTER_TEST_LOG_POINTER\"},
|
|
375
|
+
\"errorSummary\": {\"S\": \"$ESCAPED_ADAPTER_TEST_ERROR\"}
|
|
376
|
+
}},
|
|
317
377
|
\"register\": {\"M\": {
|
|
318
378
|
\"status\": {\"S\": \"$REGISTER_STATUS\"},
|
|
319
379
|
\"durationSeconds\": {\"N\": \"$REGISTER_DURATION\"},
|
|
@@ -48,6 +48,7 @@
|
|
|
48
48
|
"semver"
|
|
49
49
|
],
|
|
50
50
|
"license": "Apache-2.0",
|
|
51
|
+
"peer": true,
|
|
51
52
|
"dependencies": {
|
|
52
53
|
"jsonschema": "~1.4.1",
|
|
53
54
|
"semver": "^7.7.4"
|
|
@@ -2150,6 +2151,7 @@
|
|
|
2150
2151
|
"integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==",
|
|
2151
2152
|
"dev": true,
|
|
2152
2153
|
"license": "MIT",
|
|
2154
|
+
"peer": true,
|
|
2153
2155
|
"dependencies": {
|
|
2154
2156
|
"undici-types": "~6.21.0"
|
|
2155
2157
|
}
|
|
@@ -2789,7 +2791,8 @@
|
|
|
2789
2791
|
"version": "10.6.0",
|
|
2790
2792
|
"resolved": "https://registry.npmjs.org/constructs/-/constructs-10.6.0.tgz",
|
|
2791
2793
|
"integrity": "sha512-TxHOnBO5zMo/G76ykzGF/wMpEHu257TbWiIxP9K0Yv/+t70UzgBQiTqjkAsWOPC6jW91DzJI0+ehQV6xDRNBuQ==",
|
|
2792
|
-
"license": "Apache-2.0"
|
|
2794
|
+
"license": "Apache-2.0",
|
|
2795
|
+
"peer": true
|
|
2793
2796
|
},
|
|
2794
2797
|
"node_modules/create-require": {
|
|
2795
2798
|
"version": "1.1.1",
|
|
@@ -3694,6 +3697,7 @@
|
|
|
3694
3697
|
"integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
|
|
3695
3698
|
"dev": true,
|
|
3696
3699
|
"license": "Apache-2.0",
|
|
3700
|
+
"peer": true,
|
|
3697
3701
|
"bin": {
|
|
3698
3702
|
"tsc": "bin/tsc",
|
|
3699
3703
|
"tsserver": "bin/tsserver"
|
package/package.json
CHANGED
package/servers/README.md
CHANGED
|
@@ -15,7 +15,12 @@ servers/
|
|
|
15
15
|
│ ├── test.js # Standalone tests (node test.js)
|
|
16
16
|
│ ├── package.json
|
|
17
17
|
│ └── LICENSE
|
|
18
|
-
|
|
18
|
+
├── region-picker/ # AWS region suggestion server
|
|
19
|
+
│ ├── index.js # MCP server entry point
|
|
20
|
+
│ ├── test.js # Standalone tests (node test.js)
|
|
21
|
+
│ ├── package.json
|
|
22
|
+
│ └── LICENSE
|
|
23
|
+
└── endpoint-picker/ # SageMaker endpoint discovery server
|
|
19
24
|
├── index.js # MCP server entry point
|
|
20
25
|
├── test.js # Standalone tests (node test.js)
|
|
21
26
|
├── package.json
|
|
@@ -74,6 +79,39 @@ Suggests AWS regions for SageMaker deployments based on a search term. Filters t
|
|
|
74
79
|
}
|
|
75
80
|
```
|
|
76
81
|
|
|
82
|
+
### endpoint-picker
|
|
83
|
+
|
|
84
|
+
Discovers InService SageMaker real-time endpoints with available GPU capacity for attaching new inference components. Uses `ListEndpoints`, `DescribeEndpoint`, and `ListInferenceComponents` to calculate available capacity.
|
|
85
|
+
|
|
86
|
+
**Discover mode:** Queries the SageMaker API using a 3-strategy credential fallback (explicit profile → default chain → detect profiles). No static mode — always requires AWS credentials.
|
|
87
|
+
|
|
88
|
+
**Tool:** `get_inference_endpoints`
|
|
89
|
+
|
|
90
|
+
| Input Field | Type | Description |
|
|
91
|
+
|-------------|------|-------------|
|
|
92
|
+
| `parameters` | `string[]` | Must include `"endpointName"` to get results |
|
|
93
|
+
| `limit` | `number` | Max endpoints to return (default: 10) |
|
|
94
|
+
| `context` | `object` | `awsRegion`, `awsProfile`, `deploymentTarget` (must be `realtime-inference`) |
|
|
95
|
+
|
|
96
|
+
**Example response:**
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{
|
|
100
|
+
"values": { "endpointName": "my-endpoint-1234567890" },
|
|
101
|
+
"choices": { "endpointName": ["my-endpoint-1234567890", "prod-llm-endpoint"] },
|
|
102
|
+
"metadata": {
|
|
103
|
+
"my-endpoint-1234567890": {
|
|
104
|
+
"variantName": "AllTraffic",
|
|
105
|
+
"instanceType": "ml.g6e.48xlarge",
|
|
106
|
+
"instanceCount": 1,
|
|
107
|
+
"icCount": 2,
|
|
108
|
+
"availableGpus": 4,
|
|
109
|
+
"hasInstancePools": false
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
```
|
|
114
|
+
|
|
77
115
|
## Usage
|
|
78
116
|
|
|
79
117
|
### Adding a Bundled Server
|
|
@@ -297,6 +335,7 @@ The Bedrock API didn't respond within 10 seconds. This usually means network con
|
|
|
297
335
|
```bash
|
|
298
336
|
node servers/region-picker/test.js
|
|
299
337
|
node servers/instance-recommender/test.js
|
|
338
|
+
node servers/endpoint-picker/test.js
|
|
300
339
|
```
|
|
301
340
|
|
|
302
341
|
### Smart Mode Not Activating
|
|
@@ -313,6 +352,7 @@ Each server has standalone tests that run without AWS credentials or network acc
|
|
|
313
352
|
# Run individual server tests
|
|
314
353
|
node servers/region-picker/test.js
|
|
315
354
|
node servers/instance-recommender/test.js
|
|
355
|
+
node servers/endpoint-picker/test.js
|
|
316
356
|
|
|
317
357
|
# Run all server tests from the project root
|
|
318
358
|
npm run test:servers
|
|
@@ -51,7 +51,7 @@ try {
|
|
|
51
51
|
|
|
52
52
|
// ── Mode configuration ───────────────────────────────────────────────────────
|
|
53
53
|
|
|
54
|
-
const DISCOVER_MODE = process.
|
|
54
|
+
const DISCOVER_MODE = process.env.DISCOVER_MODE !== 'false' && !process.argv.includes('--no-discover')
|
|
55
55
|
const SMART_MODE = process.env.BEDROCK_SMART === 'true'
|
|
56
56
|
const BEDROCK_MODEL = process.env.BEDROCK_MODEL || 'global.anthropic.claude-sonnet-4-20250514-v1:0'
|
|
57
57
|
const BEDROCK_REGION = process.env.BEDROCK_REGION || process.env.AWS_REGION || 'us-east-1'
|
|
@@ -383,6 +383,7 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
383
383
|
// Step 3a: Quota & availability filtering (discover mode only)
|
|
384
384
|
let preQuotaFilterCount = 0
|
|
385
385
|
let allFilteredByQuota = false
|
|
386
|
+
let preQuotaRecommendations = []
|
|
386
387
|
if (DISCOVER_MODE && recommendations.length > 0) {
|
|
387
388
|
try {
|
|
388
389
|
const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
|
|
@@ -396,6 +397,7 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
396
397
|
])
|
|
397
398
|
|
|
398
399
|
preQuotaFilterCount = recommendations.length
|
|
400
|
+
preQuotaRecommendations = [...recommendations]
|
|
399
401
|
recommendations = applyAvailabilityRanking(
|
|
400
402
|
recommendations,
|
|
401
403
|
quotas.status === 'fulfilled' ? quotas.value : null,
|
|
@@ -404,6 +406,10 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
404
406
|
)
|
|
405
407
|
if (recommendations.length === 0 && preQuotaFilterCount > 0) {
|
|
406
408
|
allFilteredByQuota = true
|
|
409
|
+
// Restore pre-filter recommendations so user can see compatible instances
|
|
410
|
+
// and request quota increases for the ones they want
|
|
411
|
+
recommendations = preQuotaRecommendations
|
|
412
|
+
log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`)
|
|
407
413
|
}
|
|
408
414
|
} catch (err) {
|
|
409
415
|
// Graceful degradation: if credentials are missing or any unexpected
|
|
@@ -587,10 +593,10 @@ const isMain = process.argv[1] && resolve(process.argv[1]) === __filename
|
|
|
587
593
|
if (isMain) {
|
|
588
594
|
if (SMART_MODE) {
|
|
589
595
|
log(`Smart mode enabled (model: ${BEDROCK_MODEL}, region: ${BEDROCK_REGION})`)
|
|
590
|
-
} else if (DISCOVER_MODE) {
|
|
591
|
-
log('
|
|
596
|
+
} else if (!DISCOVER_MODE) {
|
|
597
|
+
log('Static mode (catalog-only, no network calls) — use --no-discover to force this')
|
|
592
598
|
} else {
|
|
593
|
-
log('
|
|
599
|
+
log('Discover mode (HuggingFace API + quota lookups active)')
|
|
594
600
|
}
|
|
595
601
|
|
|
596
602
|
const transport = new StdioServerTransport()
|
|
@@ -207,7 +207,7 @@ const isHuggingFacePattern = (modelName) => {
|
|
|
207
207
|
* @returns {Promise<{ parameterCount: number, dtype: string, architecture: string, maxPositionEmbeddings: number, source: string } | null>}
|
|
208
208
|
*/
|
|
209
209
|
const resolveModelMetadata = async (modelName, options = {}) => {
|
|
210
|
-
const { discover =
|
|
210
|
+
const { discover = true, catalogPath } = options
|
|
211
211
|
|
|
212
212
|
// Tier 1: Catalog lookup
|
|
213
213
|
const catalog = await loadCatalog(catalogPath)
|