@aws/ml-container-creator 0.2.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +38 -2
- package/config/bootstrap-stack.json +14 -0
- package/infra/ci-harness/package-lock.json +22 -9
- package/package.json +1 -1
- package/servers/instance-sizer/index.js +9 -6
- package/servers/instance-sizer/lib/instance-ranker.js +35 -10
- package/servers/instance-sizer/lib/model-resolver.js +10 -6
- package/servers/lib/catalogs/model-servers.json +283 -5
- package/servers/lib/catalogs/models.json +30 -0
- package/servers/lib/schemas/image-catalog.schema.json +6 -0
- package/servers/model-picker/index.js +2 -1
- package/src/app.js +19 -0
- package/src/lib/architecture-sync.js +171 -0
- package/src/lib/arn-detection.js +22 -0
- package/src/lib/bootstrap-command-handler.js +82 -0
- package/src/lib/config-manager.js +43 -0
- package/src/lib/cross-cutting-checker.js +119 -0
- package/src/lib/deployment-entry-schema.js +1 -2
- package/src/lib/prompt-runner.js +427 -20
- package/src/lib/prompts.js +1 -1
- package/src/lib/registry-command-handler.js +236 -0
- package/src/lib/secret-classification.js +56 -0
- package/src/lib/secrets-command-handler.js +550 -0
- package/src/lib/validate-runner.js +49 -0
- package/src/lib/validation-report.js +8 -1
- package/src/prompt-adapter.js +3 -2
- package/templates/do/build +22 -0
- package/templates/do/config +15 -3
- package/templates/do/deploy +60 -5
- package/templates/do/logs +18 -3
- package/templates/do/run +10 -0
|
@@ -64,7 +64,157 @@
|
|
|
64
64
|
"notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
|
|
65
65
|
}
|
|
66
66
|
},
|
|
67
|
-
"notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+"
|
|
67
|
+
"notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+",
|
|
68
|
+
"supportedModelTypes": [
|
|
69
|
+
"arcee",
|
|
70
|
+
"arctic",
|
|
71
|
+
"aria",
|
|
72
|
+
"aya_vision",
|
|
73
|
+
"baichuan",
|
|
74
|
+
"bailing_moe",
|
|
75
|
+
"bamba",
|
|
76
|
+
"bart",
|
|
77
|
+
"bert",
|
|
78
|
+
"bert_with_rope",
|
|
79
|
+
"blip2",
|
|
80
|
+
"bloom",
|
|
81
|
+
"chameleon",
|
|
82
|
+
"chatglm",
|
|
83
|
+
"cohere2_vision",
|
|
84
|
+
"commandr",
|
|
85
|
+
"dbrx",
|
|
86
|
+
"deepseek",
|
|
87
|
+
"deepseek_mtp",
|
|
88
|
+
"deepseek_v2",
|
|
89
|
+
"deepseek_vl2",
|
|
90
|
+
"dots1",
|
|
91
|
+
"ernie45",
|
|
92
|
+
"ernie45_moe",
|
|
93
|
+
"exaone",
|
|
94
|
+
"exaone4",
|
|
95
|
+
"fairseq2_llama",
|
|
96
|
+
"falcon",
|
|
97
|
+
"falcon_h1",
|
|
98
|
+
"florence2",
|
|
99
|
+
"fuyu",
|
|
100
|
+
"gemma",
|
|
101
|
+
"gemma2",
|
|
102
|
+
"gemma3",
|
|
103
|
+
"gemma3_mm",
|
|
104
|
+
"gemma3n",
|
|
105
|
+
"gemma3n_mm",
|
|
106
|
+
"glm",
|
|
107
|
+
"glm4",
|
|
108
|
+
"glm4_1v",
|
|
109
|
+
"glm4_moe",
|
|
110
|
+
"glm4_moe_mtp",
|
|
111
|
+
"glm4v",
|
|
112
|
+
"gpt2",
|
|
113
|
+
"gpt_bigcode",
|
|
114
|
+
"gpt_j",
|
|
115
|
+
"gpt_neox",
|
|
116
|
+
"gpt_oss",
|
|
117
|
+
"granite",
|
|
118
|
+
"granite_speech",
|
|
119
|
+
"granitemoe",
|
|
120
|
+
"granitemoehybrid",
|
|
121
|
+
"granitemoeshared",
|
|
122
|
+
"gritlm",
|
|
123
|
+
"grok1",
|
|
124
|
+
"h2ovl",
|
|
125
|
+
"hunyuan_v1",
|
|
126
|
+
"hyperclovax_vision",
|
|
127
|
+
"idefics3",
|
|
128
|
+
"internlm2",
|
|
129
|
+
"internlm2_ve",
|
|
130
|
+
"interns1",
|
|
131
|
+
"internvl",
|
|
132
|
+
"jais",
|
|
133
|
+
"jamba",
|
|
134
|
+
"jina_vl",
|
|
135
|
+
"keye",
|
|
136
|
+
"kimi_vl",
|
|
137
|
+
"llama",
|
|
138
|
+
"llama4",
|
|
139
|
+
"llama4_eagle",
|
|
140
|
+
"llama_eagle",
|
|
141
|
+
"llama_eagle3",
|
|
142
|
+
"llava",
|
|
143
|
+
"llava_next",
|
|
144
|
+
"llava_next_video",
|
|
145
|
+
"llava_onevision",
|
|
146
|
+
"mamba",
|
|
147
|
+
"mamba2",
|
|
148
|
+
"medusa",
|
|
149
|
+
"mimo",
|
|
150
|
+
"mimo_mtp",
|
|
151
|
+
"minicpm",
|
|
152
|
+
"minicpm3",
|
|
153
|
+
"minicpm_eagle",
|
|
154
|
+
"minicpmo",
|
|
155
|
+
"minicpmv",
|
|
156
|
+
"minimax_text_01",
|
|
157
|
+
"minimax_vl_01",
|
|
158
|
+
"mistral3",
|
|
159
|
+
"mixtral",
|
|
160
|
+
"mixtral_quant",
|
|
161
|
+
"mllama",
|
|
162
|
+
"mllama4",
|
|
163
|
+
"mlp_speculator",
|
|
164
|
+
"modernbert",
|
|
165
|
+
"molmo",
|
|
166
|
+
"mpt",
|
|
167
|
+
"nemotron",
|
|
168
|
+
"nemotron_h",
|
|
169
|
+
"nemotron_nas",
|
|
170
|
+
"nemotron_vl",
|
|
171
|
+
"nvlm_d",
|
|
172
|
+
"olmo",
|
|
173
|
+
"olmo2",
|
|
174
|
+
"olmoe",
|
|
175
|
+
"opt",
|
|
176
|
+
"orion",
|
|
177
|
+
"ovis",
|
|
178
|
+
"paligemma",
|
|
179
|
+
"persimmon",
|
|
180
|
+
"phi",
|
|
181
|
+
"phi3",
|
|
182
|
+
"phi3v",
|
|
183
|
+
"phi4_multimodal",
|
|
184
|
+
"phi4flash",
|
|
185
|
+
"phi4mm",
|
|
186
|
+
"phimoe",
|
|
187
|
+
"pixtral",
|
|
188
|
+
"plamo2",
|
|
189
|
+
"prithvi_geospatial_mae",
|
|
190
|
+
"qwen",
|
|
191
|
+
"qwen2",
|
|
192
|
+
"qwen2_5_omni_thinker",
|
|
193
|
+
"qwen2_5_vl",
|
|
194
|
+
"qwen2_audio",
|
|
195
|
+
"qwen2_moe",
|
|
196
|
+
"qwen2_rm",
|
|
197
|
+
"qwen2_vl",
|
|
198
|
+
"qwen3",
|
|
199
|
+
"qwen3_moe",
|
|
200
|
+
"qwen_vl",
|
|
201
|
+
"roberta",
|
|
202
|
+
"skyworkr1v",
|
|
203
|
+
"smolvlm",
|
|
204
|
+
"solar",
|
|
205
|
+
"stablelm",
|
|
206
|
+
"starcoder2",
|
|
207
|
+
"step3_text",
|
|
208
|
+
"step3_vl",
|
|
209
|
+
"tarsier",
|
|
210
|
+
"telechat2",
|
|
211
|
+
"teleflm",
|
|
212
|
+
"transformers",
|
|
213
|
+
"ultravox",
|
|
214
|
+
"voxtral",
|
|
215
|
+
"whisper",
|
|
216
|
+
"zamba2"
|
|
217
|
+
]
|
|
68
218
|
},
|
|
69
219
|
{
|
|
70
220
|
"image": "vllm/vllm-openai:v0.9.1",
|
|
@@ -130,7 +280,133 @@
|
|
|
130
280
|
"notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
|
|
131
281
|
}
|
|
132
282
|
},
|
|
133
|
-
"notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+"
|
|
283
|
+
"notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+",
|
|
284
|
+
"supportedModelTypes": [
|
|
285
|
+
"arctic",
|
|
286
|
+
"aria",
|
|
287
|
+
"aya_vision",
|
|
288
|
+
"baichuan",
|
|
289
|
+
"bamba",
|
|
290
|
+
"bart",
|
|
291
|
+
"bert",
|
|
292
|
+
"bert_with_rope",
|
|
293
|
+
"blip2",
|
|
294
|
+
"bloom",
|
|
295
|
+
"chameleon",
|
|
296
|
+
"chatglm",
|
|
297
|
+
"commandr",
|
|
298
|
+
"dbrx",
|
|
299
|
+
"deepseek",
|
|
300
|
+
"deepseek_mtp",
|
|
301
|
+
"deepseek_v2",
|
|
302
|
+
"deepseek_vl2",
|
|
303
|
+
"eagle",
|
|
304
|
+
"exaone",
|
|
305
|
+
"fairseq2_llama",
|
|
306
|
+
"falcon",
|
|
307
|
+
"falcon_h1",
|
|
308
|
+
"florence2",
|
|
309
|
+
"fuyu",
|
|
310
|
+
"gemma",
|
|
311
|
+
"gemma2",
|
|
312
|
+
"gemma3",
|
|
313
|
+
"gemma3_mm",
|
|
314
|
+
"glm",
|
|
315
|
+
"glm4",
|
|
316
|
+
"glm4v",
|
|
317
|
+
"gpt2",
|
|
318
|
+
"gpt_bigcode",
|
|
319
|
+
"gpt_j",
|
|
320
|
+
"gpt_neox",
|
|
321
|
+
"granite",
|
|
322
|
+
"granite_speech",
|
|
323
|
+
"granitemoe",
|
|
324
|
+
"granitemoehybrid",
|
|
325
|
+
"granitemoeshared",
|
|
326
|
+
"gritlm",
|
|
327
|
+
"grok1",
|
|
328
|
+
"h2ovl",
|
|
329
|
+
"idefics3",
|
|
330
|
+
"internlm2",
|
|
331
|
+
"internlm2_ve",
|
|
332
|
+
"internvl",
|
|
333
|
+
"jais",
|
|
334
|
+
"jamba",
|
|
335
|
+
"kimi_vl",
|
|
336
|
+
"llama",
|
|
337
|
+
"llama_eagle",
|
|
338
|
+
"llama_eagle3",
|
|
339
|
+
"llava",
|
|
340
|
+
"llava_next",
|
|
341
|
+
"llava_next_video",
|
|
342
|
+
"llava_onevision",
|
|
343
|
+
"mamba",
|
|
344
|
+
"mamba2",
|
|
345
|
+
"medusa",
|
|
346
|
+
"mimo",
|
|
347
|
+
"mimo_mtp",
|
|
348
|
+
"minicpm",
|
|
349
|
+
"minicpm3",
|
|
350
|
+
"minicpm_eagle",
|
|
351
|
+
"minicpmo",
|
|
352
|
+
"minicpmv",
|
|
353
|
+
"minimax_text_01",
|
|
354
|
+
"minimax_vl_01",
|
|
355
|
+
"mistral3",
|
|
356
|
+
"mixtral",
|
|
357
|
+
"mixtral_quant",
|
|
358
|
+
"mllama",
|
|
359
|
+
"mllama4",
|
|
360
|
+
"mlp_speculator",
|
|
361
|
+
"modernbert",
|
|
362
|
+
"molmo",
|
|
363
|
+
"mpt",
|
|
364
|
+
"nemotron",
|
|
365
|
+
"nemotron_h",
|
|
366
|
+
"nemotron_nas",
|
|
367
|
+
"nvlm_d",
|
|
368
|
+
"olmo",
|
|
369
|
+
"olmo2",
|
|
370
|
+
"olmoe",
|
|
371
|
+
"opt",
|
|
372
|
+
"orion",
|
|
373
|
+
"ovis",
|
|
374
|
+
"paligemma",
|
|
375
|
+
"persimmon",
|
|
376
|
+
"phi",
|
|
377
|
+
"phi3",
|
|
378
|
+
"phi3_small",
|
|
379
|
+
"phi3v",
|
|
380
|
+
"phi4mm",
|
|
381
|
+
"phimoe",
|
|
382
|
+
"pixtral",
|
|
383
|
+
"plamo2",
|
|
384
|
+
"prithvi_geospatial_mae",
|
|
385
|
+
"qwen",
|
|
386
|
+
"qwen2",
|
|
387
|
+
"qwen2_5_omni_thinker",
|
|
388
|
+
"qwen2_5_vl",
|
|
389
|
+
"qwen2_audio",
|
|
390
|
+
"qwen2_moe",
|
|
391
|
+
"qwen2_rm",
|
|
392
|
+
"qwen2_vl",
|
|
393
|
+
"qwen3",
|
|
394
|
+
"qwen3_moe",
|
|
395
|
+
"qwen_vl",
|
|
396
|
+
"roberta",
|
|
397
|
+
"skyworkr1v",
|
|
398
|
+
"smolvlm",
|
|
399
|
+
"solar",
|
|
400
|
+
"stablelm",
|
|
401
|
+
"starcoder2",
|
|
402
|
+
"tarsier",
|
|
403
|
+
"telechat2",
|
|
404
|
+
"teleflm",
|
|
405
|
+
"transformers",
|
|
406
|
+
"ultravox",
|
|
407
|
+
"whisper",
|
|
408
|
+
"zamba2"
|
|
409
|
+
]
|
|
134
410
|
}
|
|
135
411
|
],
|
|
136
412
|
"sglang": [
|
|
@@ -311,7 +587,8 @@
|
|
|
311
587
|
"notes": "Enables running larger models on smaller instances with acceptable accuracy"
|
|
312
588
|
}
|
|
313
589
|
},
|
|
314
|
-
"notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+"
|
|
590
|
+
"notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+",
|
|
591
|
+
"supportedModelTypes": []
|
|
315
592
|
},
|
|
316
593
|
{
|
|
317
594
|
"image": "nvcr.io/nvidia/tensorrt-llm/release:1.1.0",
|
|
@@ -380,7 +657,8 @@
|
|
|
380
657
|
"notes": "Enables running larger models on smaller instances with acceptable accuracy"
|
|
381
658
|
}
|
|
382
659
|
},
|
|
383
|
-
"notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+"
|
|
660
|
+
"notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+",
|
|
661
|
+
"supportedModelTypes": []
|
|
384
662
|
}
|
|
385
663
|
],
|
|
386
664
|
"lmi": [
|
|
@@ -958,4 +1236,4 @@
|
|
|
958
1236
|
"notes": "Triton Python backend for custom model serving with TritonPythonModel interface. GPU optional"
|
|
959
1237
|
}
|
|
960
1238
|
]
|
|
961
|
-
}
|
|
1239
|
+
}
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"openai/gpt-oss-20b": {
|
|
3
3
|
"family": "gpt-oss",
|
|
4
|
+
"parameterCount": 20000000000,
|
|
5
|
+
"defaultDtype": "bfloat16",
|
|
6
|
+
"maxPositionEmbeddings": 8192,
|
|
4
7
|
"gated": false,
|
|
5
8
|
"tags": [
|
|
6
9
|
"text-generation",
|
|
@@ -99,6 +102,9 @@
|
|
|
99
102
|
},
|
|
100
103
|
"meta-llama/Llama-2-70b-chat-hf": {
|
|
101
104
|
"family": "llama-2",
|
|
105
|
+
"parameterCount": 70000000000,
|
|
106
|
+
"defaultDtype": "float16",
|
|
107
|
+
"maxPositionEmbeddings": 4096,
|
|
102
108
|
"gated": true,
|
|
103
109
|
"tags": [
|
|
104
110
|
"text-generation",
|
|
@@ -259,6 +265,30 @@
|
|
|
259
265
|
"text-generation"
|
|
260
266
|
]
|
|
261
267
|
},
|
|
268
|
+
"meta-llama/Llama-2-70b-hf": {
|
|
269
|
+
"family": "llama-2",
|
|
270
|
+
"parameterCount": 70000000000,
|
|
271
|
+
"defaultDtype": "float16",
|
|
272
|
+
"maxPositionEmbeddings": 4096,
|
|
273
|
+
"gated": true,
|
|
274
|
+
"tags": [
|
|
275
|
+
"text-generation",
|
|
276
|
+
"llama-2"
|
|
277
|
+
],
|
|
278
|
+
"architecture": "LlamaForCausalLM",
|
|
279
|
+
"notes": "Llama-2 70B base model (non-chat). Requires multi-GPU for inference.",
|
|
280
|
+
"chatTemplate": "",
|
|
281
|
+
"frameworkCompatibility": {
|
|
282
|
+
"vllm": ">=0.3.0",
|
|
283
|
+
"tensorrt-llm": ">=0.8.0",
|
|
284
|
+
"sglang": ">=0.2.0"
|
|
285
|
+
},
|
|
286
|
+
"validationLevel": "community-validated",
|
|
287
|
+
"modelType": "transformer",
|
|
288
|
+
"tasks": [
|
|
289
|
+
"text-generation"
|
|
290
|
+
]
|
|
291
|
+
},
|
|
262
292
|
"meta-llama/Llama-2-*": {
|
|
263
293
|
"family": "llama-2",
|
|
264
294
|
"gated": true,
|
|
@@ -195,11 +195,12 @@ class HuggingFaceResolver extends ModelResolver {
|
|
|
195
195
|
}
|
|
196
196
|
|
|
197
197
|
// Fetch model config (conditional)
|
|
198
|
-
if (!fields || fields.includes('architecture')) {
|
|
198
|
+
if (!fields || fields.includes('architecture') || fields.includes('model_type')) {
|
|
199
199
|
const modelConfig = await this._fetchJson(
|
|
200
200
|
`${this.baseUrl}/${modelId}/resolve/main/config.json`
|
|
201
201
|
)
|
|
202
202
|
metadata.architecture = modelConfig?.architectures?.[0] || null
|
|
203
|
+
metadata.model_type = modelConfig?.model_type || null
|
|
203
204
|
}
|
|
204
205
|
|
|
205
206
|
return Object.keys(metadata).length > 0 ? metadata : null
|
package/src/app.js
CHANGED
|
@@ -119,6 +119,23 @@ export async function run(projectName, options) {
|
|
|
119
119
|
let answers;
|
|
120
120
|
if (configManager.shouldSkipPrompts()) {
|
|
121
121
|
console.log('\n🚀 Skipping prompts - using configuration from other sources');
|
|
122
|
+
|
|
123
|
+
// Fail-fast if required parameters are missing
|
|
124
|
+
const missing = configManager.getMissingRequiredParameters();
|
|
125
|
+
if (missing.length > 0) {
|
|
126
|
+
console.error('\n❌ Cannot skip prompts — required parameters are missing:\n');
|
|
127
|
+
for (const param of missing) {
|
|
128
|
+
const matrix = configManager._getParameterMatrix()[param];
|
|
129
|
+
const cliFlag = matrix?.cliOption ? `--${matrix.cliOption}` : '';
|
|
130
|
+
const envVar = matrix?.envVar || '';
|
|
131
|
+
const hints = [cliFlag, envVar].filter(Boolean).join(' or ');
|
|
132
|
+
console.error(` • ${param}${hints ? ` (${hints})` : ''}`);
|
|
133
|
+
}
|
|
134
|
+
console.error('\n Provide these via CLI flags, environment variables, or a config file.');
|
|
135
|
+
console.error(' Run "ml-container-creator --help" for available options.\n');
|
|
136
|
+
process.exit(1);
|
|
137
|
+
}
|
|
138
|
+
|
|
122
139
|
answers = configManager.getFinalConfiguration();
|
|
123
140
|
|
|
124
141
|
// Infer modelSource from model name prefix if not set
|
|
@@ -493,7 +510,9 @@ async function _ensureTemplateVariables(answers, registryConfigManager = null) {
|
|
|
493
510
|
chatTemplate: null,
|
|
494
511
|
chatTemplateSource: null,
|
|
495
512
|
hfToken: null,
|
|
513
|
+
hfTokenArn: null,
|
|
496
514
|
ngcApiKey: null,
|
|
515
|
+
ngcTokenArn: null,
|
|
497
516
|
envVars: {},
|
|
498
517
|
inferenceAmiVersion: null,
|
|
499
518
|
accelerator: null,
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Architecture Sync
|
|
6
|
+
*
|
|
7
|
+
* Fetches model registry source files from server GitHub repositories
|
|
8
|
+
* and extracts supported model_type values into the model-servers catalog.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { readFileSync, writeFileSync } from 'node:fs';
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Parse vLLM's model registry Python source to extract model_type keys.
|
|
15
|
+
*
|
|
16
|
+
* vLLM's registry maps architecture class names to (module, impl_class) tuples:
|
|
17
|
+
* "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
|
|
18
|
+
* "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
|
|
19
|
+
*
|
|
20
|
+
* The module name (first tuple element) corresponds to the model_type.
|
|
21
|
+
* Also matches older formats where model_type is used directly as dict key.
|
|
22
|
+
*
|
|
23
|
+
* @param {string} source - Python source code content
|
|
24
|
+
* @returns {string[]} Sorted array of model_type strings
|
|
25
|
+
*/
|
|
26
|
+
export const parseVllmRegistry = (source) => {
|
|
27
|
+
const modelTypes = new Set();
|
|
28
|
+
const patterns = [
|
|
29
|
+
// Tuple value format: ("module_name", "ClassName") — extract module_name
|
|
30
|
+
/\("([a-z][a-z0-9_]*)"\s*,\s*"[A-Z]/g,
|
|
31
|
+
// Direct lowercase key format (older registries): "model_type": (
|
|
32
|
+
/"([a-z][a-z0-9_]*)":\s*\(/g,
|
|
33
|
+
// Direct lowercase key format: "model_type": ClassName
|
|
34
|
+
/"([a-z][a-z0-9_]*)":\s*[A-Z]/g,
|
|
35
|
+
// Direct lowercase key format: "model_type": [
|
|
36
|
+
/"([a-z][a-z0-9_]*)":\s*\[/g
|
|
37
|
+
];
|
|
38
|
+
for (const pattern of patterns) {
|
|
39
|
+
let match;
|
|
40
|
+
while ((match = pattern.exec(source)) !== null) {
|
|
41
|
+
modelTypes.add(match[1]);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
return [...modelTypes].sort();
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Parse SGLang's model_registry.py to extract model_type keys.
|
|
49
|
+
*
|
|
50
|
+
* Matches patterns like:
|
|
51
|
+
* "llama": ModelClass,
|
|
52
|
+
* "qwen2": (ModulePath, ClassName),
|
|
53
|
+
*
|
|
54
|
+
* @param {string} source - Python source code content
|
|
55
|
+
* @returns {string[]} Sorted array of model_type strings
|
|
56
|
+
*/
|
|
57
|
+
export const parseSglangRegistry = (source) => {
|
|
58
|
+
const modelTypes = new Set();
|
|
59
|
+
const patterns = [
|
|
60
|
+
/"([a-z][a-z0-9_]*)":\s*\(/g,
|
|
61
|
+
/"([a-z][a-z0-9_]*)":\s*[A-Z]/g,
|
|
62
|
+
/"([a-z][a-z0-9_]*)":\s*\[/g
|
|
63
|
+
];
|
|
64
|
+
for (const pattern of patterns) {
|
|
65
|
+
let match;
|
|
66
|
+
while ((match = pattern.exec(source)) !== null) {
|
|
67
|
+
modelTypes.add(match[1]);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
return [...modelTypes].sort();
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Parse TensorRT-LLM's models __init__.py to extract model_type keys.
|
|
75
|
+
*
|
|
76
|
+
* Matches patterns from MODEL_MAP or similar dict structures:
|
|
77
|
+
* "llama": LlamaForCausalLM,
|
|
78
|
+
* "gpt2": GPT2LMHeadModel,
|
|
79
|
+
*
|
|
80
|
+
* @param {string} source - Python source code content
|
|
81
|
+
* @returns {string[]} Sorted array of model_type strings
|
|
82
|
+
*/
|
|
83
|
+
export const parseTensorRTRegistry = (source) => {
|
|
84
|
+
const modelTypes = new Set();
|
|
85
|
+
const patterns = [
|
|
86
|
+
/"([a-z][a-z0-9_]*)":\s*[A-Z]/g,
|
|
87
|
+
/"([a-z][a-z0-9_]*)":\s*\(/g,
|
|
88
|
+
/'([a-z][a-z0-9_]*)':\s*[A-Z]/g,
|
|
89
|
+
/'([a-z][a-z0-9_]*)':\s*\(/g
|
|
90
|
+
];
|
|
91
|
+
for (const pattern of patterns) {
|
|
92
|
+
let match;
|
|
93
|
+
while ((match = pattern.exec(source)) !== null) {
|
|
94
|
+
modelTypes.add(match[1]);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
return [...modelTypes].sort();
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Configuration mapping each server to its GitHub repository,
|
|
102
|
+
* registry file path, tag prefix, and parser function.
|
|
103
|
+
*/
|
|
104
|
+
export const SERVER_REGISTRY_SOURCES = {
|
|
105
|
+
vllm: {
|
|
106
|
+
repo: 'vllm-project/vllm',
|
|
107
|
+
file: 'vllm/model_executor/models/registry.py',
|
|
108
|
+
tagPrefix: 'v',
|
|
109
|
+
parser: parseVllmRegistry
|
|
110
|
+
},
|
|
111
|
+
sglang: {
|
|
112
|
+
repo: 'sgl-project/sglang',
|
|
113
|
+
file: 'python/sglang/srt/models/model_registry.py',
|
|
114
|
+
tagPrefix: 'v',
|
|
115
|
+
parser: parseSglangRegistry
|
|
116
|
+
},
|
|
117
|
+
'tensorrt-llm': {
|
|
118
|
+
repo: 'NVIDIA/TensorRT-LLM',
|
|
119
|
+
file: 'tensorrt_llm/models/__init__.py',
|
|
120
|
+
tagPrefix: 'v',
|
|
121
|
+
parser: parseTensorRTRegistry
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Sync supported model architectures from server GitHub repositories
|
|
127
|
+
* into the model-servers catalog.
|
|
128
|
+
*
|
|
129
|
+
* For each server entry in the catalog that has a matching source config,
|
|
130
|
+
* fetches the model registry file from GitHub at the version tag and
|
|
131
|
+
* parses it to extract supported model_type values.
|
|
132
|
+
*
|
|
133
|
+
* @param {string} catalogPath - Path to model-servers.json
|
|
134
|
+
* @returns {object} Summary with counts and failures
|
|
135
|
+
*/
|
|
136
|
+
export const syncArchitectures = async (catalogPath) => {
|
|
137
|
+
const catalog = JSON.parse(readFileSync(catalogPath, 'utf8'));
|
|
138
|
+
const summary = { servers: [], failures: [] };
|
|
139
|
+
|
|
140
|
+
for (const [server, entries] of Object.entries(catalog)) {
|
|
141
|
+
const source = SERVER_REGISTRY_SOURCES[server];
|
|
142
|
+
if (!source) continue;
|
|
143
|
+
|
|
144
|
+
for (const entry of entries) {
|
|
145
|
+
const version = entry.labels?.framework_version;
|
|
146
|
+
if (!version) continue;
|
|
147
|
+
|
|
148
|
+
const tag = `${source.tagPrefix}${version}`;
|
|
149
|
+
const url = `https://raw.githubusercontent.com/${source.repo}/${tag}/${source.file}`;
|
|
150
|
+
|
|
151
|
+
try {
|
|
152
|
+
const response = await fetch(url);
|
|
153
|
+
if (!response.ok) {
|
|
154
|
+
summary.failures.push({ server, version, reason: `HTTP ${response.status}` });
|
|
155
|
+
console.log(` ⚠️ ${server} ${version}: fetch failed (HTTP ${response.status})`);
|
|
156
|
+
continue;
|
|
157
|
+
}
|
|
158
|
+
const content = await response.text();
|
|
159
|
+
entry.supportedModelTypes = source.parser(content);
|
|
160
|
+
summary.servers.push({ server, version, count: entry.supportedModelTypes.length });
|
|
161
|
+
console.log(` ✓ ${server} ${version}: ${entry.supportedModelTypes.length} architectures`);
|
|
162
|
+
} catch (err) {
|
|
163
|
+
summary.failures.push({ server, version, reason: err.message });
|
|
164
|
+
console.log(` ⚠️ ${server} ${version}: fetch failed (${err.message})`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
writeFileSync(catalogPath, JSON.stringify(catalog, null, 4));
|
|
170
|
+
return summary;
|
|
171
|
+
};
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* ARN Detection Utility
|
|
6
|
+
*
|
|
7
|
+
* Provides a pure function for distinguishing AWS Secrets Manager ARNs
|
|
8
|
+
* from plaintext values. Used by the prompt flow and CLI to determine
|
|
9
|
+
* whether user input should be treated as a secret reference or a
|
|
10
|
+
* literal token value.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const SECRETS_MANAGER_ARN_PREFIX = 'arn:aws:secretsmanager:';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Determines if a value is a Secrets Manager ARN.
|
|
17
|
+
* @param {*} value - The input value to check
|
|
18
|
+
* @returns {boolean} True if the value is a Secrets Manager ARN
|
|
19
|
+
*/
|
|
20
|
+
export function isSecretsManagerArn(value) {
|
|
21
|
+
return typeof value === 'string' && value.startsWith(SECRETS_MANAGER_ARN_PREFIX);
|
|
22
|
+
}
|