@aws/ml-container-creator 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +62 -298
  2. package/bin/cli.js +7 -2
  3. package/package.json +7 -8
  4. package/servers/base-image-picker/index.js +3 -3
  5. package/servers/base-image-picker/manifest.json +4 -2
  6. package/servers/instance-sizer/index.js +561 -0
  7. package/servers/instance-sizer/lib/instance-ranker.js +245 -0
  8. package/servers/instance-sizer/lib/model-resolver.js +265 -0
  9. package/servers/instance-sizer/lib/vram-estimator.js +177 -0
  10. package/servers/instance-sizer/manifest.json +17 -0
  11. package/servers/instance-sizer/package.json +15 -0
  12. package/servers/{instance-recommender → lib}/catalogs/instances.json +136 -34
  13. package/servers/{base-image-picker → lib}/catalogs/model-servers.json +19 -249
  14. package/servers/lib/catalogs/model-sizes.json +131 -0
  15. package/servers/lib/catalogs/models.json +602 -0
  16. package/servers/{model-picker → lib}/catalogs/popular-diffusors.json +32 -10
  17. package/servers/{model-picker → lib}/catalogs/popular-transformers.json +59 -26
  18. package/servers/{base-image-picker → lib}/catalogs/python-slim.json +12 -12
  19. package/servers/lib/schemas/image-catalog.schema.json +0 -12
  20. package/servers/lib/schemas/instances.schema.json +29 -0
  21. package/servers/lib/schemas/model-catalog.schema.json +12 -10
  22. package/servers/lib/schemas/unified-model-catalog.schema.json +129 -0
  23. package/servers/model-picker/index.js +2 -3
  24. package/servers/model-picker/manifest.json +2 -3
  25. package/servers/region-picker/index.js +1 -1
  26. package/servers/region-picker/manifest.json +1 -1
  27. package/src/app.js +17 -0
  28. package/src/lib/bootstrap-command-handler.js +38 -0
  29. package/src/lib/cli-handler.js +3 -3
  30. package/src/lib/config-manager.js +4 -1
  31. package/src/lib/configuration-manager.js +2 -2
  32. package/src/lib/cross-cutting-checker.js +341 -0
  33. package/src/lib/dry-run-validator.js +78 -0
  34. package/src/lib/generation-validator.js +102 -0
  35. package/src/lib/mcp-validator-config.js +89 -0
  36. package/src/lib/payload-builder.js +153 -0
  37. package/src/lib/prompt-runner.js +445 -135
  38. package/src/lib/prompts.js +1 -1
  39. package/src/lib/registry-loader.js +5 -5
  40. package/src/lib/schema-sync.js +203 -0
  41. package/src/lib/schema-validation-engine.js +195 -0
  42. package/src/lib/service-model-parser.js +102 -0
  43. package/src/lib/validate-runner.js +167 -0
  44. package/src/lib/validation-report.js +133 -0
  45. package/src/lib/validators/base-validator.js +36 -0
  46. package/src/lib/validators/catalog-validator.js +177 -0
  47. package/src/lib/validators/enum-validator.js +120 -0
  48. package/src/lib/validators/required-field-validator.js +150 -0
  49. package/src/lib/validators/type-validator.js +313 -0
  50. package/templates/Dockerfile +1 -1
  51. package/templates/do/build +15 -5
  52. package/templates/do/run +5 -1
  53. package/templates/do/validate +61 -0
  54. package/servers/instance-recommender/LICENSE +0 -202
  55. package/servers/instance-recommender/index.js +0 -284
  56. package/servers/instance-recommender/manifest.json +0 -16
  57. package/servers/instance-recommender/package.json +0 -15
  58. /package/servers/{model-picker → lib}/catalogs/jumpstart-public.json +0 -0
  59. /package/servers/{region-picker → lib}/catalogs/regions.json +0 -0
  60. /package/servers/{base-image-picker → lib}/catalogs/triton-backends.json +0 -0
  61. /package/servers/{base-image-picker → lib}/catalogs/triton.json +0 -0
@@ -54,6 +54,14 @@ export default class PromptRunner {
54
54
 
55
55
  /**
56
56
  * Runs all prompting phases and returns combined answers
57
+ *
58
+ * Phase ordering (MCP Catalog Consolidation):
59
+ * Phase 1 (What): deployment config + model name/ID + quantization
60
+ * Phase 2 (How): deployment target + serving profile + base image
61
+ * Phase 3 (Where): region + instance-sizer query + instance type + CUDA/AMI auto-resolution + HyperPod + build target
62
+ * Phase 4 (Details): framework version, model profile, modules
63
+ * Phase 5 (Project): project name + destination
64
+ *
57
65
  * @returns {Promise<Object>} Combined answers from all phases
58
66
  */
59
67
  async run() {
@@ -70,39 +78,184 @@ export default class PromptRunner {
70
78
  // Get only explicit configuration (not defaults) for prompt skipping
71
79
  const explicitConfig = this.configManager ? this.configManager.getExplicitConfiguration() : {};
72
80
 
73
- // Phase 1: Infrastructure & Deployment
74
- // Requirements: 3.1 — infrastructure prompts run first
75
- // Ordering: Region Deployment Target Instance (if managed) → HyperPod (if eks) → Build Target
81
+ // ══════════════════════════════════════════════════════════════════════
82
+ // Phase 1 — What (deployment config + model name/ID + quantization)
83
+ // Requirements: 4.1, 4.2 model selection drives instance sizing
84
+ // ══════════════════════════════════════════════════════════════════════
85
+ console.log('\n🔧 Core ML Configuration');
86
+ const deploymentConfigAnswers = await this._runPhase(deploymentConfigPrompts, {}, explicitConfig, existingConfig);
87
+
88
+ // Derive architecture, backend, and legacy framework/modelServer from deploymentConfig
89
+ let architecture, backend, framework, modelServer;
90
+ if (deploymentConfigAnswers.deploymentConfig) {
91
+ const parts = deploymentConfigAnswers.deploymentConfig.split('-');
92
+ architecture = parts[0];
93
+ backend = parts.slice(1).join('-');
94
+ // Legacy compatibility: derive framework and modelServer
95
+ framework = architecture;
96
+ modelServer = backend;
97
+ }
98
+
99
+ // Add derived values to answers
100
+ const frameworkAnswers = {
101
+ ...deploymentConfigAnswers,
102
+ architecture: architecture || deploymentConfigAnswers.architecture,
103
+ backend: backend || deploymentConfigAnswers.backend,
104
+ framework: framework || deploymentConfigAnswers.framework,
105
+ modelServer: modelServer || deploymentConfigAnswers.modelServer
106
+ };
107
+
108
+ // Engine prompt for http architecture
109
+ const engineAnswers = await this._runPhase(enginePrompts, { ...frameworkAnswers }, explicitConfig, existingConfig);
110
+
111
+ // Auto-set model format for Triton backends with single format
112
+ const tritonAutoFormat = this._getTritonAutoModelFormat(architecture, backend);
113
+
114
+ // Query model-picker MCP server for model choices
115
+ this._queryMcpForModels(frameworkAnswers.architecture);
116
+ if (this._mcpModelChoices) {
117
+ console.log(' 🔍 Querying model-picker...');
118
+ console.log(` ✓ ${this._mcpModelChoices.length} model(s) available from catalog`);
119
+ }
120
+ const modelFormatPreviousAnswers = {
121
+ ...frameworkAnswers,
122
+ ...engineAnswers,
123
+ ...(this._mcpModelChoices ? { _mcpModelChoices: this._mcpModelChoices } : {})
124
+ };
125
+ const modelFormatAnswers = await this._runPhase(
126
+ modelFormatPrompts,
127
+ modelFormatPreviousAnswers,
128
+ explicitConfig,
129
+ existingConfig
130
+ );
131
+
132
+ // Model server prompts are now deprecated (empty array)
133
+ const modelServerAnswers = await this._runPhase(
134
+ modelServerPrompts,
135
+ {...frameworkAnswers, ...engineAnswers},
136
+ explicitConfig,
137
+ existingConfig
138
+ );
139
+
140
+ // Resolve model ID early for instance-sizer query in Phase 3
141
+ const phase1ModelId = modelFormatAnswers.customModelName || modelFormatAnswers.modelName || explicitConfig.modelName;
142
+
143
+ // Fetch model information from HuggingFace and Model Registry
144
+ if (phase1ModelId && phase1ModelId !== 'Custom (enter manually)') {
145
+ await this._fetchAndDisplayModelInfo(phase1ModelId);
146
+ }
147
+
148
+ // ══════════════════════════════════════════════════════════════════════
149
+ // Phase 2 — How (deployment target + serving profile + base image)
150
+ // Requirements: 4.3 — instance prompt appears AFTER base image is known
151
+ // ══════════════════════════════════════════════════════════════════════
76
152
  console.log('\n💪 Infrastructure & Deployment');
77
153
 
78
- // 1a. Query region MCP, then prompt for region + deployment target
79
- await this._queryMcpForRegion({}, explicitConfig);
154
+ // 2a. Deployment target (realtime, async, batch, hyperpod, local)
80
155
  const bootstrapRegion = existingConfig.awsRegion || explicitConfig.awsRegion;
81
156
  const regionPreviousAnswers = bootstrapRegion ? { _bootstrapRegion: bootstrapRegion } : {};
82
- const regionAndTargetAnswers = await this._runPhase(infraRegionAndTargetPrompts, regionPreviousAnswers, explicitConfig, existingConfig);
157
+ const regionAndTargetAnswers = await this._runPhase(infraRegionAndTargetPrompts, { ...frameworkAnswers, ...regionPreviousAnswers }, explicitConfig, existingConfig);
158
+
159
+ // 2b. Query base-image-picker MCP server for base image choices
160
+ await this._queryMcpForBaseImage(frameworkAnswers, explicitConfig);
161
+ const baseImagePreviousAnswers = {
162
+ ...frameworkAnswers,
163
+ ...engineAnswers,
164
+ ...(this._mcpBaseImageChoices ? { _mcpBaseImageChoices: this._mcpBaseImageChoices } : {})
165
+ };
166
+ const baseImageAnswers = await this._runPhase(
167
+ baseImagePrompts,
168
+ baseImagePreviousAnswers,
169
+ explicitConfig,
170
+ existingConfig
171
+ );
83
172
 
84
- // 1b. Instance type query MCP and prompt for realtime-inference, async-inference, batch-transform, and hyperpod-eks
173
+ // Extract CUDA version from selected base image for instance-sizer context
174
+ const selectedBaseImageCuda = this._extractCudaFromBaseImage(baseImageAnswers);
175
+
176
+ // ══════════════════════════════════════════════════════════════════════
177
+ // Phase 3 — Where (region + instance [derived] + CUDA/AMI + HyperPod + build target)
178
+ // Requirements: 4.4, 4.5, 4.7, 3.6, 3.7 — sizer query with full context
179
+ // ══════════════════════════════════════════════════════════════════════
180
+
181
+ // 3a. Region query
182
+ await this._queryMcpForRegion(frameworkAnswers, explicitConfig);
183
+
184
+ // 3b. Instance type — query instance-sizer with full context (model + profile + CUDA)
85
185
  let instanceAnswers = {};
86
- if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
186
+ const needsInstance = regionAndTargetAnswers.deploymentTarget === 'realtime-inference' ||
87
187
  regionAndTargetAnswers.deploymentTarget === 'async-inference' ||
88
188
  regionAndTargetAnswers.deploymentTarget === 'batch-transform' ||
89
- regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks') {
90
- await this._queryMcpForInstance({}, explicitConfig);
91
- const mcpInstanceChoices = this.configManager?.mcpChoices?.instanceType;
189
+ regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks';
190
+
191
+ if (needsInstance) {
192
+ // Determine architecture type for heuristic fallback
193
+ const modelArchitecture = frameworkAnswers.architecture || frameworkAnswers.deploymentConfig?.split('-')[0];
194
+
195
+ // Skip sizer query if --instance-type was provided via CLI
196
+ if (!explicitConfig.instanceType) {
197
+ // Skip sizer for predictor models (CPU-only)
198
+ if (modelArchitecture === 'predictor' || modelArchitecture === 'http') {
199
+ // Architecture heuristic: predictor → ml.m5.large
200
+ console.log(' ℹ️ Predictor model: defaulting to CPU instance (ml.m5.large)');
201
+ this._architectureHeuristicDefault = 'ml.m5.large';
202
+ } else if (phase1ModelId && phase1ModelId !== 'Custom (enter manually)') {
203
+ // Query instance-sizer with full context
204
+ await this._queryMcpForInstanceSizing(frameworkAnswers, modelFormatAnswers, explicitConfig, {
205
+ cudaVersion: selectedBaseImageCuda,
206
+ profileEnvVars: this._selectedProfileEnvVars || {}
207
+ });
208
+ } else {
209
+ // No model known — use architecture heuristic
210
+ await this._queryMcpForInstance(frameworkAnswers, explicitConfig);
211
+ }
212
+ }
213
+
214
+ // Build instance prompt choices from sizer results
215
+ const mcpInstanceChoices = this._mcpInstanceSizerChoices || this.configManager?.mcpChoices?.instanceType;
92
216
  const instancePreviousAnswers = {
93
217
  ...regionAndTargetAnswers,
94
- ...(mcpInstanceChoices && mcpInstanceChoices.length > 0 ? { _mcpInstanceChoices: mcpInstanceChoices } : {})
218
+ ...(mcpInstanceChoices && mcpInstanceChoices.length > 0 ? { _mcpInstanceChoices: mcpInstanceChoices } : {}),
219
+ ...(this._architectureHeuristicDefault ? { _architectureHeuristicDefault: this._architectureHeuristicDefault } : {})
95
220
  };
96
221
  instanceAnswers = await this._runPhase(infraInstancePrompts, instancePreviousAnswers, explicitConfig, existingConfig);
222
+
223
+ // Apply architecture heuristic fallback when sizer returns empty
224
+ if (!instanceAnswers.instanceType && !explicitConfig.instanceType && this._architectureHeuristicDefault) {
225
+ instanceAnswers.instanceType = this._architectureHeuristicDefault;
226
+ }
227
+ }
228
+
229
+ // In auto-prompt mode, use instance-sizer's top recommendation as the instance type
230
+ if (this.configManager?.isAutoPrompt() && this._mcpInstanceSizerChoices && this._mcpInstanceSizerChoices.length > 0) {
231
+ const sizerRecommendation = this._mcpInstanceSizerChoices[0];
232
+ if (!explicitConfig.instanceType) {
233
+ instanceAnswers.instanceType = sizerRecommendation;
234
+ console.log(` ✓ Auto-prompt: using instance-sizer recommendation: ${sizerRecommendation}`);
235
+ }
97
236
  }
98
237
 
99
- // 1b-async. Async-specific prompts (only when deploymentTarget === 'async-inference')
238
+ // Auto-set tensor parallelism when sizer recommends TP > 1
239
+ // Requirements: 4.8
240
+ if (this._instanceSizerMetadata) {
241
+ const sizerRecs = this._instanceSizerMetadata.recommendations || [];
242
+ const finalInstanceType = instanceAnswers.customInstanceType || instanceAnswers.instanceType;
243
+ const matchingRec = sizerRecs.find(r => r.instanceType === finalInstanceType);
244
+ const tpRec = matchingRec || sizerRecs[0];
245
+ if (tpRec && tpRec.tensorParallelism > 1) {
246
+ this._autoTensorParallelism = tpRec.tensorParallelism;
247
+ this._autoGpuCount = tpRec.gpuCount;
248
+ console.log(` ✓ Auto-set tensor parallelism: TP=${tpRec.tensorParallelism} (${tpRec.gpuCount} GPUs)`);
249
+ }
250
+ }
251
+
252
+ // 3c. Async-specific prompts (only when deploymentTarget === 'async-inference')
100
253
  let asyncAnswers = {};
101
254
  if (regionAndTargetAnswers.deploymentTarget === 'async-inference') {
102
255
  asyncAnswers = await this._runPhase(infraAsyncPrompts, { ...regionAndTargetAnswers }, explicitConfig, existingConfig);
103
256
  }
104
257
 
105
- // 1b-batch. Batch transform-specific prompts (only when deploymentTarget === 'batch-transform')
258
+ // 3d. Batch transform-specific prompts (only when deploymentTarget === 'batch-transform')
106
259
  let batchTransformAnswers = {};
107
260
  if (regionAndTargetAnswers.deploymentTarget === 'batch-transform') {
108
261
  batchTransformAnswers = await this._runPhase(
@@ -113,16 +266,24 @@ export default class PromptRunner {
113
266
  );
114
267
  }
115
268
 
116
- // 1c. HyperPod prompts — only query MCP and prompt when deployment target is hyperpod-eks
269
+ // 3e. CUDA/AMI auto-resolution
270
+ const instanceType = instanceAnswers.customInstanceType || instanceAnswers.instanceType;
271
+ const cudaAnswer = await this._promptCudaVersion(
272
+ instanceType,
273
+ frameworkAnswers.framework,
274
+ null, // frameworkVersion not yet known in Phase 3
275
+ selectedBaseImageCuda // base image CUDA version for intersection
276
+ );
277
+
278
+ // 3f. HyperPod prompts — only query MCP and prompt when deployment target is hyperpod-eks
117
279
  let hyperPodAnswers = {};
118
280
  if (regionAndTargetAnswers.deploymentTarget === 'hyperpod-eks') {
119
- // Resolve the actual region (handle 'custom' selection)
120
281
  const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
121
282
  await this._queryMcpForHyperPod({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
122
283
  hyperPodAnswers = await this._runPhase(infraHyperPodPrompts, { ...regionAndTargetAnswers }, explicitConfig, existingConfig);
123
284
  }
124
285
 
125
- // 1d. Build target + role ARN (always)
286
+ // 3g. Build target + role ARN (always)
126
287
  const buildAnswers = await this._runPhase(infraBuildPrompts, { ...regionAndTargetAnswers, ...instanceAnswers, ...hyperPodAnswers }, explicitConfig, existingConfig);
127
288
 
128
289
  // Combine all infrastructure answers
@@ -135,54 +296,16 @@ export default class PromptRunner {
135
296
  ...buildAnswers
136
297
  };
137
298
 
138
- // Phase 2: Core ML Configuration
139
- // Requirements: 3.1, 3.2 — ML configuration prompts run after infrastructure
140
- console.log('\n🔧 Core ML Configuration');
141
- const deploymentConfigAnswers = await this._runPhase(deploymentConfigPrompts, { ...infraAnswers }, explicitConfig, existingConfig);
142
-
143
- // Derive architecture, backend, and legacy framework/modelServer from deploymentConfig
144
- // Requirements: 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7
145
- let architecture, backend, framework, modelServer;
146
- if (deploymentConfigAnswers.deploymentConfig) {
147
- const parts = deploymentConfigAnswers.deploymentConfig.split('-');
148
- architecture = parts[0];
149
- backend = parts.slice(1).join('-');
150
- // Legacy compatibility: derive framework and modelServer
151
- framework = architecture;
152
- modelServer = backend;
299
+ // Apply CUDA resolution to infra answers
300
+ if (cudaAnswer) {
301
+ infraAnswers._selectedCudaVersion = cudaAnswer.cudaVersion;
302
+ infraAnswers._resolvedInferenceAmiVersion = cudaAnswer.inferenceAmiVersion;
153
303
  }
154
-
155
- // Add derived values to answers
156
- const frameworkAnswers = {
157
- ...deploymentConfigAnswers,
158
- architecture: architecture || deploymentConfigAnswers.architecture,
159
- backend: backend || deploymentConfigAnswers.backend,
160
- framework: framework || deploymentConfigAnswers.framework,
161
- modelServer: modelServer || deploymentConfigAnswers.modelServer
162
- };
163
-
164
- // Engine prompt for http architecture
165
- // Requirements: 3.7
166
- const engineAnswers = await this._runPhase(enginePrompts, { ...frameworkAnswers }, explicitConfig, existingConfig);
167
-
168
- // Auto-set model format for Triton backends with single format
169
- // Requirements: 3.3, 3.4, 3.5
170
- const tritonAutoFormat = this._getTritonAutoModelFormat(architecture, backend);
171
-
172
- // Query base-image-picker MCP server for base image choices
173
- // Requirements: 5.1, 5.2, 5.3
174
- await this._queryMcpForBaseImage(frameworkAnswers, explicitConfig);
175
- const baseImagePreviousAnswers = {
176
- ...frameworkAnswers,
177
- ...engineAnswers,
178
- ...(this._mcpBaseImageChoices ? { _mcpBaseImageChoices: this._mcpBaseImageChoices } : {})
179
- };
180
- const baseImageAnswers = await this._runPhase(
181
- baseImagePrompts,
182
- baseImagePreviousAnswers,
183
- explicitConfig,
184
- existingConfig
185
- );
304
+
305
+ // ══════════════════════════════════════════════════════════════════════
306
+ // Phase 4 — Details (framework version, model profile, modules)
307
+ // ══════════════════════════════════════════════════════════════════════
308
+ console.log('\n📦 Module Selection');
186
309
 
187
310
  // Populate framework version choices from registry
188
311
  const frameworkVersionChoices = this._getFrameworkVersionChoices(frameworkAnswers.framework);
@@ -209,44 +332,10 @@ export default class PromptRunner {
209
332
  explicitConfig,
210
333
  existingConfig
211
334
  );
212
-
213
- // Query model-picker MCP server for model choices
214
- this._queryMcpForModels(frameworkAnswers.architecture);
215
- if (this._mcpModelChoices) {
216
- console.log(' 🔍 Querying model-picker...');
217
- console.log(` ✓ ${this._mcpModelChoices.length} model(s) available from catalog`);
218
- }
219
- const modelFormatPreviousAnswers = {
220
- ...frameworkAnswers,
221
- ...engineAnswers,
222
- ...frameworkVersionAnswers,
223
- ...frameworkProfileAnswers,
224
- ...(this._mcpModelChoices ? { _mcpModelChoices: this._mcpModelChoices } : {})
225
- };
226
- const modelFormatAnswers = await this._runPhase(
227
- modelFormatPrompts,
228
- modelFormatPreviousAnswers,
229
- explicitConfig,
230
- existingConfig
231
- );
232
-
233
- // Model server prompts are now deprecated (empty array)
234
- const modelServerAnswers = await this._runPhase(
235
- modelServerPrompts,
236
- {...frameworkAnswers, ...engineAnswers, ...frameworkVersionAnswers, ...frameworkProfileAnswers},
237
- explicitConfig,
238
- existingConfig
239
- );
240
-
335
+
241
336
  // Populate model profile choices from registry (if model ID is available)
337
+ const modelId = phase1ModelId;
242
338
  const currentAnswers = {...frameworkAnswers, ...engineAnswers, ...frameworkVersionAnswers, ...frameworkProfileAnswers, ...modelFormatAnswers, ...modelServerAnswers};
243
- const modelId = currentAnswers.customModelName || currentAnswers.modelName || explicitConfig.modelName;
244
-
245
- // Fetch model information from HuggingFace and Model Registry
246
- // Requirements: 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.11, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7
247
- if (modelId && modelId !== 'Custom (enter manually)') {
248
- await this._fetchAndDisplayModelInfo(modelId);
249
- }
250
339
 
251
340
  const modelProfileChoices = this._getModelProfileChoices(modelId);
252
341
  const modelProfileAnswers = await this._runPhase(
@@ -257,7 +346,6 @@ export default class PromptRunner {
257
346
  );
258
347
 
259
348
  // Model loading strategy prompt (build-time vs runtime)
260
- // Requirements: 13.1, 13.2, 13.3, 13.4, 13.5
261
349
  const modelLoadStrategyAnswers = await this._runPhase(
262
350
  modelLoadStrategyPrompts,
263
351
  { ...frameworkAnswers, ...engineAnswers, ...modelFormatAnswers, ...modelServerAnswers, ...modelProfileAnswers },
@@ -273,32 +361,7 @@ export default class PromptRunner {
273
361
  { ...frameworkAnswers, ...engineAnswers, ...frameworkVersionAnswers, ...frameworkProfileAnswers, ...modelFormatAnswers, ...modelServerAnswers, ...modelProfileAnswers },
274
362
  explicitConfig, existingConfig);
275
363
 
276
- // Validate instance type against framework requirements (now that framework is known)
277
- // Requirements: 4.1, 4.2, 4.3, 4.4, 4.5, 4.6
278
- const instanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
279
- if (instanceType && frameworkVersionAnswers.frameworkVersion) {
280
- await this._validateAndDisplayInstanceType(
281
- instanceType,
282
- frameworkAnswers.framework,
283
- frameworkVersionAnswers.frameworkVersion
284
- );
285
- }
286
-
287
- // CUDA version selection: if the selected instance supports multiple CUDA versions,
288
- // let the user pick which one. This transparently sets the inference AMI version.
289
- const cudaAnswer = await this._promptCudaVersion(
290
- instanceType,
291
- frameworkAnswers.framework,
292
- frameworkVersionAnswers.frameworkVersion
293
- );
294
- if (cudaAnswer) {
295
- infraAnswers._selectedCudaVersion = cudaAnswer.cudaVersion;
296
- infraAnswers._resolvedInferenceAmiVersion = cudaAnswer.inferenceAmiVersion;
297
- }
298
-
299
- // Phase 3: Module Selection
300
- // Requirements: 3.3 — module selection after ML configuration
301
- console.log('\n📦 Module Selection');
364
+ // Module selection
302
365
  const moduleAnswers = await this._runPhase(modulePrompts, { ...frameworkAnswers, ...engineAnswers }, explicitConfig, existingConfig);
303
366
 
304
367
  // Ensure transformers, diffusors, and ineligible Triton backends don't get sample model
@@ -309,8 +372,19 @@ export default class PromptRunner {
309
372
  moduleAnswers.includeSampleModel = false;
310
373
  }
311
374
 
312
- // Phase 4: Project Configuration
313
- // Requirements: 3.4 project configuration last
375
+ // Validate instance type against framework requirements (now that framework version is known)
376
+ const finalInstanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
377
+ if (finalInstanceType && frameworkVersionAnswers.frameworkVersion) {
378
+ await this._validateAndDisplayInstanceType(
379
+ finalInstanceType,
380
+ frameworkAnswers.framework,
381
+ frameworkVersionAnswers.frameworkVersion
382
+ );
383
+ }
384
+
385
+ // ══════════════════════════════════════════════════════════════════════
386
+ // Phase 5 — Project (project name + destination)
387
+ // ══════════════════════════════════════════════════════════════════════
314
388
  console.log('\n📋 Project Configuration');
315
389
  const allTechnicalAnswers = {
316
390
  ...frameworkAnswers,
@@ -439,6 +513,21 @@ export default class PromptRunner {
439
513
  delete combinedAnswers.customInstanceType;
440
514
  }
441
515
 
516
+ // Propagate tensor parallelism from instance-sizer to templates
517
+ // Requirements: 4.8 — auto-set TP when sizer recommends > 1
518
+ if (this._autoTensorParallelism) {
519
+ combinedAnswers.tensorParallelSize = this._autoTensorParallelism;
520
+ combinedAnswers.gpuCount = this._autoGpuCount;
521
+ } else if (this._instanceSizerMetadata) {
522
+ const sizerInstanceType = combinedAnswers.instanceType;
523
+ const sizerRecs = this._instanceSizerMetadata.recommendations || [];
524
+ const matchingRec = sizerRecs.find(r => r.instanceType === sizerInstanceType);
525
+ if (matchingRec && matchingRec.tensorParallelism > 1) {
526
+ combinedAnswers.tensorParallelSize = matchingRec.tensorParallelism;
527
+ combinedAnswers.gpuCount = matchingRec.gpuCount;
528
+ }
529
+ }
530
+
442
531
  // Handle custom HyperPod cluster name
443
532
  if (combinedAnswers.customHyperPodCluster) {
444
533
  combinedAnswers.hyperPodCluster = combinedAnswers.customHyperPodCluster;
@@ -623,6 +712,55 @@ export default class PromptRunner {
623
712
  return null;
624
713
  }
625
714
 
715
+ /**
716
+ * Extract CUDA version from the selected base image.
717
+ * Looks at the MCP base image metadata for accelerator.version or labels.cuda_version.
718
+ * @param {object} baseImageAnswers - Answers from the base image prompt
719
+ * @returns {string|null} CUDA version string (e.g., "12.1") or null
720
+ * @private
721
+ */
722
+ _extractCudaFromBaseImage(baseImageAnswers) {
723
+ if (!this._mcpBaseImageChoices) return null;
724
+
725
+ const selectedImage = baseImageAnswers.baseImage || baseImageAnswers.customBaseImage;
726
+ if (!selectedImage) return null;
727
+
728
+ // Find the matching entry in the MCP choices
729
+ const matchingChoice = this._mcpBaseImageChoices.find(c => c.value === selectedImage);
730
+ if (!matchingChoice) return null;
731
+
732
+ // Try to extract CUDA version from the choice metadata
733
+ // The formatImageChoices function stores labels in the choice object
734
+ if (matchingChoice._meta?.labels?.cuda_version) {
735
+ return matchingChoice._meta.labels.cuda_version;
736
+ }
737
+ if (matchingChoice._meta?.accelerator?.version) {
738
+ return matchingChoice._meta.accelerator.version;
739
+ }
740
+
741
+ return null;
742
+ }
743
+
744
+ /**
745
+ * Get architecture-based heuristic default instance type.
746
+ * Used when the instance-sizer cannot produce a recommendation.
747
+ * Requirements: 3.9, 4.6
748
+ * @param {string} architecture - Model architecture type
749
+ * @returns {string} Default instance type
750
+ * @private
751
+ */
752
+ _getArchitectureHeuristicDefault(architecture) {
753
+ const HEURISTIC_DEFAULTS = {
754
+ 'transformers': 'ml.g5.xlarge',
755
+ 'transformer': 'ml.g5.xlarge',
756
+ 'diffusors': 'ml.g5.2xlarge',
757
+ 'diffusor': 'ml.g5.2xlarge',
758
+ 'predictor': 'ml.m5.large',
759
+ 'http': 'ml.m5.large'
760
+ };
761
+ return Object.hasOwn(HEURISTIC_DEFAULTS, architecture) ? HEURISTIC_DEFAULTS[architecture] : 'ml.g5.xlarge';
762
+ }
763
+
626
764
  /**
627
765
  * Query MCP region-picker server before infrastructure prompts.
628
766
  * Populates configManager.mcpChoices so _runPhase injects them into list prompts.
@@ -671,8 +809,8 @@ export default class PromptRunner {
671
809
  }
672
810
 
673
811
  /**
674
- * Query MCP instance-recommender server after deployment target is known.
675
- * Only runs when deploymentTarget is realtime-inference.
812
+ * Query MCP instance-sizer server with tag-based search after deployment target is known.
813
+ * Used when no model name is available for VRAM-based sizing.
676
814
  * Populates configManager.mcpChoices so _runPhase injects them into list prompts.
677
815
  * @private
678
816
  */
@@ -686,7 +824,7 @@ export default class PromptRunner {
686
824
  const smart = this.options.smart === true;
687
825
 
688
826
  // Instance type: query if not already provided via CLI/config
689
- if (!explicitConfig.instanceType && mcpServers.includes('instance-recommender')) {
827
+ if (!explicitConfig.instanceType && mcpServers.includes('instance-sizer')) {
690
828
  const { instanceSearch } = await this._runPrompts([{
691
829
  type: 'input',
692
830
  name: 'instanceSearch',
@@ -695,8 +833,8 @@ export default class PromptRunner {
695
833
  }]);
696
834
 
697
835
  if (instanceSearch && instanceSearch.trim()) {
698
- console.log(` 🔍 Querying instance-recommender${smart ? ' [smart]' : ''}...`);
699
- const result = await cm.queryMcpServer('instance-recommender', {
836
+ console.log(` 🔍 Querying instance-sizer [search]${smart ? ' [smart]' : ''}...`);
837
+ const result = await cm.queryMcpServer('instance-sizer', {
700
838
  ...frameworkAnswers,
701
839
  instanceSearch: instanceSearch.trim()
702
840
  });
@@ -713,6 +851,146 @@ export default class PromptRunner {
713
851
  }
714
852
  }
715
853
 
854
+ /**
855
+ * Query the instance-sizer MCP server after model is known.
856
+ * Estimates VRAM requirements and returns filtered, ranked instance recommendations.
857
+ * Stores results in this._mcpInstanceSizerChoices and this._instanceSizerMetadata.
858
+ * Requirements: 4.4, 4.5, 4.7, 3.6, 3.7
859
+ * @param {object} frameworkAnswers - Framework/architecture answers
860
+ * @param {object} modelFormatAnswers - Model format answers (contains modelName)
861
+ * @param {object} explicitConfig - Explicit CLI/config values
862
+ * @param {object} [sizerContext={}] - Additional context for the sizer query
863
+ * @param {string} [sizerContext.cudaVersion] - CUDA version from base image
864
+ * @param {object} [sizerContext.profileEnvVars] - Profile ENV overrides
865
+ * @private
866
+ */
867
+ async _queryMcpForInstanceSizing(frameworkAnswers, modelFormatAnswers, explicitConfig, sizerContext = {}) {
868
+ const cm = this.configManager;
869
+ if (!cm) return;
870
+
871
+ const mcpServers = cm.getMcpServerNames();
872
+ if (!mcpServers.includes('instance-sizer')) return;
873
+
874
+ // Resolve model name from answers or explicit config
875
+ const modelName = modelFormatAnswers.customModelName || modelFormatAnswers.modelName || explicitConfig.modelName;
876
+ if (!modelName || modelName === 'Custom (enter manually)') return;
877
+
878
+ const smart = this.options.smart === true;
879
+ const discover = this.options.discover === true;
880
+
881
+ const modeLabel = [smart && '[smart]', discover && '[discover]'].filter(Boolean).join(' ');
882
+ console.log(` 🔍 Querying instance-sizer${modeLabel ? ` ${modeLabel}` : ''}...`);
883
+
884
+ try {
885
+ const mcpConfigPath = path.join(GENERATOR_ROOT, 'config', 'mcp.json');
886
+ if (!fs.existsSync(mcpConfigPath)) return;
887
+
888
+ const mcpConfig = JSON.parse(fs.readFileSync(mcpConfigPath, 'utf8'));
889
+ const serverConfig = mcpConfig.mcpServers?.['instance-sizer'];
890
+ if (!serverConfig) return;
891
+
892
+ const { Client } = await import('@modelcontextprotocol/sdk/client/index.js');
893
+ const { StdioClientTransport } = await import('@modelcontextprotocol/sdk/client/stdio.js');
894
+
895
+ const serverArgs = [...(serverConfig.args || [])];
896
+ if (discover && !serverArgs.includes('--discover')) {
897
+ serverArgs.push('--discover');
898
+ }
899
+
900
+ const transport = new StdioClientTransport({
901
+ command: serverConfig.command,
902
+ args: serverArgs,
903
+ env: {
904
+ ...process.env,
905
+ ...(serverConfig.env || {}),
906
+ ...(smart ? { BEDROCK_SMART: 'true' } : {})
907
+ },
908
+ stderr: 'pipe'
909
+ });
910
+
911
+ const mcpClient = new Client(
912
+ { name: 'ml-container-creator', version: '1.0.0' },
913
+ { capabilities: {} }
914
+ );
915
+
916
+ await mcpClient.connect(transport);
917
+
918
+ const toolArgs = {
919
+ modelName,
920
+ limit: 8,
921
+ context: {
922
+ architecture: frameworkAnswers.architecture || undefined,
923
+ backend: frameworkAnswers.backend || undefined,
924
+ deploymentTarget: frameworkAnswers.deploymentTarget || undefined,
925
+ profileEnvVars: sizerContext.profileEnvVars || undefined
926
+ }
927
+ };
928
+
929
+ // Add CUDA version from base image for filtering
930
+ if (sizerContext.cudaVersion) {
931
+ toolArgs.cudaVersion = sizerContext.cudaVersion;
932
+ }
933
+
934
+ // Add quantization if available from model format answers
935
+ if (modelFormatAnswers.quantization) {
936
+ toolArgs.quantization = modelFormatAnswers.quantization;
937
+ }
938
+
939
+ const result = await mcpClient.callTool({
940
+ name: 'get_instance_recommendation',
941
+ arguments: toolArgs
942
+ });
943
+
944
+ await mcpClient.close();
945
+
946
+ // Parse the response
947
+ const textBlock = result?.content?.find(b => b.type === 'text');
948
+ if (textBlock) {
949
+ const parsed = JSON.parse(textBlock.text);
950
+
951
+ if (parsed.choices?.instanceType?.length > 0) {
952
+ this._instanceSizerMetadata = parsed.metadata || null;
953
+
954
+ // Build display labels with VRAM estimate and utilization percentage
955
+ const recommendations = parsed.metadata?.recommendations || [];
956
+ const estimatedVramGb = parsed.metadata?.estimatedVramGb;
957
+
958
+ // Store choices with display labels for the instance prompt
959
+ this._mcpInstanceSizerChoices = parsed.choices.instanceType;
960
+ this._mcpInstanceSizerDisplayChoices = recommendations.map(rec => ({
961
+ name: rec.displayLabel || `${rec.instanceType} (${estimatedVramGb ? estimatedVramGb.toFixed(1) : '?'}GB / ${rec.totalVramGb || '?'}GB — ${rec.utilizationPercent || '?'}% utilization)`,
962
+ value: rec.instanceType,
963
+ short: rec.instanceType
964
+ }));
965
+
966
+ const choices = parsed.choices.instanceType;
967
+ const topRec = recommendations[0];
968
+ const vramInfo = estimatedVramGb
969
+ ? ` (VRAM: ${estimatedVramGb.toFixed(1)}GB)`
970
+ : '';
971
+ const tpInfo = topRec?.tensorParallelism > 1
972
+ ? ` [TP=${topRec.tensorParallelism}]`
973
+ : '';
974
+
975
+ console.log(` ✓ ${choices.length} sized instance(s): ${choices[0]}${vramInfo}${tpInfo}`);
976
+ } else if (parsed.metadata?.warning) {
977
+ console.log(` ⚠️ ${parsed.metadata.warning}`);
978
+ } else {
979
+ // Apply architecture heuristic fallback when sizer returns empty
980
+ const archForHeuristic = frameworkAnswers.architecture || frameworkAnswers.deploymentConfig?.split('-')[0];
981
+ this._architectureHeuristicDefault = this._getArchitectureHeuristicDefault(archForHeuristic);
982
+ console.log(` ↳ No instance-sizer results, using heuristic default: ${this._architectureHeuristicDefault}`);
983
+ }
984
+ }
985
+ } catch (err) {
986
+ // Sizer unavailable — apply architecture heuristic fallback
987
+ const archForHeuristic = frameworkAnswers.architecture || frameworkAnswers.deploymentConfig?.split('-')[0];
988
+ this._architectureHeuristicDefault = this._getArchitectureHeuristicDefault(archForHeuristic);
989
+ console.log(` ⚠️ instance-sizer: ${err.message}`);
990
+ console.log(` ↳ Using heuristic default: ${this._architectureHeuristicDefault}`);
991
+ }
992
+ }
993
+
716
994
  /**
717
995
  * Query the hyperpod-cluster-picker MCP server for available HyperPod EKS clusters.
718
996
  * Populates configManager.mcpChoices.hyperPodCluster so _runPhase injects them into the list prompt.
@@ -1297,16 +1575,21 @@ export default class PromptRunner {
1297
1575
  * supports multiple versions. The choice transparently resolves to the
1298
1576
  * correct SageMaker inference AMI.
1299
1577
  *
1578
+ * When a base image CUDA version is provided, auto-resolves by intersecting
1579
+ * with the instance's supported versions. Removes the CUDA prompt from the
1580
+ * interactive flow when auto-resolution succeeds.
1581
+ *
1300
1582
  * Skipped for CPU instances, non-CUDA accelerators, or when only one
1301
1583
  * compatible CUDA version exists.
1302
1584
  *
1303
1585
  * @param {string} instanceType - Selected instance type (e.g. "ml.g5.2xlarge")
1304
1586
  * @param {string} framework - Selected framework name
1305
1587
  * @param {string} frameworkVersion - Selected framework version
1588
+ * @param {string} [baseImageCuda] - CUDA version from selected base image (for auto-resolution)
1306
1589
  * @returns {Promise<{cudaVersion: string, inferenceAmiVersion: string}|null>}
1307
1590
  * @private
1308
1591
  */
1309
- async _promptCudaVersion(instanceType, framework, frameworkVersion) {
1592
+ async _promptCudaVersion(instanceType, framework, frameworkVersion, baseImageCuda) {
1310
1593
  if (!instanceType) return null;
1311
1594
 
1312
1595
  // Look up instance in accelerator mapping
@@ -1316,6 +1599,33 @@ export default class PromptRunner {
1316
1599
  const instanceCudaVersions = instanceInfo.accelerator.versions;
1317
1600
  if (!instanceCudaVersions || instanceCudaVersions.length === 0) return null;
1318
1601
 
1602
+ // Auto-resolution: when base image specifies a CUDA version, intersect with instance support
1603
+ // Requirements: 3.11, 4.9, 4.10, 4.11
1604
+ if (baseImageCuda) {
1605
+ const majorRequired = baseImageCuda.split('.')[0];
1606
+ const intersection = instanceCudaVersions.filter(v => {
1607
+ if (v === baseImageCuda) return true;
1608
+ if (v.startsWith(`${majorRequired }.`)) return true;
1609
+ return false;
1610
+ });
1611
+
1612
+ if (intersection.length > 0) {
1613
+ // Auto-select: pick exact match or highest compatible
1614
+ const exactMatch = intersection.find(v => v === baseImageCuda);
1615
+ const selectedVersion = exactMatch || intersection.sort().pop();
1616
+ const inferenceAmiVersion = PromptRunner.CUDA_AMI_MAP[selectedVersion];
1617
+ if (inferenceAmiVersion) {
1618
+ console.log(`\n🔧 CUDA ${selectedVersion} auto-resolved from base image (requires ${baseImageCuda})`);
1619
+ console.log(` AMI: ${inferenceAmiVersion}`);
1620
+ return { cudaVersion: selectedVersion, inferenceAmiVersion };
1621
+ }
1622
+ } else {
1623
+ // No intersection — warn and fall through to manual prompt
1624
+ console.log(`\n ⚠️ Base image requires CUDA ${baseImageCuda} but instance ${instanceType} supports: ${instanceCudaVersions.join(', ')}`);
1625
+ console.log(' No compatible CUDA version found. Falling back to manual selection.');
1626
+ }
1627
+ }
1628
+
1319
1629
  // Get framework CUDA requirements (if available)
1320
1630
  const registryConfigManager = this.registryConfigManager;
1321
1631
  const frameworkConfig = registryConfigManager?.frameworkRegistry?.[framework]?.[frameworkVersion];