@aws/ml-container-creator 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +1 -1
  2. package/bin/cli.js +1 -1
  3. package/config/tune-catalog.json +303 -1
  4. package/infra/ci-harness/lib/ci-harness-stack.ts +43 -0
  5. package/package.json +3 -2
  6. package/servers/base-image-picker/index.js +65 -18
  7. package/servers/instance-sizer/index.js +32 -0
  8. package/servers/lib/catalogs/fleet-drivers.json +38 -0
  9. package/servers/lib/catalogs/model-arch-support.json +51 -0
  10. package/servers/lib/catalogs/model-servers.json +2842 -1516
  11. package/servers/lib/schemas/image-catalog.schema.json +12 -0
  12. package/src/app.js +6 -4
  13. package/src/lib/bootstrap-command-handler.js +12 -2
  14. package/src/lib/bootstrap-profile-manager.js +16 -0
  15. package/src/lib/cross-cutting-checker.js +6 -1
  16. package/src/lib/generated/cli-options.js +1 -1
  17. package/src/lib/generated/parameter-matrix.js +1 -1
  18. package/src/lib/generated/validation-rules.js +1 -1
  19. package/src/lib/mcp-query-runner.js +110 -3
  20. package/src/lib/prompt-runner.js +66 -22
  21. package/src/lib/template-variable-resolver.js +8 -0
  22. package/src/lib/train-config-builder.js +339 -0
  23. package/templates/do/.benchmark_writer.py +3 -0
  24. package/templates/do/.eval_helper.py +409 -0
  25. package/templates/do/.register_helper.py +185 -11
  26. package/templates/do/.train_build_request.py +102 -113
  27. package/templates/do/.train_helper.py +433 -0
  28. package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
  29. package/templates/do/adapter +157 -0
  30. package/templates/do/benchmark +60 -3
  31. package/templates/do/deploy.d/managed-inference.ejs +83 -0
  32. package/templates/do/evaluate +272 -0
  33. package/templates/do/lib/resolve-instance.sh +155 -0
  34. package/templates/do/register +5 -0
  35. package/templates/do/test +1 -0
  36. package/templates/do/train +879 -126
  37. package/templates/do/training/config.yaml +83 -11
  38. package/templates/do/training/dpo/accelerate_config.yaml +24 -0
  39. package/templates/do/training/dpo/defaults.yaml +26 -0
  40. package/templates/do/training/dpo/prompts.json +8 -0
  41. package/templates/do/training/dpo/train.py +363 -0
  42. package/templates/do/training/sft/accelerate_config.yaml +22 -0
  43. package/templates/do/training/sft/defaults.yaml +18 -0
  44. package/templates/do/training/sft/prompts.json +7 -0
  45. package/templates/do/training/sft/train.py +310 -0
  46. package/templates/do/tune +11 -2
  47. package/templates/do/.train_poll_parser.py +0 -135
  48. package/templates/do/.train_status_parser.py +0 -187
  49. /package/templates/do/training/{train.py → custom/train.py} +0 -0
@@ -159,6 +159,18 @@
159
159
  "items": {
160
160
  "type": "string"
161
161
  }
162
+ },
163
+ "min_driver_version": {
164
+ "type": "string",
165
+ "description": "Minimum GPU driver version required (e.g., '550.54')"
166
+ },
167
+ "cuda_toolkit": {
168
+ "type": "string",
169
+ "description": "CUDA toolkit version bundled in the image (e.g., '12.4')"
170
+ },
171
+ "transformers_version": {
172
+ "type": "string",
173
+ "description": "Bundled transformers library version (e.g., '4.52.0')"
162
174
  }
163
175
  },
164
176
  "additionalProperties": false
package/src/app.js CHANGED
@@ -366,10 +366,11 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
366
366
  const trainIncluded = answers.deploymentTarget !== 'batch-transform';
367
367
  if (!trainIncluded) {
368
368
  ignorePatterns.push('**/do/train');
369
+ ignorePatterns.push('**/do/.train_helper.py');
369
370
  ignorePatterns.push('**/do/.train_build_request.py');
370
- ignorePatterns.push('**/do/.train_status_parser.py');
371
- ignorePatterns.push('**/do/.train_poll_parser.py');
372
371
  ignorePatterns.push('**/do/training/**');
372
+ ignorePatterns.push('**/do/evaluate');
373
+ ignorePatterns.push('**/do/.eval_helper.py');
373
374
  }
374
375
 
375
376
  // Exclude feedback.sh when neither tune nor train is included
@@ -404,10 +405,11 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
404
405
  ignorePatterns.push('**/do/.adapter_helper.py');
405
406
  ignorePatterns.push('**/do/.register_helper.py');
406
407
  ignorePatterns.push('**/do/train');
408
+ ignorePatterns.push('**/do/.train_helper.py');
407
409
  ignorePatterns.push('**/do/.train_build_request.py');
408
- ignorePatterns.push('**/do/.train_status_parser.py');
409
- ignorePatterns.push('**/do/.train_poll_parser.py');
410
410
  ignorePatterns.push('**/do/training/**');
411
+ ignorePatterns.push('**/do/evaluate');
412
+ ignorePatterns.push('**/do/.eval_helper.py');
411
413
  ignorePatterns.push('**/do/add-ic');
412
414
  ignorePatterns.push('**/do/run');
413
415
  ignorePatterns.push('**/sample_model/**');
@@ -64,6 +64,7 @@ export default class BootstrapCommandHandler {
64
64
  _handlePrune() { return this.profileManager._handlePrune(); }
65
65
  _handleSyncSchemas() { return this.profileManager._handleSyncSchemas(); }
66
66
  _handleSyncModelFamilies() { return this.profileManager._handleSyncModelFamilies(); }
67
+ _handleSyncServingVersions() { return this.profileManager._handleSyncServingVersions(); }
67
68
 
68
69
  /**
69
70
  * Dispatch bootstrap subcommands.
@@ -132,6 +133,9 @@ export default class BootstrapCommandHandler {
132
133
  case 'sync-model-families':
133
134
  await this._handleSyncModelFamilies();
134
135
  break;
136
+ case 'sync-serving-versions':
137
+ await this._handleSyncServingVersions();
138
+ break;
135
139
  // Migration path: upgrades legacy profiles to current naming conventions.
136
140
  // Corrects stackName to mlcc-bootstrap-{profileName}, renames sharedStackFrom
137
141
  // to sharedInfraFrom. Idempotent — safe to run multiple times.
@@ -1467,7 +1471,9 @@ SUBCOMMANDS:
1467
1471
  prune Remove deleted and unknown records from the deployment manifest
1468
1472
  update Re-deploy bootstrap stacks using active profile (no prompts)
1469
1473
  migrate Upgrade legacy profiles to current naming conventions
1474
+ sync-schemas Download AWS service model schemas (sagemaker, iam, ecr, s3)
1470
1475
  sync-model-families Discover tune-eligible models from JumpStart Hub and update catalog
1476
+ sync-serving-versions Discover latest vLLM/SGLang/TRT-LLM image versions and update catalog
1471
1477
 
1472
1478
  SETUP OPTIONS:
1473
1479
  --non-interactive Run without interactive prompts
@@ -1477,8 +1483,10 @@ SETUP OPTIONS:
1477
1483
  --role-arn <arn> Use existing IAM role ARN (skip role creation)
1478
1484
  --skip-s3 Skip S3 bucket creation
1479
1485
  --ci Provision CI testing infrastructure
1486
+ --benchmark-infra Provision Athena/Glue benchmark infrastructure (requires --ci)
1480
1487
  --skip-ci Skip CI infrastructure provisioning
1481
1488
  --skip-post-setup Skip post-setup chain (mcp init, sync-architectures, sync-schemas)
1489
+ --ignore-staleness Suppress schema staleness warnings
1482
1490
 
1483
1491
  STATUS OPTIONS:
1484
1492
  --verify Check each active resource against AWS APIs for drift detection
@@ -1495,13 +1503,15 @@ EXAMPLES:
1495
1503
  ml-container-creator bootstrap list
1496
1504
  ml-container-creator bootstrap remove dev
1497
1505
  ml-container-creator bootstrap remove dev --force --delete-stack
1506
+ ml-container-creator bootstrap update
1507
+ ml-container-creator bootstrap update --ci --benchmark-infra
1498
1508
  ml-container-creator bootstrap scan
1509
+ ml-container-creator bootstrap sync-schemas
1499
1510
  ml-container-creator bootstrap sync-model-families
1511
+ ml-container-creator bootstrap sync-serving-versions
1500
1512
  ml-container-creator bootstrap migrate
1501
1513
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2
1502
- ml-container-creator bootstrap --non-interactive --profile my-aws-profile --role-arn arn:aws:iam::123456789012:role/MyRole --skip-s3
1503
1514
  ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --ci
1504
- ml-container-creator bootstrap --non-interactive --profile my-aws-profile --region us-west-2 --skip-ci
1505
1515
  `);
1506
1516
  }
1507
1517
 
@@ -655,4 +655,20 @@ export default class BootstrapProfileManager {
655
655
  process.exit(1);
656
656
  }
657
657
  }
658
+
659
+ /**
660
+ * Handle sync-serving-versions subcommand: discover latest container image
661
+ * versions for vLLM, SGLang, and TensorRT-LLM and update the model-servers catalog.
662
+ */
663
+ async _handleSyncServingVersions() {
664
+ console.log('\n🔄 Sync Serving Versions — Discovering latest container images...\n');
665
+ try {
666
+ const { syncServingVersions } = await import('../../scripts/sync-serving-versions.js');
667
+ const result = await syncServingVersions();
668
+ console.log(`\n✅ Sync complete: ${result.totalAdded} new, ${result.totalRemoved} pruned\n`);
669
+ } catch (err) {
670
+ console.log(`❌ Sync failed: ${err.message}`);
671
+ process.exit(1);
672
+ }
673
+ }
658
674
  }
@@ -290,7 +290,12 @@ export default class CrossCuttingChecker {
290
290
  if (!modelType || !server || !serverVersion) return findings;
291
291
 
292
292
  const entries = modelServersCatalog[server] || [];
293
- const entry = entries.find(e => e.labels?.framework_version === serverVersion);
293
+ // Try exact version match first, then fall back to nearest entry with supportedModelTypes
294
+ let entry = entries.find(e => e.labels?.framework_version === serverVersion);
295
+ if (!entry?.supportedModelTypes?.length) {
296
+ // Fall back to any entry that has supportedModelTypes populated
297
+ entry = entries.find(e => e.supportedModelTypes?.length > 0);
298
+ }
294
299
  if (!entry?.supportedModelTypes?.length) return findings;
295
300
 
296
301
  if (!entry.supportedModelTypes.includes(modelType.toLowerCase())) {
@@ -1,6 +1,6 @@
1
1
  // AUTO-GENERATED by scripts/codegen-cli.js — DO NOT EDIT
2
2
  // Source: config/parameter-schema-v2.json
3
- // Generated: 2026-06-23T20:55:23.381Z
3
+ // Generated: 2026-06-30T16:45:56.916Z
4
4
 
5
5
  /**
6
6
  * CLI option definitions derived from parameter-schema-v2.json.
@@ -1,6 +1,6 @@
1
1
  // AUTO-GENERATED by scripts/codegen-parameter-matrix.js — DO NOT EDIT
2
2
  // Source: config/parameter-schema-v2.json
3
- // Generated: 2026-06-23T20:55:23.482Z
3
+ // Generated: 2026-06-30T16:45:57.021Z
4
4
 
5
5
  /**
6
6
  * Parameter matrix defining how each parameter is loaded from various sources.
@@ -1,6 +1,6 @@
1
1
  // AUTO-GENERATED by scripts/codegen-validator.js — DO NOT EDIT
2
2
  // Source: config/parameter-schema-v2.json
3
- // Generated: 2026-06-23T20:55:23.412Z
3
+ // Generated: 2026-06-30T16:45:56.949Z
4
4
 
5
5
  /**
6
6
  * Validation rules derived from parameter-schema-v2.json.
@@ -384,6 +384,9 @@ export default class McpQueryRunner {
384
384
  const endpointNames = result.choices.endpointName;
385
385
  const metadata = result.metadata || {};
386
386
 
387
+ // Store endpoint metadata for later instance type resolution (US-1)
388
+ this.runner._endpointPickerMetadata = metadata;
389
+
387
390
  // Build choices with metadata annotations
388
391
  this.runner._mcpEndpointChoices = endpointNames.map(name => {
389
392
  const meta = metadata[name];
@@ -412,12 +415,15 @@ export default class McpQueryRunner {
412
415
  }
413
416
 
414
417
  /**
415
- * Query MCP base-image-picker server after deployment config is selected.
418
+ * Query MCP base-image-picker server after deployment config and instance type are known.
416
419
  * Populates _mcpBaseImageChoices for the base image selection prompt.
417
- * Requirements: 5.1, 5.2, 5.3, 5.4, 9.1, 9.2, 9.3
420
+ * Requirements: 5.1, 5.2, 5.3, 5.4, 9.1, 9.2, 9.3, US-1 (ordering constraint)
421
+ * @param {Object} frameworkAnswers - Framework/architecture answers
422
+ * @param {Object} _explicitConfig - Explicit CLI/config values
423
+ * @param {Object} [infraContext] - Infrastructure context (instanceType, tensorParallelSize, modelId)
418
424
  * @private
419
425
  */
420
- async _queryMcpForBaseImage(frameworkAnswers, _explicitConfig) {
426
+ async _queryMcpForBaseImage(frameworkAnswers, _explicitConfig, infraContext = {}) {
421
427
  // Skip if base image provided via CLI --base-image flag
422
428
  if (this.runner.options['base-image']) return;
423
429
 
@@ -454,6 +460,17 @@ export default class McpQueryRunner {
454
460
  context.searchCriteria = searchCriteria.trim();
455
461
  }
456
462
 
463
+ // Pass infrastructure context for driver-aware filtering (US-1 ordering constraint)
464
+ if (infraContext.instanceType) {
465
+ context.instanceType = infraContext.instanceType;
466
+ }
467
+ if (infraContext.tensorParallelSize !== null && infraContext.tensorParallelSize !== undefined) {
468
+ context.tensorParallelSize = infraContext.tensorParallelSize;
469
+ }
470
+ if (infraContext.modelId) {
471
+ context.modelId = infraContext.modelId;
472
+ }
473
+
457
474
  const result = await cm.queryMcpServer('base-image-picker', context);
458
475
 
459
476
  if (result && result.metadata?.baseImage?.length > 0) {
@@ -716,6 +733,96 @@ export default class McpQueryRunner {
716
733
  }
717
734
  }
718
735
 
736
+ /**
737
+ * Resolve instance type from an existing endpoint.
738
+ * Priority:
739
+ * 1. Endpoint-picker metadata (already fetched, no network call)
740
+ * 2. Direct AWS SDK call: DescribeEndpoint → DescribeEndpointConfig
741
+ *
742
+ * Reuses the resolution pattern from do/lib/resolve-instance.sh:
743
+ * - Check ProductionVariants[0].CurrentInstanceType or InstanceType
744
+ * - Fallback: DescribeEndpointConfig → ProductionVariants[0].InstanceType
745
+ * - Final fallback: InstancePools[0] (highest priority)
746
+ *
747
+ * Requirements: US-1 (ordering constraint — resolve instance type before base image picker)
748
+ * @param {string} endpointName - The existing endpoint name
749
+ * @param {string} awsRegion - AWS region for API calls
750
+ * @returns {Promise<string|null>} Resolved instance type or null on failure
751
+ * @private
752
+ */
753
+ async _resolveEndpointInstanceType(endpointName, awsRegion) {
754
+ // Strategy 1: Use endpoint-picker metadata (already fetched, no network call)
755
+ if (this.runner._endpointPickerMetadata) {
756
+ const meta = this.runner._endpointPickerMetadata[endpointName];
757
+ if (meta?.instanceType) {
758
+ // Strip pool annotation if present: "ml.g5.12xlarge (pool: 3 types)" → "ml.g5.12xlarge"
759
+ const rawInstanceType = meta.instanceType.includes(' (pool:')
760
+ ? meta.instanceType.split(' (pool:')[0]
761
+ : meta.instanceType;
762
+ if (rawInstanceType && rawInstanceType !== 'unknown') {
763
+ console.log(` ✓ Resolved instance type from endpoint metadata: ${rawInstanceType}`);
764
+ return rawInstanceType;
765
+ }
766
+ }
767
+ }
768
+
769
+ // Strategy 2: Direct AWS SDK call (for custom endpoint names not in picker results)
770
+ console.log(' 🔍 Resolving instance type from existing endpoint...');
771
+ try {
772
+ const { SageMakerClient, DescribeEndpointCommand, DescribeEndpointConfigCommand } = await import('@aws-sdk/client-sagemaker');
773
+
774
+ const region = awsRegion || process.env.AWS_REGION || 'us-east-1';
775
+ const clientOptions = { region };
776
+
777
+ // Use AWS profile if available
778
+ const awsProfile = this.runner.configManager?.config?.awsProfile
779
+ || this.runner.options?.profile || process.env.AWS_PROFILE || null;
780
+ if (awsProfile) {
781
+ try {
782
+ const { fromIni } = await import('@aws-sdk/credential-providers');
783
+ clientOptions.credentials = fromIni({ profile: awsProfile });
784
+ } catch {
785
+ // credential-providers not available, use default chain
786
+ }
787
+ }
788
+
789
+ const client = new SageMakerClient(clientOptions);
790
+
791
+ // DescribeEndpoint — check ProductionVariants for instance type
792
+ const epResponse = await client.send(new DescribeEndpointCommand({ EndpointName: endpointName }));
793
+
794
+ const primaryVariant = (epResponse.ProductionVariants || [])[0] || {};
795
+ let instanceType = primaryVariant.CurrentInstanceType || primaryVariant.InstanceType || null;
796
+
797
+ // Fallback: DescribeEndpointConfig
798
+ if (!instanceType && epResponse.EndpointConfigName) {
799
+ const ecResponse = await client.send(
800
+ new DescribeEndpointConfigCommand({ EndpointConfigName: epResponse.EndpointConfigName })
801
+ );
802
+ const ecVariant = (ecResponse.ProductionVariants || [])[0];
803
+ if (ecVariant?.InstanceType) {
804
+ instanceType = ecVariant.InstanceType;
805
+ } else if (ecVariant?.InstancePools?.length > 0) {
806
+ // Use highest-priority pool entry (lowest Priority number)
807
+ const sorted = [...ecVariant.InstancePools].sort((a, b) => (a.Priority || 99) - (b.Priority || 99));
808
+ instanceType = sorted[0].InstanceType || null;
809
+ }
810
+ }
811
+
812
+ if (instanceType) {
813
+ console.log(` ✓ Resolved instance type from endpoint: ${instanceType}`);
814
+ return instanceType;
815
+ }
816
+
817
+ console.log(' ↳ Could not determine instance type from endpoint');
818
+ return null;
819
+ } catch (err) {
820
+ // Graceful fallback: if AWS call fails, skip filtering (no driver-aware filter)
821
+ console.log(` ⚠️ Could not resolve instance type from endpoint: ${err.message}`);
822
+ return null;
823
+ }
824
+ }
825
+
719
826
  /**
720
827
  * Validate and display instance type compatibility
721
828
  * Requirements: 4.1, 4.2, 4.3, 4.4, 4.5, 4.6
@@ -68,6 +68,7 @@ export default class PromptRunner {
68
68
  _queryMcpForInstance(...args) { return this.mcpQueryRunner._queryMcpForInstance(...args); }
69
69
  _queryMcpForInstanceSizing(...args) { return this.mcpQueryRunner._queryMcpForInstanceSizing(...args); }
70
70
  _queryMcpForEndpoints(...args) { return this.mcpQueryRunner._queryMcpForEndpoints(...args); }
71
+ _resolveEndpointInstanceType(...args) { return this.mcpQueryRunner._resolveEndpointInstanceType(...args); }
71
72
  _queryMcpForHyperPod(...args) { return this.mcpQueryRunner._queryMcpForHyperPod(...args); }
72
73
  _fetchAndDisplayModelInfo(...args) { return this.mcpQueryRunner._fetchAndDisplayModelInfo(...args); }
73
74
  _validateAndDisplayInstanceType(...args) { return this.mcpQueryRunner._validateAndDisplayInstanceType(...args); }
@@ -182,8 +183,8 @@ export default class PromptRunner {
182
183
  }
183
184
 
184
185
  // ══════════════════════════════════════════════════════════════════════
185
- // Phase 2 — How (deployment target + serving profile + base image)
186
- // Requirements: 4.3instance prompt appears AFTER base image is known
186
+ // Phase 2 — How (deployment target + serving profile)
187
+ // Requirements: US-1base image selection moved AFTER instance resolution
187
188
  // ══════════════════════════════════════════════════════════════════════
188
189
  console.log('\n💪 Infrastructure & Deployment');
189
190
 
@@ -192,25 +193,8 @@ export default class PromptRunner {
192
193
  const regionPreviousAnswers = bootstrapRegion ? { _bootstrapRegion: bootstrapRegion } : {};
193
194
  const regionAndTargetAnswers = await this._runPhase(infraRegionAndTargetPrompts, { ...frameworkAnswers, ...regionPreviousAnswers }, explicitConfig, existingConfig);
194
195
 
195
- // 2b. Query base-image-picker MCP server for base image choices
196
- await this.mcpQueryRunner._queryMcpForBaseImage(frameworkAnswers, explicitConfig);
197
- const baseImagePreviousAnswers = {
198
- ...frameworkAnswers,
199
- ...engineAnswers,
200
- ...(this._mcpBaseImageChoices ? { _mcpBaseImageChoices: this._mcpBaseImageChoices } : {})
201
- };
202
- const baseImageAnswers = await this._runPhase(
203
- baseImagePrompts,
204
- baseImagePreviousAnswers,
205
- explicitConfig,
206
- existingConfig
207
- );
208
-
209
- // Requirements: 4.2-4.5 — Check model architecture compatibility after base image selection
210
- this._checkModelArchitectureCompatibility(baseImageAnswers, frameworkAnswers);
211
-
212
- // Extract CUDA version from selected base image for instance-sizer context
213
- const selectedBaseImageCuda = this._extractCudaFromBaseImage(baseImageAnswers);
196
+ // NOTE: Base image selection moved to Phase 3 (after instance type resolution)
197
+ // to enable driver-aware filtering. See US-1 ordering constraint in requirements.
214
198
 
215
199
  // ══════════════════════════════════════════════════════════════════════
216
200
  // Phase 3 — Where (region + instance [derived] + CUDA/AMI + HyperPod + build target)
@@ -283,7 +267,7 @@ export default class PromptRunner {
283
267
  } else if (phase1ModelId && phase1ModelId !== 'Custom (enter manually)') {
284
268
  // Query instance-sizer with full context
285
269
  await this.mcpQueryRunner._queryMcpForInstanceSizing(frameworkAnswers, modelFormatAnswers, explicitConfig, {
286
- cudaVersion: selectedBaseImageCuda,
270
+ cudaVersion: null, // base image not yet selected (moved after instance resolution)
287
271
  profileEnvVars: this._selectedProfileEnvVars || {}
288
272
  });
289
273
  } else {
@@ -422,6 +406,66 @@ export default class PromptRunner {
422
406
  }
423
407
  }
424
408
 
409
+ // 3b2. Base image selection — AFTER instance type resolved (US-1 ordering constraint)
410
+ // Pass resolved instanceType and tensorParallelSize for driver-aware filtering
411
+ let resolvedInstanceType = instanceAnswers.customInstanceType || instanceAnswers.instanceType;
412
+ let resolvedTensorParallelSize = this._autoTensorParallelism || 1;
413
+
414
+ // For existing endpoints: resolve instance type from the endpoint (US-1 ordering constraint)
415
+ // The instance type is needed for driver-aware base image filtering even though the user
416
+ // doesn't select it manually. Pattern reused from do/lib/resolve-instance.sh.
417
+ if (!resolvedInstanceType && existingEndpointAnswers.existingEndpointName) {
418
+ const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
419
+ resolvedInstanceType = await this.mcpQueryRunner._resolveEndpointInstanceType(
420
+ existingEndpointAnswers.existingEndpointName,
421
+ resolvedRegion
422
+ );
423
+ // Store resolved instance type for downstream use (IC config, GPU count derivation)
424
+ if (resolvedInstanceType) {
425
+ existingEndpointAnswers._resolvedEndpointInstanceType = resolvedInstanceType;
426
+ // Propagate as instanceType so template-variable-resolver derives
427
+ // icGpuCount and tensorParallelSize from the instance catalog.
428
+ // Without this, IC_GPU_COUNT defaults to 1 even for multi-GPU instances.
429
+ existingEndpointAnswers.instanceType = resolvedInstanceType;
430
+
431
+ // Derive GPU count from instance catalog for immediate use (TP for base image filtering)
432
+ const endpointInstanceEntry = instanceCatalogRaw[resolvedInstanceType];
433
+ if (endpointInstanceEntry?.gpus && endpointInstanceEntry.gpus > 1) {
434
+ existingEndpointAnswers.gpuCount = endpointInstanceEntry.gpus;
435
+ existingEndpointAnswers.tensorParallelSize = endpointInstanceEntry.gpus;
436
+ this._autoTensorParallelism = endpointInstanceEntry.gpus;
437
+ this._autoGpuCount = endpointInstanceEntry.gpus;
438
+ console.log(` ✓ Endpoint instance ${resolvedInstanceType}: ${endpointInstanceEntry.gpus} GPUs (TP=${endpointInstanceEntry.gpus})`);
439
+ }
440
+ }
441
+ }
442
+
443
+ // Re-read tensor parallel size after potential endpoint resolution update
444
+ resolvedTensorParallelSize = this._autoTensorParallelism || 1;
445
+
446
+ await this.mcpQueryRunner._queryMcpForBaseImage(frameworkAnswers, explicitConfig, {
447
+ instanceType: resolvedInstanceType,
448
+ tensorParallelSize: resolvedTensorParallelSize,
449
+ modelId: phase1ModelId || undefined
450
+ });
451
+ const baseImagePreviousAnswers = {
452
+ ...frameworkAnswers,
453
+ ...engineAnswers,
454
+ ...(this._mcpBaseImageChoices ? { _mcpBaseImageChoices: this._mcpBaseImageChoices } : {})
455
+ };
456
+ const baseImageAnswers = await this._runPhase(
457
+ baseImagePrompts,
458
+ baseImagePreviousAnswers,
459
+ explicitConfig,
460
+ existingConfig
461
+ );
462
+
463
+ // Requirements: 4.2-4.5 — Check model architecture compatibility after base image selection
464
+ this._checkModelArchitectureCompatibility(baseImageAnswers, frameworkAnswers);
465
+
466
+ // Extract CUDA version from selected base image for CUDA/AMI auto-resolution
467
+ const selectedBaseImageCuda = this._extractCudaFromBaseImage(baseImageAnswers);
468
+
425
469
  // 3c. Async-specific prompts (only when deploymentTarget === 'async-inference')
426
470
  let asyncAnswers = {};
427
471
  if (regionAndTargetAnswers.deploymentTarget === 'async-inference') {
@@ -454,6 +454,14 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
454
454
  answers.tensorParallelSize = instanceGpuCount;
455
455
  answers._tpAutoResolved = true;
456
456
  answers._tpAutoResolvedFrom = answers.instanceType;
457
+
458
+ // Also propagate to icEnvVars so IC_ENV_VLLM_TENSOR_PARALLEL_SIZE
459
+ // (or equivalent) is written in do/config for deploy-time IC creation.
460
+ if (!answers.icEnvVars) {
461
+ answers.icEnvVars = {};
462
+ }
463
+ answers.icEnvVars[tpEnvKey] = String(instanceGpuCount);
464
+
457
465
  console.log(` ℹ️ TP degree: ${instanceGpuCount} (auto-detected from ${answers.instanceType})`);
458
466
  }
459
467
  }