@aws/ml-container-creator 0.13.5 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/config/parameter-schema-v2.json +33 -5
  2. package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
  3. package/infra/ci-harness/package-lock.json +121 -111
  4. package/infra/ci-harness/package.json +1 -1
  5. package/package.json +2 -2
  6. package/servers/endpoint-picker/index.js +23 -14
  7. package/servers/instance-sizer/index.js +72 -4
  8. package/servers/instance-sizer/lib/model-resolver.js +28 -2
  9. package/src/app.js +15 -0
  10. package/src/lib/config-loader.js +18 -0
  11. package/src/lib/config-manager.js +6 -1
  12. package/src/lib/dataset-slug.js +152 -0
  13. package/src/lib/generated/cli-options.js +9 -3
  14. package/src/lib/generated/parameter-matrix.js +15 -4
  15. package/src/lib/generated/validation-rules.js +1 -1
  16. package/src/lib/mcp-client.js +15 -1
  17. package/src/lib/mcp-query-runner.js +11 -1
  18. package/src/lib/prompt-runner.js +40 -20
  19. package/src/lib/prompts/feature-prompts.js +1 -1
  20. package/src/lib/template-manager.js +0 -7
  21. package/src/lib/template-variable-resolver.js +51 -1
  22. package/src/lib/tune-config-state.js +14 -1
  23. package/templates/do/.benchmark_writer.py +43 -0
  24. package/templates/do/.register_helper.py +1185 -0
  25. package/templates/do/.tune_helper.py +168 -2
  26. package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
  27. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  28. package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
  29. package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
  30. package/templates/do/adapter +319 -27
  31. package/templates/do/add-ic +85 -3
  32. package/templates/do/benchmark +28 -8
  33. package/templates/do/config +20 -0
  34. package/templates/do/lib/inference-component.sh +56 -3
  35. package/templates/do/register +557 -6
  36. package/templates/do/test +12 -2
  37. package/templates/do/tune +219 -6
@@ -25,8 +25,8 @@ import { readFileSync } from 'node:fs';
25
25
  import { fileURLToPath } from 'node:url';
26
26
  import { resolve, dirname } from 'node:path';
27
27
  import { resolveModelMetadata } from './lib/model-resolver.js';
28
- import { estimateVram } from './lib/vram-estimator.js';
29
- import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js';
28
+ import { estimateVram, computeMaxModelLen } from './lib/vram-estimator.js';
29
+ import { filterAndRankInstances, applyAvailabilityRanking, getPerGpuMemoryGb } from './lib/instance-ranker.js';
30
30
  import { QuotaResolver } from './lib/quota-resolver.js';
31
31
  import { queryBedrock } from '../lib/bedrock-client.js';
32
32
 
@@ -393,6 +393,66 @@ async function handleGetInstanceRecommendation(params) {
393
393
  { limit }
394
394
  );
395
395
 
396
+ // Step 3-max_model_len: When no instance fits at full context, try capping context length
397
+ // NFR-1 guard: skip this logic for models with recommendedInstances in catalog
398
+ let suggestedMaxModelLen = null;
399
+ let contextLengthCapped = false;
400
+ let originalMaxPositionEmbeddings = null;
401
+
402
+ if (recommendations.length === 0 && !modelMetadata.recommendedInstances && modelMetadata.maxPositionEmbeddings) {
403
+ // Find the largest available GPU instance
404
+ const gpuInstances = Object.entries(effectiveCatalog)
405
+ .filter(([, meta]) => meta.category === 'gpu' && meta.gpus > 0)
406
+ .map(([name, meta]) => {
407
+ const perGpu = getPerGpuMemoryGb(meta);
408
+ return { name, meta, totalVramGb: perGpu ? perGpu * meta.gpus : 0 };
409
+ })
410
+ .filter(i => i.totalVramGb > 0)
411
+ .sort((a, b) => b.totalVramGb - a.totalVramGb);
412
+
413
+ if (gpuInstances.length > 0) {
414
+ const bestInstance = gpuInstances[0];
415
+
416
+ // Compute model weight memory for computeMaxModelLen
417
+ const weightsGb = vramEstimate.breakdown.weightsGb;
418
+
419
+ const safeLen = computeMaxModelLen({
420
+ modelWeightGb: weightsGb,
421
+ totalGpuMemoryGb: bestInstance.meta.gpuMemoryGb || (bestInstance.totalVramGb / bestInstance.meta.gpus),
422
+ gpuCount: bestInstance.meta.gpus,
423
+ numLayers: modelMetadata.numLayers,
424
+ numKvHeads: modelMetadata.numKvHeads,
425
+ headDim: modelMetadata.headDim
426
+ });
427
+
428
+ if (safeLen && safeLen.maxModelLen >= 2048) {
429
+ // Re-estimate VRAM with capped sequence length
430
+ const cappedEstimate = estimateVram({
431
+ parameterCount: modelMetadata.parameterCount,
432
+ dtype: modelMetadata.dtype,
433
+ quantization: quantization || undefined,
434
+ maxSequenceLength: safeLen.maxModelLen,
435
+ batchSize: effectiveBatchSize || undefined
436
+ });
437
+
438
+ // Re-filter instances with the reduced VRAM requirement
439
+ recommendations = filterAndRankInstances(
440
+ cappedEstimate.vramGb,
441
+ effectiveCatalog,
442
+ { limit }
443
+ );
444
+
445
+ suggestedMaxModelLen = safeLen.maxModelLen;
446
+ contextLengthCapped = true;
447
+ originalMaxPositionEmbeddings = modelMetadata.maxPositionEmbeddings;
448
+ log(`Context capped: ${modelMetadata.maxPositionEmbeddings} → ${safeLen.maxModelLen} for ${modelName}`);
449
+ } else {
450
+ // AC-1.6: safeLen < 2048 or null — recommend larger instance instead
451
+ log(`Model ${modelName} cannot fit 2048 context on ${bestInstance.name}, recommending larger instance`);
452
+ }
453
+ }
454
+ }
455
+
396
456
  // Step 3a: Quota & availability filtering (discover mode only)
397
457
  let preQuotaFilterCount = 0;
398
458
  let allFilteredByQuota = false;
@@ -521,7 +581,10 @@ async function handleGetInstanceRecommendation(params) {
521
581
  content: [{
522
582
  type: 'text',
523
583
  text: JSON.stringify({
524
- values: { instanceType: topRecommendation },
584
+ values: {
585
+ instanceType: topRecommendation,
586
+ ...(suggestedMaxModelLen ? { maxModelLen: suggestedMaxModelLen } : {})
587
+ },
525
588
  choices: { instanceType: rankedList },
526
589
  metadata: {
527
590
  modelName,
@@ -533,7 +596,12 @@ async function handleGetInstanceRecommendation(params) {
533
596
  recommendations: finalRecommendations,
534
597
  source: modelMetadata.source,
535
598
  smartModeUsed,
536
- allFilteredByQuota
599
+ allFilteredByQuota,
600
+ ...(contextLengthCapped ? {
601
+ suggestedMaxModelLen,
602
+ contextLengthCapped: true,
603
+ originalMaxPositionEmbeddings
604
+ } : {})
537
605
  }
538
606
  })
539
607
  }]
@@ -142,13 +142,27 @@ export function extractFromHuggingFaceConfig(config) {
142
142
  const architecture = (config.architectures && config.architectures[0]) || 'unknown';
143
143
  const maxPositionEmbeddings = config.max_position_embeddings || 4096;
144
144
 
145
- return {
145
+ // Extract architecture params for KV cache computation (computeMaxModelLen)
146
+ const numLayers = config.num_hidden_layers || null;
147
+ const numKvHeads = config.num_key_value_heads || config.num_attention_heads || null;
148
+ const headDim = config.head_dim || (config.hidden_size && config.num_attention_heads
149
+ ? Math.floor(config.hidden_size / config.num_attention_heads)
150
+ : null);
151
+
152
+ const result = {
146
153
  parameterCount,
147
154
  dtype,
148
155
  architecture,
149
156
  maxPositionEmbeddings,
150
157
  source: 'huggingface_api'
151
158
  };
159
+
160
+ // Only include architecture params if available (graceful degradation)
161
+ if (numLayers) result.numLayers = numLayers;
162
+ if (numKvHeads) result.numKvHeads = numKvHeads;
163
+ if (headDim) result.headDim = headDim;
164
+
165
+ return result;
152
166
  }
153
167
 
154
168
  /**
@@ -175,13 +189,25 @@ export async function resolveModelMetadata(modelName, options = {}) {
175
189
  const catalogEntry = catalogLookup(modelName, catalog);
176
190
 
177
191
  if (catalogEntry) {
178
- return {
192
+ const result = {
179
193
  parameterCount: catalogEntry.parameterCount,
180
194
  dtype: catalogEntry.defaultDtype || 'float16',
181
195
  architecture: catalogEntry.architecture || 'unknown',
182
196
  maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings || 4096,
183
197
  source: 'catalog'
184
198
  };
199
+
200
+ // Pass through recommendedInstances for NFR-1 guard
201
+ if (catalogEntry.recommendedInstances) {
202
+ result.recommendedInstances = catalogEntry.recommendedInstances;
203
+ }
204
+
205
+ // Pass through architecture params if available in catalog
206
+ if (catalogEntry.numLayers) result.numLayers = catalogEntry.numLayers;
207
+ if (catalogEntry.numKvHeads) result.numKvHeads = catalogEntry.numKvHeads;
208
+ if (catalogEntry.headDim) result.headDim = catalogEntry.headDim;
209
+
210
+ return result;
185
211
  }
186
212
 
187
213
  // Step 2: If discover mode, try HuggingFace Hub
package/src/app.js CHANGED
@@ -402,6 +402,7 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
402
402
  ignorePatterns.push('**/do/.tune_helper.py');
403
403
  ignorePatterns.push('**/do/.stage_helper.py');
404
404
  ignorePatterns.push('**/do/.adapter_helper.py');
405
+ ignorePatterns.push('**/do/.register_helper.py');
405
406
  ignorePatterns.push('**/do/train');
406
407
  ignorePatterns.push('**/do/.train_build_request.py');
407
408
  ignorePatterns.push('**/do/.train_status_parser.py');
@@ -578,6 +579,20 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
578
579
  fs.writeFileSync(gitignorePath, mlccIgnore);
579
580
  }
580
581
  }
582
+
583
+ // Add __pycache__/ and *.pyc to .gitignore (Python helpers leave bytecode behind)
584
+ {
585
+ const gitignorePath = path.join(destDir, '.gitignore');
586
+ const pycacheIgnore = '# Python bytecode (generated by do/ helper scripts)\n__pycache__/\n*.pyc\n';
587
+ if (fs.existsSync(gitignorePath)) {
588
+ const existing = fs.readFileSync(gitignorePath, 'utf8');
589
+ if (!existing.includes('__pycache__')) {
590
+ fs.appendFileSync(gitignorePath, `\n${pycacheIgnore}`);
591
+ }
592
+ } else {
593
+ fs.writeFileSync(gitignorePath, pycacheIgnore);
594
+ }
595
+ }
581
596
  }
582
597
 
583
598
  /**
@@ -265,6 +265,21 @@ export default class ConfigLoader {
265
265
  return;
266
266
  }
267
267
 
268
+ // Handle icEnvVars object (deploy-time IC environment variables)
269
+ if (key === 'icEnvVars' && typeof value === 'object' && value !== null) {
270
+ if (!this.manager.config.icEnvVars) {
271
+ this.manager.config.icEnvVars = {};
272
+ }
273
+ const cliIcEnvVars = (this.manager.explicitConfig && this.manager.explicitConfig.icEnvVars) || {};
274
+ Object.entries(value).forEach(([envKey, envValue]) => {
275
+ if (!(envKey in cliIcEnvVars)) {
276
+ this.manager.config.icEnvVars[envKey] = envValue;
277
+ this.manager._recordSource(`icEnvVars.${envKey}`, envValue, 'config-file');
278
+ }
279
+ });
280
+ return;
281
+ }
282
+
268
283
  if (this.manager._isSourceSupported(key, 'configFile')) {
269
284
  filteredConfig[key] = this.manager._parseValue(key, value);
270
285
  this.manager._recordSource(key, this.manager._parseValue(key, value), 'config-file');
@@ -342,6 +357,9 @@ export default class ConfigLoader {
342
357
 
343
358
  // Parse --server-env KEY=VALUE pairs
344
359
  this._parseEnvVarOptions('server-env', 'serverEnvVars');
360
+
361
+ // Parse --ic-env KEY=VALUE pairs (deploy-time IC environment variables)
362
+ this._parseEnvVarOptions('ic-env', 'icEnvVars');
345
363
  }
346
364
 
347
365
  /**
@@ -183,6 +183,9 @@ export default class ConfigManager {
183
183
  if (this.config.serverEnvVars && typeof this.config.serverEnvVars === 'object') {
184
184
  finalConfig.serverEnvVars = { ...this.config.serverEnvVars };
185
185
  }
186
+ if (this.config.icEnvVars && typeof this.config.icEnvVars === 'object') {
187
+ finalConfig.icEnvVars = { ...this.config.icEnvVars };
188
+ }
186
189
 
187
190
  // Ensure all parameters from the matrix are included in final config
188
191
  // This is important for optional parameters that might be null
@@ -411,7 +414,8 @@ export default class ConfigManager {
411
414
  ...endpointParams,
412
415
  ...icParams,
413
416
  'modelEnvVars',
414
- 'serverEnvVars'
417
+ 'serverEnvVars',
418
+ 'icEnvVars'
415
419
  ]);
416
420
  const core = {};
417
421
  for (const [key, value] of Object.entries(this.config)) {
@@ -426,6 +430,7 @@ export default class ConfigManager {
426
430
  icConfig,
427
431
  modelEnvVars: { ...(this.config.modelEnvVars || {}) },
428
432
  serverEnvVars: { ...(this.config.serverEnvVars || {}) },
433
+ icEnvVars: { ...(this.config.icEnvVars || {}) },
429
434
  manifest: [...this._sourceManifest]
430
435
  };
431
436
  }
@@ -0,0 +1,152 @@
1
+ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ /**
5
+ * Dataset Slug Derivation
6
+ *
7
+ * Derives a deterministic, short slug from a dataset URI for use in
8
+ * tuning-job-aware adapter naming conventions.
9
+ *
10
+ * Slugification rules:
11
+ * - Lowercase
12
+ * - Strip non-alphanumeric characters (keep hyphens)
13
+ * - Truncate to 20 characters
14
+ * - Replace consecutive hyphens with single hyphen
15
+ * - Strip leading/trailing hyphens
16
+ *
17
+ * Examples:
18
+ * hf://org/name -> "name"
19
+ * hf://tatsu-lab/alpaca -> "alpaca"
20
+ * hf://Open-Orca/OpenOrca -> "openorca"
21
+ * s3://bucket/path/file.jsonl -> "file"
22
+ *
23
+ * Requirements: US-4 (AC-4.2)
24
+ */
25
+
26
+ /**
27
+ * Derive a dataset slug from a dataset URI.
28
+ *
29
+ * @param {string} datasetUri - Dataset URI (s3://... or hf://...)
30
+ * @returns {string} The derived slug, or empty string if extraction fails
31
+ */
32
+ export function deriveDatasetSlug(datasetUri) {
33
+ if (!datasetUri || typeof datasetUri !== 'string') {
34
+ return '';
35
+ }
36
+
37
+ let rawName = '';
38
+
39
+ if (datasetUri.startsWith('hf://')) {
40
+ // hf://org/name[/split][?file=pattern]
41
+ // Extract the dataset name (second path component)
42
+ const hfPath = datasetUri.slice(5); // remove "hf://"
43
+ const withoutQuery = hfPath.split('?')[0]; // remove ?file=...
44
+ const parts = withoutQuery.split('/');
45
+ // parts[0] = org, parts[1] = name, parts[2+] = split
46
+ rawName = parts[1] || parts[0] || '';
47
+ } else if (datasetUri.startsWith('s3://')) {
48
+ // s3://bucket/path/file.jsonl -> slug from filename (without extension)
49
+ const s3Path = datasetUri.slice(5); // remove "s3://"
50
+ const parts = s3Path.split('/');
51
+ const filename = parts[parts.length - 1] || '';
52
+ // Remove file extension
53
+ const dotIndex = filename.lastIndexOf('.');
54
+ rawName = dotIndex > 0 ? filename.substring(0, dotIndex) : filename;
55
+ } else {
56
+ // Unknown format — try to extract last path component
57
+ const parts = datasetUri.split('/');
58
+ rawName = parts[parts.length - 1] || '';
59
+ }
60
+
61
+ return slugify(rawName);
62
+ }
63
+
64
+ /**
65
+ * Apply slugification rules to a raw name.
66
+ *
67
+ * @param {string} raw - Raw name to slugify
68
+ * @returns {string} Slugified string
69
+ */
70
+ export function slugify(raw) {
71
+ if (!raw) return '';
72
+
73
+ let slug = raw
74
+ .toLowerCase() // lowercase
75
+ .replace(/[^a-z0-9-]/g, '') // strip non-alphanumeric (keep hyphens)
76
+ .replace(/-{2,}/g, '-') // replace consecutive hyphens
77
+ .replace(/^-+/, '') // strip leading hyphens
78
+ .replace(/-+$/, ''); // strip trailing hyphens
79
+
80
+ // Truncate to 20 chars
81
+ if (slug.length > 20) {
82
+ slug = slug.substring(0, 20);
83
+ // Don't end on a hyphen after truncation
84
+ slug = slug.replace(/-+$/, '');
85
+ }
86
+
87
+ return slug;
88
+ }
89
+
90
+ /**
91
+ * Resolve a --from-tune argument to the appropriate config variable name.
92
+ *
93
+ * Resolution rules:
94
+ * - No arg (empty/null) -> TUNE_OUTPUT_PATH_LATEST
95
+ * - technique only (e.g., "sft") -> TUNE_ADAPTER_PATH_SFT
96
+ * - technique-dataset compound (e.g., "sft-alpaca") -> TUNE_ADAPTER_PATH_SFT_ALPACA
97
+ *
98
+ * @param {string} fromTuneArg - The --from-tune argument value
99
+ * @param {function} configVarExists - Function that checks if a config var exists
100
+ * @returns {{ varName: string, technique: string, slug: string, isCompound: boolean, fallback: string|null }}
101
+ */
102
+ export function resolveFromTuneVar(fromTuneArg, configVarExists) {
103
+ if (!fromTuneArg) {
104
+ return {
105
+ varName: 'TUNE_OUTPUT_PATH_LATEST',
106
+ technique: '',
107
+ slug: '',
108
+ isCompound: false,
109
+ fallback: null
110
+ };
111
+ }
112
+
113
+ const upper = fromTuneArg.toUpperCase();
114
+
115
+ // Check if argument contains a hyphen — potential compound key
116
+ const hyphenIndex = fromTuneArg.indexOf('-');
117
+ if (hyphenIndex > 0) {
118
+ const technique = fromTuneArg.substring(0, hyphenIndex);
119
+ const slug = fromTuneArg.substring(hyphenIndex + 1);
120
+ const techniqueUpper = technique.toUpperCase();
121
+ const slugUpper = slug.toUpperCase().replace(/-/g, '_');
122
+ const compoundVar = `TUNE_ADAPTER_PATH_${techniqueUpper}_${slugUpper}`;
123
+
124
+ if (configVarExists(compoundVar)) {
125
+ return {
126
+ varName: compoundVar,
127
+ technique,
128
+ slug,
129
+ isCompound: true,
130
+ fallback: null
131
+ };
132
+ }
133
+
134
+ // Compound key doesn't exist — fallback to technique-only
135
+ return {
136
+ varName: `TUNE_ADAPTER_PATH_${techniqueUpper}`,
137
+ technique,
138
+ slug,
139
+ isCompound: false,
140
+ fallback: compoundVar // the compound var that was tried but didn't exist
141
+ };
142
+ }
143
+
144
+ // No hyphen — technique-only
145
+ return {
146
+ varName: `TUNE_ADAPTER_PATH_${upper}`,
147
+ technique: fromTuneArg,
148
+ slug: '',
149
+ isCompound: false,
150
+ fallback: null
151
+ };
152
+ }
@@ -1,6 +1,6 @@
1
1
  // AUTO-GENERATED by scripts/codegen-cli.js — DO NOT EDIT
2
2
  // Source: config/parameter-schema-v2.json
3
- // Generated: 2026-06-15T20:16:03.840Z
3
+ // Generated: 2026-06-23T20:55:23.381Z
4
4
 
5
5
  /**
6
6
  * CLI option definitions derived from parameter-schema-v2.json.
@@ -70,7 +70,7 @@ export const cliOptions = [
70
70
  {
71
71
  'flag': '--enable-lora',
72
72
  'description': 'Enable LoRA adapter serving',
73
- 'defaultValue': false
73
+ 'defaultValue': true
74
74
  },
75
75
  {
76
76
  'flag': '--max-loras <n>',
@@ -85,7 +85,7 @@ export const cliOptions = [
85
85
  {
86
86
  'flag': '--include-benchmark',
87
87
  'description': 'Include SageMaker AI Benchmarking scripts (do/benchmark, do/optimize). Workload configuration is specified at runtime via --workload flag.',
88
- 'defaultValue': false
88
+ 'defaultValue': true
89
89
  },
90
90
  {
91
91
  'flag': '--benchmark-concurrency <n>',
@@ -353,6 +353,11 @@ export const cliOptions = [
353
353
  'description': 'Server env var, repeatable (e.g. SGLANG_MEM_FRACTION=0.9)',
354
354
  'repeatable': true
355
355
  },
356
+ {
357
+ 'flag': '--ic-env <KEY=VALUE>',
358
+ 'description': 'Deploy-time environment variable for inference components (IC_ENV_* prefix), repeatable (e.g. VLLM_MAX_MODEL_LEN=8192)',
359
+ 'repeatable': true
360
+ },
356
361
  {
357
362
  'flag': '--include-sample',
358
363
  'description': 'Include sample model code',
@@ -464,6 +469,7 @@ export const helpGroups = {
464
469
  '--fsx-volume-handle': 'hyperpod',
465
470
  '--model-env': 'env',
466
471
  '--server-env': 'env',
472
+ '--ic-env': 'ic',
467
473
  '--include-sample': 'features',
468
474
  '--include-testing': 'features',
469
475
  '--test-types': 'features',
@@ -1,6 +1,6 @@
1
1
  // AUTO-GENERATED by scripts/codegen-parameter-matrix.js — DO NOT EDIT
2
2
  // Source: config/parameter-schema-v2.json
3
- // Generated: 2026-06-15T20:16:03.952Z
3
+ // Generated: 2026-06-23T20:55:23.482Z
4
4
 
5
5
  /**
6
6
  * Parameter matrix defining how each parameter is loaded from various sources.
@@ -106,7 +106,7 @@ export const parameterMatrix = {
106
106
  'mcp': false,
107
107
  'promptable': true,
108
108
  'required': false,
109
- 'default': false,
109
+ 'default': true,
110
110
  'valueSpace': 'bounded'
111
111
  },
112
112
  'maxLoras': {
@@ -139,7 +139,7 @@ export const parameterMatrix = {
139
139
  'mcp': false,
140
140
  'promptable': true,
141
141
  'required': false,
142
- 'default': false,
142
+ 'default': true,
143
143
  'valueSpace': 'bounded'
144
144
  },
145
145
  'benchmarkConcurrency': {
@@ -225,7 +225,7 @@ export const parameterMatrix = {
225
225
  'configFile': true,
226
226
  'packageJson': false,
227
227
  'mcp': true,
228
- 'promptable': false,
228
+ 'promptable': true,
229
229
  'required': false,
230
230
  'default': null,
231
231
  'valueSpace': 'unbounded'
@@ -569,6 +569,17 @@ export const parameterMatrix = {
569
569
  'default': null,
570
570
  'valueSpace': 'unbounded'
571
571
  },
572
+ 'icEnv': {
573
+ 'cliOption': 'ic-env',
574
+ 'envVar': null,
575
+ 'configFile': true,
576
+ 'packageJson': false,
577
+ 'mcp': false,
578
+ 'promptable': false,
579
+ 'required': false,
580
+ 'default': [],
581
+ 'valueSpace': 'unbounded'
582
+ },
572
583
  'includeSampleModel': {
573
584
  'cliOption': 'include-sample',
574
585
  'envVar': 'ML_INCLUDE_SAMPLE',
@@ -1,6 +1,6 @@
1
1
  // AUTO-GENERATED by scripts/codegen-validator.js — DO NOT EDIT
2
2
  // Source: config/parameter-schema-v2.json
3
- // Generated: 2026-06-15T20:16:03.877Z
3
+ // Generated: 2026-06-23T20:55:23.412Z
4
4
 
5
5
  /**
6
6
  * Validation rules derived from parameter-schema-v2.json.
@@ -143,9 +143,23 @@ class McpClient {
143
143
  // Build context from bounded parameters that have defaults
144
144
  const context = this._buildContext();
145
145
 
146
+ // Auto-discover tool name if using the default (get_ml_config)
147
+ // Each server registers its own tool name (e.g. get_base_images, get_inference_endpoints)
148
+ let toolName = this.toolName;
149
+ if (toolName === DEFAULT_TOOL_NAME) {
150
+ try {
151
+ const toolList = await this._client.listTools();
152
+ if (toolList && toolList.tools && toolList.tools.length > 0) {
153
+ toolName = toolList.tools[0].name;
154
+ }
155
+ } catch (_listErr) {
156
+ // Fall through to use default tool name
157
+ }
158
+ }
159
+
146
160
  // Call the configured tool
147
161
  const result = await this._client.callTool({
148
- name: this.toolName,
162
+ name: toolName,
149
163
  arguments: {
150
164
  parameters: unboundedParams,
151
165
  limit: this.limit,
@@ -216,6 +216,12 @@ export default class McpQueryRunner {
216
216
  if (parsed.choices?.instanceType?.length > 0) {
217
217
  this.runner._instanceSizerMetadata = parsed.metadata || null;
218
218
 
219
+ // Store maxModelLen from sizer if context was capped (AC-1.7)
220
+ if (parsed.values?.maxModelLen) {
221
+ this.runner._sizerMaxModelLen = parsed.values.maxModelLen;
222
+ console.log(` ✓ Context length capped: max_model_len=${parsed.values.maxModelLen}`);
223
+ }
224
+
219
225
  // Build display labels with VRAM estimate and utilization percentage
220
226
  const recommendations = parsed.metadata?.recommendations || [];
221
227
  const estimatedVramGb = parsed.metadata?.estimatedVramGb;
@@ -365,9 +371,13 @@ export default class McpQueryRunner {
365
371
  console.log(' 🔍 Querying endpoint-picker...');
366
372
 
367
373
  try {
374
+ // Pass awsProfile from bootstrap config for credential resolution
375
+ const awsProfile = this.runner.configManager?.config?.awsProfile
376
+ || this.runner.options?.profile || process.env.AWS_PROFILE || null;
368
377
  const result = await cm.queryMcpServer('endpoint-picker', {
369
378
  awsRegion: infraAnswers.awsRegion,
370
- deploymentTarget: 'realtime-inference'
379
+ deploymentTarget: 'realtime-inference',
380
+ ...(awsProfile ? { awsProfile } : {})
371
381
  });
372
382
 
373
383
  if (result && result.choices?.endpointName?.length > 0) {
@@ -224,25 +224,39 @@ export default class PromptRunner {
224
224
  // Requirements: 3.3, 4.3, 4.4 — endpoint-picker MCP query
225
225
  let existingEndpointAnswers = {};
226
226
  if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference') {
227
- // Query endpoint-picker MCP server for available endpoints
228
- const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
229
- await this.mcpQueryRunner._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
230
-
231
- const endpointPreviousAnswers = {
232
- ...regionAndTargetAnswers,
233
- ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
234
- };
235
- existingEndpointAnswers = await this._runPhase(
236
- infraExistingEndpointPrompts,
237
- endpointPreviousAnswers,
227
+ // First ask if user wants to attach to existing endpoint (no MCP call yet)
228
+ const attachAnswer = await this._runPhase(
229
+ [infraExistingEndpointPrompts[0]],
230
+ { ...regionAndTargetAnswers },
238
231
  explicitConfig,
239
232
  existingConfig
240
233
  );
241
234
 
242
- // Resolve custom endpoint name
243
- if (existingEndpointAnswers.customExistingEndpointName) {
244
- existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
245
- delete existingEndpointAnswers.customExistingEndpointName;
235
+ if (attachAnswer.useExistingEndpoint === 'yes') {
236
+ // Only now query endpoint-picker MCP server
237
+ const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
238
+ await this.mcpQueryRunner._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
239
+
240
+ const endpointPreviousAnswers = {
241
+ ...regionAndTargetAnswers,
242
+ ...attachAnswer,
243
+ ...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
244
+ };
245
+ existingEndpointAnswers = await this._runPhase(
246
+ infraExistingEndpointPrompts.slice(1),
247
+ endpointPreviousAnswers,
248
+ explicitConfig,
249
+ existingConfig
250
+ );
251
+ existingEndpointAnswers.useExistingEndpoint = 'yes';
252
+
253
+ // Resolve custom endpoint name
254
+ if (existingEndpointAnswers.customExistingEndpointName) {
255
+ existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
256
+ delete existingEndpointAnswers.customExistingEndpointName;
257
+ }
258
+ } else {
259
+ existingEndpointAnswers = attachAnswer;
246
260
  }
247
261
  }
248
262
 
@@ -376,11 +390,12 @@ export default class PromptRunner {
376
390
  const sizerRecs = this._instanceSizerMetadata.recommendations || [];
377
391
  const finalInstanceType = instanceAnswers.customInstanceType || instanceAnswers.instanceType;
378
392
  const matchingRec = sizerRecs.find(r => r.instanceType === finalInstanceType);
379
- const tpRec = matchingRec || sizerRecs[0];
380
- if (tpRec && tpRec.tensorParallelism > 1) {
381
- this._autoTensorParallelism = tpRec.tensorParallelism;
382
- this._autoGpuCount = tpRec.gpuCount;
383
- console.log(` ✓ Auto-set tensor parallelism: TP=${tpRec.tensorParallelism} (${tpRec.gpuCount} GPUs)`);
393
+ // Only use sizer TP recommendation if user selected a recommended instance
394
+ // Custom instances resolve TP from the instance catalog in template-variable-resolver
395
+ if (matchingRec && matchingRec.tensorParallelism > 1) {
396
+ this._autoTensorParallelism = matchingRec.tensorParallelism;
397
+ this._autoGpuCount = matchingRec.gpuCount;
398
+ console.log(` ✓ Auto-set tensor parallelism: TP=${matchingRec.tensorParallelism} (${matchingRec.gpuCount} GPUs)`);
384
399
  }
385
400
 
386
401
  // Display capacity type confirmation for selected instance
@@ -710,6 +725,11 @@ export default class PromptRunner {
710
725
  delete combinedAnswers.customHyperPodCluster;
711
726
  }
712
727
 
728
+ // Propagate max_model_len from instance-sizer context capping (AC-1.7)
729
+ if (this._sizerMaxModelLen) {
730
+ combinedAnswers.sizerMaxModelLen = this._sizerMaxModelLen;
731
+ }
732
+
713
733
  // Apply CUDA version selection → inference AMI override
714
734
  if (combinedAnswers._resolvedInferenceAmiVersion) {
715
735
  combinedAnswers.inferenceAmiVersion = combinedAnswers._resolvedInferenceAmiVersion;
@@ -90,7 +90,7 @@ const loraPrompts = [
90
90
  type: 'confirm',
91
91
  name: 'enableLora',
92
92
  message: 'Enable LoRA adapter serving?',
93
- default: false,
93
+ default: true,
94
94
  when: (answers) => {
95
95
  const architecture = answers.architecture || answers.deploymentConfig?.split('-')[0];
96
96
  const backend = answers.backend || answers.deploymentConfig?.split('-').slice(1).join('-');