@aws/ml-container-creator 0.13.5 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/parameter-schema-v2.json +33 -5
- package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
- package/infra/ci-harness/package-lock.json +121 -111
- package/infra/ci-harness/package.json +1 -1
- package/package.json +2 -2
- package/servers/endpoint-picker/index.js +23 -14
- package/servers/instance-sizer/index.js +72 -4
- package/servers/instance-sizer/lib/model-resolver.js +28 -2
- package/src/app.js +15 -0
- package/src/lib/config-loader.js +18 -0
- package/src/lib/config-manager.js +6 -1
- package/src/lib/dataset-slug.js +152 -0
- package/src/lib/generated/cli-options.js +9 -3
- package/src/lib/generated/parameter-matrix.js +15 -4
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/mcp-client.js +15 -1
- package/src/lib/mcp-query-runner.js +11 -1
- package/src/lib/prompt-runner.js +40 -20
- package/src/lib/prompts/feature-prompts.js +1 -1
- package/src/lib/template-manager.js +0 -7
- package/src/lib/template-variable-resolver.js +51 -1
- package/src/lib/tune-config-state.js +14 -1
- package/templates/do/.benchmark_writer.py +43 -0
- package/templates/do/.register_helper.py +1185 -0
- package/templates/do/.tune_helper.py +168 -2
- package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
- package/templates/do/adapter +319 -27
- package/templates/do/add-ic +85 -3
- package/templates/do/benchmark +28 -8
- package/templates/do/config +20 -0
- package/templates/do/lib/inference-component.sh +56 -3
- package/templates/do/register +557 -6
- package/templates/do/test +12 -2
- package/templates/do/tune +219 -6
|
@@ -25,8 +25,8 @@ import { readFileSync } from 'node:fs';
|
|
|
25
25
|
import { fileURLToPath } from 'node:url';
|
|
26
26
|
import { resolve, dirname } from 'node:path';
|
|
27
27
|
import { resolveModelMetadata } from './lib/model-resolver.js';
|
|
28
|
-
import { estimateVram } from './lib/vram-estimator.js';
|
|
29
|
-
import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js';
|
|
28
|
+
import { estimateVram, computeMaxModelLen } from './lib/vram-estimator.js';
|
|
29
|
+
import { filterAndRankInstances, applyAvailabilityRanking, getPerGpuMemoryGb } from './lib/instance-ranker.js';
|
|
30
30
|
import { QuotaResolver } from './lib/quota-resolver.js';
|
|
31
31
|
import { queryBedrock } from '../lib/bedrock-client.js';
|
|
32
32
|
|
|
@@ -393,6 +393,66 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
393
393
|
{ limit }
|
|
394
394
|
);
|
|
395
395
|
|
|
396
|
+
// Step 3-max_model_len: When no instance fits at full context, try capping context length
|
|
397
|
+
// NFR-1 guard: skip this logic for models with recommendedInstances in catalog
|
|
398
|
+
let suggestedMaxModelLen = null;
|
|
399
|
+
let contextLengthCapped = false;
|
|
400
|
+
let originalMaxPositionEmbeddings = null;
|
|
401
|
+
|
|
402
|
+
if (recommendations.length === 0 && !modelMetadata.recommendedInstances && modelMetadata.maxPositionEmbeddings) {
|
|
403
|
+
// Find the largest available GPU instance
|
|
404
|
+
const gpuInstances = Object.entries(effectiveCatalog)
|
|
405
|
+
.filter(([, meta]) => meta.category === 'gpu' && meta.gpus > 0)
|
|
406
|
+
.map(([name, meta]) => {
|
|
407
|
+
const perGpu = getPerGpuMemoryGb(meta);
|
|
408
|
+
return { name, meta, totalVramGb: perGpu ? perGpu * meta.gpus : 0 };
|
|
409
|
+
})
|
|
410
|
+
.filter(i => i.totalVramGb > 0)
|
|
411
|
+
.sort((a, b) => b.totalVramGb - a.totalVramGb);
|
|
412
|
+
|
|
413
|
+
if (gpuInstances.length > 0) {
|
|
414
|
+
const bestInstance = gpuInstances[0];
|
|
415
|
+
|
|
416
|
+
// Compute model weight memory for computeMaxModelLen
|
|
417
|
+
const weightsGb = vramEstimate.breakdown.weightsGb;
|
|
418
|
+
|
|
419
|
+
const safeLen = computeMaxModelLen({
|
|
420
|
+
modelWeightGb: weightsGb,
|
|
421
|
+
totalGpuMemoryGb: bestInstance.meta.gpuMemoryGb || (bestInstance.totalVramGb / bestInstance.meta.gpus),
|
|
422
|
+
gpuCount: bestInstance.meta.gpus,
|
|
423
|
+
numLayers: modelMetadata.numLayers,
|
|
424
|
+
numKvHeads: modelMetadata.numKvHeads,
|
|
425
|
+
headDim: modelMetadata.headDim
|
|
426
|
+
});
|
|
427
|
+
|
|
428
|
+
if (safeLen && safeLen.maxModelLen >= 2048) {
|
|
429
|
+
// Re-estimate VRAM with capped sequence length
|
|
430
|
+
const cappedEstimate = estimateVram({
|
|
431
|
+
parameterCount: modelMetadata.parameterCount,
|
|
432
|
+
dtype: modelMetadata.dtype,
|
|
433
|
+
quantization: quantization || undefined,
|
|
434
|
+
maxSequenceLength: safeLen.maxModelLen,
|
|
435
|
+
batchSize: effectiveBatchSize || undefined
|
|
436
|
+
});
|
|
437
|
+
|
|
438
|
+
// Re-filter instances with the reduced VRAM requirement
|
|
439
|
+
recommendations = filterAndRankInstances(
|
|
440
|
+
cappedEstimate.vramGb,
|
|
441
|
+
effectiveCatalog,
|
|
442
|
+
{ limit }
|
|
443
|
+
);
|
|
444
|
+
|
|
445
|
+
suggestedMaxModelLen = safeLen.maxModelLen;
|
|
446
|
+
contextLengthCapped = true;
|
|
447
|
+
originalMaxPositionEmbeddings = modelMetadata.maxPositionEmbeddings;
|
|
448
|
+
log(`Context capped: ${modelMetadata.maxPositionEmbeddings} → ${safeLen.maxModelLen} for ${modelName}`);
|
|
449
|
+
} else {
|
|
450
|
+
// AC-1.6: safeLen < 2048 or null — recommend larger instance instead
|
|
451
|
+
log(`Model ${modelName} cannot fit 2048 context on ${bestInstance.name}, recommending larger instance`);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
396
456
|
// Step 3a: Quota & availability filtering (discover mode only)
|
|
397
457
|
let preQuotaFilterCount = 0;
|
|
398
458
|
let allFilteredByQuota = false;
|
|
@@ -521,7 +581,10 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
521
581
|
content: [{
|
|
522
582
|
type: 'text',
|
|
523
583
|
text: JSON.stringify({
|
|
524
|
-
values: {
|
|
584
|
+
values: {
|
|
585
|
+
instanceType: topRecommendation,
|
|
586
|
+
...(suggestedMaxModelLen ? { maxModelLen: suggestedMaxModelLen } : {})
|
|
587
|
+
},
|
|
525
588
|
choices: { instanceType: rankedList },
|
|
526
589
|
metadata: {
|
|
527
590
|
modelName,
|
|
@@ -533,7 +596,12 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
533
596
|
recommendations: finalRecommendations,
|
|
534
597
|
source: modelMetadata.source,
|
|
535
598
|
smartModeUsed,
|
|
536
|
-
allFilteredByQuota
|
|
599
|
+
allFilteredByQuota,
|
|
600
|
+
...(contextLengthCapped ? {
|
|
601
|
+
suggestedMaxModelLen,
|
|
602
|
+
contextLengthCapped: true,
|
|
603
|
+
originalMaxPositionEmbeddings
|
|
604
|
+
} : {})
|
|
537
605
|
}
|
|
538
606
|
})
|
|
539
607
|
}]
|
|
@@ -142,13 +142,27 @@ export function extractFromHuggingFaceConfig(config) {
|
|
|
142
142
|
const architecture = (config.architectures && config.architectures[0]) || 'unknown';
|
|
143
143
|
const maxPositionEmbeddings = config.max_position_embeddings || 4096;
|
|
144
144
|
|
|
145
|
-
|
|
145
|
+
// Extract architecture params for KV cache computation (computeMaxModelLen)
|
|
146
|
+
const numLayers = config.num_hidden_layers || null;
|
|
147
|
+
const numKvHeads = config.num_key_value_heads || config.num_attention_heads || null;
|
|
148
|
+
const headDim = config.head_dim || (config.hidden_size && config.num_attention_heads
|
|
149
|
+
? Math.floor(config.hidden_size / config.num_attention_heads)
|
|
150
|
+
: null);
|
|
151
|
+
|
|
152
|
+
const result = {
|
|
146
153
|
parameterCount,
|
|
147
154
|
dtype,
|
|
148
155
|
architecture,
|
|
149
156
|
maxPositionEmbeddings,
|
|
150
157
|
source: 'huggingface_api'
|
|
151
158
|
};
|
|
159
|
+
|
|
160
|
+
// Only include architecture params if available (graceful degradation)
|
|
161
|
+
if (numLayers) result.numLayers = numLayers;
|
|
162
|
+
if (numKvHeads) result.numKvHeads = numKvHeads;
|
|
163
|
+
if (headDim) result.headDim = headDim;
|
|
164
|
+
|
|
165
|
+
return result;
|
|
152
166
|
}
|
|
153
167
|
|
|
154
168
|
/**
|
|
@@ -175,13 +189,25 @@ export async function resolveModelMetadata(modelName, options = {}) {
|
|
|
175
189
|
const catalogEntry = catalogLookup(modelName, catalog);
|
|
176
190
|
|
|
177
191
|
if (catalogEntry) {
|
|
178
|
-
|
|
192
|
+
const result = {
|
|
179
193
|
parameterCount: catalogEntry.parameterCount,
|
|
180
194
|
dtype: catalogEntry.defaultDtype || 'float16',
|
|
181
195
|
architecture: catalogEntry.architecture || 'unknown',
|
|
182
196
|
maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings || 4096,
|
|
183
197
|
source: 'catalog'
|
|
184
198
|
};
|
|
199
|
+
|
|
200
|
+
// Pass through recommendedInstances for NFR-1 guard
|
|
201
|
+
if (catalogEntry.recommendedInstances) {
|
|
202
|
+
result.recommendedInstances = catalogEntry.recommendedInstances;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Pass through architecture params if available in catalog
|
|
206
|
+
if (catalogEntry.numLayers) result.numLayers = catalogEntry.numLayers;
|
|
207
|
+
if (catalogEntry.numKvHeads) result.numKvHeads = catalogEntry.numKvHeads;
|
|
208
|
+
if (catalogEntry.headDim) result.headDim = catalogEntry.headDim;
|
|
209
|
+
|
|
210
|
+
return result;
|
|
185
211
|
}
|
|
186
212
|
|
|
187
213
|
// Step 2: If discover mode, try HuggingFace Hub
|
package/src/app.js
CHANGED
|
@@ -402,6 +402,7 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
|
|
|
402
402
|
ignorePatterns.push('**/do/.tune_helper.py');
|
|
403
403
|
ignorePatterns.push('**/do/.stage_helper.py');
|
|
404
404
|
ignorePatterns.push('**/do/.adapter_helper.py');
|
|
405
|
+
ignorePatterns.push('**/do/.register_helper.py');
|
|
405
406
|
ignorePatterns.push('**/do/train');
|
|
406
407
|
ignorePatterns.push('**/do/.train_build_request.py');
|
|
407
408
|
ignorePatterns.push('**/do/.train_status_parser.py');
|
|
@@ -578,6 +579,20 @@ export async function writeProject(templateDir, destDir, answers, registryConfig
|
|
|
578
579
|
fs.writeFileSync(gitignorePath, mlccIgnore);
|
|
579
580
|
}
|
|
580
581
|
}
|
|
582
|
+
|
|
583
|
+
// Add __pycache__/ and *.pyc to .gitignore (Python helpers leave bytecode behind)
|
|
584
|
+
{
|
|
585
|
+
const gitignorePath = path.join(destDir, '.gitignore');
|
|
586
|
+
const pycacheIgnore = '# Python bytecode (generated by do/ helper scripts)\n__pycache__/\n*.pyc\n';
|
|
587
|
+
if (fs.existsSync(gitignorePath)) {
|
|
588
|
+
const existing = fs.readFileSync(gitignorePath, 'utf8');
|
|
589
|
+
if (!existing.includes('__pycache__')) {
|
|
590
|
+
fs.appendFileSync(gitignorePath, `\n${pycacheIgnore}`);
|
|
591
|
+
}
|
|
592
|
+
} else {
|
|
593
|
+
fs.writeFileSync(gitignorePath, pycacheIgnore);
|
|
594
|
+
}
|
|
595
|
+
}
|
|
581
596
|
}
|
|
582
597
|
|
|
583
598
|
/**
|
package/src/lib/config-loader.js
CHANGED
|
@@ -265,6 +265,21 @@ export default class ConfigLoader {
|
|
|
265
265
|
return;
|
|
266
266
|
}
|
|
267
267
|
|
|
268
|
+
// Handle icEnvVars object (deploy-time IC environment variables)
|
|
269
|
+
if (key === 'icEnvVars' && typeof value === 'object' && value !== null) {
|
|
270
|
+
if (!this.manager.config.icEnvVars) {
|
|
271
|
+
this.manager.config.icEnvVars = {};
|
|
272
|
+
}
|
|
273
|
+
const cliIcEnvVars = (this.manager.explicitConfig && this.manager.explicitConfig.icEnvVars) || {};
|
|
274
|
+
Object.entries(value).forEach(([envKey, envValue]) => {
|
|
275
|
+
if (!(envKey in cliIcEnvVars)) {
|
|
276
|
+
this.manager.config.icEnvVars[envKey] = envValue;
|
|
277
|
+
this.manager._recordSource(`icEnvVars.${envKey}`, envValue, 'config-file');
|
|
278
|
+
}
|
|
279
|
+
});
|
|
280
|
+
return;
|
|
281
|
+
}
|
|
282
|
+
|
|
268
283
|
if (this.manager._isSourceSupported(key, 'configFile')) {
|
|
269
284
|
filteredConfig[key] = this.manager._parseValue(key, value);
|
|
270
285
|
this.manager._recordSource(key, this.manager._parseValue(key, value), 'config-file');
|
|
@@ -342,6 +357,9 @@ export default class ConfigLoader {
|
|
|
342
357
|
|
|
343
358
|
// Parse --server-env KEY=VALUE pairs
|
|
344
359
|
this._parseEnvVarOptions('server-env', 'serverEnvVars');
|
|
360
|
+
|
|
361
|
+
// Parse --ic-env KEY=VALUE pairs (deploy-time IC environment variables)
|
|
362
|
+
this._parseEnvVarOptions('ic-env', 'icEnvVars');
|
|
345
363
|
}
|
|
346
364
|
|
|
347
365
|
/**
|
|
@@ -183,6 +183,9 @@ export default class ConfigManager {
|
|
|
183
183
|
if (this.config.serverEnvVars && typeof this.config.serverEnvVars === 'object') {
|
|
184
184
|
finalConfig.serverEnvVars = { ...this.config.serverEnvVars };
|
|
185
185
|
}
|
|
186
|
+
if (this.config.icEnvVars && typeof this.config.icEnvVars === 'object') {
|
|
187
|
+
finalConfig.icEnvVars = { ...this.config.icEnvVars };
|
|
188
|
+
}
|
|
186
189
|
|
|
187
190
|
// Ensure all parameters from the matrix are included in final config
|
|
188
191
|
// This is important for optional parameters that might be null
|
|
@@ -411,7 +414,8 @@ export default class ConfigManager {
|
|
|
411
414
|
...endpointParams,
|
|
412
415
|
...icParams,
|
|
413
416
|
'modelEnvVars',
|
|
414
|
-
'serverEnvVars'
|
|
417
|
+
'serverEnvVars',
|
|
418
|
+
'icEnvVars'
|
|
415
419
|
]);
|
|
416
420
|
const core = {};
|
|
417
421
|
for (const [key, value] of Object.entries(this.config)) {
|
|
@@ -426,6 +430,7 @@ export default class ConfigManager {
|
|
|
426
430
|
icConfig,
|
|
427
431
|
modelEnvVars: { ...(this.config.modelEnvVars || {}) },
|
|
428
432
|
serverEnvVars: { ...(this.config.serverEnvVars || {}) },
|
|
433
|
+
icEnvVars: { ...(this.config.icEnvVars || {}) },
|
|
429
434
|
manifest: [...this._sourceManifest]
|
|
430
435
|
};
|
|
431
436
|
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Dataset Slug Derivation
|
|
6
|
+
*
|
|
7
|
+
* Derives a deterministic, short slug from a dataset URI for use in
|
|
8
|
+
* tuning-job-aware adapter naming conventions.
|
|
9
|
+
*
|
|
10
|
+
* Slugification rules:
|
|
11
|
+
* - Lowercase
|
|
12
|
+
* - Strip non-alphanumeric characters (keep hyphens)
|
|
13
|
+
* - Truncate to 20 characters
|
|
14
|
+
* - Replace consecutive hyphens with single hyphen
|
|
15
|
+
* - Strip leading/trailing hyphens
|
|
16
|
+
*
|
|
17
|
+
* Examples:
|
|
18
|
+
* hf://org/name -> "name"
|
|
19
|
+
* hf://tatsu-lab/alpaca -> "alpaca"
|
|
20
|
+
* hf://Open-Orca/OpenOrca -> "openorca"
|
|
21
|
+
* s3://bucket/path/file.jsonl -> "file"
|
|
22
|
+
*
|
|
23
|
+
* Requirements: US-4 (AC-4.2)
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Derive a dataset slug from a dataset URI.
|
|
28
|
+
*
|
|
29
|
+
* @param {string} datasetUri - Dataset URI (s3://... or hf://...)
|
|
30
|
+
* @returns {string} The derived slug, or empty string if extraction fails
|
|
31
|
+
*/
|
|
32
|
+
export function deriveDatasetSlug(datasetUri) {
|
|
33
|
+
if (!datasetUri || typeof datasetUri !== 'string') {
|
|
34
|
+
return '';
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
let rawName = '';
|
|
38
|
+
|
|
39
|
+
if (datasetUri.startsWith('hf://')) {
|
|
40
|
+
// hf://org/name[/split][?file=pattern]
|
|
41
|
+
// Extract the dataset name (second path component)
|
|
42
|
+
const hfPath = datasetUri.slice(5); // remove "hf://"
|
|
43
|
+
const withoutQuery = hfPath.split('?')[0]; // remove ?file=...
|
|
44
|
+
const parts = withoutQuery.split('/');
|
|
45
|
+
// parts[0] = org, parts[1] = name, parts[2+] = split
|
|
46
|
+
rawName = parts[1] || parts[0] || '';
|
|
47
|
+
} else if (datasetUri.startsWith('s3://')) {
|
|
48
|
+
// s3://bucket/path/file.jsonl -> slug from filename (without extension)
|
|
49
|
+
const s3Path = datasetUri.slice(5); // remove "s3://"
|
|
50
|
+
const parts = s3Path.split('/');
|
|
51
|
+
const filename = parts[parts.length - 1] || '';
|
|
52
|
+
// Remove file extension
|
|
53
|
+
const dotIndex = filename.lastIndexOf('.');
|
|
54
|
+
rawName = dotIndex > 0 ? filename.substring(0, dotIndex) : filename;
|
|
55
|
+
} else {
|
|
56
|
+
// Unknown format — try to extract last path component
|
|
57
|
+
const parts = datasetUri.split('/');
|
|
58
|
+
rawName = parts[parts.length - 1] || '';
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return slugify(rawName);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Apply slugification rules to a raw name.
|
|
66
|
+
*
|
|
67
|
+
* @param {string} raw - Raw name to slugify
|
|
68
|
+
* @returns {string} Slugified string
|
|
69
|
+
*/
|
|
70
|
+
export function slugify(raw) {
|
|
71
|
+
if (!raw) return '';
|
|
72
|
+
|
|
73
|
+
let slug = raw
|
|
74
|
+
.toLowerCase() // lowercase
|
|
75
|
+
.replace(/[^a-z0-9-]/g, '') // strip non-alphanumeric (keep hyphens)
|
|
76
|
+
.replace(/-{2,}/g, '-') // replace consecutive hyphens
|
|
77
|
+
.replace(/^-+/, '') // strip leading hyphens
|
|
78
|
+
.replace(/-+$/, ''); // strip trailing hyphens
|
|
79
|
+
|
|
80
|
+
// Truncate to 20 chars
|
|
81
|
+
if (slug.length > 20) {
|
|
82
|
+
slug = slug.substring(0, 20);
|
|
83
|
+
// Don't end on a hyphen after truncation
|
|
84
|
+
slug = slug.replace(/-+$/, '');
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return slug;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Resolve a --from-tune argument to the appropriate config variable name.
|
|
92
|
+
*
|
|
93
|
+
* Resolution rules:
|
|
94
|
+
* - No arg (empty/null) -> TUNE_OUTPUT_PATH_LATEST
|
|
95
|
+
* - technique only (e.g., "sft") -> TUNE_ADAPTER_PATH_SFT
|
|
96
|
+
* - technique-dataset compound (e.g., "sft-alpaca") -> TUNE_ADAPTER_PATH_SFT_ALPACA
|
|
97
|
+
*
|
|
98
|
+
* @param {string} fromTuneArg - The --from-tune argument value
|
|
99
|
+
* @param {function} configVarExists - Function that checks if a config var exists
|
|
100
|
+
* @returns {{ varName: string, technique: string, slug: string, isCompound: boolean, fallback: string|null }}
|
|
101
|
+
*/
|
|
102
|
+
export function resolveFromTuneVar(fromTuneArg, configVarExists) {
|
|
103
|
+
if (!fromTuneArg) {
|
|
104
|
+
return {
|
|
105
|
+
varName: 'TUNE_OUTPUT_PATH_LATEST',
|
|
106
|
+
technique: '',
|
|
107
|
+
slug: '',
|
|
108
|
+
isCompound: false,
|
|
109
|
+
fallback: null
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const upper = fromTuneArg.toUpperCase();
|
|
114
|
+
|
|
115
|
+
// Check if argument contains a hyphen — potential compound key
|
|
116
|
+
const hyphenIndex = fromTuneArg.indexOf('-');
|
|
117
|
+
if (hyphenIndex > 0) {
|
|
118
|
+
const technique = fromTuneArg.substring(0, hyphenIndex);
|
|
119
|
+
const slug = fromTuneArg.substring(hyphenIndex + 1);
|
|
120
|
+
const techniqueUpper = technique.toUpperCase();
|
|
121
|
+
const slugUpper = slug.toUpperCase().replace(/-/g, '_');
|
|
122
|
+
const compoundVar = `TUNE_ADAPTER_PATH_${techniqueUpper}_${slugUpper}`;
|
|
123
|
+
|
|
124
|
+
if (configVarExists(compoundVar)) {
|
|
125
|
+
return {
|
|
126
|
+
varName: compoundVar,
|
|
127
|
+
technique,
|
|
128
|
+
slug,
|
|
129
|
+
isCompound: true,
|
|
130
|
+
fallback: null
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Compound key doesn't exist — fallback to technique-only
|
|
135
|
+
return {
|
|
136
|
+
varName: `TUNE_ADAPTER_PATH_${techniqueUpper}`,
|
|
137
|
+
technique,
|
|
138
|
+
slug,
|
|
139
|
+
isCompound: false,
|
|
140
|
+
fallback: compoundVar // the compound var that was tried but didn't exist
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// No hyphen — technique-only
|
|
145
|
+
return {
|
|
146
|
+
varName: `TUNE_ADAPTER_PATH_${upper}`,
|
|
147
|
+
technique: fromTuneArg,
|
|
148
|
+
slug: '',
|
|
149
|
+
isCompound: false,
|
|
150
|
+
fallback: null
|
|
151
|
+
};
|
|
152
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
// AUTO-GENERATED by scripts/codegen-cli.js — DO NOT EDIT
|
|
2
2
|
// Source: config/parameter-schema-v2.json
|
|
3
|
-
// Generated: 2026-06-
|
|
3
|
+
// Generated: 2026-06-23T20:55:23.381Z
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* CLI option definitions derived from parameter-schema-v2.json.
|
|
@@ -70,7 +70,7 @@ export const cliOptions = [
|
|
|
70
70
|
{
|
|
71
71
|
'flag': '--enable-lora',
|
|
72
72
|
'description': 'Enable LoRA adapter serving',
|
|
73
|
-
'defaultValue':
|
|
73
|
+
'defaultValue': true
|
|
74
74
|
},
|
|
75
75
|
{
|
|
76
76
|
'flag': '--max-loras <n>',
|
|
@@ -85,7 +85,7 @@ export const cliOptions = [
|
|
|
85
85
|
{
|
|
86
86
|
'flag': '--include-benchmark',
|
|
87
87
|
'description': 'Include SageMaker AI Benchmarking scripts (do/benchmark, do/optimize). Workload configuration is specified at runtime via --workload flag.',
|
|
88
|
-
'defaultValue':
|
|
88
|
+
'defaultValue': true
|
|
89
89
|
},
|
|
90
90
|
{
|
|
91
91
|
'flag': '--benchmark-concurrency <n>',
|
|
@@ -353,6 +353,11 @@ export const cliOptions = [
|
|
|
353
353
|
'description': 'Server env var, repeatable (e.g. SGLANG_MEM_FRACTION=0.9)',
|
|
354
354
|
'repeatable': true
|
|
355
355
|
},
|
|
356
|
+
{
|
|
357
|
+
'flag': '--ic-env <KEY=VALUE>',
|
|
358
|
+
'description': 'Deploy-time environment variable for inference components (IC_ENV_* prefix), repeatable (e.g. VLLM_MAX_MODEL_LEN=8192)',
|
|
359
|
+
'repeatable': true
|
|
360
|
+
},
|
|
356
361
|
{
|
|
357
362
|
'flag': '--include-sample',
|
|
358
363
|
'description': 'Include sample model code',
|
|
@@ -464,6 +469,7 @@ export const helpGroups = {
|
|
|
464
469
|
'--fsx-volume-handle': 'hyperpod',
|
|
465
470
|
'--model-env': 'env',
|
|
466
471
|
'--server-env': 'env',
|
|
472
|
+
'--ic-env': 'ic',
|
|
467
473
|
'--include-sample': 'features',
|
|
468
474
|
'--include-testing': 'features',
|
|
469
475
|
'--test-types': 'features',
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
// AUTO-GENERATED by scripts/codegen-parameter-matrix.js — DO NOT EDIT
|
|
2
2
|
// Source: config/parameter-schema-v2.json
|
|
3
|
-
// Generated: 2026-06-
|
|
3
|
+
// Generated: 2026-06-23T20:55:23.482Z
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Parameter matrix defining how each parameter is loaded from various sources.
|
|
@@ -106,7 +106,7 @@ export const parameterMatrix = {
|
|
|
106
106
|
'mcp': false,
|
|
107
107
|
'promptable': true,
|
|
108
108
|
'required': false,
|
|
109
|
-
'default':
|
|
109
|
+
'default': true,
|
|
110
110
|
'valueSpace': 'bounded'
|
|
111
111
|
},
|
|
112
112
|
'maxLoras': {
|
|
@@ -139,7 +139,7 @@ export const parameterMatrix = {
|
|
|
139
139
|
'mcp': false,
|
|
140
140
|
'promptable': true,
|
|
141
141
|
'required': false,
|
|
142
|
-
'default':
|
|
142
|
+
'default': true,
|
|
143
143
|
'valueSpace': 'bounded'
|
|
144
144
|
},
|
|
145
145
|
'benchmarkConcurrency': {
|
|
@@ -225,7 +225,7 @@ export const parameterMatrix = {
|
|
|
225
225
|
'configFile': true,
|
|
226
226
|
'packageJson': false,
|
|
227
227
|
'mcp': true,
|
|
228
|
-
'promptable':
|
|
228
|
+
'promptable': true,
|
|
229
229
|
'required': false,
|
|
230
230
|
'default': null,
|
|
231
231
|
'valueSpace': 'unbounded'
|
|
@@ -569,6 +569,17 @@ export const parameterMatrix = {
|
|
|
569
569
|
'default': null,
|
|
570
570
|
'valueSpace': 'unbounded'
|
|
571
571
|
},
|
|
572
|
+
'icEnv': {
|
|
573
|
+
'cliOption': 'ic-env',
|
|
574
|
+
'envVar': null,
|
|
575
|
+
'configFile': true,
|
|
576
|
+
'packageJson': false,
|
|
577
|
+
'mcp': false,
|
|
578
|
+
'promptable': false,
|
|
579
|
+
'required': false,
|
|
580
|
+
'default': [],
|
|
581
|
+
'valueSpace': 'unbounded'
|
|
582
|
+
},
|
|
572
583
|
'includeSampleModel': {
|
|
573
584
|
'cliOption': 'include-sample',
|
|
574
585
|
'envVar': 'ML_INCLUDE_SAMPLE',
|
package/src/lib/mcp-client.js
CHANGED
|
@@ -143,9 +143,23 @@ class McpClient {
|
|
|
143
143
|
// Build context from bounded parameters that have defaults
|
|
144
144
|
const context = this._buildContext();
|
|
145
145
|
|
|
146
|
+
// Auto-discover tool name if using the default (get_ml_config)
|
|
147
|
+
// Each server registers its own tool name (e.g. get_base_images, get_inference_endpoints)
|
|
148
|
+
let toolName = this.toolName;
|
|
149
|
+
if (toolName === DEFAULT_TOOL_NAME) {
|
|
150
|
+
try {
|
|
151
|
+
const toolList = await this._client.listTools();
|
|
152
|
+
if (toolList && toolList.tools && toolList.tools.length > 0) {
|
|
153
|
+
toolName = toolList.tools[0].name;
|
|
154
|
+
}
|
|
155
|
+
} catch (_listErr) {
|
|
156
|
+
// Fall through to use default tool name
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
146
160
|
// Call the configured tool
|
|
147
161
|
const result = await this._client.callTool({
|
|
148
|
-
name:
|
|
162
|
+
name: toolName,
|
|
149
163
|
arguments: {
|
|
150
164
|
parameters: unboundedParams,
|
|
151
165
|
limit: this.limit,
|
|
@@ -216,6 +216,12 @@ export default class McpQueryRunner {
|
|
|
216
216
|
if (parsed.choices?.instanceType?.length > 0) {
|
|
217
217
|
this.runner._instanceSizerMetadata = parsed.metadata || null;
|
|
218
218
|
|
|
219
|
+
// Store maxModelLen from sizer if context was capped (AC-1.7)
|
|
220
|
+
if (parsed.values?.maxModelLen) {
|
|
221
|
+
this.runner._sizerMaxModelLen = parsed.values.maxModelLen;
|
|
222
|
+
console.log(` ✓ Context length capped: max_model_len=${parsed.values.maxModelLen}`);
|
|
223
|
+
}
|
|
224
|
+
|
|
219
225
|
// Build display labels with VRAM estimate and utilization percentage
|
|
220
226
|
const recommendations = parsed.metadata?.recommendations || [];
|
|
221
227
|
const estimatedVramGb = parsed.metadata?.estimatedVramGb;
|
|
@@ -365,9 +371,13 @@ export default class McpQueryRunner {
|
|
|
365
371
|
console.log(' 🔍 Querying endpoint-picker...');
|
|
366
372
|
|
|
367
373
|
try {
|
|
374
|
+
// Pass awsProfile from bootstrap config for credential resolution
|
|
375
|
+
const awsProfile = this.runner.configManager?.config?.awsProfile
|
|
376
|
+
|| this.runner.options?.profile || process.env.AWS_PROFILE || null;
|
|
368
377
|
const result = await cm.queryMcpServer('endpoint-picker', {
|
|
369
378
|
awsRegion: infraAnswers.awsRegion,
|
|
370
|
-
deploymentTarget: 'realtime-inference'
|
|
379
|
+
deploymentTarget: 'realtime-inference',
|
|
380
|
+
...(awsProfile ? { awsProfile } : {})
|
|
371
381
|
});
|
|
372
382
|
|
|
373
383
|
if (result && result.choices?.endpointName?.length > 0) {
|
package/src/lib/prompt-runner.js
CHANGED
|
@@ -224,25 +224,39 @@ export default class PromptRunner {
|
|
|
224
224
|
// Requirements: 3.3, 4.3, 4.4 — endpoint-picker MCP query
|
|
225
225
|
let existingEndpointAnswers = {};
|
|
226
226
|
if (regionAndTargetAnswers.deploymentTarget === 'realtime-inference') {
|
|
227
|
-
//
|
|
228
|
-
const
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
const endpointPreviousAnswers = {
|
|
232
|
-
...regionAndTargetAnswers,
|
|
233
|
-
...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
|
|
234
|
-
};
|
|
235
|
-
existingEndpointAnswers = await this._runPhase(
|
|
236
|
-
infraExistingEndpointPrompts,
|
|
237
|
-
endpointPreviousAnswers,
|
|
227
|
+
// First ask if user wants to attach to existing endpoint (no MCP call yet)
|
|
228
|
+
const attachAnswer = await this._runPhase(
|
|
229
|
+
[infraExistingEndpointPrompts[0]],
|
|
230
|
+
{ ...regionAndTargetAnswers },
|
|
238
231
|
explicitConfig,
|
|
239
232
|
existingConfig
|
|
240
233
|
);
|
|
241
234
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
235
|
+
if (attachAnswer.useExistingEndpoint === 'yes') {
|
|
236
|
+
// Only now query endpoint-picker MCP server
|
|
237
|
+
const resolvedRegion = regionAndTargetAnswers.customAwsRegion || regionAndTargetAnswers.awsRegion;
|
|
238
|
+
await this.mcpQueryRunner._queryMcpForEndpoints({ ...regionAndTargetAnswers, awsRegion: resolvedRegion }, explicitConfig);
|
|
239
|
+
|
|
240
|
+
const endpointPreviousAnswers = {
|
|
241
|
+
...regionAndTargetAnswers,
|
|
242
|
+
...attachAnswer,
|
|
243
|
+
...(this._mcpEndpointChoices ? { _mcpEndpointChoices: this._mcpEndpointChoices } : {})
|
|
244
|
+
};
|
|
245
|
+
existingEndpointAnswers = await this._runPhase(
|
|
246
|
+
infraExistingEndpointPrompts.slice(1),
|
|
247
|
+
endpointPreviousAnswers,
|
|
248
|
+
explicitConfig,
|
|
249
|
+
existingConfig
|
|
250
|
+
);
|
|
251
|
+
existingEndpointAnswers.useExistingEndpoint = 'yes';
|
|
252
|
+
|
|
253
|
+
// Resolve custom endpoint name
|
|
254
|
+
if (existingEndpointAnswers.customExistingEndpointName) {
|
|
255
|
+
existingEndpointAnswers.existingEndpointName = existingEndpointAnswers.customExistingEndpointName;
|
|
256
|
+
delete existingEndpointAnswers.customExistingEndpointName;
|
|
257
|
+
}
|
|
258
|
+
} else {
|
|
259
|
+
existingEndpointAnswers = attachAnswer;
|
|
246
260
|
}
|
|
247
261
|
}
|
|
248
262
|
|
|
@@ -376,11 +390,12 @@ export default class PromptRunner {
|
|
|
376
390
|
const sizerRecs = this._instanceSizerMetadata.recommendations || [];
|
|
377
391
|
const finalInstanceType = instanceAnswers.customInstanceType || instanceAnswers.instanceType;
|
|
378
392
|
const matchingRec = sizerRecs.find(r => r.instanceType === finalInstanceType);
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
this.
|
|
383
|
-
|
|
393
|
+
// Only use sizer TP recommendation if user selected a recommended instance
|
|
394
|
+
// Custom instances resolve TP from the instance catalog in template-variable-resolver
|
|
395
|
+
if (matchingRec && matchingRec.tensorParallelism > 1) {
|
|
396
|
+
this._autoTensorParallelism = matchingRec.tensorParallelism;
|
|
397
|
+
this._autoGpuCount = matchingRec.gpuCount;
|
|
398
|
+
console.log(` ✓ Auto-set tensor parallelism: TP=${matchingRec.tensorParallelism} (${matchingRec.gpuCount} GPUs)`);
|
|
384
399
|
}
|
|
385
400
|
|
|
386
401
|
// Display capacity type confirmation for selected instance
|
|
@@ -710,6 +725,11 @@ export default class PromptRunner {
|
|
|
710
725
|
delete combinedAnswers.customHyperPodCluster;
|
|
711
726
|
}
|
|
712
727
|
|
|
728
|
+
// Propagate max_model_len from instance-sizer context capping (AC-1.7)
|
|
729
|
+
if (this._sizerMaxModelLen) {
|
|
730
|
+
combinedAnswers.sizerMaxModelLen = this._sizerMaxModelLen;
|
|
731
|
+
}
|
|
732
|
+
|
|
713
733
|
// Apply CUDA version selection → inference AMI override
|
|
714
734
|
if (combinedAnswers._resolvedInferenceAmiVersion) {
|
|
715
735
|
combinedAnswers.inferenceAmiVersion = combinedAnswers._resolvedInferenceAmiVersion;
|
|
@@ -90,7 +90,7 @@ const loraPrompts = [
|
|
|
90
90
|
type: 'confirm',
|
|
91
91
|
name: 'enableLora',
|
|
92
92
|
message: 'Enable LoRA adapter serving?',
|
|
93
|
-
default:
|
|
93
|
+
default: true,
|
|
94
94
|
when: (answers) => {
|
|
95
95
|
const architecture = answers.architecture || answers.deploymentConfig?.split('-')[0];
|
|
96
96
|
const backend = answers.backend || answers.deploymentConfig?.split('-').slice(1).join('-');
|