@aws/ml-container-creator 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/bootstrap-stack.json +86 -7
- package/config/defaults.json +1 -1
- package/package.json +3 -1
- package/servers/instance-sizer/index.js +36 -2
- package/servers/instance-sizer/lib/instance-ranker.js +114 -10
- package/servers/instance-sizer/lib/quota-resolver.js +368 -0
- package/servers/instance-sizer/package.json +2 -0
- package/servers/lib/catalogs/instances.json +527 -12
- package/servers/lib/catalogs/model-servers.json +15 -15
- package/servers/lib/catalogs/model-sizes.json +27 -0
- package/servers/lib/catalogs/models.json +71 -0
- package/servers/lib/schemas/image-catalog.schema.json +9 -1
- package/src/app.js +77 -2
- package/src/lib/bootstrap-command-handler.js +96 -3
- package/src/lib/cli-handler.js +2 -2
- package/src/lib/config-manager.js +78 -1
- package/src/lib/prompt-runner.js +96 -9
- package/src/lib/prompts.js +66 -4
- package/src/lib/schema-sync.js +31 -0
- package/src/lib/template-manager.js +49 -1
- package/src/lib/validate-runner.js +125 -2
- package/templates/Dockerfile +10 -2
- package/templates/code/cuda_compat.sh +22 -0
- package/templates/code/serve +3 -0
- package/templates/code/start_server.sh +3 -0
- package/templates/diffusors/Dockerfile +2 -1
- package/templates/diffusors/serve +3 -0
- package/templates/do/README.md +33 -0
- package/templates/do/benchmark +646 -0
- package/templates/do/clean +86 -0
- package/templates/do/config +26 -3
- package/templates/do/deploy +6 -1
- package/templates/do/register +8 -1
- package/templates/triton/Dockerfile +5 -0
package/src/lib/prompt-runner.js
CHANGED
|
@@ -18,6 +18,7 @@ import {
|
|
|
18
18
|
modelLoadStrategyPrompts,
|
|
19
19
|
modelProfilePrompts,
|
|
20
20
|
modulePrompts,
|
|
21
|
+
benchmarkPrompts,
|
|
21
22
|
infraRegionAndTargetPrompts,
|
|
22
23
|
infraInstancePrompts,
|
|
23
24
|
infraAsyncPrompts,
|
|
@@ -252,6 +253,29 @@ export default class PromptRunner {
|
|
|
252
253
|
this._autoGpuCount = tpRec.gpuCount;
|
|
253
254
|
console.log(` ✓ Auto-set tensor parallelism: TP=${tpRec.tensorParallelism} (${tpRec.gpuCount} GPUs)`);
|
|
254
255
|
}
|
|
256
|
+
|
|
257
|
+
// Display capacity type confirmation for selected instance
|
|
258
|
+
// Requirements: 5.4
|
|
259
|
+
if (matchingRec && matchingRec.capacityType) {
|
|
260
|
+
if (matchingRec.capacityType === 'reserved') {
|
|
261
|
+
const resType = matchingRec.reservationType === 'capacity-block' ? 'Capacity Block' : 'ODCR';
|
|
262
|
+
const endInfo = matchingRec.reservationType === 'capacity-block' && matchingRec.reservationInfo?.endDate
|
|
263
|
+
? `, ends ${new Date(matchingRec.reservationInfo.endDate).toLocaleDateString()}`
|
|
264
|
+
: '';
|
|
265
|
+
console.log(` ✓ Using reserved capacity — ${resType} (reservation ${matchingRec.reservationInfo?.reservationId || 'unknown'}${endInfo})`);
|
|
266
|
+
} else if (matchingRec.capacityType === 'ftp') {
|
|
267
|
+
console.log(` ✓ Using reserved capacity (plan ${matchingRec.ftpInfo?.planName || 'unknown'})`);
|
|
268
|
+
} else {
|
|
269
|
+
const headroom = matchingRec.quotaHeadroom;
|
|
270
|
+
console.log(` ✓ Using on-demand capacity (quota headroom: ${headroom ?? 'unknown'})`);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Extract reservation ARN from selected instance for deployment config
|
|
275
|
+
// Requirements: 2.3
|
|
276
|
+
if (matchingRec && matchingRec.capacityType === 'reserved' && matchingRec.reservationInfo?.reservationArn) {
|
|
277
|
+
this._selectedCapacityReservationArn = matchingRec.reservationInfo.reservationArn;
|
|
278
|
+
}
|
|
255
279
|
}
|
|
256
280
|
|
|
257
281
|
// 3c. Async-specific prompts (only when deploymentTarget === 'async-inference')
|
|
@@ -375,6 +399,21 @@ export default class PromptRunner {
|
|
|
375
399
|
moduleAnswers.includeSampleModel = false;
|
|
376
400
|
}
|
|
377
401
|
|
|
402
|
+
// Benchmark prompts — derive includeBenchmark from testTypes selection or CLI flag
|
|
403
|
+
// Requirements: 1.1, 1.2
|
|
404
|
+
let benchmarkAnswers = {};
|
|
405
|
+
if (frameworkAnswers.architecture === 'transformers' || frameworkAnswers.architecture === 'diffusors') {
|
|
406
|
+
const testTypes = moduleAnswers.testTypes || [];
|
|
407
|
+
const includeBenchmark = testTypes.includes('sagemaker-ai-automated-benchmarking') ||
|
|
408
|
+
explicitConfig.includeBenchmark === true ||
|
|
409
|
+
explicitConfig.includeBenchmark === 'true';
|
|
410
|
+
benchmarkAnswers.includeBenchmark = includeBenchmark;
|
|
411
|
+
if (includeBenchmark) {
|
|
412
|
+
const subAnswers = await this._runPhase(benchmarkPrompts, { ...frameworkAnswers, ...moduleAnswers, includeBenchmark }, explicitConfig, existingConfig);
|
|
413
|
+
benchmarkAnswers = { ...benchmarkAnswers, ...subAnswers };
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
|
|
378
417
|
// Validate instance type against framework requirements (now that framework version is known)
|
|
379
418
|
const finalInstanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
|
|
380
419
|
if (finalInstanceType && frameworkVersionAnswers.frameworkVersion) {
|
|
@@ -416,6 +455,7 @@ export default class PromptRunner {
|
|
|
416
455
|
...hfTokenAnswers,
|
|
417
456
|
...ngcApiKeyAnswers,
|
|
418
457
|
...moduleAnswers,
|
|
458
|
+
...benchmarkAnswers,
|
|
419
459
|
...projectAnswers,
|
|
420
460
|
...destinationAnswers,
|
|
421
461
|
buildTimestamp
|
|
@@ -435,6 +475,12 @@ export default class PromptRunner {
|
|
|
435
475
|
combinedAnswers.artifactUri = this._mcpArtifactUri;
|
|
436
476
|
}
|
|
437
477
|
|
|
478
|
+
// Flow capacity reservation ARN from instance-sizer selection
|
|
479
|
+
// Requirements: 2.3
|
|
480
|
+
if (this._selectedCapacityReservationArn) {
|
|
481
|
+
combinedAnswers.capacityReservationArn = this._selectedCapacityReservationArn;
|
|
482
|
+
}
|
|
483
|
+
|
|
438
484
|
// Validate: non-HF model sources require an artifact URI
|
|
439
485
|
// Without it, the serve script can't download the model at runtime
|
|
440
486
|
// Infer modelSource from model name prefix if not set by MCP
|
|
@@ -1036,13 +1082,53 @@ export default class PromptRunner {
|
|
|
1036
1082
|
: '';
|
|
1037
1083
|
|
|
1038
1084
|
console.log(` ✓ ${choices.length} compatible instance(s) found${vramInfo}`);
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1085
|
+
|
|
1086
|
+
// Check if availability data is present (recommendations have capacityType)
|
|
1087
|
+
const hasAvailabilityData = recommendations.some(r => r.capacityType);
|
|
1088
|
+
|
|
1089
|
+
if (hasAvailabilityData) {
|
|
1090
|
+
// Group by capacityType for display
|
|
1091
|
+
const reserved = recommendations.filter(r => r.capacityType === 'reserved' || r.capacityType === 'ftp');
|
|
1092
|
+
const onDemand = recommendations.filter(r => r.capacityType === 'on-demand');
|
|
1093
|
+
|
|
1094
|
+
if (reserved.length > 0) {
|
|
1095
|
+
console.log(' ── Reserved Capacity ──');
|
|
1096
|
+
for (const rec of reserved) {
|
|
1097
|
+
const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
|
|
1098
|
+
const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
|
|
1099
|
+
const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
|
|
1100
|
+
const tag = rec.capacityType === 'reserved'
|
|
1101
|
+
? ` [CR] ${rec.reservationInfo?.planName || rec.reservationInfo?.reservationId || ''}`
|
|
1102
|
+
: ` [FTP] ${rec.ftpInfo?.planName || ''}`;
|
|
1103
|
+
console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}${tag}`);
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
if (onDemand.length > 0) {
|
|
1108
|
+
console.log(' ── On-Demand ──');
|
|
1109
|
+
for (const rec of onDemand) {
|
|
1110
|
+
const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
|
|
1111
|
+
const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
|
|
1112
|
+
const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
|
|
1113
|
+
const deployed = rec.quotaDeployed;
|
|
1114
|
+
const quota = rec.quotaLimit;
|
|
1115
|
+
const tag = quota !== null && quota !== undefined ? ` [Q:${deployed ?? 0}/${quota}]` : '';
|
|
1116
|
+
console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}${tag}`);
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
} else {
|
|
1120
|
+
// Fallback: display compact recommendation table (no availability data)
|
|
1121
|
+
for (const rec of recommendations) {
|
|
1122
|
+
const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
|
|
1123
|
+
const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
|
|
1124
|
+
const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
|
|
1125
|
+
console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}`);
|
|
1126
|
+
}
|
|
1045
1127
|
}
|
|
1128
|
+
} else if (parsed.metadata?.allFilteredByQuota) {
|
|
1129
|
+
// All VRAM-compatible instances had zero quota
|
|
1130
|
+
console.log(' ⚠️ No quota available for compatible instances. Request a quota increase.');
|
|
1131
|
+
this._instanceSizerMetadata = parsed.metadata || null;
|
|
1046
1132
|
} else if (parsed.metadata?.warning) {
|
|
1047
1133
|
console.log(` ⚠️ ${parsed.metadata.warning}`);
|
|
1048
1134
|
} else {
|
|
@@ -1972,9 +2058,10 @@ export default class PromptRunner {
|
|
|
1972
2058
|
'11.4': 'al2-ami-sagemaker-inference-gpu-2-1',
|
|
1973
2059
|
'11.8': 'al2-ami-sagemaker-inference-gpu-2-1',
|
|
1974
2060
|
'12.1': 'al2-ami-sagemaker-inference-gpu-3-1',
|
|
1975
|
-
'12.2': '
|
|
1976
|
-
'12.4': '
|
|
1977
|
-
'12.6': '
|
|
2061
|
+
'12.2': 'al2-ami-sagemaker-inference-gpu-3-1',
|
|
2062
|
+
'12.4': 'al2-ami-sagemaker-inference-gpu-3-1',
|
|
2063
|
+
'12.6': 'al2-ami-sagemaker-inference-gpu-3-1',
|
|
2064
|
+
'13.0': 'al2023-ami-sagemaker-inference-gpu-4-1'
|
|
1978
2065
|
};
|
|
1979
2066
|
|
|
1980
2067
|
/**
|
package/src/lib/prompts.js
CHANGED
|
@@ -583,7 +583,7 @@ const modulePrompts = [
|
|
|
583
583
|
type: 'confirm',
|
|
584
584
|
name: 'includeSampleModel',
|
|
585
585
|
message: 'Include sample Abalone classifier?',
|
|
586
|
-
default:
|
|
586
|
+
default: true,
|
|
587
587
|
when: (answers) => {
|
|
588
588
|
const architecture = answers.architecture || answers.deploymentConfig?.split('-')[0];
|
|
589
589
|
const backend = answers.backend || answers.deploymentConfig?.split('-').slice(1).join('-');
|
|
@@ -622,7 +622,10 @@ const modulePrompts = [
|
|
|
622
622
|
|
|
623
623
|
// Transformers and Triton LLM backends only support hosted endpoint tests
|
|
624
624
|
if (architecture === 'transformers') {
|
|
625
|
-
return ['hosted-model-endpoint'];
|
|
625
|
+
return ['hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'];
|
|
626
|
+
}
|
|
627
|
+
if (architecture === 'diffusors') {
|
|
628
|
+
return ['hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'];
|
|
626
629
|
}
|
|
627
630
|
if (architecture === 'triton' && (backend === 'vllm' || backend === 'tensorrtllm')) {
|
|
628
631
|
return ['hosted-model-endpoint'];
|
|
@@ -635,7 +638,10 @@ const modulePrompts = [
|
|
|
635
638
|
const backend = answers.backend || answers.deploymentConfig?.split('-').slice(1).join('-');
|
|
636
639
|
|
|
637
640
|
if (architecture === 'transformers') {
|
|
638
|
-
return ['hosted-model-endpoint'];
|
|
641
|
+
return ['hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'];
|
|
642
|
+
}
|
|
643
|
+
if (architecture === 'diffusors') {
|
|
644
|
+
return ['hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'];
|
|
639
645
|
}
|
|
640
646
|
if (architecture === 'triton' && (backend === 'vllm' || backend === 'tensorrtllm')) {
|
|
641
647
|
return ['hosted-model-endpoint'];
|
|
@@ -700,7 +706,12 @@ const infraInstancePrompts = [
|
|
|
700
706
|
when: answers => answers.deploymentTarget === 'realtime-inference' || answers.deploymentTarget === 'async-inference' || answers.deploymentTarget === 'batch-transform' || answers.deploymentTarget === 'hyperpod-eks',
|
|
701
707
|
message: (answers) => {
|
|
702
708
|
const framework = answers.framework || answers.deploymentConfig?.split('-')[0];
|
|
703
|
-
|
|
709
|
+
|
|
710
|
+
// Skip table when MCP sizer already displayed annotated results
|
|
711
|
+
if (answers._mcpInstanceChoices && answers._mcpInstanceChoices.length > 0) {
|
|
712
|
+
return 'Select instance type:';
|
|
713
|
+
}
|
|
714
|
+
|
|
704
715
|
const table = new Table({
|
|
705
716
|
head: [
|
|
706
717
|
chalk.cyan('Instance Type'),
|
|
@@ -1110,6 +1121,56 @@ const baseImagePrompts = [
|
|
|
1110
1121
|
}
|
|
1111
1122
|
];
|
|
1112
1123
|
|
|
1124
|
+
/**
|
|
1125
|
+
* Benchmark prompts for SageMaker AI Benchmarking (NVIDIA AIPerf)
|
|
1126
|
+
* Sub-prompts shown when 'sagemaker-ai-automated-benchmarking' is selected in testTypes.
|
|
1127
|
+
* Requirements: 2.1, 2.2, 2.3, 2.4, 2.5
|
|
1128
|
+
*/
|
|
1129
|
+
const benchmarkPrompts = [
|
|
1130
|
+
{
|
|
1131
|
+
type: 'number',
|
|
1132
|
+
name: 'benchmarkConcurrency',
|
|
1133
|
+
message: 'Concurrent requests for benchmark:',
|
|
1134
|
+
default: 10,
|
|
1135
|
+
when: (answers) => answers.includeBenchmark === true
|
|
1136
|
+
},
|
|
1137
|
+
{
|
|
1138
|
+
type: 'number',
|
|
1139
|
+
name: 'benchmarkInputTokensMean',
|
|
1140
|
+
message: 'Mean input tokens per request:',
|
|
1141
|
+
default: 550,
|
|
1142
|
+
when: (answers) => answers.includeBenchmark === true
|
|
1143
|
+
},
|
|
1144
|
+
{
|
|
1145
|
+
type: 'number',
|
|
1146
|
+
name: 'benchmarkOutputTokensMean',
|
|
1147
|
+
message: 'Mean output tokens per request:',
|
|
1148
|
+
default: 150,
|
|
1149
|
+
when: (answers) => answers.includeBenchmark === true
|
|
1150
|
+
},
|
|
1151
|
+
{
|
|
1152
|
+
type: 'confirm',
|
|
1153
|
+
name: 'benchmarkStreaming',
|
|
1154
|
+
message: 'Enable streaming for benchmark?',
|
|
1155
|
+
default: true,
|
|
1156
|
+
when: (answers) => answers.includeBenchmark === true
|
|
1157
|
+
},
|
|
1158
|
+
{
|
|
1159
|
+
type: 'input',
|
|
1160
|
+
name: 'benchmarkRequestCount',
|
|
1161
|
+
message: 'Total request count (leave empty for service default):',
|
|
1162
|
+
default: '',
|
|
1163
|
+
when: (answers) => answers.includeBenchmark === true
|
|
1164
|
+
},
|
|
1165
|
+
{
|
|
1166
|
+
type: 'input',
|
|
1167
|
+
name: 'benchmarkS3OutputPath',
|
|
1168
|
+
message: 'Benchmark results S3 path (leave empty for auto-created bucket):',
|
|
1169
|
+
default: '',
|
|
1170
|
+
when: (answers) => answers.includeBenchmark === true
|
|
1171
|
+
}
|
|
1172
|
+
];
|
|
1173
|
+
|
|
1113
1174
|
export {
|
|
1114
1175
|
deploymentConfigPrompts,
|
|
1115
1176
|
frameworkPrompts, // Deprecated: kept for backward compatibility
|
|
@@ -1123,6 +1184,7 @@ export {
|
|
|
1123
1184
|
hfTokenPrompts,
|
|
1124
1185
|
ngcApiKeyPrompts,
|
|
1125
1186
|
modulePrompts,
|
|
1187
|
+
benchmarkPrompts,
|
|
1126
1188
|
infrastructurePrompts,
|
|
1127
1189
|
infraRegionAndTargetPrompts,
|
|
1128
1190
|
infraInstancePrompts,
|
package/src/lib/schema-sync.js
CHANGED
|
@@ -188,6 +188,37 @@ export function loadServiceModel(serviceName, registryPath) {
|
|
|
188
188
|
return readFileSync(modelPath, 'utf8');
|
|
189
189
|
}
|
|
190
190
|
|
|
191
|
+
/**
|
|
192
|
+
* Check whether the SageMaker service model includes the CreateAIBenchmarkJob operation shape.
|
|
193
|
+
* Used to determine if benchmark parameter validation can be performed.
|
|
194
|
+
*
|
|
195
|
+
* @param {string} [registryPath] - Override registry path
|
|
196
|
+
* @returns {{ available: boolean, reason?: string }}
|
|
197
|
+
*/
|
|
198
|
+
export function hasBenchmarkShape(registryPath) {
|
|
199
|
+
const regPath = registryPath || getRegistryPath();
|
|
200
|
+
const modelContent = loadServiceModel('sagemaker', regPath);
|
|
201
|
+
|
|
202
|
+
if (!modelContent) {
|
|
203
|
+
return { available: false, reason: 'SageMaker service model not found in registry' };
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
try {
|
|
207
|
+
const model = JSON.parse(modelContent);
|
|
208
|
+
const operations = model.operations || {};
|
|
209
|
+
const shapes = model.shapes || {};
|
|
210
|
+
|
|
211
|
+
// Check for the CreateAIBenchmarkJob operation or its input shape
|
|
212
|
+
if (operations.CreateAIBenchmarkJob || shapes.CreateAIBenchmarkJobRequest) {
|
|
213
|
+
return { available: true };
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return { available: false, reason: 'service model does not include AI Benchmark operations' };
|
|
217
|
+
} catch {
|
|
218
|
+
return { available: false, reason: 'Failed to parse SageMaker service model' };
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
191
222
|
/**
|
|
192
223
|
* Store a service model in the registry.
|
|
193
224
|
* @param {string} serviceName - Service name (e.g., 'sagemaker')
|
|
@@ -65,7 +65,7 @@ export default class TemplateManager {
|
|
|
65
65
|
],
|
|
66
66
|
buildTargets: ['codebuild'],
|
|
67
67
|
deploymentTargets: ['realtime-inference', 'async-inference', 'batch-transform', 'hyperpod-eks'],
|
|
68
|
-
testTypes: ['local-model-cli', 'local-model-server', 'hosted-model-endpoint'],
|
|
68
|
+
testTypes: ['local-model-cli', 'local-model-server', 'hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'],
|
|
69
69
|
awsRegions: [
|
|
70
70
|
'us-east-1', 'us-east-2', 'us-west-1', 'us-west-2',
|
|
71
71
|
'eu-west-1', 'eu-west-2', 'eu-central-1', 'eu-north-1',
|
|
@@ -134,6 +134,9 @@ export default class TemplateManager {
|
|
|
134
134
|
|
|
135
135
|
// Validate batch transform specific fields
|
|
136
136
|
this._validateBatchTransformConfig();
|
|
137
|
+
|
|
138
|
+
// Validate benchmark specific fields
|
|
139
|
+
this._validateBenchmarkConfig();
|
|
137
140
|
|
|
138
141
|
// Validate instance type format (ml.*.*) - only for realtime-inference
|
|
139
142
|
if (this.answers.instanceType && this.answers.instanceType !== 'custom') {
|
|
@@ -297,6 +300,51 @@ export default class TemplateManager {
|
|
|
297
300
|
}
|
|
298
301
|
}
|
|
299
302
|
|
|
303
|
+
/**
|
|
304
|
+
* Validates benchmark configuration parameters
|
|
305
|
+
* @private
|
|
306
|
+
* @throws {Error} If benchmark configuration is invalid
|
|
307
|
+
*/
|
|
308
|
+
_validateBenchmarkConfig() {
|
|
309
|
+
if (!this.answers.includeBenchmark) return;
|
|
310
|
+
|
|
311
|
+
// Gate to supported architectures
|
|
312
|
+
const dc = this.answers.deploymentConfig;
|
|
313
|
+
const arch = dc ? dc.split('-')[0] : this.answers.architecture;
|
|
314
|
+
if (arch !== 'transformers' && arch !== 'diffusors') {
|
|
315
|
+
throw new Error('⚠️ Benchmarking is only supported with transformers and diffusors architectures.');
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Gate to supported deployment targets
|
|
319
|
+
if (this.answers.deploymentTarget === 'hyperpod-eks') {
|
|
320
|
+
throw new Error('⚠️ Benchmarking is only supported with managed-inference, async-inference, and batch-transform deployment targets');
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Validate numeric parameters
|
|
324
|
+
if (this.answers.benchmarkConcurrency !== undefined) {
|
|
325
|
+
if (!Number.isInteger(this.answers.benchmarkConcurrency) || this.answers.benchmarkConcurrency < 1) {
|
|
326
|
+
throw new Error('⚠️ benchmarkConcurrency must be an integer >= 1');
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
if (this.answers.benchmarkInputTokensMean !== undefined) {
|
|
330
|
+
if (!Number.isInteger(this.answers.benchmarkInputTokensMean) || this.answers.benchmarkInputTokensMean < 1) {
|
|
331
|
+
throw new Error('⚠️ benchmarkInputTokensMean must be an integer >= 1');
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
if (this.answers.benchmarkOutputTokensMean !== undefined) {
|
|
335
|
+
if (!Number.isInteger(this.answers.benchmarkOutputTokensMean) || this.answers.benchmarkOutputTokensMean < 1) {
|
|
336
|
+
throw new Error('⚠️ benchmarkOutputTokensMean must be an integer >= 1');
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Validate S3 path format
|
|
341
|
+
if (this.answers.benchmarkS3OutputPath && this.answers.benchmarkS3OutputPath.trim() !== '') {
|
|
342
|
+
if (!this.answers.benchmarkS3OutputPath.startsWith('s3://')) {
|
|
343
|
+
throw new Error('⚠️ benchmarkS3OutputPath must start with "s3://". Example: s3://my-bucket/benchmark-results/');
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
300
348
|
/**
|
|
301
349
|
* Validates GPU instance type requirement for GPU-requiring backends.
|
|
302
350
|
* Called when deploymentConfig is present.
|
|
@@ -21,7 +21,7 @@ import SchemaValidationEngine from './schema-validation-engine.js';
|
|
|
21
21
|
import ServiceModelParser from './service-model-parser.js';
|
|
22
22
|
import CrossCuttingChecker from './cross-cutting-checker.js';
|
|
23
23
|
import HuggingFaceClient from './huggingface-client.js';
|
|
24
|
-
import { getRegistryPath, loadManifest } from './schema-sync.js';
|
|
24
|
+
import { getRegistryPath, loadManifest, hasBenchmarkShape } from './schema-sync.js';
|
|
25
25
|
|
|
26
26
|
const __filename = fileURLToPath(import.meta.url);
|
|
27
27
|
const __dirname = path.dirname(__filename);
|
|
@@ -52,6 +52,115 @@ export function parseDoConfig(configPath) {
|
|
|
52
52
|
return config;
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
/**
|
|
56
|
+
* Validate benchmark parameters against service model constraints.
|
|
57
|
+
* Called when the CreateAIBenchmarkJob shape is available in the synced schema.
|
|
58
|
+
*
|
|
59
|
+
* Validates:
|
|
60
|
+
* - Concurrency: integer, min 1
|
|
61
|
+
* - S3OutputLocation: string, starts with s3://
|
|
62
|
+
* - AIBenchmarkJobName: pattern ^[a-zA-Z0-9](-*[a-zA-Z0-9])*, max 63 chars
|
|
63
|
+
*
|
|
64
|
+
* Requirements: 8.1, 8.2, 8.3
|
|
65
|
+
*
|
|
66
|
+
* @param {Object} config - Parsed do/config values
|
|
67
|
+
* @returns {Array<Object>} Array of validation findings
|
|
68
|
+
*/
|
|
69
|
+
export function validateBenchmarkParams(config) {
|
|
70
|
+
const findings = [];
|
|
71
|
+
|
|
72
|
+
// Validate Concurrency (integer, min 1)
|
|
73
|
+
if (config.BENCHMARK_CONCURRENCY !== null && config.BENCHMARK_CONCURRENCY !== undefined && config.BENCHMARK_CONCURRENCY !== '') {
|
|
74
|
+
const concurrency = Number(config.BENCHMARK_CONCURRENCY);
|
|
75
|
+
if (!Number.isInteger(concurrency) || concurrency < 1) {
|
|
76
|
+
findings.push({
|
|
77
|
+
severity: 'error',
|
|
78
|
+
operation: 'CreateAIBenchmarkJob',
|
|
79
|
+
fieldPath: 'Concurrency',
|
|
80
|
+
constraint: 'integer >= 1',
|
|
81
|
+
invalidValue: config.BENCHMARK_CONCURRENCY,
|
|
82
|
+
remediationHint: 'BENCHMARK_CONCURRENCY must be a positive integer (>= 1)'
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Validate S3OutputLocation (string, starts with s3://)
|
|
88
|
+
if (config.BENCHMARK_S3_OUTPUT_PATH !== null && config.BENCHMARK_S3_OUTPUT_PATH !== undefined && config.BENCHMARK_S3_OUTPUT_PATH !== '') {
|
|
89
|
+
const s3Path = config.BENCHMARK_S3_OUTPUT_PATH;
|
|
90
|
+
// Skip dynamic shell expressions (e.g., s3://...$(aws ...))
|
|
91
|
+
if (!s3Path.includes('$(') && !s3Path.startsWith('s3://')) {
|
|
92
|
+
findings.push({
|
|
93
|
+
severity: 'error',
|
|
94
|
+
operation: 'CreateAIBenchmarkJob',
|
|
95
|
+
fieldPath: 'OutputConfig.S3OutputLocation',
|
|
96
|
+
constraint: 'must start with s3://',
|
|
97
|
+
invalidValue: s3Path,
|
|
98
|
+
remediationHint: 'BENCHMARK_S3_OUTPUT_PATH must start with "s3://". Example: s3://my-bucket/benchmark-results/'
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Validate AIBenchmarkJobName pattern (^[a-zA-Z0-9](-*[a-zA-Z0-9])*, max 63 chars)
|
|
104
|
+
if (config.BENCHMARK_JOB_NAME !== null && config.BENCHMARK_JOB_NAME !== undefined && config.BENCHMARK_JOB_NAME !== '') {
|
|
105
|
+
const jobName = config.BENCHMARK_JOB_NAME;
|
|
106
|
+
// Skip dynamic shell expressions
|
|
107
|
+
if (!jobName.includes('$(') && !jobName.includes('${')) {
|
|
108
|
+
const namePattern = /^[a-zA-Z0-9](-*[a-zA-Z0-9])*$/;
|
|
109
|
+
if (jobName.length > 63) {
|
|
110
|
+
findings.push({
|
|
111
|
+
severity: 'error',
|
|
112
|
+
operation: 'CreateAIBenchmarkJob',
|
|
113
|
+
fieldPath: 'AIBenchmarkJobName',
|
|
114
|
+
constraint: 'max 63 characters',
|
|
115
|
+
invalidValue: jobName,
|
|
116
|
+
remediationHint: 'AIBenchmarkJobName must be at most 63 characters'
|
|
117
|
+
});
|
|
118
|
+
} else if (!namePattern.test(jobName)) {
|
|
119
|
+
findings.push({
|
|
120
|
+
severity: 'error',
|
|
121
|
+
operation: 'CreateAIBenchmarkJob',
|
|
122
|
+
fieldPath: 'AIBenchmarkJobName',
|
|
123
|
+
constraint: 'pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9])*',
|
|
124
|
+
invalidValue: jobName,
|
|
125
|
+
remediationHint: 'AIBenchmarkJobName must start with alphanumeric and contain only alphanumeric characters and hyphens'
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Validate input tokens mean (integer, min 1)
|
|
132
|
+
if (config.BENCHMARK_INPUT_TOKENS_MEAN !== null && config.BENCHMARK_INPUT_TOKENS_MEAN !== undefined && config.BENCHMARK_INPUT_TOKENS_MEAN !== '') {
|
|
133
|
+
const inputTokens = Number(config.BENCHMARK_INPUT_TOKENS_MEAN);
|
|
134
|
+
if (!Number.isInteger(inputTokens) || inputTokens < 1) {
|
|
135
|
+
findings.push({
|
|
136
|
+
severity: 'error',
|
|
137
|
+
operation: 'CreateAIWorkloadConfig',
|
|
138
|
+
fieldPath: 'WorkloadSpec.parameters.prompt_input_tokens_mean',
|
|
139
|
+
constraint: 'integer >= 1',
|
|
140
|
+
invalidValue: config.BENCHMARK_INPUT_TOKENS_MEAN,
|
|
141
|
+
remediationHint: 'BENCHMARK_INPUT_TOKENS_MEAN must be a positive integer (>= 1)'
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Validate output tokens mean (integer, min 1)
|
|
147
|
+
if (config.BENCHMARK_OUTPUT_TOKENS_MEAN !== null && config.BENCHMARK_OUTPUT_TOKENS_MEAN !== undefined && config.BENCHMARK_OUTPUT_TOKENS_MEAN !== '') {
|
|
148
|
+
const outputTokens = Number(config.BENCHMARK_OUTPUT_TOKENS_MEAN);
|
|
149
|
+
if (!Number.isInteger(outputTokens) || outputTokens < 1) {
|
|
150
|
+
findings.push({
|
|
151
|
+
severity: 'error',
|
|
152
|
+
operation: 'CreateAIWorkloadConfig',
|
|
153
|
+
fieldPath: 'WorkloadSpec.parameters.output_tokens_mean',
|
|
154
|
+
constraint: 'integer >= 1',
|
|
155
|
+
invalidValue: config.BENCHMARK_OUTPUT_TOKENS_MEAN,
|
|
156
|
+
remediationHint: 'BENCHMARK_OUTPUT_TOKENS_MEAN must be a positive integer (>= 1)'
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
return findings;
|
|
162
|
+
}
|
|
163
|
+
|
|
55
164
|
/**
|
|
56
165
|
* Run the full validation pipeline.
|
|
57
166
|
*
|
|
@@ -171,6 +280,20 @@ export async function run(options = {}) {
|
|
|
171
280
|
}
|
|
172
281
|
}
|
|
173
282
|
|
|
283
|
+
// Run benchmark parameter validation (Requirements 8.1, 8.2, 8.3)
|
|
284
|
+
if (config.BENCHMARK_CONCURRENCY || config.BENCHMARK_INPUT_TOKENS_MEAN ||
|
|
285
|
+
config.BENCHMARK_OUTPUT_TOKENS_MEAN || config.BENCHMARK_S3_OUTPUT_PATH) {
|
|
286
|
+
const benchmarkCheck = hasBenchmarkShape(registryPath);
|
|
287
|
+
if (benchmarkCheck.available) {
|
|
288
|
+
const benchmarkFindings = validateBenchmarkParams(config);
|
|
289
|
+
for (const finding of benchmarkFindings) {
|
|
290
|
+
report.addFinding(finding);
|
|
291
|
+
}
|
|
292
|
+
} else {
|
|
293
|
+
console.log('⚠️ Benchmark validation skipped: service model does not include AI Benchmark operations. Run `bootstrap sync-schemas` to update.');
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
174
297
|
const summary = report.getSummary();
|
|
175
298
|
|
|
176
299
|
// Load manifest for version info
|
|
@@ -213,4 +336,4 @@ export async function run(options = {}) {
|
|
|
213
336
|
return exitCode;
|
|
214
337
|
}
|
|
215
338
|
|
|
216
|
-
export default { run, parseDoConfig };
|
|
339
|
+
export default { run, parseDoConfig, validateBenchmarkParams };
|
package/templates/Dockerfile
CHANGED
|
@@ -12,6 +12,9 @@
|
|
|
12
12
|
<% if (framework !== 'transformers') { %>
|
|
13
13
|
FROM <%= baseImage || 'public.ecr.aws/docker/library/python:3.12-slim' %>
|
|
14
14
|
|
|
15
|
+
# Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
|
|
16
|
+
ENV PYTHONUNBUFFERED=1
|
|
17
|
+
|
|
15
18
|
# Set a docker label to name this project, postpended with the build time
|
|
16
19
|
LABEL project.name="<%= projectName %>-<%= buildTimestamp %>" \
|
|
17
20
|
project.base-name="<%= projectName %>" \
|
|
@@ -143,6 +146,9 @@ ARG BASE_IMAGE=<%= baseImage || 'deepjavalibrary/djl-serving:0.36.0-pytorch-gpu'
|
|
|
143
146
|
|
|
144
147
|
FROM ${BASE_IMAGE}
|
|
145
148
|
|
|
149
|
+
# Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
|
|
150
|
+
ENV PYTHONUNBUFFERED=1
|
|
151
|
+
|
|
146
152
|
<% if (comments && comments.chatTemplate) { %>
|
|
147
153
|
<%= comments.chatTemplate %>
|
|
148
154
|
<% } %>
|
|
@@ -271,8 +277,9 @@ COPY code/serve /usr/bin/serve_trtllm
|
|
|
271
277
|
RUN chmod +x /usr/bin/serve_trtllm
|
|
272
278
|
|
|
273
279
|
# Copy startup script
|
|
280
|
+
COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
|
|
274
281
|
COPY code/start_server.sh /usr/bin/start_server.sh
|
|
275
|
-
RUN chmod +x /usr/bin/start_server.sh
|
|
282
|
+
RUN chmod +x /usr/bin/start_server.sh /usr/bin/cuda_compat.sh
|
|
276
283
|
|
|
277
284
|
ENTRYPOINT [ "/usr/bin/start_server.sh" ]
|
|
278
285
|
<% } else if (modelServer === 'lmi' || modelServer === 'djl') { %>
|
|
@@ -287,8 +294,9 @@ COPY code/serving.properties /opt/ml/model/serving.properties
|
|
|
287
294
|
# LMI/DJL containers use their own entrypoint
|
|
288
295
|
# The container will automatically start DJL Serving with the configuration
|
|
289
296
|
<% } else { %>
|
|
297
|
+
COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
|
|
290
298
|
COPY code/serve /usr/bin/serve
|
|
291
|
-
RUN chmod 777 /usr/bin/serve
|
|
299
|
+
RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
|
|
292
300
|
|
|
293
301
|
<% if (comments && comments.troubleshooting) { %>
|
|
294
302
|
<%= comments.troubleshooting %>
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# CUDA Compatibility Setup
|
|
3
|
+
# Required for SageMaker inference AMIs using NVIDIA Container Toolkit 1.17.4+
|
|
4
|
+
# (al2-ami-sagemaker-inference-gpu-2-1, al2-ami-sagemaker-inference-gpu-3-1,
|
|
5
|
+
# al2023-ami-sagemaker-inference-gpu-4-1)
|
|
6
|
+
#
|
|
7
|
+
# These AMIs no longer auto-mount CUDA compat libraries. This script detects
|
|
8
|
+
# whether the host NVIDIA driver is older than what the container's CUDA toolkit
|
|
9
|
+
# requires, and adds the compat libraries to LD_LIBRARY_PATH if needed.
|
|
10
|
+
|
|
11
|
+
_verlt() {
|
|
12
|
+
[ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then
|
|
16
|
+
CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-)
|
|
17
|
+
NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
|
|
18
|
+
if [ -n "$NVIDIA_DRIVER_VERSION" ] && _verlt "$NVIDIA_DRIVER_VERSION" "$CUDA_COMPAT_MAX_DRIVER_VERSION"; then
|
|
19
|
+
echo "CUDA compat: driver ${NVIDIA_DRIVER_VERSION} < ${CUDA_COMPAT_MAX_DRIVER_VERSION}, adding compat libs"
|
|
20
|
+
export LD_LIBRARY_PATH=/usr/local/cuda/compat:${LD_LIBRARY_PATH:-}
|
|
21
|
+
fi
|
|
22
|
+
fi
|
package/templates/code/serve
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
# CUDA compatibility setup (required for newer SageMaker inference AMIs)
|
|
6
|
+
source /usr/bin/cuda_compat.sh 2>/dev/null || true
|
|
7
|
+
|
|
5
8
|
<% if (modelServer === 'vllm') { %>
|
|
6
9
|
echo "Starting vLLM server"
|
|
7
10
|
<% } else if (modelServer === 'sglang') { %>
|
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
# CUDA compatibility setup (required for newer SageMaker inference AMIs)
|
|
6
|
+
source /usr/bin/cuda_compat.sh 2>/dev/null || true
|
|
7
|
+
|
|
5
8
|
set -e
|
|
6
9
|
|
|
7
10
|
echo "Starting TensorRT-LLM server on port 8081..."
|
|
@@ -59,8 +59,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends nginx \
|
|
|
59
59
|
COPY nginx-diffusors.conf /etc/nginx/nginx.conf
|
|
60
60
|
|
|
61
61
|
# Copy serve entrypoint and startup scripts
|
|
62
|
+
COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
|
|
62
63
|
COPY code/serve /usr/bin/serve
|
|
63
|
-
RUN chmod 777 /usr/bin/serve
|
|
64
|
+
RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
|
|
64
65
|
|
|
65
66
|
COPY code/start_server.sh /usr/bin/start_server.sh
|
|
66
67
|
RUN chmod +x /usr/bin/start_server.sh
|
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
# CUDA compatibility setup (required for newer SageMaker inference AMIs)
|
|
6
|
+
source /usr/bin/cuda_compat.sh 2>/dev/null || true
|
|
7
|
+
|
|
5
8
|
echo "Starting vLLM-Omni server (diffusion model serving)"
|
|
6
9
|
|
|
7
10
|
# Resolve model URI prefixes that engines cannot handle natively.
|