@aws/ml-container-creator 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@ import {
18
18
  modelLoadStrategyPrompts,
19
19
  modelProfilePrompts,
20
20
  modulePrompts,
21
+ benchmarkPrompts,
21
22
  infraRegionAndTargetPrompts,
22
23
  infraInstancePrompts,
23
24
  infraAsyncPrompts,
@@ -252,6 +253,29 @@ export default class PromptRunner {
252
253
  this._autoGpuCount = tpRec.gpuCount;
253
254
  console.log(` ✓ Auto-set tensor parallelism: TP=${tpRec.tensorParallelism} (${tpRec.gpuCount} GPUs)`);
254
255
  }
256
+
257
+ // Display capacity type confirmation for selected instance
258
+ // Requirements: 5.4
259
+ if (matchingRec && matchingRec.capacityType) {
260
+ if (matchingRec.capacityType === 'reserved') {
261
+ const resType = matchingRec.reservationType === 'capacity-block' ? 'Capacity Block' : 'ODCR';
262
+ const endInfo = matchingRec.reservationType === 'capacity-block' && matchingRec.reservationInfo?.endDate
263
+ ? `, ends ${new Date(matchingRec.reservationInfo.endDate).toLocaleDateString()}`
264
+ : '';
265
+ console.log(` ✓ Using reserved capacity — ${resType} (reservation ${matchingRec.reservationInfo?.reservationId || 'unknown'}${endInfo})`);
266
+ } else if (matchingRec.capacityType === 'ftp') {
267
+ console.log(` ✓ Using reserved capacity (plan ${matchingRec.ftpInfo?.planName || 'unknown'})`);
268
+ } else {
269
+ const headroom = matchingRec.quotaHeadroom;
270
+ console.log(` ✓ Using on-demand capacity (quota headroom: ${headroom ?? 'unknown'})`);
271
+ }
272
+ }
273
+
274
+ // Extract reservation ARN from selected instance for deployment config
275
+ // Requirements: 2.3
276
+ if (matchingRec && matchingRec.capacityType === 'reserved' && matchingRec.reservationInfo?.reservationArn) {
277
+ this._selectedCapacityReservationArn = matchingRec.reservationInfo.reservationArn;
278
+ }
255
279
  }
256
280
 
257
281
  // 3c. Async-specific prompts (only when deploymentTarget === 'async-inference')
@@ -375,6 +399,21 @@ export default class PromptRunner {
375
399
  moduleAnswers.includeSampleModel = false;
376
400
  }
377
401
 
402
+ // Benchmark prompts — derive includeBenchmark from testTypes selection or CLI flag
403
+ // Requirements: 1.1, 1.2
404
+ let benchmarkAnswers = {};
405
+ if (frameworkAnswers.architecture === 'transformers' || frameworkAnswers.architecture === 'diffusors') {
406
+ const testTypes = moduleAnswers.testTypes || [];
407
+ const includeBenchmark = testTypes.includes('sagemaker-ai-automated-benchmarking') ||
408
+ explicitConfig.includeBenchmark === true ||
409
+ explicitConfig.includeBenchmark === 'true';
410
+ benchmarkAnswers.includeBenchmark = includeBenchmark;
411
+ if (includeBenchmark) {
412
+ const subAnswers = await this._runPhase(benchmarkPrompts, { ...frameworkAnswers, ...moduleAnswers, includeBenchmark }, explicitConfig, existingConfig);
413
+ benchmarkAnswers = { ...benchmarkAnswers, ...subAnswers };
414
+ }
415
+ }
416
+
378
417
  // Validate instance type against framework requirements (now that framework version is known)
379
418
  const finalInstanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
380
419
  if (finalInstanceType && frameworkVersionAnswers.frameworkVersion) {
@@ -416,6 +455,7 @@ export default class PromptRunner {
416
455
  ...hfTokenAnswers,
417
456
  ...ngcApiKeyAnswers,
418
457
  ...moduleAnswers,
458
+ ...benchmarkAnswers,
419
459
  ...projectAnswers,
420
460
  ...destinationAnswers,
421
461
  buildTimestamp
@@ -435,6 +475,12 @@ export default class PromptRunner {
435
475
  combinedAnswers.artifactUri = this._mcpArtifactUri;
436
476
  }
437
477
 
478
+ // Flow capacity reservation ARN from instance-sizer selection
479
+ // Requirements: 2.3
480
+ if (this._selectedCapacityReservationArn) {
481
+ combinedAnswers.capacityReservationArn = this._selectedCapacityReservationArn;
482
+ }
483
+
438
484
  // Validate: non-HF model sources require an artifact URI
439
485
  // Without it, the serve script can't download the model at runtime
440
486
  // Infer modelSource from model name prefix if not set by MCP
@@ -1036,13 +1082,53 @@ export default class PromptRunner {
1036
1082
  : '';
1037
1083
 
1038
1084
  console.log(` ✓ ${choices.length} compatible instance(s) found${vramInfo}`);
1039
- // Display compact recommendation table
1040
- for (const rec of recommendations) {
1041
- const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
1042
- const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
1043
- const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
1044
- console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}`);
1085
+
1086
+ // Check if availability data is present (recommendations have capacityType)
1087
+ const hasAvailabilityData = recommendations.some(r => r.capacityType);
1088
+
1089
+ if (hasAvailabilityData) {
1090
+ // Group by capacityType for display
1091
+ const reserved = recommendations.filter(r => r.capacityType === 'reserved' || r.capacityType === 'ftp');
1092
+ const onDemand = recommendations.filter(r => r.capacityType === 'on-demand');
1093
+
1094
+ if (reserved.length > 0) {
1095
+ console.log(' ── Reserved Capacity ──');
1096
+ for (const rec of reserved) {
1097
+ const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
1098
+ const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
1099
+ const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
1100
+ const tag = rec.capacityType === 'reserved'
1101
+ ? ` [CR] ${rec.reservationInfo?.planName || rec.reservationInfo?.reservationId || ''}`
1102
+ : ` [FTP] ${rec.ftpInfo?.planName || ''}`;
1103
+ console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}${tag}`);
1104
+ }
1105
+ }
1106
+
1107
+ if (onDemand.length > 0) {
1108
+ console.log(' ── On-Demand ──');
1109
+ for (const rec of onDemand) {
1110
+ const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
1111
+ const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
1112
+ const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
1113
+ const deployed = rec.quotaDeployed;
1114
+ const quota = rec.quotaLimit;
1115
+ const tag = quota !== null && quota !== undefined ? ` [Q:${deployed ?? 0}/${quota}]` : '';
1116
+ console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}${tag}`);
1117
+ }
1118
+ }
1119
+ } else {
1120
+ // Fallback: display compact recommendation table (no availability data)
1121
+ for (const rec of recommendations) {
1122
+ const tp = rec.tensorParallelism > 1 ? ` TP=${rec.tensorParallelism}` : '';
1123
+ const vram = rec.totalVramGb ? `${rec.totalVramGb}GB` : '?';
1124
+ const util = rec.utilizationPercent ? `${rec.utilizationPercent}%` : '?';
1125
+ console.log(` ${rec === topRec ? '→' : ' '} ${rec.instanceType.padEnd(20)} ${vram.padStart(5)} VRAM ${util.padStart(4)} util${tp}`);
1126
+ }
1045
1127
  }
1128
+ } else if (parsed.metadata?.allFilteredByQuota) {
1129
+ // All VRAM-compatible instances had zero quota
1130
+ console.log(' ⚠️ No quota available for compatible instances. Request a quota increase.');
1131
+ this._instanceSizerMetadata = parsed.metadata || null;
1046
1132
  } else if (parsed.metadata?.warning) {
1047
1133
  console.log(` ⚠️ ${parsed.metadata.warning}`);
1048
1134
  } else {
@@ -1972,9 +2058,10 @@ export default class PromptRunner {
1972
2058
  '11.4': 'al2-ami-sagemaker-inference-gpu-2-1',
1973
2059
  '11.8': 'al2-ami-sagemaker-inference-gpu-2-1',
1974
2060
  '12.1': 'al2-ami-sagemaker-inference-gpu-3-1',
1975
- '12.2': 'al2023-ami-sagemaker-inference-gpu-4-1',
1976
- '12.4': 'al2023-ami-sagemaker-inference-gpu-4-1',
1977
- '12.6': 'al2023-ami-sagemaker-inference-gpu-4-1'
2061
+ '12.2': 'al2-ami-sagemaker-inference-gpu-3-1',
2062
+ '12.4': 'al2-ami-sagemaker-inference-gpu-3-1',
2063
+ '12.6': 'al2-ami-sagemaker-inference-gpu-3-1',
2064
+ '13.0': 'al2023-ami-sagemaker-inference-gpu-4-1'
1978
2065
  };
1979
2066
 
1980
2067
  /**
@@ -583,7 +583,7 @@ const modulePrompts = [
583
583
  type: 'confirm',
584
584
  name: 'includeSampleModel',
585
585
  message: 'Include sample Abalone classifier?',
586
- default: false,
586
+ default: true,
587
587
  when: (answers) => {
588
588
  const architecture = answers.architecture || answers.deploymentConfig?.split('-')[0];
589
589
  const backend = answers.backend || answers.deploymentConfig?.split('-').slice(1).join('-');
@@ -622,7 +622,10 @@ const modulePrompts = [
622
622
 
623
623
  // Transformers and Triton LLM backends only support hosted endpoint tests
624
624
  if (architecture === 'transformers') {
625
- return ['hosted-model-endpoint'];
625
+ return ['hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'];
626
+ }
627
+ if (architecture === 'diffusors') {
628
+ return ['hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'];
626
629
  }
627
630
  if (architecture === 'triton' && (backend === 'vllm' || backend === 'tensorrtllm')) {
628
631
  return ['hosted-model-endpoint'];
@@ -635,7 +638,10 @@ const modulePrompts = [
635
638
  const backend = answers.backend || answers.deploymentConfig?.split('-').slice(1).join('-');
636
639
 
637
640
  if (architecture === 'transformers') {
638
- return ['hosted-model-endpoint'];
641
+ return ['hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'];
642
+ }
643
+ if (architecture === 'diffusors') {
644
+ return ['hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'];
639
645
  }
640
646
  if (architecture === 'triton' && (backend === 'vllm' || backend === 'tensorrtllm')) {
641
647
  return ['hosted-model-endpoint'];
@@ -700,7 +706,12 @@ const infraInstancePrompts = [
700
706
  when: answers => answers.deploymentTarget === 'realtime-inference' || answers.deploymentTarget === 'async-inference' || answers.deploymentTarget === 'batch-transform' || answers.deploymentTarget === 'hyperpod-eks',
701
707
  message: (answers) => {
702
708
  const framework = answers.framework || answers.deploymentConfig?.split('-')[0];
703
-
709
+
710
+ // Skip table when MCP sizer already displayed annotated results
711
+ if (answers._mcpInstanceChoices && answers._mcpInstanceChoices.length > 0) {
712
+ return 'Select instance type:';
713
+ }
714
+
704
715
  const table = new Table({
705
716
  head: [
706
717
  chalk.cyan('Instance Type'),
@@ -1110,6 +1121,56 @@ const baseImagePrompts = [
1110
1121
  }
1111
1122
  ];
1112
1123
 
1124
+ /**
1125
+ * Benchmark prompts for SageMaker AI Benchmarking (NVIDIA AIPerf)
1126
+ * Sub-prompts shown when 'sagemaker-ai-automated-benchmarking' is selected in testTypes.
1127
+ * Requirements: 2.1, 2.2, 2.3, 2.4, 2.5
1128
+ */
1129
+ const benchmarkPrompts = [
1130
+ {
1131
+ type: 'number',
1132
+ name: 'benchmarkConcurrency',
1133
+ message: 'Concurrent requests for benchmark:',
1134
+ default: 10,
1135
+ when: (answers) => answers.includeBenchmark === true
1136
+ },
1137
+ {
1138
+ type: 'number',
1139
+ name: 'benchmarkInputTokensMean',
1140
+ message: 'Mean input tokens per request:',
1141
+ default: 550,
1142
+ when: (answers) => answers.includeBenchmark === true
1143
+ },
1144
+ {
1145
+ type: 'number',
1146
+ name: 'benchmarkOutputTokensMean',
1147
+ message: 'Mean output tokens per request:',
1148
+ default: 150,
1149
+ when: (answers) => answers.includeBenchmark === true
1150
+ },
1151
+ {
1152
+ type: 'confirm',
1153
+ name: 'benchmarkStreaming',
1154
+ message: 'Enable streaming for benchmark?',
1155
+ default: true,
1156
+ when: (answers) => answers.includeBenchmark === true
1157
+ },
1158
+ {
1159
+ type: 'input',
1160
+ name: 'benchmarkRequestCount',
1161
+ message: 'Total request count (leave empty for service default):',
1162
+ default: '',
1163
+ when: (answers) => answers.includeBenchmark === true
1164
+ },
1165
+ {
1166
+ type: 'input',
1167
+ name: 'benchmarkS3OutputPath',
1168
+ message: 'Benchmark results S3 path (leave empty for auto-created bucket):',
1169
+ default: '',
1170
+ when: (answers) => answers.includeBenchmark === true
1171
+ }
1172
+ ];
1173
+
1113
1174
  export {
1114
1175
  deploymentConfigPrompts,
1115
1176
  frameworkPrompts, // Deprecated: kept for backward compatibility
@@ -1123,6 +1184,7 @@ export {
1123
1184
  hfTokenPrompts,
1124
1185
  ngcApiKeyPrompts,
1125
1186
  modulePrompts,
1187
+ benchmarkPrompts,
1126
1188
  infrastructurePrompts,
1127
1189
  infraRegionAndTargetPrompts,
1128
1190
  infraInstancePrompts,
@@ -188,6 +188,37 @@ export function loadServiceModel(serviceName, registryPath) {
188
188
  return readFileSync(modelPath, 'utf8');
189
189
  }
190
190
 
191
+ /**
192
+ * Check whether the SageMaker service model includes the CreateAIBenchmarkJob operation shape.
193
+ * Used to determine if benchmark parameter validation can be performed.
194
+ *
195
+ * @param {string} [registryPath] - Override registry path
196
+ * @returns {{ available: boolean, reason?: string }}
197
+ */
198
+ export function hasBenchmarkShape(registryPath) {
199
+ const regPath = registryPath || getRegistryPath();
200
+ const modelContent = loadServiceModel('sagemaker', regPath);
201
+
202
+ if (!modelContent) {
203
+ return { available: false, reason: 'SageMaker service model not found in registry' };
204
+ }
205
+
206
+ try {
207
+ const model = JSON.parse(modelContent);
208
+ const operations = model.operations || {};
209
+ const shapes = model.shapes || {};
210
+
211
+ // Check for the CreateAIBenchmarkJob operation or its input shape
212
+ if (operations.CreateAIBenchmarkJob || shapes.CreateAIBenchmarkJobRequest) {
213
+ return { available: true };
214
+ }
215
+
216
+ return { available: false, reason: 'service model does not include AI Benchmark operations' };
217
+ } catch {
218
+ return { available: false, reason: 'Failed to parse SageMaker service model' };
219
+ }
220
+ }
221
+
191
222
  /**
192
223
  * Store a service model in the registry.
193
224
  * @param {string} serviceName - Service name (e.g., 'sagemaker')
@@ -65,7 +65,7 @@ export default class TemplateManager {
65
65
  ],
66
66
  buildTargets: ['codebuild'],
67
67
  deploymentTargets: ['realtime-inference', 'async-inference', 'batch-transform', 'hyperpod-eks'],
68
- testTypes: ['local-model-cli', 'local-model-server', 'hosted-model-endpoint'],
68
+ testTypes: ['local-model-cli', 'local-model-server', 'hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'],
69
69
  awsRegions: [
70
70
  'us-east-1', 'us-east-2', 'us-west-1', 'us-west-2',
71
71
  'eu-west-1', 'eu-west-2', 'eu-central-1', 'eu-north-1',
@@ -134,6 +134,9 @@ export default class TemplateManager {
134
134
 
135
135
  // Validate batch transform specific fields
136
136
  this._validateBatchTransformConfig();
137
+
138
+ // Validate benchmark specific fields
139
+ this._validateBenchmarkConfig();
137
140
 
138
141
  // Validate instance type format (ml.*.*) - only for realtime-inference
139
142
  if (this.answers.instanceType && this.answers.instanceType !== 'custom') {
@@ -297,6 +300,51 @@ export default class TemplateManager {
297
300
  }
298
301
  }
299
302
 
303
+ /**
304
+ * Validates benchmark configuration parameters
305
+ * @private
306
+ * @throws {Error} If benchmark configuration is invalid
307
+ */
308
+ _validateBenchmarkConfig() {
309
+ if (!this.answers.includeBenchmark) return;
310
+
311
+ // Gate to supported architectures
312
+ const dc = this.answers.deploymentConfig;
313
+ const arch = dc ? dc.split('-')[0] : this.answers.architecture;
314
+ if (arch !== 'transformers' && arch !== 'diffusors') {
315
+ throw new Error('⚠️ Benchmarking is only supported with transformers and diffusors architectures.');
316
+ }
317
+
318
+ // Gate to supported deployment targets
319
+ if (this.answers.deploymentTarget === 'hyperpod-eks') {
320
+ throw new Error('⚠️ Benchmarking is only supported with managed-inference, async-inference, and batch-transform deployment targets');
321
+ }
322
+
323
+ // Validate numeric parameters
324
+ if (this.answers.benchmarkConcurrency !== undefined) {
325
+ if (!Number.isInteger(this.answers.benchmarkConcurrency) || this.answers.benchmarkConcurrency < 1) {
326
+ throw new Error('⚠️ benchmarkConcurrency must be an integer >= 1');
327
+ }
328
+ }
329
+ if (this.answers.benchmarkInputTokensMean !== undefined) {
330
+ if (!Number.isInteger(this.answers.benchmarkInputTokensMean) || this.answers.benchmarkInputTokensMean < 1) {
331
+ throw new Error('⚠️ benchmarkInputTokensMean must be an integer >= 1');
332
+ }
333
+ }
334
+ if (this.answers.benchmarkOutputTokensMean !== undefined) {
335
+ if (!Number.isInteger(this.answers.benchmarkOutputTokensMean) || this.answers.benchmarkOutputTokensMean < 1) {
336
+ throw new Error('⚠️ benchmarkOutputTokensMean must be an integer >= 1');
337
+ }
338
+ }
339
+
340
+ // Validate S3 path format
341
+ if (this.answers.benchmarkS3OutputPath && this.answers.benchmarkS3OutputPath.trim() !== '') {
342
+ if (!this.answers.benchmarkS3OutputPath.startsWith('s3://')) {
343
+ throw new Error('⚠️ benchmarkS3OutputPath must start with "s3://". Example: s3://my-bucket/benchmark-results/');
344
+ }
345
+ }
346
+ }
347
+
300
348
  /**
301
349
  * Validates GPU instance type requirement for GPU-requiring backends.
302
350
  * Called when deploymentConfig is present.
@@ -21,7 +21,7 @@ import SchemaValidationEngine from './schema-validation-engine.js';
21
21
  import ServiceModelParser from './service-model-parser.js';
22
22
  import CrossCuttingChecker from './cross-cutting-checker.js';
23
23
  import HuggingFaceClient from './huggingface-client.js';
24
- import { getRegistryPath, loadManifest } from './schema-sync.js';
24
+ import { getRegistryPath, loadManifest, hasBenchmarkShape } from './schema-sync.js';
25
25
 
26
26
  const __filename = fileURLToPath(import.meta.url);
27
27
  const __dirname = path.dirname(__filename);
@@ -52,6 +52,115 @@ export function parseDoConfig(configPath) {
52
52
  return config;
53
53
  }
54
54
 
55
+ /**
56
+ * Validate benchmark parameters against service model constraints.
57
+ * Called when the CreateAIBenchmarkJob shape is available in the synced schema.
58
+ *
59
+ * Validates:
60
+ * - Concurrency: integer, min 1
61
+ * - S3OutputLocation: string, starts with s3://
62
+ * - AIBenchmarkJobName: pattern ^[a-zA-Z0-9](-*[a-zA-Z0-9])*, max 63 chars
63
+ *
64
+ * Requirements: 8.1, 8.2, 8.3
65
+ *
66
+ * @param {Object} config - Parsed do/config values
67
+ * @returns {Array<Object>} Array of validation findings
68
+ */
69
+ export function validateBenchmarkParams(config) {
70
+ const findings = [];
71
+
72
+ // Validate Concurrency (integer, min 1)
73
+ if (config.BENCHMARK_CONCURRENCY !== null && config.BENCHMARK_CONCURRENCY !== undefined && config.BENCHMARK_CONCURRENCY !== '') {
74
+ const concurrency = Number(config.BENCHMARK_CONCURRENCY);
75
+ if (!Number.isInteger(concurrency) || concurrency < 1) {
76
+ findings.push({
77
+ severity: 'error',
78
+ operation: 'CreateAIBenchmarkJob',
79
+ fieldPath: 'Concurrency',
80
+ constraint: 'integer >= 1',
81
+ invalidValue: config.BENCHMARK_CONCURRENCY,
82
+ remediationHint: 'BENCHMARK_CONCURRENCY must be a positive integer (>= 1)'
83
+ });
84
+ }
85
+ }
86
+
87
+ // Validate S3OutputLocation (string, starts with s3://)
88
+ if (config.BENCHMARK_S3_OUTPUT_PATH !== null && config.BENCHMARK_S3_OUTPUT_PATH !== undefined && config.BENCHMARK_S3_OUTPUT_PATH !== '') {
89
+ const s3Path = config.BENCHMARK_S3_OUTPUT_PATH;
90
+ // Skip dynamic shell expressions (e.g., s3://...$(aws ...))
91
+ if (!s3Path.includes('$(') && !s3Path.startsWith('s3://')) {
92
+ findings.push({
93
+ severity: 'error',
94
+ operation: 'CreateAIBenchmarkJob',
95
+ fieldPath: 'OutputConfig.S3OutputLocation',
96
+ constraint: 'must start with s3://',
97
+ invalidValue: s3Path,
98
+ remediationHint: 'BENCHMARK_S3_OUTPUT_PATH must start with "s3://". Example: s3://my-bucket/benchmark-results/'
99
+ });
100
+ }
101
+ }
102
+
103
+ // Validate AIBenchmarkJobName pattern (^[a-zA-Z0-9](-*[a-zA-Z0-9])*, max 63 chars)
104
+ if (config.BENCHMARK_JOB_NAME !== null && config.BENCHMARK_JOB_NAME !== undefined && config.BENCHMARK_JOB_NAME !== '') {
105
+ const jobName = config.BENCHMARK_JOB_NAME;
106
+ // Skip dynamic shell expressions
107
+ if (!jobName.includes('$(') && !jobName.includes('${')) {
108
+ const namePattern = /^[a-zA-Z0-9](-*[a-zA-Z0-9])*$/;
109
+ if (jobName.length > 63) {
110
+ findings.push({
111
+ severity: 'error',
112
+ operation: 'CreateAIBenchmarkJob',
113
+ fieldPath: 'AIBenchmarkJobName',
114
+ constraint: 'max 63 characters',
115
+ invalidValue: jobName,
116
+ remediationHint: 'AIBenchmarkJobName must be at most 63 characters'
117
+ });
118
+ } else if (!namePattern.test(jobName)) {
119
+ findings.push({
120
+ severity: 'error',
121
+ operation: 'CreateAIBenchmarkJob',
122
+ fieldPath: 'AIBenchmarkJobName',
123
+ constraint: 'pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9])*',
124
+ invalidValue: jobName,
125
+ remediationHint: 'AIBenchmarkJobName must start with alphanumeric and contain only alphanumeric characters and hyphens'
126
+ });
127
+ }
128
+ }
129
+ }
130
+
131
+ // Validate input tokens mean (integer, min 1)
132
+ if (config.BENCHMARK_INPUT_TOKENS_MEAN !== null && config.BENCHMARK_INPUT_TOKENS_MEAN !== undefined && config.BENCHMARK_INPUT_TOKENS_MEAN !== '') {
133
+ const inputTokens = Number(config.BENCHMARK_INPUT_TOKENS_MEAN);
134
+ if (!Number.isInteger(inputTokens) || inputTokens < 1) {
135
+ findings.push({
136
+ severity: 'error',
137
+ operation: 'CreateAIWorkloadConfig',
138
+ fieldPath: 'WorkloadSpec.parameters.prompt_input_tokens_mean',
139
+ constraint: 'integer >= 1',
140
+ invalidValue: config.BENCHMARK_INPUT_TOKENS_MEAN,
141
+ remediationHint: 'BENCHMARK_INPUT_TOKENS_MEAN must be a positive integer (>= 1)'
142
+ });
143
+ }
144
+ }
145
+
146
+ // Validate output tokens mean (integer, min 1)
147
+ if (config.BENCHMARK_OUTPUT_TOKENS_MEAN !== null && config.BENCHMARK_OUTPUT_TOKENS_MEAN !== undefined && config.BENCHMARK_OUTPUT_TOKENS_MEAN !== '') {
148
+ const outputTokens = Number(config.BENCHMARK_OUTPUT_TOKENS_MEAN);
149
+ if (!Number.isInteger(outputTokens) || outputTokens < 1) {
150
+ findings.push({
151
+ severity: 'error',
152
+ operation: 'CreateAIWorkloadConfig',
153
+ fieldPath: 'WorkloadSpec.parameters.output_tokens_mean',
154
+ constraint: 'integer >= 1',
155
+ invalidValue: config.BENCHMARK_OUTPUT_TOKENS_MEAN,
156
+ remediationHint: 'BENCHMARK_OUTPUT_TOKENS_MEAN must be a positive integer (>= 1)'
157
+ });
158
+ }
159
+ }
160
+
161
+ return findings;
162
+ }
163
+
55
164
  /**
56
165
  * Run the full validation pipeline.
57
166
  *
@@ -171,6 +280,20 @@ export async function run(options = {}) {
171
280
  }
172
281
  }
173
282
 
283
+ // Run benchmark parameter validation (Requirements 8.1, 8.2, 8.3)
284
+ if (config.BENCHMARK_CONCURRENCY || config.BENCHMARK_INPUT_TOKENS_MEAN ||
285
+ config.BENCHMARK_OUTPUT_TOKENS_MEAN || config.BENCHMARK_S3_OUTPUT_PATH) {
286
+ const benchmarkCheck = hasBenchmarkShape(registryPath);
287
+ if (benchmarkCheck.available) {
288
+ const benchmarkFindings = validateBenchmarkParams(config);
289
+ for (const finding of benchmarkFindings) {
290
+ report.addFinding(finding);
291
+ }
292
+ } else {
293
+ console.log('⚠️ Benchmark validation skipped: service model does not include AI Benchmark operations. Run `bootstrap sync-schemas` to update.');
294
+ }
295
+ }
296
+
174
297
  const summary = report.getSummary();
175
298
 
176
299
  // Load manifest for version info
@@ -213,4 +336,4 @@ export async function run(options = {}) {
213
336
  return exitCode;
214
337
  }
215
338
 
216
- export default { run, parseDoConfig };
339
+ export default { run, parseDoConfig, validateBenchmarkParams };
@@ -12,6 +12,9 @@
12
12
  <% if (framework !== 'transformers') { %>
13
13
  FROM <%= baseImage || 'public.ecr.aws/docker/library/python:3.12-slim' %>
14
14
 
15
+ # Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
16
+ ENV PYTHONUNBUFFERED=1
17
+
15
18
  # Set a docker label to name this project, postpended with the build time
16
19
  LABEL project.name="<%= projectName %>-<%= buildTimestamp %>" \
17
20
  project.base-name="<%= projectName %>" \
@@ -143,6 +146,9 @@ ARG BASE_IMAGE=<%= baseImage || 'deepjavalibrary/djl-serving:0.36.0-pytorch-gpu'
143
146
 
144
147
  FROM ${BASE_IMAGE}
145
148
 
149
+ # Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
150
+ ENV PYTHONUNBUFFERED=1
151
+
146
152
  <% if (comments && comments.chatTemplate) { %>
147
153
  <%= comments.chatTemplate %>
148
154
  <% } %>
@@ -271,8 +277,9 @@ COPY code/serve /usr/bin/serve_trtllm
271
277
  RUN chmod +x /usr/bin/serve_trtllm
272
278
 
273
279
  # Copy startup script
280
+ COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
274
281
  COPY code/start_server.sh /usr/bin/start_server.sh
275
- RUN chmod +x /usr/bin/start_server.sh
282
+ RUN chmod +x /usr/bin/start_server.sh /usr/bin/cuda_compat.sh
276
283
 
277
284
  ENTRYPOINT [ "/usr/bin/start_server.sh" ]
278
285
  <% } else if (modelServer === 'lmi' || modelServer === 'djl') { %>
@@ -287,8 +294,9 @@ COPY code/serving.properties /opt/ml/model/serving.properties
287
294
  # LMI/DJL containers use their own entrypoint
288
295
  # The container will automatically start DJL Serving with the configuration
289
296
  <% } else { %>
297
+ COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
290
298
  COPY code/serve /usr/bin/serve
291
- RUN chmod 777 /usr/bin/serve
299
+ RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
292
300
 
293
301
  <% if (comments && comments.troubleshooting) { %>
294
302
  <%= comments.troubleshooting %>
@@ -0,0 +1,22 @@
1
+ #!/bin/bash
2
+ # CUDA Compatibility Setup
3
+ # Required for SageMaker inference AMIs using NVIDIA Container Toolkit 1.17.4+
4
+ # (al2-ami-sagemaker-inference-gpu-2-1, al2-ami-sagemaker-inference-gpu-3-1,
5
+ # al2023-ami-sagemaker-inference-gpu-4-1)
6
+ #
7
+ # These AMIs no longer auto-mount CUDA compat libraries. This script detects
8
+ # whether the host NVIDIA driver is older than what the container's CUDA toolkit
9
+ # requires, and adds the compat libraries to LD_LIBRARY_PATH if needed.
10
+
11
+ _verlt() {
12
+ [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
13
+ }
14
+
15
+ if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then
16
+ CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-)
17
+ NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
18
+ if [ -n "$NVIDIA_DRIVER_VERSION" ] && _verlt "$NVIDIA_DRIVER_VERSION" "$CUDA_COMPAT_MAX_DRIVER_VERSION"; then
19
+ echo "CUDA compat: driver ${NVIDIA_DRIVER_VERSION} < ${CUDA_COMPAT_MAX_DRIVER_VERSION}, adding compat libs"
20
+ export LD_LIBRARY_PATH=/usr/local/cuda/compat:${LD_LIBRARY_PATH:-}
21
+ fi
22
+ fi
@@ -2,6 +2,9 @@
2
2
  # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ # CUDA compatibility setup (required for newer SageMaker inference AMIs)
6
+ source /usr/bin/cuda_compat.sh 2>/dev/null || true
7
+
5
8
  <% if (modelServer === 'vllm') { %>
6
9
  echo "Starting vLLM server"
7
10
  <% } else if (modelServer === 'sglang') { %>
@@ -2,6 +2,9 @@
2
2
  # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ # CUDA compatibility setup (required for newer SageMaker inference AMIs)
6
+ source /usr/bin/cuda_compat.sh 2>/dev/null || true
7
+
5
8
  set -e
6
9
 
7
10
  echo "Starting TensorRT-LLM server on port 8081..."
@@ -59,8 +59,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends nginx \
59
59
  COPY nginx-diffusors.conf /etc/nginx/nginx.conf
60
60
 
61
61
  # Copy serve entrypoint and startup scripts
62
+ COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
62
63
  COPY code/serve /usr/bin/serve
63
- RUN chmod 777 /usr/bin/serve
64
+ RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
64
65
 
65
66
  COPY code/start_server.sh /usr/bin/start_server.sh
66
67
  RUN chmod +x /usr/bin/start_server.sh
@@ -2,6 +2,9 @@
2
2
  # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ # CUDA compatibility setup (required for newer SageMaker inference AMIs)
6
+ source /usr/bin/cuda_compat.sh 2>/dev/null || true
7
+
5
8
  echo "Starting vLLM-Omni server (diffusion model serving)"
6
9
 
7
10
  # Resolve model URI prefixes that engines cannot handle natively.