@aws/ml-container-creator 0.13.5 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/config/parameter-schema-v2.json +33 -5
  2. package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
  3. package/infra/ci-harness/package-lock.json +121 -111
  4. package/infra/ci-harness/package.json +1 -1
  5. package/package.json +2 -2
  6. package/servers/endpoint-picker/index.js +23 -14
  7. package/servers/instance-sizer/index.js +72 -4
  8. package/servers/instance-sizer/lib/model-resolver.js +28 -2
  9. package/src/app.js +15 -0
  10. package/src/lib/config-loader.js +18 -0
  11. package/src/lib/config-manager.js +6 -1
  12. package/src/lib/dataset-slug.js +152 -0
  13. package/src/lib/generated/cli-options.js +9 -3
  14. package/src/lib/generated/parameter-matrix.js +15 -4
  15. package/src/lib/generated/validation-rules.js +1 -1
  16. package/src/lib/mcp-client.js +15 -1
  17. package/src/lib/mcp-query-runner.js +11 -1
  18. package/src/lib/prompt-runner.js +40 -20
  19. package/src/lib/prompts/feature-prompts.js +1 -1
  20. package/src/lib/template-manager.js +0 -7
  21. package/src/lib/template-variable-resolver.js +51 -1
  22. package/src/lib/tune-config-state.js +14 -1
  23. package/templates/do/.benchmark_writer.py +43 -0
  24. package/templates/do/.register_helper.py +1185 -0
  25. package/templates/do/.tune_helper.py +168 -2
  26. package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
  27. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  28. package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
  29. package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
  30. package/templates/do/adapter +319 -27
  31. package/templates/do/add-ic +85 -3
  32. package/templates/do/benchmark +28 -8
  33. package/templates/do/config +20 -0
  34. package/templates/do/lib/inference-component.sh +56 -3
  35. package/templates/do/register +557 -6
  36. package/templates/do/test +12 -2
  37. package/templates/do/tune +219 -6
@@ -314,13 +314,6 @@ export default class TemplateManager {
314
314
  _validateBenchmarkConfig() {
315
315
  if (!this.answers.includeBenchmark) return;
316
316
 
317
- // Gate to supported architectures
318
- const dc = this.answers.deploymentConfig;
319
- const arch = dc ? dc.split('-')[0] : this.answers.architecture;
320
- if (arch !== 'transformers' && arch !== 'diffusors') {
321
- throw new Error('⚠️ Benchmarking is only supported with transformers and diffusors architectures.');
322
- }
323
-
324
317
  // Gate to supported deployment targets
325
318
  if (this.answers.deploymentTarget === 'hyperpod-eks') {
326
319
  throw new Error('⚠️ Benchmarking is only supported with managed-inference, async-inference, and batch-transform deployment targets');
@@ -232,7 +232,7 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
232
232
  artifactUri: '',
233
233
  modelLoadStrategy: 'runtime',
234
234
  existingEndpointName: null,
235
- enableLora: false,
235
+ enableLora: true,
236
236
  maxLoras: 30,
237
237
  maxLoraRank: 64
238
238
  };
@@ -261,6 +261,20 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
261
261
  }
262
262
  }
263
263
 
264
+ // Always include benchmarking by default (AC-2.3 — enabled for all architectures).
265
+ // Only set when not explicitly provided by user (AC-2.4, AC-2.7 — respect explicit opt-out).
266
+ if (answers.includeBenchmark === undefined) {
267
+ answers.includeBenchmark = true;
268
+ }
269
+
270
+ // Enforce enableLora scoping: only LoRA-capable servers get enableLora=true
271
+ // (AC-2.1, NFR-2). All incompatible backends are forced to false.
272
+ const loraCapableServers = ['vllm', 'sglang', 'djl-lmi', 'lmi', 'djl'];
273
+ const resolvedBackend = answers.backend || answers.modelServer;
274
+ if (!loraCapableServers.includes(resolvedBackend)) {
275
+ answers.enableLora = false;
276
+ }
277
+
264
278
  // Merge catalog env vars into answers.envVars with correct precedence
265
279
  await _mergeEnvVarsWithPrecedence(answers, registryConfigManager);
266
280
 
@@ -445,6 +459,35 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
445
459
  }
446
460
  }
447
461
 
462
+ // Propagate max_model_len from instance-sizer context capping to env vars (AC-1.7).
463
+ // The instance-sizer sets sizerMaxModelLen when the model's full context doesn't fit
464
+ // on the recommended instance. Write as VLLM_MAX_MODEL_LEN or SGLANG_MAX_MODEL_LEN.
465
+ const _MAX_MODEL_LEN_ENGINE_MAP = {
466
+ 'vllm': 'VLLM_MAX_MODEL_LEN',
467
+ 'vllm-omni': 'VLLM_MAX_MODEL_LEN',
468
+ 'sglang': 'SGLANG_MAX_MODEL_LEN'
469
+ };
470
+
471
+ if (answers.sizerMaxModelLen) {
472
+ const maxLenEngine = answers.backend || answers.modelServer;
473
+ const maxLenEnvKey = maxLenEngine ? _MAX_MODEL_LEN_ENGINE_MAP[maxLenEngine] : null;
474
+ if (maxLenEnvKey) {
475
+ // Only set if user hasn't explicitly provided this env var
476
+ const userServerEnvVars = answers.serverEnvVars || {};
477
+ const userExplicitlySetMaxLen = (
478
+ userServerEnvVars['MAX_MODEL_LEN'] !== undefined ||
479
+ userServerEnvVars[maxLenEnvKey] !== undefined
480
+ );
481
+ if (!userExplicitlySetMaxLen && (!answers.envVars || !answers.envVars[maxLenEnvKey])) {
482
+ if (!answers.envVars) {
483
+ answers.envVars = {};
484
+ }
485
+ answers.envVars[maxLenEnvKey] = String(answers.sizerMaxModelLen);
486
+ console.log(` ℹ️ max_model_len: ${answers.sizerMaxModelLen} (context capped by instance-sizer)`);
487
+ }
488
+ }
489
+ }
490
+
448
491
  // Determine tune support based on model presence in the tune catalog.
449
492
  // Used by the do/config template to write TUNE_SUPPORTED=true|false.
450
493
  if (answers.tuneSupported === undefined) {
@@ -481,4 +524,11 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
481
524
  answers.tuneModelId = null;
482
525
  }
483
526
  }
527
+
528
+ // Propagate --ic-env KEY=VALUE pairs to icEnvVars for do/config template rendering.
529
+ // These are rendered as IC_ENV_* exports in do/config, which inference-component.sh
530
+ // reads at deploy time and passes as the Environment field in InferenceComponent.create().
531
+ if (!answers.icEnvVars) {
532
+ answers.icEnvVars = {};
533
+ }
484
534
  }
@@ -74,22 +74,35 @@ export function persistSubmissionState(configPath, { technique, trainingType, da
74
74
  * Simulate the config writes that happen after a job completes successfully.
75
75
  * This mirrors the behavior in do/tune's _handle_completion() function.
76
76
  *
77
+ * Writes three levels of tracking (AC-4.1, AC-4.2):
78
+ * - Level 1: TUNE_OUTPUT_PATH_LATEST (always the last run, any technique)
79
+ * - Level 2: TUNE_ADAPTER_PATH_<TECHNIQUE> (last run per technique)
80
+ * - Level 3: TUNE_ADAPTER_PATH_<TECHNIQUE>_<SLUG> (per technique + dataset slug)
81
+ *
77
82
  * @param {string} configPath - Path to the config file
78
83
  * @param {object} params - Completion parameters
79
84
  * @param {string} params.technique - Technique (sft, dpo, rlaif, rlvr)
80
85
  * @param {string} params.trainingType - Training type (lora, full-rank)
81
86
  * @param {string} params.artifactPath - S3 path to the output artifact
82
87
  * @param {string} params.outputType - Output type (adapter, full-model)
88
+ * @param {string} [params.datasetSlug] - Optional dataset slug for per-technique-per-dataset tracking
83
89
  */
84
- export function persistCompletionState(configPath, { technique, trainingType, artifactPath, outputType }) {
90
+ export function persistCompletionState(configPath, { technique, trainingType, artifactPath, outputType, datasetSlug }) {
85
91
  const techniqueUpper = technique.toUpperCase();
86
92
 
87
93
  if (trainingType === 'lora') {
94
+ // Level 2: per-technique
88
95
  updateConfigVar(configPath, `TUNE_ADAPTER_PATH_${techniqueUpper}`, artifactPath);
96
+ // Level 3: per-technique + per-dataset (if slug available)
97
+ if (datasetSlug) {
98
+ const slugUpper = datasetSlug.toUpperCase().replace(/-/g, '_');
99
+ updateConfigVar(configPath, `TUNE_ADAPTER_PATH_${techniqueUpper}_${slugUpper}`, artifactPath);
100
+ }
89
101
  } else if (trainingType === 'full-rank') {
90
102
  updateConfigVar(configPath, `TUNE_MODEL_PATH_${techniqueUpper}`, artifactPath);
91
103
  }
92
104
 
105
+ // Level 1: latest
93
106
  updateConfigVar(configPath, 'TUNE_OUTPUT_PATH_LATEST', artifactPath);
94
107
  updateConfigVar(configPath, 'TUNE_OUTPUT_TYPE_LATEST', outputType);
95
108
  }
@@ -487,6 +487,7 @@ def enrich_records(config, results, run_timestamp=None):
487
487
  'mcc_version': mcc_version,
488
488
  'run_timestamp': run_timestamp.isoformat(),
489
489
  'region': region,
490
+ 'adapter_name': config.get('adapter_name', ''),
490
491
  }
491
492
  records.append(record)
492
493
 
@@ -859,6 +860,7 @@ def get_parquet_schema():
859
860
  pa.field("mcc_version", pa.string()),
860
861
  pa.field("run_timestamp", pa.string()),
861
862
  pa.field("region", pa.string()),
863
+ pa.field("adapter_name", pa.string()),
862
864
  ])
863
865
 
864
866
 
@@ -1177,6 +1179,8 @@ def cmd_write(args):
1177
1179
  input_data['workload'] = args.workload
1178
1180
  if args.region:
1179
1181
  input_data['region'] = args.region
1182
+ if args.adapter_name:
1183
+ input_data['adapter_name'] = args.adapter_name
1180
1184
 
1181
1185
  # ── Validate before any S3 interaction ────────────────────────────────
1182
1186
  errors = validate_benchmark_input(input_data)
@@ -1397,8 +1401,18 @@ def _load_config_file(config_path):
1397
1401
  'BASE_IMAGE_VERSION': 'base_image_version',
1398
1402
  'BENCHMARK_CONCURRENCY': 'benchmark_concurrency',
1399
1403
  }
1404
+ # Also capture IC_ENV_* serving config vars
1405
+ ic_env_map = {
1406
+ 'IC_ENV_VLLM_MAX_MODEL_LEN': 'max_model_len',
1407
+ 'IC_ENV_VLLM_QUANTIZATION': 'quantization',
1408
+ 'IC_ENV_VLLM_GPU_MEMORY_UTILIZATION': 'gpu_memory_utilization',
1409
+ 'IC_ENV_VLLM_KV_CACHE_DTYPE': 'kv_cache_dtype',
1410
+ 'IC_ENV_VLLM_TENSOR_PARALLEL_SIZE': 'tensor_parallel_degree',
1411
+ }
1400
1412
  if key in shell_map:
1401
1413
  context[shell_map[key]] = value
1414
+ elif key in ic_env_map:
1415
+ context[ic_env_map[key]] = value
1402
1416
 
1403
1417
  except Exception:
1404
1418
  pass
@@ -1415,6 +1429,30 @@ def _load_config_file(config_path):
1415
1429
  parts = context['model_name'].rstrip('/').split('/')
1416
1430
  context['model_name'] = parts[-1] if parts else context['model_name']
1417
1431
 
1432
+ # Also scan IC config files (do/ic/*.conf) for IC_ENV_* serving params
1433
+ # These override do/config values for serving-specific settings
1434
+ try:
1435
+ import glob
1436
+ config_dir = os.path.dirname(os.path.abspath(config_path))
1437
+ ic_dir = os.path.join(config_dir, 'ic')
1438
+ ic_env_map = {
1439
+ 'IC_ENV_VLLM_MAX_MODEL_LEN': 'max_model_len',
1440
+ 'IC_ENV_VLLM_QUANTIZATION': 'quantization',
1441
+ 'IC_ENV_VLLM_GPU_MEMORY_UTILIZATION': 'gpu_memory_utilization',
1442
+ 'IC_ENV_VLLM_KV_CACHE_DTYPE': 'kv_cache_dtype',
1443
+ 'IC_ENV_VLLM_TENSOR_PARALLEL_SIZE': 'tensor_parallel_degree',
1444
+ }
1445
+ for conf_file in sorted(glob.glob(os.path.join(ic_dir, '*.conf'))):
1446
+ with open(conf_file, 'r') as f:
1447
+ for line in f:
1448
+ match = re.match(r'^export\s+([A-Z_][A-Z0-9_]*)=["\']?([^"\']*)["\']?\s*$', line.strip())
1449
+ if match:
1450
+ key, value = match.group(1), match.group(2)
1451
+ if key in ic_env_map and value:
1452
+ context[ic_env_map[key]] = value
1453
+ except Exception:
1454
+ pass # IC config scanning is best-effort
1455
+
1418
1456
  return context
1419
1457
 
1420
1458
 
@@ -1462,6 +1500,11 @@ def main():
1462
1500
  '--region',
1463
1501
  help='AWS region'
1464
1502
  )
1503
+ write_parser.add_argument(
1504
+ '--adapter-name', dest='adapter_name', default=None,
1505
+ help='LoRA adapter name (differentiates adapter benchmarks from base model in Athena)'
1506
+ )
1507
+
1465
1508
  write_parser.add_argument(
1466
1509
  '--dry-run', dest='dry_run', action='store_true',
1467
1510
  help='Output enriched records as JSON without writing to S3'