@aws/ml-container-creator 0.13.5 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/parameter-schema-v2.json +33 -5
- package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
- package/infra/ci-harness/package-lock.json +121 -111
- package/infra/ci-harness/package.json +1 -1
- package/package.json +2 -2
- package/servers/endpoint-picker/index.js +23 -14
- package/servers/instance-sizer/index.js +72 -4
- package/servers/instance-sizer/lib/model-resolver.js +28 -2
- package/src/app.js +15 -0
- package/src/lib/config-loader.js +18 -0
- package/src/lib/config-manager.js +6 -1
- package/src/lib/dataset-slug.js +152 -0
- package/src/lib/generated/cli-options.js +9 -3
- package/src/lib/generated/parameter-matrix.js +15 -4
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/mcp-client.js +15 -1
- package/src/lib/mcp-query-runner.js +11 -1
- package/src/lib/prompt-runner.js +40 -20
- package/src/lib/prompts/feature-prompts.js +1 -1
- package/src/lib/template-manager.js +0 -7
- package/src/lib/template-variable-resolver.js +51 -1
- package/src/lib/tune-config-state.js +14 -1
- package/templates/do/.benchmark_writer.py +43 -0
- package/templates/do/.register_helper.py +1185 -0
- package/templates/do/.tune_helper.py +168 -2
- package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
- package/templates/do/adapter +319 -27
- package/templates/do/add-ic +85 -3
- package/templates/do/benchmark +28 -8
- package/templates/do/config +20 -0
- package/templates/do/lib/inference-component.sh +56 -3
- package/templates/do/register +557 -6
- package/templates/do/test +12 -2
- package/templates/do/tune +219 -6
|
@@ -314,13 +314,6 @@ export default class TemplateManager {
|
|
|
314
314
|
_validateBenchmarkConfig() {
|
|
315
315
|
if (!this.answers.includeBenchmark) return;
|
|
316
316
|
|
|
317
|
-
// Gate to supported architectures
|
|
318
|
-
const dc = this.answers.deploymentConfig;
|
|
319
|
-
const arch = dc ? dc.split('-')[0] : this.answers.architecture;
|
|
320
|
-
if (arch !== 'transformers' && arch !== 'diffusors') {
|
|
321
|
-
throw new Error('⚠️ Benchmarking is only supported with transformers and diffusors architectures.');
|
|
322
|
-
}
|
|
323
|
-
|
|
324
317
|
// Gate to supported deployment targets
|
|
325
318
|
if (this.answers.deploymentTarget === 'hyperpod-eks') {
|
|
326
319
|
throw new Error('⚠️ Benchmarking is only supported with managed-inference, async-inference, and batch-transform deployment targets');
|
|
@@ -232,7 +232,7 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
|
|
|
232
232
|
artifactUri: '',
|
|
233
233
|
modelLoadStrategy: 'runtime',
|
|
234
234
|
existingEndpointName: null,
|
|
235
|
-
enableLora:
|
|
235
|
+
enableLora: true,
|
|
236
236
|
maxLoras: 30,
|
|
237
237
|
maxLoraRank: 64
|
|
238
238
|
};
|
|
@@ -261,6 +261,20 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
|
|
|
261
261
|
}
|
|
262
262
|
}
|
|
263
263
|
|
|
264
|
+
// Always include benchmarking by default (AC-2.3 — enabled for all architectures).
|
|
265
|
+
// Only set when not explicitly provided by user (AC-2.4, AC-2.7 — respect explicit opt-out).
|
|
266
|
+
if (answers.includeBenchmark === undefined) {
|
|
267
|
+
answers.includeBenchmark = true;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Enforce enableLora scoping: only LoRA-capable servers get enableLora=true
|
|
271
|
+
// (AC-2.1, NFR-2). All incompatible backends are forced to false.
|
|
272
|
+
const loraCapableServers = ['vllm', 'sglang', 'djl-lmi', 'lmi', 'djl'];
|
|
273
|
+
const resolvedBackend = answers.backend || answers.modelServer;
|
|
274
|
+
if (!loraCapableServers.includes(resolvedBackend)) {
|
|
275
|
+
answers.enableLora = false;
|
|
276
|
+
}
|
|
277
|
+
|
|
264
278
|
// Merge catalog env vars into answers.envVars with correct precedence
|
|
265
279
|
await _mergeEnvVarsWithPrecedence(answers, registryConfigManager);
|
|
266
280
|
|
|
@@ -445,6 +459,35 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
|
|
|
445
459
|
}
|
|
446
460
|
}
|
|
447
461
|
|
|
462
|
+
// Propagate max_model_len from instance-sizer context capping to env vars (AC-1.7).
|
|
463
|
+
// The instance-sizer sets sizerMaxModelLen when the model's full context doesn't fit
|
|
464
|
+
// on the recommended instance. Write as VLLM_MAX_MODEL_LEN or SGLANG_MAX_MODEL_LEN.
|
|
465
|
+
const _MAX_MODEL_LEN_ENGINE_MAP = {
|
|
466
|
+
'vllm': 'VLLM_MAX_MODEL_LEN',
|
|
467
|
+
'vllm-omni': 'VLLM_MAX_MODEL_LEN',
|
|
468
|
+
'sglang': 'SGLANG_MAX_MODEL_LEN'
|
|
469
|
+
};
|
|
470
|
+
|
|
471
|
+
if (answers.sizerMaxModelLen) {
|
|
472
|
+
const maxLenEngine = answers.backend || answers.modelServer;
|
|
473
|
+
const maxLenEnvKey = maxLenEngine ? _MAX_MODEL_LEN_ENGINE_MAP[maxLenEngine] : null;
|
|
474
|
+
if (maxLenEnvKey) {
|
|
475
|
+
// Only set if user hasn't explicitly provided this env var
|
|
476
|
+
const userServerEnvVars = answers.serverEnvVars || {};
|
|
477
|
+
const userExplicitlySetMaxLen = (
|
|
478
|
+
userServerEnvVars['MAX_MODEL_LEN'] !== undefined ||
|
|
479
|
+
userServerEnvVars[maxLenEnvKey] !== undefined
|
|
480
|
+
);
|
|
481
|
+
if (!userExplicitlySetMaxLen && (!answers.envVars || !answers.envVars[maxLenEnvKey])) {
|
|
482
|
+
if (!answers.envVars) {
|
|
483
|
+
answers.envVars = {};
|
|
484
|
+
}
|
|
485
|
+
answers.envVars[maxLenEnvKey] = String(answers.sizerMaxModelLen);
|
|
486
|
+
console.log(` ℹ️ max_model_len: ${answers.sizerMaxModelLen} (context capped by instance-sizer)`);
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
448
491
|
// Determine tune support based on model presence in the tune catalog.
|
|
449
492
|
// Used by the do/config template to write TUNE_SUPPORTED=true|false.
|
|
450
493
|
if (answers.tuneSupported === undefined) {
|
|
@@ -481,4 +524,11 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
|
|
|
481
524
|
answers.tuneModelId = null;
|
|
482
525
|
}
|
|
483
526
|
}
|
|
527
|
+
|
|
528
|
+
// Propagate --ic-env KEY=VALUE pairs to icEnvVars for do/config template rendering.
|
|
529
|
+
// These are rendered as IC_ENV_* exports in do/config, which inference-component.sh
|
|
530
|
+
// reads at deploy time and passes as the Environment field in InferenceComponent.create().
|
|
531
|
+
if (!answers.icEnvVars) {
|
|
532
|
+
answers.icEnvVars = {};
|
|
533
|
+
}
|
|
484
534
|
}
|
|
@@ -74,22 +74,35 @@ export function persistSubmissionState(configPath, { technique, trainingType, da
|
|
|
74
74
|
* Simulate the config writes that happen after a job completes successfully.
|
|
75
75
|
* This mirrors the behavior in do/tune's _handle_completion() function.
|
|
76
76
|
*
|
|
77
|
+
* Writes three levels of tracking (AC-4.1, AC-4.2):
|
|
78
|
+
* - Level 1: TUNE_OUTPUT_PATH_LATEST (always the last run, any technique)
|
|
79
|
+
* - Level 2: TUNE_ADAPTER_PATH_<TECHNIQUE> (last run per technique)
|
|
80
|
+
* - Level 3: TUNE_ADAPTER_PATH_<TECHNIQUE>_<SLUG> (per technique + dataset slug)
|
|
81
|
+
*
|
|
77
82
|
* @param {string} configPath - Path to the config file
|
|
78
83
|
* @param {object} params - Completion parameters
|
|
79
84
|
* @param {string} params.technique - Technique (sft, dpo, rlaif, rlvr)
|
|
80
85
|
* @param {string} params.trainingType - Training type (lora, full-rank)
|
|
81
86
|
* @param {string} params.artifactPath - S3 path to the output artifact
|
|
82
87
|
* @param {string} params.outputType - Output type (adapter, full-model)
|
|
88
|
+
* @param {string} [params.datasetSlug] - Optional dataset slug for per-technique-per-dataset tracking
|
|
83
89
|
*/
|
|
84
|
-
export function persistCompletionState(configPath, { technique, trainingType, artifactPath, outputType }) {
|
|
90
|
+
export function persistCompletionState(configPath, { technique, trainingType, artifactPath, outputType, datasetSlug }) {
|
|
85
91
|
const techniqueUpper = technique.toUpperCase();
|
|
86
92
|
|
|
87
93
|
if (trainingType === 'lora') {
|
|
94
|
+
// Level 2: per-technique
|
|
88
95
|
updateConfigVar(configPath, `TUNE_ADAPTER_PATH_${techniqueUpper}`, artifactPath);
|
|
96
|
+
// Level 3: per-technique + per-dataset (if slug available)
|
|
97
|
+
if (datasetSlug) {
|
|
98
|
+
const slugUpper = datasetSlug.toUpperCase().replace(/-/g, '_');
|
|
99
|
+
updateConfigVar(configPath, `TUNE_ADAPTER_PATH_${techniqueUpper}_${slugUpper}`, artifactPath);
|
|
100
|
+
}
|
|
89
101
|
} else if (trainingType === 'full-rank') {
|
|
90
102
|
updateConfigVar(configPath, `TUNE_MODEL_PATH_${techniqueUpper}`, artifactPath);
|
|
91
103
|
}
|
|
92
104
|
|
|
105
|
+
// Level 1: latest
|
|
93
106
|
updateConfigVar(configPath, 'TUNE_OUTPUT_PATH_LATEST', artifactPath);
|
|
94
107
|
updateConfigVar(configPath, 'TUNE_OUTPUT_TYPE_LATEST', outputType);
|
|
95
108
|
}
|
|
@@ -487,6 +487,7 @@ def enrich_records(config, results, run_timestamp=None):
|
|
|
487
487
|
'mcc_version': mcc_version,
|
|
488
488
|
'run_timestamp': run_timestamp.isoformat(),
|
|
489
489
|
'region': region,
|
|
490
|
+
'adapter_name': config.get('adapter_name', ''),
|
|
490
491
|
}
|
|
491
492
|
records.append(record)
|
|
492
493
|
|
|
@@ -859,6 +860,7 @@ def get_parquet_schema():
|
|
|
859
860
|
pa.field("mcc_version", pa.string()),
|
|
860
861
|
pa.field("run_timestamp", pa.string()),
|
|
861
862
|
pa.field("region", pa.string()),
|
|
863
|
+
pa.field("adapter_name", pa.string()),
|
|
862
864
|
])
|
|
863
865
|
|
|
864
866
|
|
|
@@ -1177,6 +1179,8 @@ def cmd_write(args):
|
|
|
1177
1179
|
input_data['workload'] = args.workload
|
|
1178
1180
|
if args.region:
|
|
1179
1181
|
input_data['region'] = args.region
|
|
1182
|
+
if args.adapter_name:
|
|
1183
|
+
input_data['adapter_name'] = args.adapter_name
|
|
1180
1184
|
|
|
1181
1185
|
# ── Validate before any S3 interaction ────────────────────────────────
|
|
1182
1186
|
errors = validate_benchmark_input(input_data)
|
|
@@ -1397,8 +1401,18 @@ def _load_config_file(config_path):
|
|
|
1397
1401
|
'BASE_IMAGE_VERSION': 'base_image_version',
|
|
1398
1402
|
'BENCHMARK_CONCURRENCY': 'benchmark_concurrency',
|
|
1399
1403
|
}
|
|
1404
|
+
# Also capture IC_ENV_* serving config vars
|
|
1405
|
+
ic_env_map = {
|
|
1406
|
+
'IC_ENV_VLLM_MAX_MODEL_LEN': 'max_model_len',
|
|
1407
|
+
'IC_ENV_VLLM_QUANTIZATION': 'quantization',
|
|
1408
|
+
'IC_ENV_VLLM_GPU_MEMORY_UTILIZATION': 'gpu_memory_utilization',
|
|
1409
|
+
'IC_ENV_VLLM_KV_CACHE_DTYPE': 'kv_cache_dtype',
|
|
1410
|
+
'IC_ENV_VLLM_TENSOR_PARALLEL_SIZE': 'tensor_parallel_degree',
|
|
1411
|
+
}
|
|
1400
1412
|
if key in shell_map:
|
|
1401
1413
|
context[shell_map[key]] = value
|
|
1414
|
+
elif key in ic_env_map:
|
|
1415
|
+
context[ic_env_map[key]] = value
|
|
1402
1416
|
|
|
1403
1417
|
except Exception:
|
|
1404
1418
|
pass
|
|
@@ -1415,6 +1429,30 @@ def _load_config_file(config_path):
|
|
|
1415
1429
|
parts = context['model_name'].rstrip('/').split('/')
|
|
1416
1430
|
context['model_name'] = parts[-1] if parts else context['model_name']
|
|
1417
1431
|
|
|
1432
|
+
# Also scan IC config files (do/ic/*.conf) for IC_ENV_* serving params
|
|
1433
|
+
# These override do/config values for serving-specific settings
|
|
1434
|
+
try:
|
|
1435
|
+
import glob
|
|
1436
|
+
config_dir = os.path.dirname(os.path.abspath(config_path))
|
|
1437
|
+
ic_dir = os.path.join(config_dir, 'ic')
|
|
1438
|
+
ic_env_map = {
|
|
1439
|
+
'IC_ENV_VLLM_MAX_MODEL_LEN': 'max_model_len',
|
|
1440
|
+
'IC_ENV_VLLM_QUANTIZATION': 'quantization',
|
|
1441
|
+
'IC_ENV_VLLM_GPU_MEMORY_UTILIZATION': 'gpu_memory_utilization',
|
|
1442
|
+
'IC_ENV_VLLM_KV_CACHE_DTYPE': 'kv_cache_dtype',
|
|
1443
|
+
'IC_ENV_VLLM_TENSOR_PARALLEL_SIZE': 'tensor_parallel_degree',
|
|
1444
|
+
}
|
|
1445
|
+
for conf_file in sorted(glob.glob(os.path.join(ic_dir, '*.conf'))):
|
|
1446
|
+
with open(conf_file, 'r') as f:
|
|
1447
|
+
for line in f:
|
|
1448
|
+
match = re.match(r'^export\s+([A-Z_][A-Z0-9_]*)=["\']?([^"\']*)["\']?\s*$', line.strip())
|
|
1449
|
+
if match:
|
|
1450
|
+
key, value = match.group(1), match.group(2)
|
|
1451
|
+
if key in ic_env_map and value:
|
|
1452
|
+
context[ic_env_map[key]] = value
|
|
1453
|
+
except Exception:
|
|
1454
|
+
pass # IC config scanning is best-effort
|
|
1455
|
+
|
|
1418
1456
|
return context
|
|
1419
1457
|
|
|
1420
1458
|
|
|
@@ -1462,6 +1500,11 @@ def main():
|
|
|
1462
1500
|
'--region',
|
|
1463
1501
|
help='AWS region'
|
|
1464
1502
|
)
|
|
1503
|
+
write_parser.add_argument(
|
|
1504
|
+
'--adapter-name', dest='adapter_name', default=None,
|
|
1505
|
+
help='LoRA adapter name (differentiates adapter benchmarks from base model in Athena)'
|
|
1506
|
+
)
|
|
1507
|
+
|
|
1465
1508
|
write_parser.add_argument(
|
|
1466
1509
|
'--dry-run', dest='dry_run', action='store_true',
|
|
1467
1510
|
help='Output enriched records as JSON without writing to S3'
|