npm - @aws/ml-container-creator - Versions diffs - 0.13.5 → 0.15.1 - Mend

@aws/ml-container-creator 0.13.5 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/config/parameter-schema-v2.json +33 -5
package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
package/infra/ci-harness/package-lock.json +121 -111
package/infra/ci-harness/package.json +1 -1
package/package.json +2 -2
package/servers/endpoint-picker/index.js +23 -14
package/servers/instance-sizer/index.js +72 -4
package/servers/instance-sizer/lib/model-resolver.js +28 -2
package/src/app.js +15 -0
package/src/lib/config-loader.js +18 -0
package/src/lib/config-manager.js +6 -1
package/src/lib/dataset-slug.js +152 -0
package/src/lib/generated/cli-options.js +9 -3
package/src/lib/generated/parameter-matrix.js +15 -4
package/src/lib/generated/validation-rules.js +1 -1
package/src/lib/mcp-client.js +15 -1
package/src/lib/mcp-query-runner.js +11 -1
package/src/lib/prompt-runner.js +40 -20
package/src/lib/prompts/feature-prompts.js +1 -1
package/src/lib/template-manager.js +0 -7
package/src/lib/template-variable-resolver.js +51 -1
package/src/lib/tune-config-state.js +14 -1
package/templates/do/.benchmark_writer.py +43 -0
package/templates/do/.register_helper.py +1185 -0
package/templates/do/.tune_helper.py +168 -2
package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
package/templates/do/adapter +319 -27
package/templates/do/add-ic +85 -3
package/templates/do/benchmark +28 -8
package/templates/do/config +20 -0
package/templates/do/lib/inference-component.sh +56 -3
package/templates/do/register +557 -6
package/templates/do/test +12 -2
package/templates/do/tune +219 -6

package/src/lib/template-manager.js CHANGED Viewed

@@ -314,13 +314,6 @@ export default class TemplateManager {
     _validateBenchmarkConfig() {
         if (!this.answers.includeBenchmark) return;
-        // Gate to supported architectures
-        const dc = this.answers.deploymentConfig;
-        const arch = dc ? dc.split('-')[0] : this.answers.architecture;
-        if (arch !== 'transformers' && arch !== 'diffusors') {
-            throw new Error('⚠️  Benchmarking is only supported with transformers and diffusors architectures.');
-        }
         // Gate to supported deployment targets
         if (this.answers.deploymentTarget === 'hyperpod-eks') {
             throw new Error('⚠️  Benchmarking is only supported with managed-inference, async-inference, and batch-transform deployment targets');

package/src/lib/template-variable-resolver.js CHANGED Viewed

@@ -232,7 +232,7 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
         artifactUri: '',
         modelLoadStrategy: 'runtime',
         existingEndpointName: null,
-        enableLora: false,
+        enableLora: true,
         maxLoras: 30,
         maxLoraRank: 64
     };
@@ -261,6 +261,20 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
         }
     }
+    // Always include benchmarking by default (AC-2.3 — enabled for all architectures).
+    // Only set when not explicitly provided by user (AC-2.4, AC-2.7 — respect explicit opt-out).
+    if (answers.includeBenchmark === undefined) {
+        answers.includeBenchmark = true;
+    }
+    // Enforce enableLora scoping: only LoRA-capable servers get enableLora=true
+    // (AC-2.1, NFR-2). All incompatible backends are forced to false.
+    const loraCapableServers = ['vllm', 'sglang', 'djl-lmi', 'lmi', 'djl'];
+    const resolvedBackend = answers.backend || answers.modelServer;
+    if (!loraCapableServers.includes(resolvedBackend)) {
+        answers.enableLora = false;
+    }
     // Merge catalog env vars into answers.envVars with correct precedence
     await _mergeEnvVarsWithPrecedence(answers, registryConfigManager);
@@ -445,6 +459,35 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
         }
     }
+    // Propagate max_model_len from instance-sizer context capping to env vars (AC-1.7).
+    // The instance-sizer sets sizerMaxModelLen when the model's full context doesn't fit
+    // on the recommended instance. Write as VLLM_MAX_MODEL_LEN or SGLANG_MAX_MODEL_LEN.
+    const _MAX_MODEL_LEN_ENGINE_MAP = {
+        'vllm': 'VLLM_MAX_MODEL_LEN',
+        'vllm-omni': 'VLLM_MAX_MODEL_LEN',
+        'sglang': 'SGLANG_MAX_MODEL_LEN'
+    };
+    if (answers.sizerMaxModelLen) {
+        const maxLenEngine = answers.backend || answers.modelServer;
+        const maxLenEnvKey = maxLenEngine ? _MAX_MODEL_LEN_ENGINE_MAP[maxLenEngine] : null;
+        if (maxLenEnvKey) {
+            // Only set if user hasn't explicitly provided this env var
+            const userServerEnvVars = answers.serverEnvVars || {};
+            const userExplicitlySetMaxLen = (
+                userServerEnvVars['MAX_MODEL_LEN'] !== undefined ||
+                userServerEnvVars[maxLenEnvKey] !== undefined
+            );
+            if (!userExplicitlySetMaxLen && (!answers.envVars || !answers.envVars[maxLenEnvKey])) {
+                if (!answers.envVars) {
+                    answers.envVars = {};
+                }
+                answers.envVars[maxLenEnvKey] = String(answers.sizerMaxModelLen);
+                console.log(`    ℹ️  max_model_len: ${answers.sizerMaxModelLen} (context capped by instance-sizer)`);
+            }
+        }
+    }
     // Determine tune support based on model presence in the tune catalog.
     // Used by the do/config template to write TUNE_SUPPORTED=true|false.
     if (answers.tuneSupported === undefined) {
@@ -481,4 +524,11 @@ export async function _ensureTemplateVariables(answers, registryConfigManager =
             answers.tuneModelId = null;
         }
     }
+    // Propagate --ic-env KEY=VALUE pairs to icEnvVars for do/config template rendering.
+    // These are rendered as IC_ENV_* exports in do/config, which inference-component.sh
+    // reads at deploy time and passes as the Environment field in InferenceComponent.create().
+    if (!answers.icEnvVars) {
+        answers.icEnvVars = {};
+    }
 }

package/src/lib/tune-config-state.js CHANGED Viewed

@@ -74,22 +74,35 @@ export function persistSubmissionState(configPath, { technique, trainingType, da
  * Simulate the config writes that happen after a job completes successfully.
  * This mirrors the behavior in do/tune's _handle_completion() function.
  *
+ * Writes three levels of tracking (AC-4.1, AC-4.2):
+ * - Level 1: TUNE_OUTPUT_PATH_LATEST (always the last run, any technique)
+ * - Level 2: TUNE_ADAPTER_PATH_<TECHNIQUE> (last run per technique)
+ * - Level 3: TUNE_ADAPTER_PATH_<TECHNIQUE>_<SLUG> (per technique + dataset slug)
+ *
  * @param {string} configPath - Path to the config file
  * @param {object} params - Completion parameters
  * @param {string} params.technique - Technique (sft, dpo, rlaif, rlvr)
  * @param {string} params.trainingType - Training type (lora, full-rank)
  * @param {string} params.artifactPath - S3 path to the output artifact
  * @param {string} params.outputType - Output type (adapter, full-model)
+ * @param {string} [params.datasetSlug] - Optional dataset slug for per-technique-per-dataset tracking
  */
-export function persistCompletionState(configPath, { technique, trainingType, artifactPath, outputType }) {
+export function persistCompletionState(configPath, { technique, trainingType, artifactPath, outputType, datasetSlug }) {
     const techniqueUpper = technique.toUpperCase();
     if (trainingType === 'lora') {
+        // Level 2: per-technique
         updateConfigVar(configPath, `TUNE_ADAPTER_PATH_${techniqueUpper}`, artifactPath);
+        // Level 3: per-technique + per-dataset (if slug available)
+        if (datasetSlug) {
+            const slugUpper = datasetSlug.toUpperCase().replace(/-/g, '_');
+            updateConfigVar(configPath, `TUNE_ADAPTER_PATH_${techniqueUpper}_${slugUpper}`, artifactPath);
+        }
     } else if (trainingType === 'full-rank') {
         updateConfigVar(configPath, `TUNE_MODEL_PATH_${techniqueUpper}`, artifactPath);
     }
+    // Level 1: latest
     updateConfigVar(configPath, 'TUNE_OUTPUT_PATH_LATEST', artifactPath);
     updateConfigVar(configPath, 'TUNE_OUTPUT_TYPE_LATEST', outputType);
 }

package/templates/do/.benchmark_writer.py CHANGED Viewed

@@ -487,6 +487,7 @@ def enrich_records(config, results, run_timestamp=None):
             'mcc_version': mcc_version,
             'run_timestamp': run_timestamp.isoformat(),
             'region': region,
+            'adapter_name': config.get('adapter_name', ''),
         }
         records.append(record)
@@ -859,6 +860,7 @@ def get_parquet_schema():
         pa.field("mcc_version", pa.string()),
         pa.field("run_timestamp", pa.string()),
         pa.field("region", pa.string()),
+        pa.field("adapter_name", pa.string()),
     ])
@@ -1177,6 +1179,8 @@ def cmd_write(args):
         input_data['workload'] = args.workload
     if args.region:
         input_data['region'] = args.region
+    if args.adapter_name:
+        input_data['adapter_name'] = args.adapter_name
     # ── Validate before any S3 interaction ────────────────────────────────
     errors = validate_benchmark_input(input_data)
@@ -1397,8 +1401,18 @@ def _load_config_file(config_path):
                     'BASE_IMAGE_VERSION': 'base_image_version',
                     'BENCHMARK_CONCURRENCY': 'benchmark_concurrency',
                 }
+                # Also capture IC_ENV_* serving config vars
+                ic_env_map = {
+                    'IC_ENV_VLLM_MAX_MODEL_LEN': 'max_model_len',
+                    'IC_ENV_VLLM_QUANTIZATION': 'quantization',
+                    'IC_ENV_VLLM_GPU_MEMORY_UTILIZATION': 'gpu_memory_utilization',
+                    'IC_ENV_VLLM_KV_CACHE_DTYPE': 'kv_cache_dtype',
+                    'IC_ENV_VLLM_TENSOR_PARALLEL_SIZE': 'tensor_parallel_degree',
+                }
                 if key in shell_map:
                     context[shell_map[key]] = value
+                elif key in ic_env_map:
+                    context[ic_env_map[key]] = value
     except Exception:
         pass
@@ -1415,6 +1429,30 @@ def _load_config_file(config_path):
         parts = context['model_name'].rstrip('/').split('/')
         context['model_name'] = parts[-1] if parts else context['model_name']
+    # Also scan IC config files (do/ic/*.conf) for IC_ENV_* serving params
+    # These override do/config values for serving-specific settings
+    try:
+        import glob
+        config_dir = os.path.dirname(os.path.abspath(config_path))
+        ic_dir = os.path.join(config_dir, 'ic')
+        ic_env_map = {
+            'IC_ENV_VLLM_MAX_MODEL_LEN': 'max_model_len',
+            'IC_ENV_VLLM_QUANTIZATION': 'quantization',
+            'IC_ENV_VLLM_GPU_MEMORY_UTILIZATION': 'gpu_memory_utilization',
+            'IC_ENV_VLLM_KV_CACHE_DTYPE': 'kv_cache_dtype',
+            'IC_ENV_VLLM_TENSOR_PARALLEL_SIZE': 'tensor_parallel_degree',
+        }
+        for conf_file in sorted(glob.glob(os.path.join(ic_dir, '*.conf'))):
+            with open(conf_file, 'r') as f:
+                for line in f:
+                    match = re.match(r'^export\s+([A-Z_][A-Z0-9_]*)=["\']?([^"\']*)["\']?\s*$', line.strip())
+                    if match:
+                        key, value = match.group(1), match.group(2)
+                        if key in ic_env_map and value:
+                            context[ic_env_map[key]] = value
+    except Exception:
+        pass  # IC config scanning is best-effort
     return context
@@ -1462,6 +1500,11 @@ def main():
         '--region',
         help='AWS region'
     )
+    write_parser.add_argument(
+        '--adapter-name', dest='adapter_name', default=None,
+        help='LoRA adapter name (differentiates adapter benchmarks from base model in Athena)'
+    )
     write_parser.add_argument(
         '--dry-run', dest='dry_run', action='store_true',
         help='Output enriched records as JSON without writing to S3'