npm - @aws/ml-container-creator - Versions diffs - 0.10.0 → 0.12.1 - Mend

@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/LICENSE-THIRD-PARTY +9304 -0
package/bin/cli.js +2 -0
package/config/bootstrap-e2e-stack.json +341 -0
package/config/bootstrap-stack.json +40 -3
package/config/parameter-schema-v2.json +33 -22
package/config/tune-catalog.json +1781 -0
package/infra/ci-harness/buildspec.yml +1 -0
package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
package/package.json +53 -67
package/servers/base-image-picker/index.js +121 -121
package/servers/e2e-status/index.js +297 -0
package/servers/e2e-status/manifest.json +14 -0
package/servers/e2e-status/package.json +15 -0
package/servers/endpoint-picker/LICENSE +202 -0
package/servers/endpoint-picker/index.js +536 -0
package/servers/endpoint-picker/manifest.json +14 -0
package/servers/endpoint-picker/package.json +18 -0
package/servers/hyperpod-cluster-picker/index.js +125 -125
package/servers/instance-sizer/index.js +166 -153
package/servers/instance-sizer/lib/instance-ranker.js +120 -76
package/servers/instance-sizer/lib/model-resolver.js +61 -61
package/servers/instance-sizer/lib/quota-resolver.js +113 -113
package/servers/instance-sizer/lib/vram-estimator.js +31 -31
package/servers/lib/bedrock-client.js +38 -38
package/servers/lib/catalogs/instances.json +27 -0
package/servers/lib/catalogs/model-servers.json +201 -3
package/servers/lib/custom-validators.js +13 -13
package/servers/lib/dynamic-resolver.js +4 -4
package/servers/marketplace-picker/index.js +342 -0
package/servers/marketplace-picker/manifest.json +14 -0
package/servers/marketplace-picker/package.json +18 -0
package/servers/model-picker/index.js +382 -382
package/servers/region-picker/index.js +56 -56
package/servers/workload-picker/LICENSE +202 -0
package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
package/servers/workload-picker/index.js +171 -0
package/servers/workload-picker/manifest.json +16 -0
package/servers/workload-picker/package.json +16 -0
package/src/app.js +12 -3
package/src/lib/bootstrap-command-handler.js +609 -15
package/src/lib/bootstrap-config.js +36 -0
package/src/lib/bootstrap-profile-manager.js +48 -41
package/src/lib/ci-register-helpers.js +74 -0
package/src/lib/config-loader.js +3 -0
package/src/lib/config-manager.js +7 -0
package/src/lib/config-validator.js +1 -1
package/src/lib/cuda-resolver.js +17 -8
package/src/lib/generated/cli-options.js +319 -314
package/src/lib/generated/parameter-matrix.js +672 -661
package/src/lib/generated/validation-rules.js +76 -72
package/src/lib/path-prover-brain.js +664 -0
package/src/lib/prompts/infrastructure-prompts.js +2 -2
package/src/lib/prompts/model-prompts.js +6 -0
package/src/lib/prompts/project-prompts.js +12 -0
package/src/lib/secrets-prompt-runner.js +4 -0
package/src/lib/template-manager.js +1 -1
package/src/lib/template-variable-resolver.js +87 -1
package/src/lib/tune-catalog-validator.js +37 -4
package/templates/Dockerfile +9 -0
package/templates/code/adapter_sidecar.py +444 -0
package/templates/code/serve +6 -0
package/templates/code/serve.d/vllm.ejs +1 -1
package/templates/do/.benchmark_writer.py +1476 -0
package/templates/do/.tune_helper.py +982 -57
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/adapter +154 -0
package/templates/do/benchmark +639 -85
package/templates/do/build +5 -0
package/templates/do/clean.d/async-inference.ejs +5 -0
package/templates/do/clean.d/batch-transform.ejs +5 -0
package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
package/templates/do/clean.d/managed-inference.ejs +5 -0
package/templates/do/config +115 -45
package/templates/do/deploy.d/async-inference.ejs +30 -3
package/templates/do/deploy.d/batch-transform.ejs +29 -3
package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
package/templates/do/deploy.d/managed-inference.ejs +216 -14
package/templates/do/lib/endpoint-config.sh +1 -1
package/templates/do/lib/profile.sh +44 -0
package/templates/do/optimize +106 -37
package/templates/do/push +5 -0
package/templates/do/register +94 -0
package/templates/do/stage +567 -0
package/templates/do/submit +7 -0
package/templates/do/test +14 -0
package/templates/do/tune +382 -59
package/templates/do/validate +44 -4

package/templates/do/benchmark CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/bin/bash
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
@@ -19,22 +18,27 @@ CLEAN_AFTER=false
 FORCE=false
 IC_ARG=""
 ADAPTER_ARG=""
+ARG_NO_STALE_WARNING=false
+ARG_WORKLOAD=""
 while [ $# -gt 0 ]; do
     case "$1" in
         --clean) CLEAN_AFTER=true; shift ;;
         --force) FORCE=true; shift ;;
+        --no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
+        --workload) shift; ARG_WORKLOAD="${1:-}"; shift ;;
         --ic) shift; IC_ARG="${1:-}"; shift ;;
         --adapter) shift; ADAPTER_ARG="${1:-}"; shift ;;
         --help|-h)
-            echo "Usage: ./do/benchmark [--ic <name>] [--adapter <name>] [--force] [--clean]"
+            echo "Usage: ./do/benchmark [--workload <name>] [--ic <name>] [--adapter <name>] [--force] [--clean] [--no-stale-warning]"
             echo ""
             echo "Run SageMaker AI Benchmark against the deployed endpoint."
             echo ""
             echo "Options:"
-            echo "  --ic <name>      Benchmark a specific inference component"
-            echo "  --adapter <name> Benchmark a specific LoRA adapter IC"
-            echo "  --force          Create a new benchmark job even if one is already running"
-            echo "  --clean          Delete workload config and benchmark job after displaying results"
+            echo "  --ic <name>         Benchmark a specific inference component"
+            echo "  --adapter <name>    Benchmark a specific LoRA adapter IC"
+            echo "  --force             Create a new benchmark job even if one is already running"
+            echo "  --clean             Delete workload config and benchmark job after displaying results"
+            echo "  --no-stale-warning  Suppress schema registry staleness warning"
             echo ""
             echo "IC resolution:"
             echo "  --adapter <name> Use ADAPTER_IC_NAME from do/adapters/<name>.conf"
@@ -54,6 +58,433 @@ while [ $# -gt 0 ]; do
     esac
 done
+# ── Require --workload flag ───────────────────────────────────────────────────
+if [ -z "${ARG_WORKLOAD}" ]; then
+    echo "❌ --workload <name> is required"
+    echo ""
+    # List available workloads from the MCP catalog
+    _CATALOG_FOR_HELP=""
+    if command -v npm &>/dev/null; then
+        _NPM_ROOT=$(npm root -g 2>/dev/null) || _NPM_ROOT=""
+        if [ -n "${_NPM_ROOT}" ] && [ -f "${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
+            _CATALOG_FOR_HELP="${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
+        fi
+    fi
+    if [ -n "${_CATALOG_FOR_HELP}" ]; then
+        echo "   Available workloads:"
+        python3 -c "
+import json
+with open('${_CATALOG_FOR_HELP}') as f:
+    catalog = json.load(f)
+for name, wl in catalog.get('workloads', {}).items():
+    print(f'     {name:30s} {wl.get("description", "")[:50]}')
+" 2>/dev/null || echo "   (could not read workload catalog)"
+    else
+        echo "   Run 'ml-container-creator mcp init' to install workload profiles"
+    fi
+    echo ""
+    echo "   Usage: ./do/benchmark --workload multi_turn_chat"
+    exit 1
+fi
+# ── Workload Resolution (from workload-picker MCP server catalog) ─────────────
+# If --workload is passed with a named workload (not "manual"), resolve
+# the workload parameters from the MCP server's catalog file. This overrides
+# BENCHMARK_INPUT_TOKENS_MEAN, BENCHMARK_OUTPUT_TOKENS_MEAN, BENCHMARK_STREAMING,
+# and BENCHMARK_CONCURRENCY_LEVELS from do/config.
+BENCHMARK_WORKLOAD="${ARG_WORKLOAD:-manual}"
+if [ "${BENCHMARK_WORKLOAD}" != "manual" ]; then
+    # Locate the workload catalog (npm global or local)
+    _WORKLOAD_CATALOG=""
+    if [ -f "$(dirname "${BASH_SOURCE[0]}")/../node_modules/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
+        _WORKLOAD_CATALOG="$(dirname "${BASH_SOURCE[0]}")/../node_modules/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
+    elif command -v npm &>/dev/null; then
+        _NPM_ROOT=$(npm root -g 2>/dev/null) || _NPM_ROOT=""
+        if [ -n "${_NPM_ROOT}" ] && [ -f "${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json" ]; then
+            _WORKLOAD_CATALOG="${_NPM_ROOT}/@aws/ml-container-creator/servers/workload-picker/catalogs/workload-profiles.json"
+        fi
+    fi
+    if [ -n "${_WORKLOAD_CATALOG}" ]; then
+        _WL_PARAMS=$(python3 -c "
+import json, sys
+with open('${_WORKLOAD_CATALOG}') as f:
+    catalog = json.load(f)
+wl = catalog.get('workloads', {}).get('${BENCHMARK_WORKLOAD}')
+if wl:
+    print(json.dumps(wl))
+else:
+    print('null')
+" 2>/dev/null) || _WL_PARAMS="null"
+        if [ "${_WL_PARAMS}" != "null" ] && [ -n "${_WL_PARAMS}" ]; then
+            echo "📋 Workload profile: ${BENCHMARK_WORKLOAD}"
+            BENCHMARK_INPUT_TOKENS_MEAN=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['input_tokens_mean'])")
+            BENCHMARK_OUTPUT_TOKENS_MEAN=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['output_tokens_mean'])")
+            BENCHMARK_STREAMING=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(str(json.load(sys.stdin)['streaming']).lower())")
+            # Set concurrency levels for multi-level mode if not already overridden
+            if [ -z "${BENCHMARK_CONCURRENCY_LEVELS:-}" ]; then
+                BENCHMARK_CONCURRENCY_LEVELS=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(','.join(str(x) for x in json.load(sys.stdin)['concurrency_levels']))")
+            fi
+            # Also override single-level BENCHMARK_CONCURRENCY with first level from workload
+            BENCHMARK_CONCURRENCY=$(echo "${_WL_PARAMS}" | python3 -c "import sys,json; print(json.load(sys.stdin)['concurrency_levels'][0])")
+            echo "   Input tokens: ${BENCHMARK_INPUT_TOKENS_MEAN}, Output tokens: ${BENCHMARK_OUTPUT_TOKENS_MEAN}"
+            echo "   Streaming: ${BENCHMARK_STREAMING}, Concurrency: ${BENCHMARK_CONCURRENCY_LEVELS:-${BENCHMARK_CONCURRENCY}}"
+            echo ""
+        else
+            echo "⚠️  Unknown workload '${BENCHMARK_WORKLOAD}' — using do/config defaults"
+        fi
+    else
+        echo "⚠️  Workload catalog not found — using do/config defaults"
+    fi
+fi
+# ── Resolve profile-level values ──────────────────────────────────────────────
+# Read S3 buckets and account info from the bootstrap profile
+_PROFILE_JSON=""
+if command -v python3 &>/dev/null; then
+    _PROFILE_JSON=$(python3 -c "
+import json, os
+config_path = os.path.expanduser('~/.ml-container-creator/config.json')
+try:
+    with open(config_path) as f:
+        config = json.load(f)
+    profile = config['profiles'][config['activeProfile']]
+    print(json.dumps(profile))
+except:
+    print('{}')
+" 2>/dev/null) || _PROFILE_JSON="{}"
+fi
+# Extract benchmark-relevant profile values
+BENCHMARK_S3_OUTPUT_PATH=$(echo "${_PROFILE_JSON}" | python3 -c "
+import sys, json
+p = json.load(sys.stdin)
+bucket = p.get('benchmarkS3Bucket', '')
+if not bucket:
+    acct = p.get('accountId', 'unknown')
+    region = p.get('awsRegion', 'us-east-1')
+    bucket = f'mlcc-benchmark-{acct}-{region}'
+print(f's3://{bucket}/${PROJECT_NAME}/')
+" 2>/dev/null) || BENCHMARK_S3_OUTPUT_PATH=""
+CI_BENCHMARK_RESULTS_BUCKET=$(echo "${_PROFILE_JSON}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ciBenchmarkResultsBucket', ''))" 2>/dev/null) || CI_BENCHMARK_RESULTS_BUCKET=""
+# Derive job names at runtime (unique per invocation)
+BENCHMARK_JOB_NAME="${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)"
+BENCHMARK_WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"
+# Ensure benchmark params have defaults (in case workload catalog wasn't found)
+BENCHMARK_CONCURRENCY=${BENCHMARK_CONCURRENCY:-10}
+BENCHMARK_INPUT_TOKENS_MEAN=${BENCHMARK_INPUT_TOKENS_MEAN:-550}
+BENCHMARK_OUTPUT_TOKENS_MEAN=${BENCHMARK_OUTPUT_TOKENS_MEAN:-150}
+BENCHMARK_STREAMING=${BENCHMARK_STREAMING:-true}
+# ── Multi-level concurrency support (CI Stage 2) ─────────────────────────────
+# When BENCHMARK_CONCURRENCY_LEVELS is set (comma-separated integers, e.g. "1,4,8"
+# or JSON array string, e.g. "[1,4,8]"), and we are NOT already in single-level
+# execution mode (_BENCHMARK_SINGLE_LEVEL), the script iterates over each level,
+# re-invoking itself for each one.
+# Results from all levels are aggregated into a combined JSON for the benchmark writer.
+# This supports Requirement 1.5: configurable concurrency levels per config.
+if [ -n "${BENCHMARK_CONCURRENCY_LEVELS:-}" ] && [ -z "${_BENCHMARK_SINGLE_LEVEL:-}" ]; then
+    # Normalize: strip brackets and spaces, convert to comma-separated
+    _NORMALIZED_LEVELS=$(echo "${BENCHMARK_CONCURRENCY_LEVELS}" | tr -d '[] ' )
+    # Skip if empty after normalization
+    if [ -n "${_NORMALIZED_LEVELS}" ]; then
+        echo "📊 Multi-level benchmark: running concurrency levels [${_NORMALIZED_LEVELS}]"
+        echo ""
+        IFS=',' read -ra _LEVELS <<< "${_NORMALIZED_LEVELS}"
+        _ALL_RESULTS_DIR="${SCRIPT_DIR}/../benchmarks/multi-level-$(date +%Y%m%d-%H%M%S)"
+        mkdir -p "${_ALL_RESULTS_DIR}"
+        _LEVEL_FAILURES=0
+        for _LEVEL in "${_LEVELS[@]}"; do
+            _LEVEL=$(echo "${_LEVEL}" | tr -d ' ')
+            # Skip non-numeric values
+            if ! [[ "${_LEVEL}" =~ ^[0-9]+$ ]]; then
+                echo "⚠️  Skipping invalid concurrency level: ${_LEVEL}"
+                continue
+            fi
+            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+            echo "  Running benchmark at concurrency level: ${_LEVEL}"
+            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+            echo ""
+            # Re-invoke self with overridden concurrency and single-level flag
+            export BENCHMARK_CONCURRENCY="${_LEVEL}"
+            export _BENCHMARK_SINGLE_LEVEL=1
+            # Build argument list for re-invocation
+            _REINVOKE_ARGS="--force"
+            if [ "${CLEAN_AFTER}" = true ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --clean"; fi
+            if [ "${ARG_NO_STALE_WARNING}" = true ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --no-stale-warning"; fi
+            if [ -n "${ARG_WORKLOAD}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --workload ${ARG_WORKLOAD}"; fi
+            if [ -n "${IC_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --ic ${IC_ARG}"; fi
+            if [ -n "${ADAPTER_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --adapter ${ADAPTER_ARG}"; fi
+            if "${BASH_SOURCE[0]}" ${_REINVOKE_ARGS}; then
+                # Copy results to aggregation directory — find the child's results
+                # Try the marker file first (set by child), then fall back to ls -td
+                _LATEST_JOB_DIR=""
+                if [ -f "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" ]; then
+                    _LATEST_JOB_DIR=$(cat "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" 2>/dev/null)
+                fi
+                if [ -z "${_LATEST_JOB_DIR}" ] || [ ! -d "${_LATEST_JOB_DIR}" ]; then
+                    _LATEST_JOB_DIR=$(ls -td "${SCRIPT_DIR}/../benchmarks/${PROJECT_NAME}-benchmark-"* 2>/dev/null | head -1)
+                fi
+                if [ -n "${_LATEST_JOB_DIR}" ] && [ -d "${_LATEST_JOB_DIR}" ] && [ -f "${_LATEST_JOB_DIR}/output/profile_export.jsonl" ]; then
+                    cp "${_LATEST_JOB_DIR}/output/profile_export.jsonl" "${_ALL_RESULTS_DIR}/profile-concurrency-${_LEVEL}.jsonl"
+                elif [ -n "${_LATEST_JOB_DIR}" ] && [ -f "${_LATEST_JOB_DIR}/output/profile_export_aiperf.json" ]; then
+                    cp "${_LATEST_JOB_DIR}/output/profile_export_aiperf.json" "${_ALL_RESULTS_DIR}/results-concurrency-${_LEVEL}.json"
+                fi
+            else
+                echo "⚠️  Benchmark at concurrency ${_LEVEL} failed (non-fatal, continuing)"
+                _LEVEL_FAILURES=$((_LEVEL_FAILURES + 1))
+            fi
+            unset _BENCHMARK_SINGLE_LEVEL
+            echo ""
+        done
+        # Aggregate results into a combined JSON file for the benchmark writer
+        # Reads per-level JSONL files and computes aggregate metrics per concurrency level
+        echo "📊 Aggregating multi-level results..."
+        _COMBINED_FILE="${_ALL_RESULTS_DIR}/results.json"
+        python3 -c "
+import json, glob, sys, os, math
+def percentile(sorted_vals, pct):
+    if not sorted_vals:
+        return 0.0
+    idx = (pct / 100.0) * (len(sorted_vals) - 1)
+    lower = int(math.floor(idx))
+    upper = int(math.ceil(idx))
+    if lower == upper:
+        return sorted_vals[lower]
+    frac = idx - lower
+    return sorted_vals[lower] * (1 - frac) + sorted_vals[upper] * frac
+def get_val(metrics, key):
+    m = metrics.get(key)
+    if isinstance(m, dict):
+        return m.get('value')
+    return m
+results_dir = '${_ALL_RESULTS_DIR}'
+combined = {'metrics': []}
+# Process JSONL files (preferred)
+for f in sorted(glob.glob(os.path.join(results_dir, 'profile-concurrency-*.jsonl'))):
+    try:
+        level = int(os.path.basename(f).replace('profile-concurrency-', '').replace('.jsonl', ''))
+        records = []
+        with open(f) as fp:
+            for line in fp:
+                line = line.strip()
+                if line:
+                    records.append(json.loads(line))
+        if not records:
+            continue
+        # Aggregate per-request metrics
+        latencies, ttfts, itls, ttsts, out_tokens = [], [], [], [], []
+        start_times, end_times, in_tokens = [], [], []
+        prefill_tps, output_tps = [], []
+        for rec in records:
+            meta = rec.get('metadata', {})
+            metrics = rec.get('metrics', {})
+            lat = get_val(metrics, 'request_latency')
+            if lat is not None: latencies.append(lat)
+            ttft = get_val(metrics, 'time_to_first_token') or get_val(metrics, 'time_to_first_output_token')
+            if ttft is not None: ttfts.append(ttft)
+            itl = get_val(metrics, 'inter_token_latency')
+            if itl is not None: itls.append(itl)
+            ttst = get_val(metrics, 'time_to_second_token')
+            if ttst is not None: ttsts.append(ttst)
+            otc = get_val(metrics, 'output_token_count')
+            if otc is not None: out_tokens.append(otc)
+            isl = get_val(metrics, 'input_sequence_length')
+            if isl is not None: in_tokens.append(isl)
+            ptps = get_val(metrics, 'prefill_throughput_per_user')
+            if ptps is not None: prefill_tps.append(ptps)
+            otps = get_val(metrics, 'output_token_throughput_per_user')
+            if otps is not None: output_tps.append(otps)
+            rs = meta.get('request_start_ns')
+            re_ = meta.get('request_end_ns')
+            if rs: start_times.append(rs)
+            if re_: end_times.append(re_)
+        # Sort for percentiles
+        latencies.sort()
+        ttfts.sort()
+        itls.sort()
+        ttsts.sort()
+        prefill_tps.sort()
+        output_tps.sort()
+        # Compute throughput
+        duration_s = (max(end_times) - min(start_times)) / 1e9 if start_times and end_times else 1.0
+        duration_s = max(duration_s, 0.001)
+        req_throughput = len(records) / duration_s
+        token_throughput = sum(out_tokens) / duration_s if out_tokens else 0.0
+        entry = {
+            'concurrency': level,
+            'request_throughput': req_throughput,
+            'output_token_throughput': token_throughput,
+            'total_requests': len(records),
+            'duration_seconds': duration_s,
+            'time_to_first_token': {
+                'avg': sum(ttfts)/len(ttfts) if ttfts else 0.0,
+                'p50': percentile(ttfts, 50),
+                'p90': percentile(ttfts, 90),
+                'p99': percentile(ttfts, 99),
+            },
+            'inter_token_latency': {
+                'avg': sum(itls)/len(itls) if itls else 0.0,
+                'p50': percentile(itls, 50),
+                'p90': percentile(itls, 90),
+                'p99': percentile(itls, 99),
+            },
+            'e2e_latency': {
+                'avg': sum(latencies)/len(latencies) if latencies else 0.0,
+                'p50': percentile(latencies, 50),
+                'p90': percentile(latencies, 90),
+                'p99': percentile(latencies, 99),
+            },
+            'time_to_second_token': {
+                'p50': percentile(ttsts, 50),
+                'p90': percentile(ttsts, 90),
+            },
+            'prefill_throughput': {
+                'avg': sum(prefill_tps)/len(prefill_tps) if prefill_tps else 0.0,
+                'p50': percentile(prefill_tps, 50),
+            },
+            'output_token_throughput_detail': {
+                'avg': sum(output_tps)/len(output_tps) if output_tps else 0.0,
+                'p50': percentile(output_tps, 50),
+                'p90': percentile(output_tps, 90),
+            },
+            'total_token_throughput': (sum(out_tokens) + sum(in_tokens)) / duration_s if (out_tokens or in_tokens) else 0.0,
+            'output_sequence_length': sum(out_tokens)/len(out_tokens) if out_tokens else 0.0,
+            'input_sequence_length': sum(in_tokens)/len(in_tokens) if in_tokens else 0.0,
+            'request_count': len(records),
+            'input_tokens_mean': ${BENCHMARK_INPUT_TOKENS_MEAN:-0},
+            'output_tokens_mean': ${BENCHMARK_OUTPUT_TOKENS_MEAN:-0},
+        }
+        combined['metrics'].append(entry)
+    except Exception as e:
+        print(f'Warning: Could not parse {f}: {e}', file=sys.stderr)
+# Fallback: process old-style JSON files if no JSONL found
+if not combined['metrics']:
+    for f in sorted(glob.glob(os.path.join(results_dir, 'results-concurrency-*.json'))):
+        try:
+            with open(f) as fp:
+                data = json.load(fp)
+            level = int(os.path.basename(f).replace('results-concurrency-', '').replace('.json', ''))
+            if isinstance(data, dict):
+                data['concurrency'] = level
+                combined['metrics'].append(data)
+        except Exception as e:
+            print(f'Warning: Could not parse {f}: {e}', file=sys.stderr)
+with open('${_COMBINED_FILE}', 'w') as fp:
+    try:
+        json.dump(combined, fp, indent=2)
+    except TypeError as te:
+        print(f'Warning: JSON serialize error: {str(te)}', file=sys.stderr)
+        fp.write(json.dumps({'metrics': []}, indent=2))
+n_metrics = len(combined.get('metrics', []))
+print(f'Combined {n_metrics} concurrency level results')
+" 2>&1
+        # Persist to Athena if CI mode is active
+        if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ] && [ -f "${_COMBINED_FILE}" ]; then
+            echo ""
+            echo "📊 Persisting multi-level benchmark results to Athena..."
+            _compute_config_id() {
+                local input="${DEPLOYMENT_CONFIG}:${MODEL_NAME:-none}:${INSTANCE_TYPE}:${AWS_REGION}:${DEPLOYMENT_TARGET}:ic${IC_COUNT:-1}:adapt${ADAPTER_COUNT:-0}"
+                if command -v sha256sum &> /dev/null; then
+                    echo -n "$input" | sha256sum | cut -c1-16
+                else
+                    echo -n "$input" | shasum -a 256 | cut -c1-16
+                fi
+            }
+            CONFIG_ID=$(_compute_config_id)
+            if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
+                --results-file "${_COMBINED_FILE}" \
+                --config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
+                --project-name "${PROJECT_NAME}" \
+                --workload "${BENCHMARK_WORKLOAD:-manual}" \
+                --bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
+                --region "${AWS_REGION:-${REGION}}"; then
+                echo "✅ Multi-level benchmark results persisted to S3"
+            else
+                echo "⚠️  Failed to persist multi-level benchmark results to Athena (non-fatal)"
+            fi
+        fi
+        echo ""
+        echo "📋 Multi-level Summary:"
+        echo "   Levels tested: ${_NORMALIZED_LEVELS}"
+        echo "   Failures: ${_LEVEL_FAILURES} / ${#_LEVELS[@]}"
+        echo "   Results: ${_ALL_RESULTS_DIR}/"
+        if [ ${_LEVEL_FAILURES} -ge ${#_LEVELS[@]} ]; then
+            echo "❌ All concurrency levels failed"
+            exit 1
+        fi
+        exit 0
+    fi
+fi
+# ── _check_schema_registry_staleness() ────────────────────────────────────────
+# Warn if the schema registry manifest's lastSynced timestamp is older than threshold.
+# Configurable via MCC_CATALOG_STALENESS_DAYS (default: 90).
+# Suppressed by --no-stale-warning flag or MCC_NO_STALE_WARNING=true env var.
+_check_schema_registry_staleness() {
+    if [ "${MCC_NO_STALE_WARNING:-}" = "true" ] || [ "${ARG_NO_STALE_WARNING:-false}" = true ]; then
+        return 0
+    fi
+    local threshold="${MCC_CATALOG_STALENESS_DAYS:-90}"
+    local manifest_file="${HOME}/.ml-container-creator/schemas/manifest.json"
+    if [ ! -f "${manifest_file}" ]; then
+        return 0
+    fi
+    local last_synced
+    last_synced=$(python3 -c "
+import json, sys
+from datetime import datetime, timezone
+try:
+    with open('${manifest_file}') as f:
+        manifest = json.load(f)
+    ls = manifest.get('lastSynced', '')
+    if not ls:
+        sys.exit(0)
+    synced = datetime.fromisoformat(ls.replace('Z', '+00:00'))
+    days = (datetime.now(timezone.utc) - synced).days
+    if days > int('${threshold}'):
+        print(days)
+except:
+    pass
+" 2>/dev/null)
+    if [ -n "${last_synced}" ]; then
+        echo "⚠️  Schema registry is ${last_synced} days old. Run 'ml-container-creator bootstrap sync-schemas' to update."
+    fi
+}
+_check_schema_registry_staleness
 # ── Verify AWS CLI v2 ─────────────────────────────────────────────────────────
 if ! aws --version 2>&1 | grep -q "aws-cli/2"; then
     echo "❌ AWS CLI v2 is required for benchmarking."
@@ -185,7 +616,7 @@ if [ "${FORCE}" = false ] && [ -n "${BENCHMARK_JOB_NAME:-}" ]; then
 fi
 # ── Configuration ─────────────────────────────────────────────────────────────
-WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config"
+WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"
 if [ "${RESUME_EXISTING}" = false ]; then
     BENCHMARK_JOB_NAME="${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)"
 fi
@@ -357,6 +788,7 @@ if [ -n "${EXISTING_CONFIG_SPEC}" ]; then
     if [ "${EXISTING_NORMALIZED}" = "${DESIRED_NORMALIZED}" ]; then
         echo "   ✅ Existing workload config matches current parameters — reusing"
+        CREATE_WORKLOAD_CONFIG=false
     else
         echo "   ⚠️  Workload config parameters changed — recreating..."
         aws sagemaker delete-ai-workload-config \
@@ -484,10 +916,11 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
     # Persist results locally to benchmarks/<job-name>/
     PROJECT_ROOT="${SCRIPT_DIR}/.."
     LOCAL_RESULTS_DIR="${PROJECT_ROOT}/benchmarks/${BENCHMARK_JOB_NAME}"
-    RESULTS_FILE="${LOCAL_RESULTS_DIR}/results.json"
+    RESULTS_JSONL="${LOCAL_RESULTS_DIR}/output/profile_export.jsonl"
+    RESULTS_FILE="${LOCAL_RESULTS_DIR}/output/profile_export_aiperf.json"
     # Check if results already exist locally (idempotency: skip S3 download)
-    if [ -f "${RESULTS_FILE}" ]; then
+    if [ -f "${RESULTS_JSONL}" ] || [ -f "${RESULTS_FILE}" ]; then
         echo "📥 Step 4: Results already available locally"
         RESULTS_DOWNLOADED=true
     else
@@ -513,12 +946,27 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
         # This is the most reliable approach — handles any subdirectory structure
         echo "   Syncing results from S3..."
         if aws s3 sync "${RESULTS_S3_PATH}" "${LOCAL_RESULTS_DIR}/" --region "${AWS_REGION}" 2>/dev/null; then
-            # Look for any JSON file in the synced directory tree
-            FOUND_FILE=$(find "${LOCAL_RESULTS_DIR}" -name "*.json" -type f 2>/dev/null | head -1)
-            if [ -n "${FOUND_FILE}" ]; then
-                # If the found file isn't already at our canonical path, copy it there
-                if [ "${FOUND_FILE}" != "${RESULTS_FILE}" ]; then
-                    cp "${FOUND_FILE}" "${RESULTS_FILE}"
+            # Extract any tar.gz archives (benchmark service packages results as output.tar.gz)
+            for ARCHIVE in $(find "${LOCAL_RESULTS_DIR}" -name "*.tar.gz" -type f 2>/dev/null); do
+                ARCHIVE_DIR=$(dirname "${ARCHIVE}")
+                tar -xzf "${ARCHIVE}" -C "${ARCHIVE_DIR}" 2>/dev/null || true
+            done
+            # Look for specific result files (priority: JSONL > aiperf JSON)
+            _FOUND_JSONL=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export.jsonl" -type f 2>/dev/null | head -1)
+            _FOUND_JSON=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export_aiperf.json" -type f 2>/dev/null | head -1)
+            if [ -n "${_FOUND_JSONL}" ]; then
+                if [ "${_FOUND_JSONL}" != "${RESULTS_JSONL}" ]; then
+                    mkdir -p "$(dirname "${RESULTS_JSONL}")"
+                    cp "${_FOUND_JSONL}" "${RESULTS_JSONL}"
+                fi
+                RESULTS_DOWNLOADED=true
+            fi
+            if [ -n "${_FOUND_JSON}" ]; then
+                if [ "${_FOUND_JSON}" != "${RESULTS_FILE}" ]; then
+                    mkdir -p "$(dirname "${RESULTS_FILE}")"
+                    cp "${_FOUND_JSON}" "${RESULTS_FILE}"
                 fi
                 RESULTS_DOWNLOADED=true
             fi
@@ -531,33 +979,25 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
             RESULTS_BUCKET=$(echo "${RESULTS_S3_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
             RESULTS_PREFIX=$(echo "${RESULTS_S3_PATH}" | sed "s|s3://${RESULTS_BUCKET}/||")
-            # List all objects under the output path and find data files
-            # aws s3api list-objects-v2 is more reliable than aws s3 ls --recursive
-            FOUND_KEY=$(aws s3api list-objects-v2 \
+            # List all objects and look for our target files
+            _ALL_KEYS=$(aws s3api list-objects-v2 \
                 --bucket "${RESULTS_BUCKET}" \
                 --prefix "${RESULTS_PREFIX}" \
                 --region "${AWS_REGION}" \
                 --query 'Contents[].Key' \
-                --output text 2>/dev/null \
-                | tr '\t' '\n' \
-                | grep -E '\.(json|jsonl|csv)$' \
-                | head -1)
-            if [ -n "${FOUND_KEY}" ] && [ "${FOUND_KEY}" != "None" ]; then
-                if aws s3 cp "s3://${RESULTS_BUCKET}/${FOUND_KEY}" "${RESULTS_FILE}" --region "${AWS_REGION}" 2>/dev/null; then
-                    RESULTS_DOWNLOADED=true
-                fi
-            fi
-        fi
+                --output text 2>/dev/null | tr '\t' '\n')
-        # Strategy 3: If still nothing, try direct path patterns the service might use
-        if [ "${RESULTS_DOWNLOADED}" = false ]; then
-            for PATTERN in "results.json" "benchmark_results.json" "output.json"; do
-                if aws s3 cp "${RESULTS_S3_PATH}${PATTERN}" "${RESULTS_FILE}" --region "${AWS_REGION}" 2>/dev/null; then
-                    RESULTS_DOWNLOADED=true
-                    break
-                fi
-            done
+            _JSONL_KEY=$(echo "${_ALL_KEYS}" | grep "profile_export\.jsonl$" | head -1)
+            _JSON_KEY=$(echo "${_ALL_KEYS}" | grep "profile_export_aiperf\.json$" | head -1)
+            if [ -n "${_JSONL_KEY}" ] && [ "${_JSONL_KEY}" != "None" ]; then
+                mkdir -p "$(dirname "${RESULTS_JSONL}")"
+                aws s3 cp "s3://${RESULTS_BUCKET}/${_JSONL_KEY}" "${RESULTS_JSONL}" --region "${AWS_REGION}" 2>/dev/null && RESULTS_DOWNLOADED=true
+            fi
+            if [ -n "${_JSON_KEY}" ] && [ "${_JSON_KEY}" != "None" ]; then
+                mkdir -p "$(dirname "${RESULTS_FILE}")"
+                aws s3 cp "s3://${RESULTS_BUCKET}/${_JSON_KEY}" "${RESULTS_FILE}" --region "${AWS_REGION}" 2>/dev/null && RESULTS_DOWNLOADED=true
+            fi
         fi
     fi
@@ -573,72 +1013,156 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
         echo "║  Endpoint: ${ENDPOINT_NAME}"
         echo "╠══════════════════════════════════════════════════════════════════╣"
-        # Parse and display metrics using built-in tools
-        # Extract key metrics from the results JSON
+        # Parse and display metrics from profile_export.jsonl (rich per-request data)
         if command -v python3 &>/dev/null; then
             python3 -c "
-import json, sys
+import json, sys, os, math
+def percentile(sorted_vals, pct):
+    if not sorted_vals:
+        return None
+    idx = (pct / 100.0) * (len(sorted_vals) - 1)
+    lower = int(math.floor(idx))
+    upper = int(math.ceil(idx))
+    if lower == upper:
+        return sorted_vals[lower]
+    frac = idx - lower
+    return sorted_vals[lower] * (1 - frac) + sorted_vals[upper] * frac
+def fmt(val, suffix=''):
+    if val is None:
+        return 'N/A'
+    return f'{val:.2f}{suffix}'
 try:
-    with open('${RESULTS_FILE}') as f:
-        data = json.load(f)
-    metrics = data if isinstance(data, dict) else {}
-    # Helper to safely get nested values
-    def get_metric(d, *keys):
-        for k in keys:
-            if isinstance(d, dict):
-                d = d.get(k, 'N/A')
-            else:
-                return 'N/A'
-        return d
-    # Display throughput
-    throughput = get_metric(metrics, 'request_throughput')
-    output_throughput = get_metric(metrics, 'output_token_throughput')
-    print(f'║  Request Throughput:      {throughput} req/s')
-    print(f'║  Output Token Throughput: {output_throughput} tokens/s')
-    print('║')
-    # Display request latency
-    lat_p50 = get_metric(metrics, 'request_latency', 'p50')
-    lat_p90 = get_metric(metrics, 'request_latency', 'p90')
-    lat_p99 = get_metric(metrics, 'request_latency', 'p99')
-    print(f'║  Request Latency (ms):')
-    print(f'║    P50: {lat_p50}  P90: {lat_p90}  P99: {lat_p99}')
-    print('║')
-    # Display TTFT (time to first token)
-    ttft_p50 = get_metric(metrics, 'time_to_first_token', 'p50')
-    ttft_p90 = get_metric(metrics, 'time_to_first_token', 'p90')
-    ttft_p99 = get_metric(metrics, 'time_to_first_token', 'p99')
-    print(f'║  Time to First Token (ms):')
-    print(f'║    P50: {ttft_p50}  P90: {ttft_p90}  P99: {ttft_p99}')
-    print('║')
-    # Display ITL (inter-token latency)
-    itl_p50 = get_metric(metrics, 'inter_token_latency', 'p50')
-    itl_p90 = get_metric(metrics, 'inter_token_latency', 'p90')
-    itl_p99 = get_metric(metrics, 'inter_token_latency', 'p99')
-    print(f'║  Inter-Token Latency (ms):')
-    print(f'║    P50: {itl_p50}  P90: {itl_p90}  P99: {itl_p99}')
+    jsonl_path = '${RESULTS_JSONL}'
+    json_path = '${RESULTS_FILE}'
+    records = []
+    # Primary: read profile_export.jsonl (rich per-request data)
+    if os.path.exists(jsonl_path):
+        with open(jsonl_path) as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    try:
+                        records.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        continue
+    if records:
+        # Extract scalar values from metric dicts {"value": X, "unit": "..."}
+        def get_val(metrics, key):
+            m = metrics.get(key)
+            if isinstance(m, dict):
+                return m.get('value')
+            return m
+        # Collect per-request metrics
+        latencies = []
+        ttfts = []
+        itls = []
+        ttsts = []
+        output_tokens = []
+        start_times = []
+        end_times = []
+        for rec in records:
+            meta = rec.get('metadata', {})
+            metrics = rec.get('metrics', {})
+            lat = get_val(metrics, 'request_latency')
+            if lat is not None:
+                latencies.append(lat)
+            ttft = get_val(metrics, 'time_to_first_token')
+            if ttft is None:
+                ttft = get_val(metrics, 'time_to_first_output_token')
+            if ttft is not None:
+                ttfts.append(ttft)
+            itl = get_val(metrics, 'inter_token_latency')
+            if itl is not None:
+                itls.append(itl)
+            ttst = get_val(metrics, 'time_to_second_token')
+            if ttst is not None:
+                ttsts.append(ttst)
+            otc = get_val(metrics, 'output_token_count')
+            if otc is not None:
+                output_tokens.append(otc)
+            # Track timing for throughput calculation
+            rs = meta.get('request_start_ns')
+            re_ = meta.get('request_end_ns')
+            if rs is not None:
+                start_times.append(rs)
+            if re_ is not None:
+                end_times.append(re_)
+        n = len(records)
+        # Compute system throughput
+        if start_times and end_times:
+            duration_ns = max(end_times) - min(start_times)
+            duration_s = duration_ns / 1e9 if duration_ns > 0 else 1.0
+            req_throughput = n / duration_s
+            total_out_tokens = sum(output_tokens) if output_tokens else 0
+            token_throughput = total_out_tokens / duration_s
+        else:
+            req_throughput = None
+            token_throughput = None
+        # Compute percentiles
+        latencies.sort()
+        ttfts.sort()
+        itls.sort()
+        ttsts.sort()
+        print(f'║  Requests:                {n}')
+        print(f'║  Request Throughput:      {fmt(req_throughput)} req/s')
+        print(f'║  Output Token Throughput: {fmt(token_throughput)} tokens/s')
+        print('║')
+        print('║  Time to First Token (ms):')
+        print(f'║    Avg: {fmt(sum(ttfts)/len(ttfts) if ttfts else None)}  P50: {fmt(percentile(ttfts, 50))}  P90: {fmt(percentile(ttfts, 90))}  P99: {fmt(percentile(ttfts, 99))}')
+        print('║')
+        print('║  Inter-Token Latency (ms):')
+        print(f'║    Avg: {fmt(sum(itls)/len(itls) if itls else None)}  P50: {fmt(percentile(itls, 50))}  P90: {fmt(percentile(itls, 90))}  P99: {fmt(percentile(itls, 99))}')
+        print('║')
+        print('║  Request Latency (ms):')
+        print(f'║    Avg: {fmt(sum(latencies)/len(latencies) if latencies else None)}  P50: {fmt(percentile(latencies, 50))}  P90: {fmt(percentile(latencies, 90))}  P99: {fmt(percentile(latencies, 99))}')
+        print('║')
+        print('║  Time to Second Token (ms):')
+        print(f'║    Avg: {fmt(sum(ttsts)/len(ttsts) if ttsts else None)}  P50: {fmt(percentile(ttsts, 50))}  P90: {fmt(percentile(ttsts, 90))}  P99: {fmt(percentile(ttsts, 99))}')
+    else:
+        print('║  ⚠️  No JSONL results found — cannot display metrics')
+        print(f'║  Expected: {jsonl_path}')
 except Exception as e:
     print(f'║  ⚠️  Could not parse results: {e}')
-    print(f'║  Raw file: ${RESULTS_FILE}')
+    import traceback
+    traceback.print_exc(file=sys.stderr)
 "
         else
             # Fallback: display raw JSON if python3 is not available
             echo "║  (python3 not available — showing raw results)"
             echo "║"
-            cat "${RESULTS_FILE}" | head -50
+            if [ -f "${RESULTS_JSONL}" ]; then
+                head -3 "${RESULTS_JSONL}"
+            elif [ -f "${RESULTS_FILE}" ]; then
+                cat "${RESULTS_FILE}" | head -50
+            fi
         fi
         echo "╚══════════════════════════════════════════════════════════════════╝"
         echo ""
         echo "📁 Results saved to: benchmarks/${BENCHMARK_JOB_NAME}/"
         echo "☁️  S3 results: ${RESULTS_S3_PATH:-${BENCHMARK_S3_OUTPUT_PATH}}"
+        # Write marker for multi-level parent to find this results dir
+        echo "${LOCAL_RESULTS_DIR}" > "/tmp/.mlcc-benchmark-latest-${PROJECT_NAME}" 2>/dev/null || true
     else
         echo "⚠️  Could not download results from S3"
         echo "   The benchmark completed but results could not be located."
@@ -661,6 +1185,36 @@ except Exception as e:
             --output table 2>/dev/null || echo "   (could not list objects)"
     fi
+    # ── Persist benchmark results to Athena ──────────────────────────────────
+    # When CI_BENCHMARK_RESULTS_BUCKET is set (from bootstrap config), call the
+    # benchmark writer to persist results as Parquet to S3 for Athena querying.
+    # Skip when running as a child of multi-level mode — the parent orchestrator
+    # handles combined persistence (one row per concurrency level, no duplicates).
+    if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ] && [ "${RESULTS_DOWNLOADED}" = true ] && [ -z "${_BENCHMARK_SINGLE_LEVEL:-}" ]; then
+        echo ""
+        echo "📊 Persisting benchmark results to Athena..."
+        # Determine which results file to pass to the writer (prefer JSONL)
+        _WRITER_INPUT="${RESULTS_JSONL}"
+        if [ ! -f "${_WRITER_INPUT}" ]; then
+            _WRITER_INPUT="${RESULTS_FILE}"
+        fi
+        # Best-effort: errors are logged but do not fail the benchmark script
+        if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
+            --results-file "${_WRITER_INPUT}" \
+            --config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
+            --project-name "${PROJECT_NAME}" \
+            --workload "${BENCHMARK_WORKLOAD:-manual}" \
+            --concurrency "${BENCHMARK_CONCURRENCY}" \
+            --bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
+            --region "${AWS_REGION:-${REGION}}"; then
+            echo "✅ Benchmark results persisted to S3"
+        else
+            echo "⚠️  Failed to persist benchmark results to Athena (non-fatal)"
+            echo "   Results remain available locally in: benchmarks/${BENCHMARK_JOB_NAME}/"
+        fi
+    fi
 elif [ "${JOB_STATUS}" = "Failed" ]; then
     # Display failure reason
     echo "❌ Step 4: Benchmark job failed"